├── LICENSE.txt ├── README.rst ├── docs ├── Makefile ├── acknowledgements.rst ├── blocks.rst ├── blocks_proper.rst ├── calculating_covariants.rst ├── conf.py ├── constructor_or_non_standard_sequence.rst ├── custom_regressors_into_purifiers.rst ├── defines.rst ├── examples.rst ├── getting_insights_about_the_model.rst ├── hello_user.rst ├── index.rst ├── installation.rst ├── make.bat ├── nice_abstract.rst ├── sequential_fitting.rst ├── symlinks │ └── constructing_machine_learning_potential.ipynb ├── theory.rst └── utilities.rst ├── enumerate ├── README.md ├── indep-1-1.dat ├── indep-1-8.dat ├── indep-7-1.dat ├── indep-7-4.dat ├── linear_reduce.nb └── polynomial_reduce.nb ├── examples ├── methane_home_pc.ipynb ├── methane_medium.ipynb ├── methane_small.ipynb ├── qm9_home_pc.ipynb └── qm9_small.ipynb ├── nice ├── __init__.py ├── blocks │ ├── __init__.py │ ├── compressors.py │ ├── expansioners.py │ ├── grouping.py │ ├── miscellaneous.py │ └── purifiers.py ├── clebsch_gordan.py ├── contracted_pca.py ├── nice_utilities.pxd ├── nice_utilities.pyx ├── packing.pyx ├── rascal_coefficients.pyx ├── thresholding.pyx ├── unrolling_individual_pca.pyx ├── unrolling_pca.pyx └── utilities.py ├── pyproject.toml ├── reference_configurations ├── methane_100.extxyz └── readme.txt ├── requirements.txt ├── setup.py ├── tests ├── compare_kernels.py └── readme.txt ├── tutorials ├── calculating_covariants.ipynb ├── constructing_machine_learning_potential.ipynb ├── constructor_or_non_standard_sequence.ipynb ├── custom_regressors_into_purifiers.ipynb ├── getting_insights_about_the_model.ipynb └── sequential_fitting.ipynb └── update_docs.py /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Sergey Pozdnyakov, Jigyasa Nigam, Michele Ceriotti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. role:: bash(code) 2 | :language: bash 3 | 4 | NICE 5 | ==== 6 | 7 | NICE (N-body Iteratively Contracted Equivariants) is a set of tools designed for the calculation of 8 | invariant and covariant atomic structure representations. It allows for 9 | automatic selection of the most informative combinations of high order spectrum elements 10 | and performs their efficient computation using recurrence relations. 11 | 12 | Although it is designed specifically for atomistic machine learning, NICE in principle 13 | can be applied to other machine learning tasks, such as those which involve signals in a ball or on a sphere, all which require invariant or covariant outputs. 14 | 15 | ++++++++++++ 16 | Installation 17 | ++++++++++++ 18 | 19 | 1. Install `librascal `_ 20 | 2. git clone or download archive with nice and unpack 21 | 3. cd to root nice directory and run :bash:`pip3 install .` 22 | 23 | +++++++++++++ 24 | Documentation 25 | +++++++++++++ 26 | 27 | Documentation can be found `here `_ 28 | 29 | ++++++++++ 30 | References 31 | ++++++++++ 32 | 33 | If you are using NICE, please cite `this article `_. 34 | 35 | [1] Jigyasa Nigam, Sergey Pozdnyakov, and Michele Ceriotti. "Recursive evaluation and iterative contraction of N-body equivariant features." The Journal of Chemical Physics 153.12 (2020): 121101. 36 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = ../../build/ 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/acknowledgements.rst: -------------------------------------------------------------------------------- 1 | Acknowledgements 2 | ================ 3 | We are very thankful to Felix Musil for remarkable advices especially 4 | on tools for building documentation. 5 | -------------------------------------------------------------------------------- /docs/blocks.rst: -------------------------------------------------------------------------------- 1 | Blocks 2 | ====== 3 | 4 | NICE 5 | ---------------------------- 6 | 7 | .. automodule:: blocks.grouping 8 | :members: 9 | :undoc-members: 10 | :show-inheritance: 11 | 12 | Compressors 13 | ------------------------------- 14 | 15 | .. automodule:: blocks.compressors 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | Expansioners 21 | -------------------------------- 22 | 23 | .. automodule:: blocks.expansioners 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | Purifiers 29 | ----------------------------- 30 | 31 | .. automodule:: blocks.purifiers 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | Miscellaneous 37 | --------------------------------- 38 | 39 | .. automodule:: blocks.miscellaneous 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | -------------------------------------------------------------------------------- /docs/blocks_proper.rst: -------------------------------------------------------------------------------- 1 | NICE 2 | ---------------------------- 3 | 4 | .. automodule:: blocks.grouping 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Compressors 10 | ------------------------------- 11 | 12 | .. automodule:: blocks.compressors 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | 17 | Expansioners 18 | -------------------------------- 19 | 20 | .. automodule:: blocks.expansioners 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | Purifiers 26 | ----------------------------- 27 | 28 | .. automodule:: blocks.purifiers 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | Miscellaneous 34 | --------------------------------- 35 | 36 | .. automodule:: blocks.miscellaneous 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | -------------------------------------------------------------------------------- /docs/calculating_covariants.rst: -------------------------------------------------------------------------------- 1 | .. include:: cutted/calculating_covariants_before_collapsible/calculating_covariants_before_collapsible.rst 2 | 3 | .. admonition:: Preliminaries 4 | :class: toggle 5 | 6 | .. include:: cutted/calculating_covariants_collapsible/calculating_covariants_collapsible.rst 7 | 8 | .. include:: cutted/calculating_covariants_after_collapsible/calculating_covariants_after_collapsible.rst 9 | 10 | 11 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('.')) 16 | sys.path.insert(0, os.path.abspath('../nice')) 17 | sys.path.insert(0, os.path.abspath('../nice/blocks')) 18 | 19 | import sphinx_rtd_theme 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'NICE' 23 | copyright = '2020, Jigyasa Nigam, Sergey Pozdnyakov, Michele Ceriotti' 24 | author = 'Jigyasa Nigam, Sergey Pozdnyakov, Michele Ceriotti' 25 | 26 | 27 | # -- General configuration --------------------------------------------------- 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | 33 | 34 | 35 | extensions = [ 36 | "sphinx_rtd_theme", "nbsphinx", "sphinxcontrib.napoleon", "sphinx_togglebutton" 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ['_templates'] 41 | 42 | # List of patterns, relative to source directory, that match files and 43 | # directories to ignore when looking for source files. 44 | # This pattern also affects html_static_path and html_extra_path. 45 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 46 | 47 | 48 | # -- Options for HTML output ------------------------------------------------- 49 | 50 | # The theme to use for HTML and HTML Help pages. See the documentation for 51 | # a list of builtin themes. 52 | # 53 | html_theme = 'sphinx_rtd_theme' 54 | 55 | # Add any paths that contain custom static files (such as style sheets) here, 56 | # relative to this directory. They are copied after the builtin static files, 57 | # so a file named "default.css" will overwrite the builtin "default.css". 58 | html_static_path = ['_static'] 59 | -------------------------------------------------------------------------------- /docs/constructor_or_non_standard_sequence.rst: -------------------------------------------------------------------------------- 1 | .. include:: cutted/constructor_or_non_standard_sequence_before_collapsible/constructor_or_non_standard_sequence_before_collapsible.rst 2 | 3 | 4 | .. admonition:: Preliminaries 5 | :class: toggle 6 | 7 | .. include:: cutted/constructor_or_non_standard_sequence_collapsible/constructor_or_non_standard_sequence_collapsible.rst 8 | 9 | 10 | .. include:: cutted/constructor_or_non_standard_sequence_after_collapsible/constructor_or_non_standard_sequence_after_collapsible.rst 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/custom_regressors_into_purifiers.rst: -------------------------------------------------------------------------------- 1 | .. include:: cutted/custom_regressors_into_purifiers_before_collapsible/custom_regressors_into_purifiers_before_collapsible.rst 2 | 3 | 4 | .. admonition:: Preliminaries 5 | :class: toggle 6 | 7 | .. include:: cutted/custom_regressors_into_purifiers_collapsible/custom_regressors_into_purifiers_collapsible.rst 8 | 9 | 10 | .. include:: cutted/custom_regressors_into_purifiers_after_collapsible/custom_regressors_into_purifiers_after_collapsible.rst 11 | 12 | -------------------------------------------------------------------------------- /docs/defines.rst: -------------------------------------------------------------------------------- 1 | .. role:: bash(code) 2 | :language: bash 3 | 4 | 5 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | Real-Life Examples of Nice 2 | ========================== 3 | 4 | 5 | Among other things, this repository contains scripts and notebooks to contextualize NICE into real-world problems. These examples are similar to the procedures reported in Jigyasa Nigam, Sergey Pozdnyakov, and Michele Ceriotti. "Recursive evaluation and iterative contraction of N-body equivariant features." The Journal of Chemical Physics 153.12 (2020): 121101, but not direct productions. 6 | 7 | In `qm9_home_pc.ipynb` and `qm9_small.ipynb` construct similar machine learned potentials for the QM9 dataset (see below). `qm9_home_pc.ipynb` is intended to run on a local workstation, whereas `qm9_small.ipynb` is best suited for HPC resources. We have also provided examples for the methane dataset (https://archive.materialscloud.org/record/2020.110). All notebooks include general advice on 8 | appropriate real-life hyperparameters. 9 | 10 | QM9 dataset is `available `_ 11 | in the form of separate .xyz files for each molecule in such a special format 12 | that it can not be read by `ase `_. 13 | The first cells of qm9_home_pc.ipynb and qm9_small.ipynb notebooks contain code to fetch the raw QM9 dataset and parses it into a single ase .extxyz file. 14 | -------------------------------------------------------------------------------- /docs/getting_insights_about_the_model.rst: -------------------------------------------------------------------------------- 1 | .. include:: cutted/getting_insights_about_the_model_before_collapsible/getting_insights_about_the_model_before_collapsible.rst 2 | 3 | 4 | .. admonition:: Preliminaries 5 | :class: toggle 6 | 7 | .. include:: cutted/getting_insights_about_the_model_collapsible/getting_insights_about_the_model_collapsible.rst 8 | 9 | 10 | .. include:: cutted/getting_insights_about_the_model_after_collapsible/getting_insights_about_the_model_after_collapsible.rst 11 | 12 | -------------------------------------------------------------------------------- /docs/hello_user.rst: -------------------------------------------------------------------------------- 1 | NICE 2 | ========================== 3 | 4 | .. include:: nice_abstract.rst 5 | 6 | Installation 7 | ------------ 8 | 9 | .. include:: installation.rst 10 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. NICE documentation master file, created by 2 | sphinx-quickstart on Wed Sep 23 16:53:53 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | 8 | .. include:: hello_user.rst 9 | 10 | .. toctree:: 11 | :glob: 12 | :maxdepth: 1 13 | :caption: Theory in a nutshell 14 | 15 | theory 16 | 17 | .. toctree:: 18 | :glob: 19 | :maxdepth: 1 20 | :caption: Tutorials 21 | 22 | symlinks/constructing_machine_learning_potential 23 | calculating_covariants 24 | getting_insights_about_the_model 25 | constructor_or_non_standard_sequence 26 | sequential_fitting 27 | custom_regressors_into_purifiers 28 | 29 | .. toctree:: 30 | :glob: 31 | :maxdepth: 1 32 | :caption: Examples 33 | 34 | examples 35 | 36 | .. toctree:: 37 | :glob: 38 | :maxdepth: 5 39 | :caption: Reference guide 40 | 41 | blocks 42 | utilities 43 | 44 | .. toctree:: 45 | :glob: 46 | :maxdepth: 1 47 | :caption: Acknowledgements 48 | 49 | acknowledgements 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. include:: defines.rst 2 | 3 | 1. Install `librascal `_ 4 | 2. git clone or download archive with nice and unpack 5 | 3. cd to root nice directory and run :bash:`pip3 install .` 6 | 7 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR= ../../build/ 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/nice_abstract.rst: -------------------------------------------------------------------------------- 1 | NICE (N-body Iteratively Contracted Equivariants) is a set of tools designed for the calculation of invariant and covariant atomic structure representations. It allows to 2 | automatically select the most informative combinations of high-order spectrum elements 3 | and perform their efficient computation using recurrence relations. 4 | 5 | Though being designed specifically for atomistic machine learning, NICE, in principle, can be applied to other machine learning tasks towards producing invariant or covariant output that involve signals within a spherical cutoff, either on the sphere surface (2D manifold) or within the enclosed area (3D manifold). 6 | -------------------------------------------------------------------------------- /docs/sequential_fitting.rst: -------------------------------------------------------------------------------- 1 | .. include:: cutted/sequential_fitting_before_collapsible/sequential_fitting_before_collapsible.rst 2 | 3 | 4 | .. admonition:: Preliminaries 5 | :class: toggle 6 | 7 | .. include:: cutted/sequential_fitting_collapsible/sequential_fitting_collapsible.rst 8 | 9 | 10 | .. include:: cutted/sequential_fitting_after_collapsible/sequential_fitting_after_collapsible.rst 11 | 12 | -------------------------------------------------------------------------------- /docs/symlinks/constructing_machine_learning_potential.ipynb: -------------------------------------------------------------------------------- 1 | ../../tutorials/constructing_machine_learning_potential.ipynb -------------------------------------------------------------------------------- /docs/theory.rst: -------------------------------------------------------------------------------- 1 | Theory in a nutshell 2 | _____________________________ 3 | 4 | One can use this toolbox as a black box that calculates proper atomic 5 | structure representations. In this case, we refer the reader to tutorials along with an examples folder to borrow 6 | appropriate hyperparameters for real-life scenarios. 7 | 8 | In order to meaningfully select hypers or design your calculations, some understanding of 9 | what is going on is required. The most comprehensive description is given in [Ref]_, which 10 | might appear to be quite time-consuming for people not from the field. Thus, this 11 | section is designed to give a short overview of the method without any proofs or unnecessary 12 | details. 13 | 14 | For various purposes in atomistic machine learning, there is a need to describe atomic environments 15 | by invariant or covariant values. The most widespread case is the construction of so-called 16 | machine learning potentials. In this case, the goal is to construct a mapping function from an atomistic structure, 17 | whether it is a molecule, crystal, or amorphous solid to the energy of this configuration. Energy is 18 | an `extensive `_ property, which allows representing total energy as a sum of atomic contributions which are defined by central atomic specie along with 19 | the atomic environment. 20 | 21 | Most machine learning algorithms don't exhibit required symmetries such as rotational symmetry out of the box. 22 | Thus, there is a need to calculate atomic environment representation which is invariant with respect to certain transformations. 23 | For the prediction of other properties, there is also a need for covariant representations which transform in a certain way under rotations. 24 | 25 | 26 | The atomic environment is described by an unordered set of 27 | relative positions of neighbors within a given cut-off radius, along with their species 28 | :math:`\{\{\vec{r_1}, \alpha_1\}, \{\vec{r_2}, \alpha_{2}\}... \{\vec{r_n}, \alpha_{n}\}\}`. 29 | The number of neighbors potentially can vary. The goal is to provide a description 30 | of the fixed size consisting of invariant or covariant features with respect 31 | to permutations of atoms of the same specie along rotations of the environment. 32 | 33 | The invariance with respect to the permutation of atoms is achieved by introducing "neighbor 34 | density functions": 35 | :math:`\rho_{\alpha}(\vec{r}) = \sum\limits_i g(\vec{r} - \vec{r_i}) \delta_{\alpha, \alpha_i}`, 36 | where :math:`g` is some local function, such as a gaussian or even delta function. After that 37 | fingerprints are expressed as the functionals of :math:`\rho`. 38 | 39 | To deal with neighbor density functions, spherical expansion coefficients are introduced: 40 | 41 | .. math:: 42 | < \{n, \alpha\} \lambda m | \rho^1> = \int d\vec{r} R_{n}(\vec{r}) Y_{\lambda}^m(\hat{r}) \rho_{\alpha}(\vec{r}) 43 | , where :math:`\hat{r}` is the unit direction vector, :math:`r = |\vec{r}|`, :math:`R_{n}(r)` is 44 | some complete basis, it doesn't really matter which one particularly, 45 | :math:`Y_{\lambda}^m(\hat{r})` are 46 | `spherical harmonics `_. :math:`\lambda` index runs from :math:`0` 47 | to :math:`+\inf`, 48 | :math:`m` runs from :math:`-\lambda` to :math:`\lambda`. 49 | 50 | :math:`\{n, \alpha\}` indices are never used separately from each other and, thus, for simplicity, 51 | in the further narrative, we will refer to them as just :math:`n`. 52 | 53 | It is known how coefficients :math:`< n \lambda m | \rho^1>` transform under rotations of the environment. 54 | Particularly coefficients with :math:`l = 0` remain constant under rotations, i. e. are invariants, 55 | while the general transformation rule is 56 | 57 | .. math:: 58 | < n \lambda m | \hat{R} | \rho^1> = \sum\limits_{m'} D^{\lambda}_{mm'} < n \lambda m' | \rho^1> 59 | 60 | where :math:`< n \lambda m | \hat{R} | \rho^1>` are spherical expansion coefficients 61 | for the rotated environment, :math:`\hat{R}` is the rotation, described, for instance, 62 | by `Euler angles `_ 63 | , :math:`D^{\lambda}_{mm'}(\hat{R})` are 64 | `Wigner D matrices `_. 65 | 66 | Let's look at this transformation more closely. First of all, we see that spherical expansion 67 | coefficients of the rotated environment depend only on coefficients of the initial environments 68 | with the same :math:`n` and :math:`\lambda` indices. I. e., one can group coefficients into vectors 69 | corresponding to fixed :math:`n` and :math:`\lambda` of size :math:`2 \lambda + 1` and indexed by :math: 'm' 70 | index. The transformation itself is nothing else but matrix-vector multiplication. 71 | 72 | Within this framework, we work only with this way of transformation. Further, we will call 73 | any vector of odd size which transforms this way as a covariant feature/fingerprint. 74 | 75 | 76 | 77 | Some transformations upon covariant vectors also lead to covariant vectors. Some do not. 78 | For instance, we can apply elementwise squaring of vector elements which clearly would 79 | result in a non-covariant vector. 80 | 81 | There are several ways to combine covariants to get a covariant output. The most obvious is to 82 | construct a linear combination of covariants. 83 | 84 | .. math:: 85 | :label: first_expansion 86 | 87 | {output}^{\lambda}_m = \sum\limits_i (input_i)^{\lambda}_m * q_i 88 | 89 | 90 | 91 | 92 | where :math:`q_i` are arbitrarily coefficients. The less obvious way is to do a Clebsch-Gordan 93 | iteration: 94 | 95 | .. math:: 96 | :label: second_expansion 97 | 98 | {output}^{\lambda}_m = \sum\limits_{m_1 m_2} 99 | (first\:input)^{l_1}_{m_1} (second\:input)^{l_2}_{m_2} 100 | 101 | , there :math:`` are 102 | `Clebsch-Gordan coefficients `_. 103 | 104 | Let's take a look at the second construction rule in more detail. It takes 105 | two covariant vectors as input and constructs several covariant outputs, indexed 106 | by natural index :math:`\lambda`. (Actually, :math:`\lambda` is bounded between 107 | :math:`| l_1 - l_2 |` and :math:`|l_1 + l_2|`, otherwise Clebsch-Gordan coefficients are zeros) 108 | 109 | 110 | For further purposes, it is necessary to introduce the concept of body order. 111 | 112 | It is clear that by combining transformation rules :eq:`first_expansion` and :eq:`second_expansion`, we get covariants 113 | which depend polynomially on the entries of initial spherical expansion coefficients. 114 | 115 | If all monomials have the same power :math:`\nu`, then we define the body order of the 116 | corresponding covariant vector to be :math:`\nu`. If monomials have different powers, 117 | then body order is undefined. 118 | 119 | If we apply the linear combination to the covariants of body order :math:`\nu`, then the result also 120 | has a body order :math:`\nu`. If we do Clebsch-Gordan iteration with covariants of body order 121 | :math:`\nu_1` and :math:`\nu_2`, then the result has body order :math:`\nu_1 + \nu_2`. 122 | 123 | Consider the following procedure. Initially, we 124 | have :math:`\nu = 1`, and initial spherical expansion 125 | coefficients :math:`< n \lambda m | \rho^1>` . Let's apply the construction rule 126 | :eq:`second_expansion` for each pair of spherical expansion coefficients 127 | and for each possible output :math:`\lambda`. The result would be set 128 | of :math:`\nu=2` body order covariants. As the next step, let's do the same 129 | for each pair of the obtained :math:`\nu=2` covariants and 130 | initial :math:`\nu=1` spherical expansion coefficients. The result would 131 | be a set of :math:`\nu=3` covariants. And so on. 132 | 133 | 134 | There are two important statements: 135 | 136 | 1. Completeness a. 137 | For each :math:`\nu` set of covariants obtained by the previously discussed procedure is complete basis in the space of :math:`v` order functionals 138 | from :math:`\rho(*)` to invariant/covariant output. It means 139 | that any :math:`\nu` order functional can be expressed as a linear combination 140 | of :math:`\nu` order covariants/invariants. 141 | 142 | 2. Completeness b. 143 | For each :math:`\nu` set of covariants obtained by the previously discussed 144 | procedure is a complete basis in a space of :math:`v` body order potentials. 145 | It means that any function of atomic structure given by the sum of contributions 146 | over all subsets of :math:`\nu` atoms can be represented as the linear 147 | combination of :math:`\nu` order covariants/invariants. Particularly any 148 | two-body potential, such as `LJ potential `_, 149 | can be represented as 150 | linear combination of first-order invariants, any three-body potential 151 | can be represented as a linear combination of second-order invariants 152 | and so on. 153 | 154 | 155 | Taking into account these facts, it looks like the recipe for machine learning 156 | potentials is very clear. Just iterate over the body order 157 | until convergence. 158 | 159 | The problem is that the size of :math:`\nu` order covariants explodes with 160 | :math:`\nu` exponentially. Indeed, when we go from :math:`\nu - 1` to 161 | :math:`\nu` order number of entries is multiplied by the number 162 | of :math:`\nu=1` order covariant vectors and by the number of 163 | different :math:`\lambda`-s. Thus, it is not computationally feasible to 164 | go to high body orders with this naive approach. 165 | 166 | In practice, for particular distributions in phase space, given by particular 167 | datasets, by far, not all components of covariants are relevant. Namely, 168 | in real-life scenarios the `PCA `_ 169 | spectrum decreases very rapidly. So, 170 | in fact, we need only a few components out of a great many. 171 | 172 | There is a way to construct iterative components iteratively. 173 | It consists of iterative PCA and Clebsch-Gordan expansions. For each 174 | transition from :math:`\nu-1` body order to :math:`\nu` body order, we do PCA 175 | of :math:`\nu-1` body order covariants and use only those with the highest 176 | variance or importance for subsequent expansion. The number of components 177 | to take can be either fixed or selected dynamically in such a way as to cover a certain percentage of the 178 | variance in the dataset. 179 | 180 | It is clear that in this way, most part of the variance is kept. Indeed, 181 | let's imagine that we had exact linear dependencies at some step, and, thus, 182 | after PCA, some components have exact zero variance. Substituting vector with zeros to the 183 | expansion rule :eq:`second_expansion` we see that the result is ... also zeros. 184 | The same relates to small components - components with small variance also 185 | "give birth" to components with small variance. Thus, neglecting them 186 | would not affect the covariants with higher body orders much. 187 | 188 | There is another important observation that on a particular dataset, covariants with different body orders can correlate with each other. Thus, 189 | it is a good idea to preserve at each iteration not the components with 190 | the highest absolute variance but the components with the 191 | highest "purified variance" or "new variance". I. e. components 192 | with the highest residuals, which can not be explained by linear regression 193 | based on previous body orders. Using 194 | "`sklearn `_ language" purification 195 | step can be viewed as : 196 | 197 | .. code-block:: python 198 | 199 | purified_covariants = covariants - linear_regressor.fit( 200 | all_covariants_of_smaller_body_order, covariants).predict(covariants) 201 | 202 | 203 | To conclude, NICE consist of iterations each of three steps: 204 | 205 | 1. Expansion - raising the body order by one using Clebsh-Gordan iteration :eq:`second_expansion`. 206 | 2. Purification - getting rid of variance, which is explainable by previous body-order covariants. 207 | 3. PCA - to group the most part of the variance in a small subset of components. 208 | 209 | 210 | In principle, one can apply this machinery to other invariant/covariant machine learning tasks 211 | not related to atomistic machine learning. The only difference is that in this case, 212 | input spherical expansion coefficients :math:`< n \lambda m | \rho^1>` would be obtained from 213 | some other sphere/ball signal, not from the sum of Gaussians as in the case of atomistic machine learning. 214 | 215 | In the current implementation there is also a duplicate branch of only invariants, 216 | which allows choosing hyper parameters, such as the number of components to expand, 217 | separately for invariants and covariants, which is very useful in practice. 218 | 219 | More about it in the first tutorial, "Constructing machine learning potential". 220 | 221 | 222 | 223 | .. [Ref] https://aip.scitation.org/doi/10.1063/5.0021116 224 | -------------------------------------------------------------------------------- /docs/utilities.rst: -------------------------------------------------------------------------------- 1 | Utilities 2 | ========= 3 | 4 | .. automodule:: utilities 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /enumerate/README.md: -------------------------------------------------------------------------------- 1 | Enumeration of linearly and polynomially-independent invariants 2 | =============================================================== 3 | 4 | The Mathematica notebooks `linear_reduce.nb` and `polynomial_reduce.nb` 5 | use computer algebra to list all of the coefficients of density-correlation 6 | equivariants that are linearly independent, or that cannot be computed as 7 | polynomial of lower-order invariants. 8 | 9 | The code is not optimized, and cannot go beyond relatively low body order 10 | and nmax,lmax thresholds. It shows however how there are relatively few 11 | invariants that can be dropped beyond those that can be identified based 12 | on angular momentum recoupling theory. 13 | 14 | The repository also contains a few examples of the listings, named as 15 | `indep-nmax-lmax.dat` 16 | Entries in each file list the indices of the nonzero (and linearly independent) 17 | invariants, labeled as 18 | 19 | ``` 20 | # nu sigma lambda n1 l1 k1 [n2 l2 k2 .....] 21 | ``` 22 | 23 | following the notation from https://arxiv.org/abs/2007.03407 24 | -------------------------------------------------------------------------------- /enumerate/indep-1-1.dat: -------------------------------------------------------------------------------- 1 | "# nu sigma lambda n1 l1 k1 [n2 l2 k2 .....]" 2 | 1 1 0 1 0 0 3 | 1 1 1 1 1 1 4 | 2 1 0 1 0 0 1 0 0 5 | 2 1 0 1 1 1 1 1 1 6 | 2 1 1 1 0 0 1 1 0 7 | 3 1 0 1 0 0 1 0 0 1 0 0 8 | 3 1 0 1 0 0 1 1 0 1 1 1 9 | 3 1 1 1 0 0 1 0 0 1 1 0 10 | 3 1 1 1 1 1 1 1 1 1 1 0 11 | 4 1 0 1 0 0 1 0 0 1 0 0 1 0 0 12 | 4 1 0 1 0 0 1 0 0 1 1 0 1 1 1 13 | 4 1 0 1 1 1 1 1 1 1 1 0 1 1 1 14 | 4 1 1 1 0 0 1 0 0 1 0 0 1 1 0 15 | 4 1 1 1 0 0 1 1 0 1 1 1 1 1 0 16 | 5 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 17 | 5 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 18 | 5 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 19 | 5 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 20 | 5 1 1 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 21 | 5 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 22 | 6 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 23 | 6 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 24 | 6 1 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 25 | 6 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 26 | 6 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 27 | 6 1 1 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 28 | 6 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 29 | 7 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 30 | 7 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 31 | 7 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 32 | 7 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 33 | 7 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 34 | 7 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 35 | 7 1 1 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 36 | 7 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 37 | 8 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 38 | 8 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 39 | 8 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 40 | 8 1 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 41 | 8 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 42 | 8 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 43 | 8 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 44 | 8 1 1 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 45 | 8 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 -------------------------------------------------------------------------------- /examples/methane_home_pc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n", 10 | "\n", 11 | "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n", 12 | "!gunzip -k methane.extxyz.gz" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import numpy as np\n", 22 | "import ase.io\n", 23 | "import tqdm\n", 24 | "from nice.blocks import *\n", 25 | "from nice.utilities import *\n", 26 | "from matplotlib import pyplot as plt\n", 27 | "from sklearn.linear_model import BayesianRidge" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "HARTREE_TO_EV = 27.211386245988\n", 37 | "train_subset = \"0:10000\" #input for ase.io.read command\n", 38 | "test_subset = \"10000:15000\" #input to ase.io.read command\n", 39 | "environments_for_fitting = 1000 #number of environments to fit nice transfomers\n", 40 | "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000, 5000, 7500,\n", 41 | " 10000] #for learning curve\n", 42 | "\n", 43 | "#HYPERS for librascal spherical expansion coefficients\n", 44 | "HYPERS = {\n", 45 | " 'interaction_cutoff': 6.3,\n", 46 | " 'max_radial': 5,\n", 47 | " 'max_angular': 5,\n", 48 | " 'gaussian_sigma_type': 'Constant',\n", 49 | " 'gaussian_sigma_constant': 0.05,\n", 50 | " 'cutoff_smooth_width': 0.3,\n", 51 | " 'radial_basis': 'GTO'\n", 52 | "}" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "#our model:\n", 62 | "def get_nice():\n", 63 | " return StandardSequence([\n", 64 | " StandardBlock(ThresholdExpansioner(num_expand=150),\n", 65 | " CovariantsPurifierBoth(max_take=10),\n", 66 | " IndividualLambdaPCAsBoth(n_components=50),\n", 67 | " ThresholdExpansioner(num_expand=300, mode='invariants'),\n", 68 | " InvariantsPurifier(max_take=50),\n", 69 | " InvariantsPCA(n_components=200)),\n", 70 | " StandardBlock(ThresholdExpansioner(num_expand=150),\n", 71 | " CovariantsPurifierBoth(max_take=10),\n", 72 | " IndividualLambdaPCAsBoth(n_components=50),\n", 73 | " ThresholdExpansioner(num_expand=300, mode='invariants'),\n", 74 | " InvariantsPurifier(max_take=50),\n", 75 | " InvariantsPCA(n_components=200)),\n", 76 | " StandardBlock(None, None, None,\n", 77 | " ThresholdExpansioner(num_expand=300, mode='invariants'),\n", 78 | " InvariantsPurifier(max_take=50),\n", 79 | " InvariantsPCA(n_components=200))\n", 80 | " ],\n", 81 | " initial_scaler=InitialScaler(\n", 82 | " mode='signal integral', individually=True))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "train_structures = ase.io.read('methane.extxyz', index=train_subset)\n", 92 | "\n", 93 | "test_structures = ase.io.read('methane.extxyz', index=test_subset)\n", 94 | "\n", 95 | "all_species = get_all_species(train_structures + test_structures)\n", 96 | "\n", 97 | "train_coefficients = get_spherical_expansion(train_structures, HYPERS,\n", 98 | " all_species)\n", 99 | "\n", 100 | "test_coefficients = get_spherical_expansion(test_structures, HYPERS,\n", 101 | " all_species)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "#individual nice transformers for each atomic specie in the dataset\n", 111 | "nice = {}\n", 112 | "for key in train_coefficients.keys():\n", 113 | " nice[key] = get_nice()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "for key in train_coefficients.keys():\n", 123 | " nice[key].fit(train_coefficients[key][:environments_for_fitting])" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "train_features = transform_sequentially(nice, train_structures, HYPERS,\n", 133 | " all_species)\n", 134 | "test_features = transform_sequentially(nice, test_structures, HYPERS,\n", 135 | " all_species)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "train_energies = [structure.info['energy'] for structure in train_structures]\n", 145 | "train_energies = np.array(train_energies) * HARTREE_TO_EV\n", 146 | "\n", 147 | "test_energies = [structure.info['energy'] for structure in test_structures]\n", 148 | "test_energies = np.array(test_energies) * HARTREE_TO_EV" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "def get_rmse(first, second):\n", 158 | " return np.sqrt(np.mean((first - second)**2))\n", 159 | "\n", 160 | "\n", 161 | "def get_standard_deviation(values):\n", 162 | " return np.sqrt(np.mean((values - np.mean(values))**2))\n", 163 | "\n", 164 | "\n", 165 | "def get_relative_performance(predictions, values):\n", 166 | " return get_rmse(predictions, values) / get_standard_deviation(values)\n", 167 | "\n", 168 | "\n", 169 | "def estimate_performance(regressor, data_train, data_test, targets_train,\n", 170 | " targets_test):\n", 171 | " regressor.fit(data_train, targets_train)\n", 172 | " return get_relative_performance(regressor.predict(data_test), targets_test)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "errors = []\n", 182 | "for el in tqdm.tqdm(grid):\n", 183 | " errors.append(\n", 184 | " estimate_performance(BayesianRidge(), train_features[:el],\n", 185 | " test_features, train_energies[:el],\n", 186 | " test_energies))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "print(errors)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "from matplotlib import pyplot as plt\n", 205 | "plt.plot(grid, errors, 'bo')\n", 206 | "plt.plot(grid, errors, 'b')\n", 207 | "plt.xlabel(\"number of structures\")\n", 208 | "plt.ylabel(\"relative error\")\n", 209 | "plt.xscale('log')\n", 210 | "plt.yscale('log')\n", 211 | "plt.show()" 212 | ] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 3", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.6.9" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 4 236 | } 237 | -------------------------------------------------------------------------------- /examples/methane_medium.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n", 10 | "\n", 11 | "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n", 12 | "!gunzip -k methane.extxyz.gz" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import numpy as np\n", 22 | "import ase.io\n", 23 | "import tqdm\n", 24 | "from nice.blocks import *\n", 25 | "from nice.utilities import *\n", 26 | "from matplotlib import pyplot as plt\n", 27 | "from sklearn.linear_model import BayesianRidge" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "HARTREE_TO_EV = 27.211386245988\n", 37 | "train_subset = \"0:100000\" #input for ase.io.read command\n", 38 | "test_subset = \"3050000:3130000\" #input to ase.io.read command\n", 39 | "environments_for_fitting = 5000 #number of environments to fit nice transfomers\n", 40 | "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000,\n", 41 | " 5000, 7500, 10000, 15000, 20000,\n", 42 | " 30000, 50000, 75000, 100000] #for learning curve\n", 43 | "\n", 44 | "#HYPERS for librascal spherical expansion coefficients\n", 45 | "HYPERS = {\n", 46 | "'interaction_cutoff': 6.3,\n", 47 | "'max_radial': 5,\n", 48 | "'max_angular': 5,\n", 49 | "'gaussian_sigma_type': 'Constant',\n", 50 | "'gaussian_sigma_constant': 0.05,\n", 51 | "'cutoff_smooth_width': 0.3,\n", 52 | "'radial_basis': 'GTO'\n", 53 | "}" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "#our model:\n", 63 | "def get_transformer():\n", 64 | " return StandardSequence([StandardBlock(ThresholdExpansioner(),\n", 65 | " CovariantsPurifierBoth(max_take = 100),\n", 66 | " IndividualLambdaPCAsBoth(),\n", 67 | " None,\n", 68 | " None,\n", 69 | " None),\n", 70 | " StandardBlock(ThresholdExpansioner(num_expand = 10000),\n", 71 | " CovariantsPurifierBoth(max_take = 100),\n", 72 | " IndividualLambdaPCAsBoth(2000),\n", 73 | " ThresholdExpansioner(num_expand = 50000, mode = 'invariants'),\n", 74 | " InvariantsPurifier(max_take = 100),\n", 75 | " InvariantsPCA(n_components = 2000)),\n", 76 | " StandardBlock(None,\n", 77 | " None,\n", 78 | " None,\n", 79 | " ThresholdExpansioner(num_expand = 50000, mode = 'invariants'),\n", 80 | " InvariantsPurifier(max_take = 100),\n", 81 | " InvariantsPCA(n_components = 5000))\n", 82 | " ],\n", 83 | " initial_scaler = InitialScaler(mode = 'signal integral',\n", 84 | " individually = True))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "train_structures = ase.io.read('methane.extxyz', \n", 94 | " index = train_subset)\n", 95 | "\n", 96 | "test_structures = ase.io.read('methane.extxyz', \n", 97 | " index = test_subset)\n", 98 | "\n", 99 | "all_species = get_all_species(train_structures + test_structures)\n", 100 | "\n", 101 | "train_coefficients = get_spherical_expansion(train_structures, HYPERS, all_species)\n", 102 | "\n", 103 | "\n", 104 | "\n", 105 | "test_coefficients = get_spherical_expansion(test_structures, HYPERS, all_species)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "#individual transformers for each atomic specie in dataset\n", 115 | "transformers = {}\n", 116 | "for key in train_coefficients.keys():\n", 117 | " transformers[key] = get_transformer()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "for key in train_coefficients.keys():\n", 127 | " transformers[key].fit(train_coefficients[key][:environments_for_fitting])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "train_features = transform_sequentially(transformers, \n", 137 | " train_structures, HYPERS, all_species)\n", 138 | "test_features = transform_sequentially(transformers,\n", 139 | " test_structures, HYPERS, all_species)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "train_energies = [structure.info['energy'] for structure in train_structures]\n", 149 | "train_energies = np.array(train_energies) * HARTREE_TO_EV\n", 150 | "\n", 151 | "test_energies = [structure.info['energy'] for structure in test_structures]\n", 152 | "test_energies = np.array(test_energies) * HARTREE_TO_EV\n" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "def get_rmse(first, second):\n", 162 | " return np.sqrt(np.mean((first - second) ** 2))\n", 163 | "\n", 164 | "def get_standard_deviation(values):\n", 165 | " return np.sqrt(np.mean((values - np.mean(values)) ** 2))\n", 166 | "\n", 167 | "def get_relative_performance(predictions, values):\n", 168 | " return get_rmse(predictions, values) / get_standard_deviation(values)\n", 169 | "\n", 170 | "def estimate_performance(clf, data_train, data_test, targets_train, targets_test):\n", 171 | " clf.fit(data_train, targets_train)\n", 172 | " return get_relative_performance(clf.predict(data_test), targets_test)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "errors = []\n", 182 | "for el in tqdm.tqdm(grid): \n", 183 | " errors.append(estimate_performance(BayesianRidge(), train_features[:el],\n", 184 | " test_features, train_energies[:el],\n", 185 | " test_energies))" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "print(errors)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "from matplotlib import pyplot as plt\n", 204 | "plt.plot(grid, errors, 'bo')\n", 205 | "plt.plot(grid, errors, 'b')\n", 206 | "plt.xlabel(\"number of structures\")\n", 207 | "plt.ylabel(\"relative error\")\n", 208 | "plt.xscale('log')\n", 209 | "plt.yscale('log')\n", 210 | "plt.show()" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "display_name": "Python 3", 224 | "language": "python", 225 | "name": "python3" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 3 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython3", 237 | "version": "3.6.9" 238 | } 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 4 242 | } 243 | -------------------------------------------------------------------------------- /examples/methane_small.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n", 10 | "\n", 11 | "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n", 12 | "!gunzip -k methane.extxyz.gz" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import numpy as np\n", 22 | "import ase.io\n", 23 | "import tqdm\n", 24 | "from nice.blocks import *\n", 25 | "from nice.utilities import *\n", 26 | "from matplotlib import pyplot as plt\n", 27 | "from sklearn.linear_model import BayesianRidge" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "HARTREE_TO_EV = 27.211386245988\n", 37 | "train_subset = \"0:100000\" #input for ase.io.read command\n", 38 | "test_subset = \"3050000:3130000\" #input to ase.io.read command\n", 39 | "environments_for_fitting = 5000 #number of environments to fit nice transfomers\n", 40 | "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000,\n", 41 | " 5000, 7500, 10000, 15000, 20000,\n", 42 | " 30000, 50000, 75000, 100000] #for learning curve\n", 43 | "\n", 44 | "#HYPERS for librascal spherical expansion coefficients\n", 45 | "HYPERS = {\n", 46 | "'interaction_cutoff': 6.3,\n", 47 | "'max_radial': 5,\n", 48 | "'max_angular': 5,\n", 49 | "'gaussian_sigma_type': 'Constant',\n", 50 | "'gaussian_sigma_constant': 0.05,\n", 51 | "'cutoff_smooth_width': 0.3,\n", 52 | "'radial_basis': 'GTO'\n", 53 | "}" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "#our model:\n", 63 | "def get_transformer():\n", 64 | " return StandardSequence([StandardBlock(ThresholdExpansioner(num_expand = 1000),\n", 65 | " CovariantsPurifierBoth(max_take = 100),\n", 66 | " IndividualLambdaPCAsBoth(500),\n", 67 | " None,\n", 68 | " None,\n", 69 | " None),\n", 70 | " StandardBlock(ThresholdExpansioner(num_expand = 3000),\n", 71 | " CovariantsPurifierBoth(max_take = 100),\n", 72 | " IndividualLambdaPCAsBoth(500),\n", 73 | " ThresholdExpansioner(num_expand = 5000, mode = 'invariants'),\n", 74 | " InvariantsPurifier(max_take = 100),\n", 75 | " InvariantsPCA(n_components = 1000)),\n", 76 | " StandardBlock(None,\n", 77 | " None,\n", 78 | " None,\n", 79 | " ThresholdExpansioner(num_expand = 5000, mode = 'invariants'),\n", 80 | " InvariantsPurifier(max_take = 100),\n", 81 | " InvariantsPCA(n_components = 2000))\n", 82 | " ],\n", 83 | " initial_scaler = InitialScaler(mode = 'signal integral',\n", 84 | " individually = True))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "train_structures = ase.io.read('methane.extxyz', \n", 94 | " index = train_subset)\n", 95 | "\n", 96 | "test_structures = ase.io.read('methane.extxyz', \n", 97 | " index = test_subset)\n", 98 | "\n", 99 | "all_species = get_all_species(train_structures + test_structures)\n", 100 | "\n", 101 | "train_coefficients = get_spherical_expansion(train_structures, HYPERS, all_species)\n", 102 | "\n", 103 | "\n", 104 | "\n", 105 | "test_coefficients = get_spherical_expansion(test_structures, HYPERS, all_species)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "#individual transformers for each atomic specie in dataset\n", 115 | "transformers = {}\n", 116 | "for key in train_coefficients.keys():\n", 117 | " transformers[key] = get_transformer()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "for key in train_coefficients.keys():\n", 127 | " transformers[key].fit(train_coefficients[key][:environments_for_fitting])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "train_features = transform_sequentially(transformers, \n", 137 | " train_structures, HYPERS, all_species)\n", 138 | "test_features = transform_sequentially(transformers,\n", 139 | " test_structures, HYPERS, all_species)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "train_energies = [structure.info['energy'] for structure in train_structures]\n", 149 | "train_energies = np.array(train_energies) * HARTREE_TO_EV\n", 150 | "\n", 151 | "test_energies = [structure.info['energy'] for structure in test_structures]\n", 152 | "test_energies = np.array(test_energies) * HARTREE_TO_EV\n" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "def get_rmse(first, second):\n", 162 | " return np.sqrt(np.mean((first - second) ** 2))\n", 163 | "\n", 164 | "def get_standard_deviation(values):\n", 165 | " return np.sqrt(np.mean((values - np.mean(values)) ** 2))\n", 166 | "\n", 167 | "def get_relative_performance(predictions, values):\n", 168 | " return get_rmse(predictions, values) / get_standard_deviation(values)\n", 169 | "\n", 170 | "def estimate_performance(clf, data_train, data_test, targets_train, targets_test):\n", 171 | " clf.fit(data_train, targets_train)\n", 172 | " return get_relative_performance(clf.predict(data_test), targets_test)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "errors = []\n", 182 | "for el in tqdm.tqdm(grid): \n", 183 | " errors.append(estimate_performance(BayesianRidge(), train_features[:el],\n", 184 | " test_features, train_energies[:el],\n", 185 | " test_energies))" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "print(errors)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "from matplotlib import pyplot as plt\n", 204 | "plt.plot(grid, errors, 'bo')\n", 205 | "plt.plot(grid, errors, 'b')\n", 206 | "plt.xlabel(\"number of structures\")\n", 207 | "plt.ylabel(\"relative error\")\n", 208 | "plt.xscale('log')\n", 209 | "plt.yscale('log')\n", 210 | "plt.show()" 211 | ] 212 | } 213 | ], 214 | "metadata": { 215 | "kernelspec": { 216 | "display_name": "Python 3", 217 | "language": "python", 218 | "name": "python3" 219 | }, 220 | "language_info": { 221 | "codemirror_mode": { 222 | "name": "ipython", 223 | "version": 3 224 | }, 225 | "file_extension": ".py", 226 | "mimetype": "text/x-python", 227 | "name": "python", 228 | "nbconvert_exporter": "python", 229 | "pygments_lexer": "ipython3", 230 | "version": "3.6.9" 231 | } 232 | }, 233 | "nbformat": 4, 234 | "nbformat_minor": 4 235 | } 236 | -------------------------------------------------------------------------------- /examples/qm9_home_pc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import ase\n", 11 | "from ase import Atoms\n", 12 | "import numpy as np\n", 13 | "import tqdm\n", 14 | "import ase.io\n", 15 | "from nice.blocks import *\n", 16 | "from nice.utilities import *\n", 17 | "from matplotlib import pyplot as plt\n", 18 | "from sklearn.linear_model import BayesianRidge" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "PROPERTIES_NAMES = [\n", 28 | " 'tag', 'index', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2',\n", 29 | " 'zpve', 'U0', 'U', 'H', 'G', 'Cv'\n", 30 | "]\n", 31 | "\n", 32 | "\n", 33 | "def string_to_float(element):\n", 34 | " '''because shit like 2.1997*^-6 happens'''\n", 35 | " return float(element.replace('*^', 'e'))\n", 36 | "\n", 37 | "\n", 38 | "PROPERTIES_HANDLERS = [str, int\n", 39 | " ] + [string_to_float] * (len(PROPERTIES_NAMES) - 2)\n", 40 | "\n", 41 | "\n", 42 | "def parse_qm9_xyz(path):\n", 43 | " with open(path, 'r') as f:\n", 44 | " lines = list(f)\n", 45 | " #print(lines)\n", 46 | " n_atoms = int(lines[0])\n", 47 | " properties = {\n", 48 | " name: handler(value)\n", 49 | " for handler, name, value in zip(PROPERTIES_HANDLERS, PROPERTIES_NAMES,\n", 50 | " lines[1].strip().split())\n", 51 | " }\n", 52 | " composition = \"\"\n", 53 | " positions = []\n", 54 | " for i in range(2, 2 + n_atoms):\n", 55 | " composition += lines[i].strip().split()[0]\n", 56 | " positions.append([\n", 57 | " string_to_float(value) for value in lines[i].strip().split()[1:4]\n", 58 | " ])\n", 59 | "\n", 60 | " positions = np.array(positions)\n", 61 | " result = Atoms(composition, positions=np.array(positions))\n", 62 | " result.info.update(properties)\n", 63 | " return result\n", 64 | "\n", 65 | "\n", 66 | "def parse_index(path):\n", 67 | " with open(path, \"r\") as f:\n", 68 | " lines = list(f)\n", 69 | " proper_lines = lines[9:-1]\n", 70 | " result = [int(line.strip().split()[0]) for line in proper_lines]\n", 71 | " return np.array(result, dtype=int)\n", 72 | "\n", 73 | "\n", 74 | "def download_qm9(clean=True):\n", 75 | " #downloading from https://figshare.com/collections/Quantum_chemistry_structures_and_properties_of_134_kilo_molecules/978904\n", 76 | " os.system(\n", 77 | " \"wget https://ndownloader.figshare.com/files/3195389 -O qm9_main.xyz.tar.bz2\"\n", 78 | " )\n", 79 | " os.system(\n", 80 | " \"wget https://ndownloader.figshare.com/files/3195404 -O problematic_index.txt\"\n", 81 | " )\n", 82 | " os.system(\"mkdir qm9_main_structures\")\n", 83 | " os.system(\"tar xjf qm9_main.xyz.tar.bz2 -C qm9_main_structures\")\n", 84 | "\n", 85 | " names = [\n", 86 | " name for name in os.listdir('qm9_main_structures/')\n", 87 | " if name.endswith('.xyz')\n", 88 | " ]\n", 89 | " names = sorted(names)\n", 90 | "\n", 91 | " structures = [\n", 92 | " parse_qm9_xyz('qm9_main_structures/{}'.format(name))\n", 93 | " for name in tqdm.tqdm(names)\n", 94 | " ]\n", 95 | "\n", 96 | " problematic_index = parse_index('problematic_index.txt')\n", 97 | " np.save('problematic_index.npy', problematic_index)\n", 98 | " ase.io.write('qm9_main.extxyz', structures)\n", 99 | " if (clean):\n", 100 | " os.system(\"rm -r qm9_main_structures\")\n", 101 | " os.system(\"rm problematic_index.txt\")\n", 102 | " os.system(\"rm qm9_main.xyz.tar.bz2\")\n", 103 | " return structures, problematic_index\n", 104 | "\n", 105 | "\n", 106 | "def get_qm9(clean=True):\n", 107 | " if ('qm9_main.extxyz' in os.listdir('.')) and \\\n", 108 | " ('problematic_index.npy' in os.listdir('.')):\n", 109 | " structures = ase.io.read('qm9_main.extxyz', index=':')\n", 110 | " problematic_index = np.load('problematic_index.npy')\n", 111 | " return structures, problematic_index\n", 112 | " else:\n", 113 | " return download_qm9(clean=clean)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "structures, problematic_index = get_qm9()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "HARTREE_TO_EV = 27.211386245988\n", 132 | "USE_PROBLEMATIC_INDEX = False\n", 133 | "np.random.seed(0)\n", 134 | "\n", 135 | "if (not USE_PROBLEMATIC_INDEX):\n", 136 | " structures = [\n", 137 | " structure for structure in structures\n", 138 | " if structure.info['index'] not in problematic_index\n", 139 | " ]\n", 140 | "\n", 141 | "del problematic_index #it borrows indexing from 1 from qm9, deleting it away from sin\n", 142 | "\n", 143 | "permutation = np.random.permutation(len(structures))\n", 144 | "train_indices = permutation[0:2000]\n", 145 | "test_indices = permutation[2000:2500]\n", 146 | "environments_for_fitting = 1000 #number of environments to fit nice transfomers\n", 147 | "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000] #for learning curve\n", 148 | "\n", 149 | "#HYPERS for librascal spherical expansion coefficients\n", 150 | "HYPERS = {\n", 151 | " 'interaction_cutoff': 5,\n", 152 | " 'max_radial': 15,\n", 153 | " 'max_angular': 5,\n", 154 | " 'gaussian_sigma_type': 'Constant',\n", 155 | " 'gaussian_sigma_constant': 0.05,\n", 156 | " 'cutoff_smooth_width': 0.3,\n", 157 | " 'radial_basis': 'GTO'\n", 158 | "}" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "#our model:\n", 168 | "def get_nice():\n", 169 | " return StandardSequence([\n", 170 | " StandardBlock(ThresholdExpansioner(num_expand=300),\n", 171 | " CovariantsPurifierBoth(max_take=10),\n", 172 | " IndividualLambdaPCAsBoth(n_components=100),\n", 173 | " ThresholdExpansioner(num_expand=1000, mode='invariants'),\n", 174 | " InvariantsPurifier(max_take=10),\n", 175 | " InvariantsPCA(n_components=200)),\n", 176 | " StandardBlock(ThresholdExpansioner(num_expand=300),\n", 177 | " CovariantsPurifierBoth(max_take=10),\n", 178 | " IndividualLambdaPCAsBoth(n_components=100),\n", 179 | " ThresholdExpansioner(num_expand=1000, mode='invariants'),\n", 180 | " InvariantsPurifier(max_take=10),\n", 181 | " InvariantsPCA(n_components=200)),\n", 182 | " StandardBlock(None, None, None,\n", 183 | " ThresholdExpansioner(num_expand=1000, mode='invariants'),\n", 184 | " InvariantsPurifier(max_take=10),\n", 185 | " InvariantsPCA(n_components=100))\n", 186 | " ],\n", 187 | " initial_scaler=InitialScaler(\n", 188 | " mode='signal integral', individually=True))" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "train_structures = [structures[i] for i in train_indices]\n", 198 | "test_structures = [structures[i] for i in test_indices]\n", 199 | "\n", 200 | "all_species = get_all_species(train_structures + test_structures)\n", 201 | "\n", 202 | "train_coefficients = get_spherical_expansion(train_structures, HYPERS,\n", 203 | " all_species)\n", 204 | "test_coefficients = get_spherical_expansion(test_structures, HYPERS,\n", 205 | " all_species)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "all_coefficients = [\n", 215 | " train_coefficients[key] for key in train_coefficients.keys()\n", 216 | "]\n", 217 | "all_coefficients = np.concatenate(all_coefficients, axis=0)\n", 218 | "np.random.shuffle(all_coefficients)\n", 219 | "all_coefficients = all_coefficients[0:environments_for_fitting]" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "nice_single = get_nice()\n", 229 | "nice_single.fit(all_coefficients)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "# using same nice transformer regardless of central specie\n", 239 | "nice = {specie: nice_single for specie in all_species}" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "train_features = transform_sequentially(nice, train_structures, HYPERS,\n", 249 | " all_species)\n", 250 | "test_features = transform_sequentially(nice, test_structures, HYPERS,\n", 251 | " all_species)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "train_c_features = get_compositional_features(train_structures, all_species)\n", 261 | "test_c_features = get_compositional_features(test_structures, all_species)\n", 262 | "\n", 263 | "train_features = np.concatenate([train_features, train_c_features], axis=1)\n", 264 | "test_features = np.concatenate([test_features, test_c_features], axis=1)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "train_energies = [structure.info['U0'] for structure in train_structures]\n", 274 | "train_energies = np.array(train_energies) * HARTREE_TO_EV\n", 275 | "\n", 276 | "test_energies = [structure.info['U0'] for structure in test_structures]\n", 277 | "test_energies = np.array(test_energies) * HARTREE_TO_EV" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "def get_rmse(first, second):\n", 287 | " return np.sqrt(np.mean((first - second)**2))\n", 288 | "\n", 289 | "\n", 290 | "def get_mae(first, second):\n", 291 | " return np.mean(np.abs(first - second))\n", 292 | "\n", 293 | "\n", 294 | "def estimate_performance(regressor, data_train, data_test, targets_train,\n", 295 | " targets_test):\n", 296 | " regressor.fit(data_train, targets_train)\n", 297 | " predictions = regressor.predict(data_test)\n", 298 | " return get_rmse(predictions,\n", 299 | " targets_test), get_mae(predictions, targets_test)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "errors_compositional = []\n", 309 | "for el in tqdm.tqdm(grid):\n", 310 | " errors_compositional.append(\n", 311 | " estimate_performance(BayesianRidge(), train_c_features[:el],\n", 312 | " test_c_features, train_energies[:el],\n", 313 | " test_energies))\n", 314 | "\n", 315 | "errors_compositional = np.array(errors_compositional)\n", 316 | "errors_nice = []\n", 317 | "for el in tqdm.tqdm(grid):\n", 318 | " # because without this step with residuals\n", 319 | " # joint fitting might face problems due to\n", 320 | " # regularization\n", 321 | " regressor = BayesianRidge()\n", 322 | " regressor.fit(train_c_features[:el], train_energies[:el])\n", 323 | "\n", 324 | " residuals_train = train_energies[:el] - regressor.predict(\n", 325 | " train_c_features[:el])\n", 326 | " residuals_test = test_energies - regressor.predict(test_c_features)\n", 327 | "\n", 328 | " errors_nice.append(\n", 329 | " estimate_performance(BayesianRidge(), train_features[:el],\n", 330 | " test_features, residuals_train, residuals_test))\n", 331 | "\n", 332 | "errors_nice = np.array(errors_nice)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "fig, axes = plt.subplots(1, 2)\n", 342 | "\n", 343 | "axes[0].plot(grid, errors_compositional[:, 0], 'ro')\n", 344 | "axes[0].plot(grid, errors_compositional[:, 0], 'r', label='only compositional')\n", 345 | "\n", 346 | "axes[0].plot(grid, errors_nice[:, 0], 'bo')\n", 347 | "axes[0].plot(grid, errors_nice[:, 0], 'b', label='nice')\n", 348 | "\n", 349 | "axes[0].set_xlabel(\"n_train\")\n", 350 | "axes[0].set_ylabel(\"rmse, eV\")\n", 351 | "axes[0].set_xscale('log')\n", 352 | "axes[0].set_yscale('log')\n", 353 | "\n", 354 | "axes[1].plot(grid, errors_compositional[:, 1], 'ro')\n", 355 | "axes[1].plot(grid, errors_compositional[:, 1], 'r', label='only compositional')\n", 356 | "\n", 357 | "axes[1].plot(grid, errors_nice[:, 1], 'bo')\n", 358 | "axes[1].plot(grid, errors_nice[:, 1], 'b', label='nice')\n", 359 | "\n", 360 | "axes[1].set_xlabel(\"n_train\")\n", 361 | "axes[1].set_ylabel(\"mae, eV\")\n", 362 | "axes[1].set_xscale('log')\n", 363 | "axes[1].set_yscale('log')\n", 364 | "plt.legend(loc='upper center')\n", 365 | "plt.subplots_adjust(wspace=0.4)\n", 366 | "plt.show()" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "print(\"mae: {} eV\".format(errors_nice[-1][0]))\n", 376 | "print(\"rmse: {} eV\".format(errors_nice[-1][1]))" 377 | ] 378 | } 379 | ], 380 | "metadata": { 381 | "kernelspec": { 382 | "display_name": "Python 3", 383 | "language": "python", 384 | "name": "python3" 385 | }, 386 | "language_info": { 387 | "codemirror_mode": { 388 | "name": "ipython", 389 | "version": 3 390 | }, 391 | "file_extension": ".py", 392 | "mimetype": "text/x-python", 393 | "name": "python", 394 | "nbconvert_exporter": "python", 395 | "pygments_lexer": "ipython3", 396 | "version": "3.6.9" 397 | } 398 | }, 399 | "nbformat": 4, 400 | "nbformat_minor": 4 401 | } 402 | -------------------------------------------------------------------------------- /nice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lab-cosmo/nice/2ff446824c88958497e2d354be271d618e3a3d3e/nice/__init__.py -------------------------------------------------------------------------------- /nice/blocks/__init__.py: -------------------------------------------------------------------------------- 1 | from nice.blocks.compressors import * 2 | from nice.blocks.expansioners import * 3 | from nice.blocks.purifiers import * 4 | from nice.blocks.grouping import * 5 | from nice.blocks.miscellaneous import * 6 | -------------------------------------------------------------------------------- /nice/blocks/compressors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nice.unrolling_individual_pca import UnrollingIndividualPCA 3 | 4 | # from cython.parallel cimport prange 5 | 6 | from nice.thresholding import get_thresholded_tasks 7 | from nice.nice_utilities import do_partial_expansion, Data, get_sizes 8 | from nice.clebsch_gordan import ClebschGordan, check_clebsch_gordan 9 | from nice.packing import unite_parallel, subtract_parallel 10 | from nice.packing import pack_dense, unpack_dense 11 | from parse import parse 12 | import warnings 13 | from sklearn.linear_model import Ridge 14 | from sklearn.base import clone 15 | from sklearn.exceptions import NotFittedError 16 | from sklearn.decomposition import PCA 17 | 18 | 19 | def get_num_fit(desired_num, block_size): 20 | if desired_num % block_size == 0: 21 | return desired_num // block_size 22 | else: 23 | return (desired_num // block_size) + 1 24 | 25 | 26 | class IndividualLambdaPCAs: 27 | ''' Block to do pca step for covariants of single parity. It operates with instances of Data class''' 28 | def __init__(self, n_components=None, num_to_fit="10x"): 29 | self.n_components_ = n_components 30 | self.num_to_fit_ = num_to_fit 31 | self.fitted_ = False 32 | 33 | def get_importances(self): 34 | if not self.fitted_: 35 | raise NotFittedError( 36 | ("instance of {} is not fitted. " 37 | "Thus importances are not available.").format( 38 | type(self).__name__)) 39 | result = np.empty([self.max_n_components_, self.l_max_ + 1]) 40 | for lambd in range(self.l_max_ + 1): 41 | if self.pcas_[lambd] is not None: 42 | result[:self.pcas_[lambd].n_components, 43 | lambd] = self.pcas_[lambd].importances_ 44 | 45 | actual_sizes = [] 46 | for lambd in range(self.l_max_ + 1): 47 | if self.pcas_[lambd] is not None: 48 | actual_sizes.append(self.pcas_[lambd].n_components) 49 | else: 50 | actual_sizes.append(0) 51 | return result 52 | 53 | def fit(self, data): 54 | 55 | self.l_max_ = data.covariants_.shape[2] - 1 56 | self.pcas_ = [] 57 | self.reduction_happened_ = False 58 | self.max_n_components_ = -1 59 | for lambd in range(self.l_max_ + 1): 60 | if data.actual_sizes_[lambd] > 0: 61 | if self.n_components_ is None: 62 | n_components_now = data.actual_sizes_[lambd] 63 | else: 64 | n_components_now = self.n_components_ 65 | 66 | self.max_n_components_ = max(self.max_n_components_, 67 | n_components_now) 68 | 69 | if data.covariants_.shape[0] * (lambd + 1) < n_components_now: 70 | raise ValueError(( 71 | "not enough data to fit pca, number of vectors is {}, " 72 | "dimensionality of single vector (lambd + 1) is {}, " 73 | "i. e. total number of points is {}, " 74 | "while number of components is {}.").format( 75 | data.covariants_.shape[0], 76 | lambd + 1, 77 | data.covariants_.shape[0] * (lambd + 1), 78 | n_components_now, 79 | )) 80 | 81 | if type(self.num_to_fit_) is str: 82 | multiplier = int(parse("{}x", self.num_to_fit_)[0]) 83 | num_fit_now = get_num_fit(multiplier * n_components_now, 84 | (lambd + 1)) 85 | else: 86 | num_fit_now = self.num_to_fit_ 87 | if num_fit_now * (lambd + 1) < n_components_now: 88 | raise ValueError( 89 | ("specified parameter num fit ({}) is too " 90 | "small to fit pca with number of components {}." 91 | ).format(num_fit_now, n_components_now)) 92 | 93 | if data.covariants_.shape[0] * (lambd + 1) < num_fit_now: 94 | warnings.warn( 95 | ("given data is less than desired number " 96 | "of points to fit pca. " 97 | "Desired number of points to fit pca is {}, " 98 | "while number of vectors is {}, " 99 | "dimensionality of single vector (lambd + 1) is {}, " 100 | "i. e. total number of points is {}. " 101 | "Number of pca components is {}.").format( 102 | num_fit_now, 103 | data.covariants_.shape[0], 104 | (lambd + 1), 105 | data.covariants_.shape[0] * (lambd + 1), 106 | n_components_now, 107 | ), 108 | RuntimeWarning, 109 | ) 110 | 111 | if n_components_now < data.actual_sizes_[lambd]: 112 | self.reduction_happened_ = True 113 | pca = UnrollingIndividualPCA(n_components=n_components_now) 114 | pca.fit( 115 | data.covariants_[:num_fit_now, :data.actual_sizes_[lambd], 116 | lambd, :], 117 | lambd, 118 | ) 119 | self.pcas_.append(pca) 120 | else: 121 | self.pcas_.append(None) 122 | self.fitted_ = True 123 | self.importances_ = self.get_importances() 124 | 125 | def transform(self, data): 126 | if not self.fitted_: 127 | raise NotFittedError( 128 | ("instance of {} is not fitted. " 129 | "It can not transform anything.").format(type(self).__name__)) 130 | result = np.empty([ 131 | data.covariants_.shape[0], 132 | self.max_n_components_, 133 | self.l_max_ + 1, 134 | 2 * self.l_max_ + 1, 135 | ]) 136 | new_actual_sizes = np.zeros([self.l_max_ + 1], dtype=np.int32) 137 | for lambd in range(self.l_max_ + 1): 138 | if self.pcas_[lambd] is not None: 139 | now = self.pcas_[lambd].transform( 140 | data.covariants_[:, :data.actual_sizes_[lambd], lambd, :], 141 | lambd) 142 | result[:, :now.shape[1], lambd, :(2 * lambd + 1)] = now 143 | new_actual_sizes[lambd] = now.shape[1] 144 | else: 145 | new_actual_sizes[lambd] = 0 146 | 147 | return Data(result, new_actual_sizes, importances=self.importances_) 148 | 149 | def is_fitted(self): 150 | return self.fitted_ 151 | 152 | 153 | class IndividualLambdaPCAsBoth: 154 | ''' Block to do pca step for covariants of both parities. It operates with even-odd pairs of instances of Data class''' 155 | def __init__(self, *args, **kwargs): 156 | self.even_pca_ = IndividualLambdaPCAs(*args, **kwargs) 157 | self.odd_pca_ = IndividualLambdaPCAs(*args, **kwargs) 158 | self.fitted_ = False 159 | 160 | def fit(self, data_even, data_odd): 161 | 162 | self.even_pca_.fit(data_even) 163 | self.odd_pca_.fit(data_odd) 164 | self.fitted_ = True 165 | 166 | def transform(self, data_even, data_odd): 167 | if not self.fitted_: 168 | raise NotFittedError( 169 | ("instance of {} is not fitted. " 170 | "It can not transform anything.").format(type(self).__name__)) 171 | return self.even_pca_.transform(data_even), self.odd_pca_.transform( 172 | data_odd) 173 | 174 | def is_fitted(self): 175 | return self.fitted_ 176 | 177 | 178 | class InvariantsPCA(PCA): 179 | ''' Block to do pca step for invariants. It operates with 2d numpy arrays''' 180 | def __init__(self, *args, num_to_fit="10x", **kwargs): 181 | self.num_to_fit_ = num_to_fit 182 | self.fitted_ = False 183 | return super().__init__(*args, **kwargs) 184 | 185 | def _my_representation(self): 186 | if (self.fitted_): 187 | return "Instance of InvariantsPCA, fitted" 188 | else: 189 | return "Instance of InvariantsPCA, not fitted" 190 | 191 | def __repr__(self): 192 | return self._my_representation() 193 | 194 | def __str__(self): 195 | return self._my_representation() 196 | 197 | def process_input(self, X): 198 | if (self.n_components is None): 199 | self.n_components = X.shape[1] 200 | if (self.n_components > X.shape[1]): 201 | self.n_components = X.shape[1] 202 | 203 | if type(self.num_to_fit_) is str: 204 | multiplier = int(parse("{}x", self.num_to_fit_)[0]) 205 | num_fit_now = multiplier * self.n_components 206 | else: 207 | num_fit_now = self.num_to_fit_ 208 | 209 | if self.n_components > X.shape[0]: 210 | raise ValueError( 211 | ("not enough data to fit pca. " 212 | "Number of environments is {}, number of components is {}." 213 | ).format(X.shape[0], self.n_components)) 214 | 215 | if num_fit_now > X.shape[0]: 216 | warnings.warn(("Amount of provided data is less " 217 | "than the desired one to fit PCA. " 218 | "Number of components is {}, " 219 | "desired number of environments is {}, " 220 | "actual number of environments is {}.").format( 221 | self.n_components, num_fit_now, X.shape[0])) 222 | 223 | return X[:num_fit_now] 224 | 225 | def fit(self, X): 226 | 227 | res = super().fit(self.process_input(X)) 228 | self.fitted_ = True 229 | return res 230 | 231 | def fit_transform(self, X): 232 | res = super().fit_transform(self.process_input(X)) 233 | self.fitted_ = True 234 | return res 235 | 236 | def transform(self, X): 237 | if not self.fitted_: 238 | raise NotFittedError( 239 | ("instance of {} is not fitted. " 240 | "It can not transform anything.").format(type(self).__name__)) 241 | return super().transform(X) 242 | 243 | def is_fitted(self): 244 | return self.fitted_ 245 | -------------------------------------------------------------------------------- /nice/blocks/expansioners.py: -------------------------------------------------------------------------------- 1 | from nice.thresholding import get_thresholded_tasks 2 | from nice.nice_utilities import do_partial_expansion, Data, get_sizes 3 | from nice.clebsch_gordan import ClebschGordan, check_clebsch_gordan 4 | import numpy as np 5 | from sklearn.exceptions import NotFittedError 6 | 7 | 8 | class ThresholdExpansioner: 9 | ''' Block to do Clebsch-Gordan iteration. It uses two even-odd pairs of Data instances with covariants 10 | to produce new ones. If first even-odd pair contains covariants of body order v1, and the second v2, body 11 | order of the result would be v1 + v2. ''' 12 | def __init__(self, num_expand=None, mode="covariants", num_threads=None): 13 | if num_expand is None: 14 | self.num_expand_ = -1 15 | else: 16 | self.num_expand_ = num_expand 17 | 18 | self.mode_ = mode 19 | self.num_threads_ = num_threads 20 | self.fitted_ = False 21 | 22 | def fit(self, 23 | first_even, 24 | first_odd, 25 | second_even, 26 | second_odd, 27 | clebsch_gordan=None): 28 | 29 | self.l_max_ = first_even.covariants_.shape[2] - 1 30 | 31 | if (first_even.importances_ is None) or (first_odd.importances_ is None) \ 32 | or (second_even.importances_ is None) or (second_odd.importances_ is None): 33 | raise ValueError( 34 | "For thresholding importances of features should be specified") 35 | 36 | ( 37 | self.task_even_even_, 38 | self.task_odd_odd_, 39 | self.task_even_odd_, 40 | self.task_odd_even_, 41 | ) = get_thresholded_tasks( 42 | first_even, 43 | first_odd, 44 | second_even, 45 | second_odd, 46 | self.num_expand_, 47 | self.l_max_, 48 | self.mode_, 49 | ) 50 | 51 | if clebsch_gordan is None: 52 | self.clebsch_ = ClebschGordan(self.l_max_) 53 | else: 54 | check_clebsch_gordan(clebsch_gordan, self.l_max_) 55 | self.clebsch_ = clebsch_gordan 56 | 57 | self.new_even_size_ = np.max( 58 | get_sizes(self.l_max_, self.task_even_even_[0], self.mode_) + 59 | get_sizes(self.l_max_, self.task_odd_odd_[0], self.mode_)) 60 | 61 | self.new_odd_size_ = np.max( 62 | get_sizes(self.l_max_, self.task_even_odd_[0], self.mode_) + 63 | get_sizes(self.l_max_, self.task_odd_even_[0], self.mode_)) 64 | 65 | self.new_even_raw_importances_ = np.concatenate( 66 | [self.task_even_even_[1], self.task_odd_odd_[1]], axis=0) 67 | self.new_odd_raw_importances_ = np.concatenate( 68 | [self.task_even_odd_[1], self.task_odd_even_[1]], axis=0) 69 | self.fitted_ = True 70 | 71 | def transform(self, first_even, first_odd, second_even, second_odd): 72 | if not self.fitted_: 73 | raise NotFittedError( 74 | "instance of {} is not fitted. It can not transform anything". 75 | format(type(self).__name__)) 76 | 77 | if self.mode_ == "covariants": 78 | new_even = np.empty([ 79 | first_even.covariants_.shape[0], 80 | self.new_even_size_, 81 | self.l_max_ + 1, 82 | 2 * self.l_max_ + 1, 83 | ]) 84 | new_odd = np.empty([ 85 | first_even.covariants_.shape[0], 86 | self.new_odd_size_, 87 | self.l_max_ + 1, 88 | 2 * self.l_max_ + 1, 89 | ]) 90 | else: 91 | new_even = np.empty( 92 | [first_even.covariants_.shape[0], self.new_even_size_, 1]) 93 | new_odd = np.empty( 94 | [first_even.covariants_.shape[0], self.new_odd_size_, 1]) 95 | 96 | if self.mode_ == "covariants": 97 | new_even_actual_sizes = np.zeros([self.l_max_ + 1], dtype=np.int32) 98 | new_odd_actual_sizes = np.zeros([self.l_max_ + 1], dtype=np.int32) 99 | else: 100 | new_even_actual_sizes = np.zeros([1], dtype=np.int32) 101 | new_odd_actual_sizes = np.zeros([1], dtype=np.int32) 102 | 103 | do_partial_expansion( 104 | self.clebsch_.precomputed_, 105 | first_even.covariants_, 106 | second_even.covariants_, 107 | self.l_max_, 108 | self.task_even_even_[0], 109 | new_even, 110 | new_even_actual_sizes, 111 | self.mode_, 112 | num_threads=self.num_threads_, 113 | ) 114 | # print(new_even_actual_sizes) 115 | do_partial_expansion( 116 | self.clebsch_.precomputed_, 117 | first_odd.covariants_, 118 | second_odd.covariants_, 119 | self.l_max_, 120 | self.task_odd_odd_[0], 121 | new_even, 122 | new_even_actual_sizes, 123 | self.mode_, 124 | num_threads=self.num_threads_, 125 | ) 126 | # print(new_even_actual_sizes) 127 | do_partial_expansion( 128 | self.clebsch_.precomputed_, 129 | first_even.covariants_, 130 | second_odd.covariants_, 131 | self.l_max_, 132 | self.task_even_odd_[0], 133 | new_odd, 134 | new_odd_actual_sizes, 135 | self.mode_, 136 | num_threads=self.num_threads_, 137 | ) 138 | 139 | do_partial_expansion( 140 | self.clebsch_.precomputed_, 141 | first_odd.covariants_, 142 | second_even.covariants_, 143 | self.l_max_, 144 | self.task_odd_even_[0], 145 | new_odd, 146 | new_odd_actual_sizes, 147 | self.mode_, 148 | num_threads=self.num_threads_, 149 | ) 150 | if self.mode_ == "covariants": 151 | return Data(new_even, 152 | new_even_actual_sizes), Data(new_odd, 153 | new_odd_actual_sizes) 154 | else: 155 | return ( 156 | new_even[:, :new_even_actual_sizes[0], 0], 157 | new_odd[:, :new_odd_actual_sizes[0], 0], 158 | ) 159 | 160 | def is_fitted(self): 161 | return self.fitted_ 162 | -------------------------------------------------------------------------------- /nice/blocks/miscellaneous.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nice.nice_utilities import Data 3 | from sklearn.exceptions import NotFittedError 4 | 5 | 6 | class ParityDefinitionChanger(): 7 | '''Block to change parity definition from even-odd to true-pseudo and vice versa''' 8 | def _init__(self): 9 | self.fitted_ = True 10 | 11 | def is_fitted(self): 12 | return self.fitted_ 13 | 14 | def transform(self, first_data, second_data): 15 | l_max = first_data.covariants_.shape[2] - 1 16 | new_first_sizes, new_second_sizes = [], [] 17 | for lambd in range(l_max + 1): 18 | if (lambd % 2 == 0): 19 | new_first_sizes.append(first_data.actual_sizes_[lambd]) 20 | new_second_sizes.append(second_data.actual_sizes_[lambd]) 21 | else: 22 | new_first_sizes.append(second_data.actual_sizes_[lambd]) 23 | new_second_sizes.append(first_data.actual_sizes_[lambd]) 24 | 25 | new_first_sizes, new_second_sizes = np.array(new_first_sizes, 26 | dtype=np.int32), np.array( 27 | new_second_sizes, 28 | dtype=np.int32) 29 | new_first_shape = list(first_data.covariants_.shape) 30 | new_first_shape[1] = np.max(new_first_sizes) 31 | 32 | new_second_shape = list(second_data.covariants_.shape) 33 | new_second_shape[1] = np.max(new_second_sizes) 34 | 35 | new_first_covariants = np.empty(new_first_shape) 36 | new_second_covariants = np.empty(new_second_shape) 37 | 38 | for lambd in range(l_max + 1): # todo may be do copying in parallel 39 | if (lambd % 2 == 0): 40 | new_first_covariants[:, :new_first_sizes[lambd], lambd, :( 41 | 2 * lambd + 42 | 1)] = first_data.covariants_[:, :new_first_sizes[lambd], 43 | lambd, :(2 * lambd + 1)] 44 | new_second_covariants[:, :new_second_sizes[lambd], lambd, :( 45 | 2 * lambd + 46 | 1)] = second_data.covariants_[:, :new_second_sizes[lambd], 47 | lambd, :(2 * lambd + 1)] 48 | else: 49 | new_first_covariants[:, :new_first_sizes[lambd], lambd, :( 50 | 2 * lambd + 51 | 1)] = second_data.covariants_[:, :new_first_sizes[lambd], 52 | lambd, :(2 * lambd + 1)] 53 | new_second_covariants[:, :new_second_sizes[lambd], lambd, :( 54 | 2 * lambd + 55 | 1)] = first_data.covariants_[:, :new_second_sizes[lambd], 56 | lambd, :(2 * lambd + 1)] 57 | 58 | if (first_data.importances_ is None) or (second_data.importances_ is 59 | None): 60 | new_first_importances = None 61 | new_second_importances = None 62 | else: 63 | new_first_importances = np.empty( 64 | [np.max(new_first_sizes), l_max + 1]) 65 | new_second_importances = np.empty( 66 | [np.max(new_second_sizes), l_max + 1]) 67 | 68 | for lambd in range(l_max + 1): 69 | if (lambd % 2 == 0): 70 | new_first_importances[:new_first_sizes[ 71 | lambd], lambd] = first_data.importances_[: 72 | new_first_sizes[ 73 | lambd], 74 | lambd] 75 | new_second_importances[:new_second_sizes[ 76 | lambd], lambd] = second_data.importances_[: 77 | new_second_sizes[ 78 | lambd], 79 | lambd] 80 | else: 81 | new_first_importances[:new_first_sizes[ 82 | lambd], lambd] = second_data.importances_[: 83 | new_first_sizes[ 84 | lambd], 85 | lambd] 86 | new_second_importances[:new_second_sizes[ 87 | lambd], lambd] = first_data.importances_[: 88 | new_second_sizes[ 89 | lambd], 90 | lambd] 91 | 92 | return Data(new_first_covariants, new_first_sizes, new_first_importances), \ 93 | Data(new_second_covariants, new_second_sizes, new_second_importances) 94 | 95 | 96 | class InitialScaler(): 97 | '''Block to scale initial spherical expansion coefficients in a certain way. It allows to both 98 | normalize coefficients for each environment individually, and to multiply whole array to single 99 | scaling factor, thus, preserving information about relative scale''' 100 | def __init__(self, mode="signal integral", individually=False): 101 | self.individually_ = individually 102 | 103 | if self.individually_: 104 | self.fitted_ = True 105 | else: 106 | self.fitted_ = False 107 | 108 | self.mode_ = mode 109 | if (self.mode_ != "signal integral") and (self.mode_ != "variance"): 110 | raise ValueError("mode should be ethier " 111 | "\"signal integral\" ethier \"variance\".") 112 | 113 | def _get_variance_multiplier(self, coefficients): 114 | total = 0.0 115 | total_values = 0 116 | 117 | for l in range(coefficients.shape[2]): 118 | if self.individually_: 119 | total += np.sum((coefficients[:, :, l, 0:(2 * l + 1)])**2, 120 | axis=(1, 2)) 121 | total_values += coefficients.shape[1] * (2 * l + 1) 122 | 123 | else: 124 | total += np.sum((coefficients[:, :, l, 0:(2 * l + 1)])**2) 125 | total_values += coefficients.shape[0] * coefficients.shape[ 126 | 1] * (2 * l + 1) 127 | 128 | average = total / total_values 129 | result = 1.0 / np.sqrt(average) 130 | if (self.individually_): 131 | return result[:, np.newaxis, np.newaxis, np.newaxis] 132 | else: 133 | return result 134 | 135 | def _get_signal_integral_multiplier(self, coefficients): 136 | if self.individually_: 137 | result = 1.0 / np.sqrt(np.sum(coefficients[:, :, 0, 0]**2, axis=1)) 138 | return result[:, np.newaxis, np.newaxis, np.newaxis] 139 | else: 140 | return 1.0 / np.sqrt( 141 | np.mean(np.sum(coefficients[:, :, 0, 0]**2, axis=1))) 142 | 143 | def fit(self, coefficients): 144 | if not self.individually_: 145 | if (self.mode_ == "signal integral"): 146 | self.multiplier_ = self._get_signal_integral_multiplier( 147 | coefficients) 148 | 149 | if (self.mode_ == "variance"): 150 | self.multiplier_ = self._get_variance_multiplier(coefficients) 151 | 152 | self.fitted_ = True 153 | 154 | def transform(self, coefficients): 155 | if (not self.fitted_): 156 | raise NotFittedError("instance of {} is not fitted. " 157 | "It can not transform anything.".format( 158 | type(self).__name__)) 159 | if (self.individually_): 160 | if (self.mode_ == "signal integral"): 161 | multipliers = self._get_signal_integral_multiplier( 162 | coefficients) 163 | 164 | if (self.mode_ == "variance"): 165 | multipliers = self._get_variance_multiplier(coefficients) 166 | return coefficients * multipliers 167 | else: 168 | return coefficients * self.multiplier_ 169 | 170 | def is_fitted(self): 171 | return self.fitted_ 172 | 173 | 174 | class InitialTransformer(): 175 | '''Utility block to split spherical expansion coefficients stored in the form of single numpy array to 176 | even-odd pair of Data instances''' 177 | def __init__(self): 178 | self.fitted_ = True 179 | 180 | def transform(self, coefficients): 181 | l_max = coefficients.shape[2] - 1 182 | even_coefficients = np.copy(coefficients) 183 | even_coefficients_sizes = [ 184 | coefficients.shape[1] if i % 2 == 0 else 0 185 | for i in range(l_max + 1) 186 | ] 187 | 188 | odd_coefficients = np.copy(coefficients) 189 | odd_coefficients_sizes = [ 190 | coefficients.shape[1] if i % 2 == 1 else 0 191 | for i in range(l_max + 1) 192 | ] 193 | 194 | return Data(even_coefficients, 195 | even_coefficients_sizes), Data(odd_coefficients, 196 | odd_coefficients_sizes) 197 | 198 | def is_fitted(self): 199 | return self.fitted_ 200 | -------------------------------------------------------------------------------- /nice/blocks/purifiers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nice.nice_utilities import Data 4 | 5 | from nice.packing import unite_parallel, subtract_parallel 6 | from nice.packing import pack_dense, unpack_dense 7 | from parse import parse 8 | import warnings 9 | from sklearn.linear_model import Ridge 10 | from sklearn.base import clone 11 | from sklearn.exceptions import NotFittedError 12 | 13 | DEFAULT_LINEAR_REGRESSOR = Ridge(alpha=1e-12) 14 | 15 | 16 | class InvariantsPurifier: 17 | ''' Block to purify invariants. It operates with numpy 2d arrays containing invariants''' 18 | def __init__(self, regressor=None, num_to_fit='10x', max_take=None): 19 | if (regressor is None): 20 | self.regressor_ = clone(DEFAULT_LINEAR_REGRESSOR) 21 | else: 22 | self.regressor_ = regressor 23 | 24 | self.fitted_ = False 25 | self.num_to_fit_ = num_to_fit 26 | self.max_take_ = max_take 27 | if (type(self.max_take_) == list): 28 | self.max_take_ = np.array(self.max_take_) 29 | if (self.max_take_ 30 | is not None) and (type(self.max_take_) != np.ndarray): 31 | self.max_take_ = int(self.max_take_) 32 | 33 | def fit(self, old_blocks, new_block): 34 | total_num = 0 35 | for i in range(len(old_blocks)): 36 | if (self.max_take_ is None): 37 | total_num += old_blocks[i].shape[1] 38 | else: 39 | if (type(self.max_take_) is int): 40 | total_num += min(old_blocks[i].shape[1], self.max_take_) 41 | else: 42 | total_num += min(old_blocks[i].shape[1], self.max_take_[i]) 43 | 44 | if (type(self.num_to_fit_) is str): 45 | multiplier = int(parse('{}x', self.num_to_fit_)[0]) 46 | num_fit_now = multiplier * total_num 47 | else: 48 | num_fit_now = self.num_to_fit_ 49 | 50 | if (num_fit_now > new_block.shape[0]): 51 | warnings.warn("Amount of provided data is less than " 52 | "the desired one to fit InvariantsPurifer. " 53 | "Number of old features is {}, " 54 | "desired number of environments is {}, " 55 | "actual number of environments is {}.".format( 56 | total_num, num_fit_now, new_block.shape[0])) 57 | 58 | if (self.max_take_ is None): 59 | restricted_blocks = [ 60 | old_block[:num_fit_now, :] for old_block in old_blocks 61 | ] 62 | else: 63 | if (type(self.max_take_) is int): 64 | restricted_blocks = [ 65 | old_block[:num_fit_now, :self.max_take_] 66 | for old_block in old_blocks 67 | ] 68 | else: 69 | restricted_blocks = [ 70 | old_blocks[i][:num_fit_now, :self.max_take_[i]] 71 | for i in range(len(old_blocks)) 72 | ] 73 | 74 | old_uniting = unite_parallel(restricted_blocks) 75 | self.regressor_.fit(old_uniting, new_block[:num_fit_now, :]) 76 | 77 | self.fitted_ = True 78 | 79 | def transform(self, old_blocks, new_block): 80 | if (not self.fitted_): 81 | raise NotFittedError( 82 | "instance of {} is not fitted. It can not transform anything". 83 | format(type(self).__name__)) 84 | 85 | if (self.max_take_ is None): 86 | restricted_blocks = [old_block[:, :] for old_block in old_blocks] 87 | else: 88 | if (type(self.max_take_) is int): 89 | restricted_blocks = [ 90 | old_block[:, :self.max_take_] for old_block in old_blocks 91 | ] 92 | else: 93 | restricted_blocks = [ 94 | old_blocks[i][:, :self.max_take_[i]] 95 | for i in range(len(old_blocks)) 96 | ] 97 | 98 | old_uniting = unite_parallel(restricted_blocks) 99 | predictions = self.regressor_.predict(old_uniting) 100 | return subtract_parallel(new_block, predictions) 101 | 102 | def is_fitted(self): 103 | return self.fitted_ 104 | 105 | 106 | class CovariantsIndividualPurifier: 107 | '''Block to purify single covariants lambda channel. It operates with 3 dimensional numpy arrays 108 | with indexing [environmental_index, feature_index, m]''' 109 | def __init__(self, regressor=None, num_to_fit='10x', max_take=None): 110 | if (regressor is None): 111 | self.regressor_ = clone(DEFAULT_LINEAR_REGRESSOR) 112 | self.regressor_.set_params(**{"fit_intercept": False}) 113 | else: 114 | self.regressor_ = regressor 115 | 116 | self.fitted_ = False 117 | self.num_to_fit_ = num_to_fit 118 | self.max_take_ = max_take 119 | if (type(self.max_take_) == list): 120 | self.max_take_ = np.array(self.max_take_) 121 | 122 | if (self.max_take_ 123 | is not None) and (type(self.max_take_) != np.ndarray): 124 | self.max_take_ = int(self.max_take_) 125 | 126 | def fit(self, old_blocks, new_block, l): 127 | total_num = 0 128 | for i in range(len(old_blocks)): 129 | if (self.max_take_ is None): 130 | total_num += old_blocks[i].shape[1] 131 | else: 132 | if (type(self.max_take_) is int): 133 | total_num += min(old_blocks[i].shape[1], self.max_take_) 134 | else: 135 | total_num += min(old_blocks[i].shape[1], self.max_take_[i]) 136 | 137 | if (type(self.num_to_fit_) is str): 138 | multiplier = int(parse('{}x', self.num_to_fit_)[0]) 139 | num_fit_now = multiplier * total_num 140 | else: 141 | num_fit_now = self.num_to_fit_ 142 | 143 | if (num_fit_now > new_block.shape[0] * (l + 1)): 144 | warnings.warn( 145 | "Amount of provided data is less than " 146 | "the desired one to fit InvariantsPurifer. " 147 | "Number of old features is {}, " 148 | "desired number of data points is {}, " 149 | "actual number of data points (n_env * (l + 1)) is {}, " 150 | "since number of environments is {}, and l is {}.".format( 151 | total_num, num_fit_now, new_block.shape[0] * (l + 1), 152 | new_block.shape[0], l)) 153 | 154 | if (num_fit_now % (l + 1) == 0): 155 | num_fit_now = num_fit_now // (l + 1) 156 | else: 157 | num_fit_now = (num_fit_now // (l + 1)) + 1 158 | 159 | if (self.max_take_ is None): 160 | old_blocks_reshaped = [] 161 | for old_block in old_blocks: 162 | old_blocks_reshaped.append( 163 | pack_dense(old_block[:num_fit_now], l, old_block.shape[1], 164 | old_block.shape[1])) 165 | else: 166 | if (type(self.max_take_) is int): 167 | size_now = self.max_take_ 168 | else: 169 | size_now = self.max_take_[i] 170 | 171 | old_blocks_reshaped = [] 172 | for old_block in old_blocks: 173 | old_blocks_reshaped.append( 174 | pack_dense( 175 | old_block[:num_fit_now, :min(size_now, old_block. 176 | shape[1])], l, 177 | min(size_now, old_block.shape[1]), 178 | min(size_now, old_block.shape[1]))) 179 | 180 | old_uniting = unite_parallel(old_blocks_reshaped) 181 | new_reshaped = pack_dense(new_block[:num_fit_now], l, 182 | new_block.shape[1], new_block.shape[1]) 183 | self.regressor_.fit(old_uniting, new_reshaped) 184 | self.fitted_ = True 185 | 186 | def transform(self, old_blocks, new_block, l): 187 | if (not self.fitted_): 188 | raise NotFittedError( 189 | "instance of {} is not fitted. It can not transform anything". 190 | format(type(self).__name__)) 191 | 192 | if (self.max_take_ is None): 193 | old_blocks_reshaped = [] 194 | for old_block in old_blocks: 195 | old_blocks_reshaped.append( 196 | pack_dense(old_block, l, old_block.shape[1], 197 | old_block.shape[1])) 198 | else: 199 | if (type(self.max_take_) is int): 200 | size_now = self.max_take_ 201 | else: 202 | size_now = self.max_take_[i] 203 | old_blocks_reshaped = [] 204 | for old_block in old_blocks: 205 | old_blocks_reshaped.append( 206 | pack_dense( 207 | old_block[:, :min(size_now, old_block.shape[1])], l, 208 | min(size_now, old_block.shape[1]), 209 | min(size_now, old_block.shape[1]))) 210 | 211 | old_uniting = unite_parallel(old_blocks_reshaped) 212 | new_reshaped = pack_dense(new_block, l, new_block.shape[1], 213 | new_block.shape[1]) 214 | predictions = self.regressor_.predict(old_uniting) 215 | result = subtract_parallel(new_reshaped, predictions) 216 | return unpack_dense(result, new_block.shape[0], l, new_block.shape[1]) 217 | 218 | def is_fitted(self): 219 | return self.fitted_ 220 | 221 | 222 | class CovariantsPurifier: 223 | '''Block to purify covariants of single parity. It operates with instances of Data class with covariants''' 224 | def __init__(self, regressor=None, num_to_fit='10x', max_take=None): 225 | if (regressor is None): 226 | self.regressor_ = clone(DEFAULT_LINEAR_REGRESSOR) 227 | self.regressor_.set_params(**{"fit_intercept": False}) 228 | else: 229 | self.regressor_ = regressor 230 | 231 | self.regressor_.set_params(**{"fit_intercept": False}) 232 | self.fitted_ = False 233 | self.num_to_fit_ = num_to_fit 234 | self.max_take_ = max_take 235 | if (type(self.max_take_) == list): 236 | self.max_take_ = np.array(self.max_take_) 237 | if (self.max_take_ 238 | is not None) and (type(self.max_take_) != np.ndarray): 239 | self.max_take_ = int(self.max_take_) 240 | 241 | def fit(self, old_datas, new_data): 242 | 243 | self.l_max_ = new_data.covariants_.shape[2] - 1 244 | self.purifiers_ = [] 245 | 246 | for l in range(self.l_max_ + 1): 247 | if (self.regressor_ is None): 248 | current_regressor = None 249 | else: 250 | current_regressor = clone(self.regressor_) 251 | self.purifiers_.append( 252 | CovariantsIndividualPurifier(regressor=current_regressor, 253 | num_to_fit=self.num_to_fit_, 254 | max_take=self.max_take_)) 255 | 256 | for l in range(self.l_max_ + 1): 257 | old_blocks_now = [] 258 | for old_data in old_datas: 259 | if (old_data.actual_sizes_[l] > 0): 260 | old_blocks_now.append( 261 | old_data.covariants_[:, :old_data.actual_sizes_[l], 262 | l, :]) 263 | 264 | new_block_now = new_data.covariants_[:, :new_data.actual_sizes_[l], 265 | l, :] 266 | 267 | old_total_size = 0 268 | for old_data in old_datas: 269 | old_total_size += old_data.actual_sizes_[l] 270 | new_size = new_data.actual_sizes_[l] 271 | if (old_total_size == 0) or (new_size == 0): 272 | self.purifiers_[l] = None 273 | else: 274 | self.purifiers_[l].fit(old_blocks_now, new_block_now, l) 275 | 276 | self.fitted_ = True 277 | 278 | def transform(self, old_datas, new_data): 279 | if (not self.fitted_): 280 | raise NotFittedError( 281 | "instance of {} is not fitted. It can not transform anything". 282 | format(type(self).__name__)) 283 | ans = Data(np.empty(new_data.covariants_.shape), 284 | np.copy(new_data.actual_sizes_), 285 | importances=None) 286 | 287 | for l in range(self.l_max_ + 1): 288 | if (self.purifiers_[l] is not None): 289 | old_blocks_now = [ 290 | old_data.covariants_[:, :old_data.actual_sizes_[l], l, :] 291 | for old_data in old_datas 292 | ] 293 | new_block_now = new_data.covariants_[:, :new_data. 294 | actual_sizes_[l], l, :] 295 | now = self.purifiers_[l].transform(old_blocks_now, 296 | new_block_now, l) 297 | ans.covariants_[:, :now.shape[1], 298 | l, :(2 * l + 1)] = now # todo parallelize it 299 | else: 300 | if (ans.actual_sizes_[l] > 0): 301 | ans.covariants_[:, :ans.actual_sizes_[l], l, :(2 * l + 1)] = \ 302 | new_data.covariants_[:, :ans.actual_sizes_[l], l, :(2 * l + 1)] # todo parallelize it 303 | 304 | return ans 305 | 306 | def is_fitted(self): 307 | return self.fitted_ 308 | 309 | 310 | class CovariantsPurifierBoth: 311 | '''Block to purify covariants of both parities. It operates with pairs of instances of Data class with covariants''' 312 | def __init__(self, regressor=None, num_to_fit='10x', max_take=None): 313 | self.num_to_fit_ = num_to_fit 314 | self.max_take_ = max_take 315 | if (self.max_take_ 316 | is not None) and (type(self.max_take_) != np.ndarray): 317 | self.max_take_ = int(self.max_take_) 318 | if (regressor is None): 319 | even_regressor, odd_regressor = None, None 320 | else: 321 | even_regressor, odd_regressor = clone(regressor), clone(regressor) 322 | even_regressor.set_params(**{"fit_intercept": False}) 323 | odd_regressor.set_params(**{"fit_intercept": False}) 324 | 325 | self.even_purifier_ = CovariantsPurifier(regressor=even_regressor, 326 | num_to_fit=self.num_to_fit_, 327 | max_take=self.max_take_) 328 | self.odd_purifier_ = CovariantsPurifier(regressor=odd_regressor, 329 | num_to_fit=self.num_to_fit_, 330 | max_take=self.max_take_) 331 | self.fitted_ = False 332 | 333 | def fit(self, old_datas_even, new_data_even, old_datas_odd, new_data_odd): 334 | 335 | self.even_purifier_.fit(old_datas_even, new_data_even) 336 | self.odd_purifier_.fit(old_datas_odd, new_data_odd) 337 | self.fitted_ = True 338 | 339 | def transform(self, old_datas_even, new_data_even, old_datas_odd, 340 | new_data_odd): 341 | if (not self.fitted_): 342 | raise NotFittedError( 343 | "instance of {} is not fitted. It can not transform anything". 344 | format(type(self).__name__)) 345 | return self.even_purifier_.transform(old_datas_even, new_data_even),\ 346 | self.odd_purifier_.transform(old_datas_odd, new_data_odd) 347 | 348 | def is_fitted(self): 349 | return self.fitted_ 350 | -------------------------------------------------------------------------------- /nice/clebsch_gordan.py: -------------------------------------------------------------------------------- 1 | from sympy.physics.wigner import clebsch_gordan 2 | from sympy import S 3 | import numpy as np 4 | 5 | 6 | def get_single(l1, l2, l, m1, m2): 7 | return float(clebsch_gordan(S(l1), S(l2), S(l), S(m1), S(m2), S(m1 + m2))) 8 | 9 | 10 | class ClebschGordan: 11 | def __init__(self, l_max): 12 | self.l_max_ = l_max 13 | self.precomputed_ = np.zeros( 14 | [l_max + 1, l_max + 1, l_max + 1, 2 * l_max + 1, 2 * l_max + 1]) 15 | 16 | for l1 in range(l_max + 1): 17 | for l2 in range(l_max + 1): 18 | for l in range(l_max + 1): 19 | for m1 in range(-l_max, l_max + 1): 20 | for m2 in range(-l_max, l_max + 1): 21 | now = get_single(l1, l2, l, m1, m2) 22 | self.precomputed_[l1, l2, l, m1 + l1, 23 | m2 + l2] = now 24 | 25 | 26 | def check_clebsch_gordan(clebsch_gordan, required_l_max): 27 | if (type(clebsch_gordan) != ClebschGordan): 28 | raise ValueError("type of precomputed clebsch gordan " 29 | "coefficients should be ClebschGordan class.") 30 | if (clebsch_gordan.l_max_ < required_l_max): 31 | raise ValueError("given precomputed clebsch gordan coefficients " 32 | "have smaller l_max than required one.") 33 | -------------------------------------------------------------------------------- /nice/contracted_pca.py: -------------------------------------------------------------------------------- 1 | from sklearn.utils.extmath import randomized_svd 2 | import numpy as np 3 | 4 | 5 | def do_sign_covariant_pca(X, n_components): 6 | sums = np.sum(X, axis=1) 7 | signs = ((sums <= 0) - 0.5) * 2.0 8 | X_normalized = signs[:, np.newaxis] * X 9 | U, S, V = randomized_svd(X_normalized, 10 | n_components=n_components, 11 | flip_sign=True) 12 | return U * signs[:, np.newaxis] 13 | 14 | 15 | def do_pca_step(features, n_components, normalize=True, epsilon=1e-8): 16 | shape_initial = features.shape 17 | features = np.transpose(features, axes=(0, 2, 3, 1)) 18 | features = np.reshape(features, [-1, features.shape[-1]]) 19 | #features = np.vstack([np.real(features), np.imag(features)]) 20 | 21 | if (normalize): 22 | stds = np.sqrt(np.mean(features * features, axis=0)) 23 | stds = np.maximum(stds, epsilon) 24 | features = features / stds[np.newaxis, :] 25 | 26 | features = do_sign_covariant_pca(features, n_components) 27 | #features = features[0:(features.shape[0] // 2)] + 1j * features[(features.shape[0] // 2):] 28 | features = np.reshape(features, [ 29 | shape_initial[0], shape_initial[2], shape_initial[3], 30 | features.shape[-1] 31 | ]) 32 | features = np.transpose(features, axes=(0, 3, 1, 2)) 33 | return features 34 | -------------------------------------------------------------------------------- /nice/nice_utilities.pxd: -------------------------------------------------------------------------------- 1 | cdef void single_contraction(const double[:, :, :, :, :] clebsh_gordan, 2 | double* first_covariant, int l1, 3 | double* second_covariant, int l2, 4 | int lambd, double* ans_placeholder, 5 | double** buff) nogil 6 | 7 | 8 | cdef int min_c(int a, int b) nogil 9 | cdef int max_c(int a, int b) nogil 10 | 11 | cdef int abs_c(int a) nogil 12 | 13 | 14 | -------------------------------------------------------------------------------- /nice/packing.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import os 3 | import numpy as np 4 | from cython.parallel import prange 5 | from multiprocessing import cpu_count 6 | 7 | cdef int switch_to_parallel_after = 36000000 8 | 9 | 10 | 11 | @cython.boundscheck(False) 12 | @cython.wraparound(False) 13 | cpdef pack_dense(double[:, :, :] covariants, int l, 14 | int n_feat, int desired_n_feat, num_threads = None): 15 | cdef int num_threads_int 16 | if (num_threads is None): 17 | num_threads_int = len(os.sched_getaffinity(0)) 18 | else: 19 | num_threads_int = num_threads 20 | 21 | cdef int n_envs = covariants.shape[0] 22 | cdef int num_per_feat = (l + 1) 23 | res = np.empty([n_envs * (2 * l + 1), desired_n_feat]) 24 | cdef double[:, :] res_view = res 25 | cdef int env_ind, feat_ind, now, m 26 | 27 | if (n_feat * (2 * l + 1) * n_envs) > switch_to_parallel_after: 28 | for env_ind in prange(n_envs, nogil = True, schedule = 'static', num_threads = num_threads_int): 29 | for feat_ind in range(n_feat): 30 | for m in range(2 * l + 1): 31 | res_view[m + env_ind * (2 * l + 1), feat_ind] = covariants[env_ind, feat_ind, m] 32 | 33 | for env_ind in prange(n_envs, nogil = True, schedule = 'static', num_threads = num_threads_int): 34 | for feat_ind in range(n_feat, desired_n_feat): 35 | for m in range(2 * l + 1): 36 | res_view[m + env_ind * (2 * l + 1), feat_ind] = 0.0 37 | 38 | else: 39 | for env_ind in range(n_envs): 40 | for feat_ind in range(n_feat): 41 | for m in range(2 * l + 1): 42 | res_view[m + env_ind * (2 * l + 1), feat_ind] = covariants[env_ind, feat_ind, m] 43 | 44 | for env_ind in range(n_envs): 45 | for feat_ind in range(n_feat, desired_n_feat): 46 | for m in range(2 * l + 1): 47 | res_view[m + env_ind * (2 * l + 1), feat_ind] = 0.0 48 | '''for feat_ind in prange(n_feat, nogil = True, schedule = 'static', num_threads = num_threads_int): 49 | now = 0 50 | for env_ind in range(n_envs): 51 | for m in range(2 * l + 1): 52 | res_view[now, feat_ind] = covariants[env_ind, feat_ind, m] 53 | now = now + 1 54 | 55 | for feat_ind in prange(n_feat, desired_n_feat, nogil = True, schedule = 'static', num_threads = num_threads_int): 56 | now = 0 57 | for env_ind in range(n_envs): 58 | for m in range(2 * l + 1): 59 | res_view[now, feat_ind] = 0.0 60 | now = now + 1''' 61 | 62 | return res 63 | 64 | '''@cython.boundscheck(False) 65 | @cython.wraparound(False) 66 | cdef transform_inplace(double[:, :, :] covariants, double[:, :] components, 67 | int l, int n_feat): 68 | cdef int n_envs = covariants.shape[0] 69 | res = np.zeros([n_envs, components.shape[0], 2 * l + 1]) 70 | cdef double[:, :, :] res_view = res 71 | cdef int feat_ind, env_ind, m, i 72 | 73 | for env_ind in range(n_envs): 74 | for feat_ind in range(components.shape[0]): 75 | for m in range(2 * l + 1): 76 | for i in range(n_feat): 77 | res_view[env_ind, feat_ind, m] += components[feat_ind, i] * covariants[env_ind, i, m] 78 | return res''' 79 | 80 | @cython.boundscheck(False) 81 | @cython.wraparound(False) 82 | cpdef unpack_dense(double[:, :] packed, int n_envs, int l, int n_feat, num_threads = None): 83 | cdef int num_threads_int 84 | if (num_threads is None): 85 | num_threads_int = len(os.sched_getaffinity(0)) 86 | else: 87 | num_threads_int = num_threads 88 | 89 | res = np.empty([n_envs, n_feat, 2 * l + 1]) 90 | cdef double[:, :, :] res_view = res 91 | cdef int feat_ind, now, env_ind, m 92 | 93 | '''for feat_ind in prange(n_feat, nogil = True, schedule = 'static', num_threads = num_threads_int): 94 | now = 0 95 | for env_ind in range(n_envs): 96 | for m in range(2 * l + 1): 97 | res_view[env_ind, feat_ind, m] = packed[now, feat_ind] 98 | now = now + 1''' 99 | if (n_feat * (2 * l + 1) * n_envs) > switch_to_parallel_after: 100 | for env_ind in prange(n_envs, nogil = True, schedule = 'static', num_threads = num_threads_int): 101 | for feat_ind in range(n_feat): 102 | for m in range(2 * l + 1): 103 | res_view[env_ind, feat_ind, m] = packed[m + env_ind * (2 * l + 1), feat_ind] 104 | 105 | else: 106 | for env_ind in range(n_envs): 107 | for feat_ind in range(n_feat): 108 | for m in range(2 * l + 1): 109 | res_view[env_ind, feat_ind, m] = packed[m + env_ind * (2 * l + 1), feat_ind] 110 | return res 111 | 112 | @cython.boundscheck(False) 113 | @cython.wraparound(False) 114 | cpdef copy_parallel(double[:, :] source, double[:, :] destination, num_threads = None): 115 | cdef int num_threads_int 116 | if (num_threads is None): 117 | num_threads_int = len(os.sched_getaffinity(0)) 118 | else: 119 | num_threads_int = num_threads 120 | 121 | cdef int env_ind, feat_ind 122 | cdef int n_feat = source.shape[1] 123 | if (source.shape[0] * source.shape[1] > switch_to_parallel_after): 124 | for env_ind in prange(source.shape[0], nogil = True, schedule = 'static', num_threads = num_threads_int): 125 | for feat_ind in range(n_feat): 126 | destination[env_ind, feat_ind] = source[env_ind, feat_ind] 127 | else: 128 | for env_ind in range(source.shape[0]): 129 | for feat_ind in range(n_feat): 130 | destination[env_ind, feat_ind] = source[env_ind, feat_ind] 131 | 132 | 133 | def unite_parallel(blocks, num_threads = None): 134 | total_size = 0 135 | for block in blocks: 136 | total_size += block.shape[1] 137 | res = np.empty([blocks[0].shape[0], total_size]) 138 | now = 0 139 | for block in blocks: 140 | copy_parallel(block, res[:, now : now + block.shape[1]], num_threads = num_threads) 141 | now += block.shape[1] 142 | return res 143 | 144 | @cython.boundscheck(False) 145 | @cython.wraparound(False) 146 | cpdef subtract_parallel(double[:, :] a, double[:, :] b, num_threads = None): 147 | result = np.empty([a.shape[0], a.shape[1]]) 148 | cdef double[:, :] result_view = result 149 | 150 | cdef int num_threads_int 151 | if (num_threads is None): 152 | num_threads_int = len(os.sched_getaffinity(0)) 153 | else: 154 | num_threads_int = num_threads 155 | 156 | cdef int env_ind, feat_ind 157 | cdef int n_feat = a.shape[1] 158 | if (a.shape[0] * a.shape[1] > switch_to_parallel_after): 159 | for env_ind in prange(a.shape[0], nogil = True, schedule = 'static', num_threads = num_threads_int): 160 | for feat_ind in range(n_feat): 161 | result_view[env_ind, feat_ind] = a[env_ind, feat_ind] - b[env_ind, feat_ind] 162 | else: 163 | for env_ind in range(a.shape[0]): 164 | for feat_ind in range(n_feat): 165 | result_view[env_ind, feat_ind] = a[env_ind, feat_ind] - b[env_ind, feat_ind] 166 | 167 | return result 168 | 169 | 170 | '''@cython.boundscheck(False) 171 | @cython.wraparound(False) 172 | cpdef accumulate(double[:, :] values, int[:] structure_indices, int central_now, 173 | double[:, :] ans): 174 | 175 | cdef int env_ind, feat_ind, n_feat = values.shape[1] 176 | cdef int now = 0 177 | for env_ind in range(values.shape[0]): 178 | for feat_ind in range(n_feat): 179 | ans[structure_indices[env_ind], feat_ind] += values[env_ind, feat_ind] 180 | 181 | def accumulate_to_structures(structures, values): 182 | all_species = [] 183 | for structure in structures: 184 | all_species.append(structure.get_atomic_numbers()) 185 | all_species = np.concatenate(all_species, axis = 0) 186 | species = np.unique(all_species) 187 | all_species = all_species.astype(np.int32) 188 | species = species.astype(np.int32) 189 | 190 | result = {} 191 | for specie in tqdm.tqdm(species): 192 | num_now = np.sum(all_species == specie) 193 | result[specie] = np.empty([num_now, coefficients.shape[1], coefficients.shape[2], coefficients.shape[3]]) 194 | copy_coefs(coefficients, all_species, specie, result[specie]) 195 | return result ''' 196 | -------------------------------------------------------------------------------- /nice/rascal_coefficients.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport cython 3 | from nice_utilities cimport single_contraction, min_c, abs_c, max_c 4 | from libc.math cimport sin, M_PI, sqrt, fmax 5 | import tqdm 6 | import rascal 7 | import os 8 | from ase import Atoms 9 | from rascal.representations import SphericalInvariants as SOAP 10 | from rascal.representations import SphericalExpansion as SPH 11 | from rascal.neighbourlist.structure_manager import ( 12 | mask_center_atoms_by_species, mask_center_atoms_by_id) 13 | import warnings 14 | import copy 15 | from multiprocessing import Pool, cpu_count 16 | 17 | 18 | @cython.boundscheck(False) 19 | @cython.wraparound(False) 20 | cpdef void copy_coefs(double[:, :, :, :] coefficients, int[:] central_species, int central_now, 21 | double[:, :, :, :] ans): 22 | cdef int now = 0 23 | cdef int n_radial = coefficients.shape[1] 24 | cdef int l_max = coefficients.shape[2] - 1 25 | cdef int env_ind, radial_ind, l, m 26 | 27 | 28 | for env_ind in range(coefficients.shape[0]): 29 | if central_species[env_ind] == central_now: 30 | for radial_ind in range(n_radial): 31 | for l in range(l_max + 1): 32 | for m in range(2 * l_max + 1): 33 | ans[now, radial_ind, l, m] = coefficients[env_ind, radial_ind, l, m] 34 | now += 1 35 | 36 | 37 | def split_by_central_specie(all_species, species, coefficients, show_progress = True): 38 | result = {} 39 | for specie in tqdm.tqdm(species, disable = not show_progress): 40 | num_now = np.sum(all_species == specie) 41 | result[specie] = np.empty([num_now, coefficients.shape[1], coefficients.shape[2], coefficients.shape[3]]) 42 | copy_coefs(coefficients, all_species, specie, result[specie]) 43 | return result 44 | 45 | 46 | 47 | @cython.boundscheck(False) 48 | @cython.wraparound(False) 49 | cpdef convert_rascal_coefficients(double[:, :] coefficients, int n_max, int n_types, int l_max): 50 | cdef int n_envs = coefficients.shape[0] 51 | cdef int env_ind, n, l, m 52 | cdef int n_radial = n_max * n_types 53 | cdef int now 54 | ans = np.zeros([n_envs, n_radial, l_max + 1, 2 * l_max + 1]) 55 | cdef double[:, :, :, :] ans_view = ans 56 | 57 | for env_ind in range(n_envs): 58 | now = 0 59 | for n in range(n_radial): 60 | for l in range(l_max + 1): 61 | for m in range(-l, l + 1): 62 | ans_view[env_ind, n, l, m + l] = coefficients[env_ind, now] 63 | now += 1 64 | return ans 65 | 66 | 67 | 68 | def process_structures(structures, delta = 0.1): 69 | """Satisfying librascal desire of having all atoms 70 | inside the cell even if structure is not periodic. 71 | (changes only non periodic structures) 72 | """ 73 | 74 | result = [] 75 | for structure in structures: 76 | if True in structure.pbc: 77 | result.append(copy.deepcopy(structure)) 78 | else: 79 | current = copy.deepcopy(structure) 80 | for dim in range(3): 81 | min_now = np.min( current.positions[:, dim]) 82 | current.positions[:, dim] = current.positions[:, dim] - min_now + delta 83 | 84 | spreads = [] 85 | for dim in range(3): 86 | spreads.append(np.max(current.positions[:, dim]) + delta) 87 | current.cell = spreads 88 | result.append(current) 89 | return result 90 | 91 | 92 | def get_rascal_coefficients(structures, HYPERS, n_types): 93 | 94 | 95 | sph = SPH(**HYPERS) 96 | try: 97 | n_max = HYPERS['max_radial'] 98 | l_max = HYPERS['max_angular'] 99 | except KeyError: 100 | raise KeyError("max_radial and max_angular should be specified") 101 | 102 | structures = process_structures(structures) 103 | 104 | feat = sph.transform(structures).get_features(sph) 105 | res = convert_rascal_coefficients(feat, n_max, n_types, l_max) 106 | 107 | #if (normalize): 108 | # normalize_by_ps(res) 109 | return np.array(res) 110 | 111 | 112 | def get_rascal_coefficients_stared(task): 113 | return get_rascal_coefficients(*task) 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /nice/thresholding.pyx: -------------------------------------------------------------------------------- 1 | from libc.math cimport sin, M_PI, sqrt, fmax 2 | cimport cython 3 | import numpy as np 4 | #from cython.parallel cimport prange 5 | from nice_utilities cimport min_c, abs_c, max_c 6 | cdef double sqrt_2 = sqrt(2.0) 7 | #from nice_utilities import Data 8 | 9 | cdef enum Mode: 10 | covariants, invariants 11 | 12 | 13 | 14 | cdef get_thresholded_task(double[:, :] first_importances, int[:] first_actual_sizes, 15 | double[:, :] second_importances, int[:] second_actual_sizes, 16 | double threshold, int known_num, int l_max, Mode mode): 17 | if mode == Mode.covariants: 18 | return get_thresholded_task_covariants(first_importances, first_actual_sizes, 19 | second_importances, second_actual_sizes, 20 | threshold, known_num, l_max) 21 | if mode == Mode.invariants: 22 | return get_thresholded_task_invariants(first_importances, first_actual_sizes, 23 | second_importances, second_actual_sizes, 24 | threshold, known_num, l_max) 25 | 26 | cdef get_thresholded_task_invariants(double[:, :] first_importances, int[:] first_actual_sizes, 27 | double[:, :] second_importances, int[:] second_actual_sizes, 28 | double threshold, int known_num, int l_max): 29 | 30 | ans = np.empty([known_num, 4], dtype = np.int32) 31 | 32 | raw_importances = np.empty([known_num]) 33 | 34 | cdef int[:, :] ans_view = ans 35 | 36 | cdef int l, first_ind, second_ind, lambd 37 | cdef int pos = 0 38 | 39 | for l in range(l_max + 1): 40 | for first_ind in range(first_actual_sizes[l]): 41 | for second_ind in range(second_actual_sizes[l]): 42 | if (first_importances[first_ind, l] * second_importances[second_ind, l] >= threshold): 43 | ans_view[pos, 0] = first_ind 44 | ans_view[pos, 1] = l 45 | ans_view[pos, 2] = second_ind 46 | ans_view[pos, 3] = l 47 | raw_importances[pos] = first_importances[first_ind, l] * second_importances[second_ind, l] 48 | pos += 1 49 | 50 | return [ans[:pos], raw_importances[:pos]] 51 | 52 | cdef get_thresholded_task_covariants(double[:, :] first_importances, int[:] first_actual_sizes, 53 | double[:, :] second_importances, int[:] second_actual_sizes, 54 | double threshold, int known_num, int l_max): 55 | 56 | ans = np.empty([known_num, 4], dtype = np.int32) 57 | 58 | raw_importances = np.empty([known_num]) 59 | 60 | cdef int[:, :] ans_view = ans 61 | 62 | cdef int l1, l2, first_ind, second_ind, lambd 63 | cdef int pos = 0 64 | 65 | for l1 in range(l_max + 1): 66 | for l2 in range(l_max + 1): 67 | for first_ind in range(first_actual_sizes[l1]): 68 | for second_ind in range(second_actual_sizes[l2]): 69 | if (first_importances[first_ind, l1] * second_importances[second_ind, l2] >= threshold): 70 | ans_view[pos, 0] = first_ind 71 | ans_view[pos, 1] = l1 72 | ans_view[pos, 2] = second_ind 73 | ans_view[pos, 3] = l2 74 | raw_importances[pos] = first_importances[first_ind, l1] * second_importances[second_ind, l2] 75 | pos += 1 76 | 77 | return [ans[:pos], raw_importances[:pos]] 78 | 79 | 80 | cpdef get_thresholded_tasks(first_even, first_odd, second_even, second_odd, int desired_num, int l_max, mode_string): 81 | 82 | cdef Mode mode 83 | if mode_string == 'covariants': 84 | mode = Mode.covariants 85 | if mode_string == 'invariants': 86 | mode = Mode.invariants 87 | 88 | cdef double threshold_even 89 | cdef int num_even_even, num_odd_odd 90 | threshold_even, num_even_even, num_odd_odd = get_threshold(first_even.importances_, first_even.actual_sizes_, 91 | second_even.importances_, second_even.actual_sizes_, 92 | first_odd.importances_, first_odd.actual_sizes_, 93 | second_odd.importances_, second_odd.actual_sizes_, 94 | desired_num, mode) 95 | 96 | cdef double threshold_odd 97 | cdef int num_even_odd, num_odd_even 98 | threshold_odd, num_even_odd, num_odd_even = get_threshold(first_even.importances_, first_even.actual_sizes_, 99 | second_odd.importances_, second_odd.actual_sizes_, 100 | first_odd.importances_, first_odd.actual_sizes_, 101 | second_even.importances_, second_even.actual_sizes_, 102 | desired_num, mode) 103 | 104 | 105 | 106 | task_even_even = get_thresholded_task(first_even.importances_, first_even.actual_sizes_, 107 | second_even.importances_, second_even.actual_sizes_, 108 | threshold_even, num_even_even, l_max, mode) 109 | 110 | task_odd_odd = get_thresholded_task(first_odd.importances_, first_odd.actual_sizes_, 111 | second_odd.importances_, second_odd.actual_sizes_, 112 | threshold_even, num_odd_odd, l_max, mode) 113 | 114 | task_even_odd = get_thresholded_task(first_even.importances_, first_even.actual_sizes_, 115 | second_odd.importances_, second_odd.actual_sizes_, 116 | threshold_odd, num_even_odd, l_max, mode) 117 | 118 | task_odd_even = get_thresholded_task(first_odd.importances_, first_odd.actual_sizes_, 119 | second_even.importances_, second_even.actual_sizes_, 120 | threshold_odd, num_odd_even, l_max, mode) 121 | 122 | return task_even_even, task_odd_odd, task_even_odd, task_odd_even 123 | 124 | 125 | 126 | cdef get_threshold(double[:, :] first_importances_1, int[:] first_actual_sizes_1, 127 | double[:, :] second_importances_1, int[:] second_actual_sizes_1, 128 | double[:, :] first_importances_2, int[:] first_actual_sizes_2, 129 | double[:, :] second_importances_2, int[:] second_actual_sizes_2, 130 | int desired_num, Mode mode, int min_iterations = 50): 131 | 132 | 133 | if (desired_num == -1): 134 | num_1_1 = get_total_num_full(first_importances_1, first_actual_sizes_1, second_importances_1, second_actual_sizes_1, -1.0, mode) 135 | num_2_2 = get_total_num_full(first_importances_2, first_actual_sizes_2, second_importances_2, second_actual_sizes_2, -1.0, mode) 136 | return -1.0, num_1_1, num_2_2 137 | 138 | cdef double left = -1.0 139 | cdef double first = get_upper_threshold(first_importances_1, first_actual_sizes_1, second_importances_1, second_actual_sizes_1, mode) + 1.0 140 | cdef double second = get_upper_threshold(first_importances_2, first_actual_sizes_2, second_importances_2, second_actual_sizes_2, mode) + 1.0 141 | 142 | cdef double right = fmax(first, second) 143 | cdef double middle = (left + right) / 2.0 144 | cdef int num_now, num_previous = -1 145 | cdef int num_it_no_change = 0 146 | while (True): 147 | middle = (left + right) / 2.0 148 | num_now = get_total_num_full(first_importances_1, first_actual_sizes_1, second_importances_1, second_actual_sizes_1, middle, mode) + get_total_num_full(first_importances_2, first_actual_sizes_2, second_importances_2, second_actual_sizes_2, middle, mode) 149 | 150 | if (num_now == desired_num): 151 | left = middle 152 | break 153 | if (num_now > desired_num): 154 | left = middle 155 | if (num_now < desired_num): 156 | right = middle 157 | 158 | if (num_now == num_previous): 159 | num_it_no_change += 1 160 | if (num_it_no_change > min_iterations): 161 | break 162 | else: 163 | num_it_no_change = 0 164 | num_previous = num_now 165 | 166 | num_1_1 = get_total_num_full(first_importances_1, first_actual_sizes_1, second_importances_1, second_actual_sizes_1, left, mode) 167 | num_2_2 = get_total_num_full(first_importances_2, first_actual_sizes_2, second_importances_2, second_actual_sizes_2, left, mode) 168 | return left, num_1_1, num_2_2 169 | 170 | 171 | cdef double get_upper_threshold(double[:, :] first_importances, int[:] first_actual_sizes, 172 | double[:, :] second_importances, int[:] second_actual_sizes, Mode mode): 173 | if mode == Mode.covariants: 174 | return get_upper_threshold_covariants(first_importances, first_actual_sizes, 175 | second_importances, second_actual_sizes) 176 | if mode == Mode.invariants: 177 | return get_upper_threshold_invariants(first_importances, first_actual_sizes, 178 | second_importances, second_actual_sizes) 179 | 180 | 181 | 182 | cdef double get_upper_threshold_invariants(double[:, :] first_importances, int[:] first_actual_sizes, 183 | double[:, :] second_importances, int[:] second_actual_sizes): 184 | cdef double ans = 0.0 185 | cdef int l 186 | 187 | for l in range(min_c(first_importances.shape[1], second_importances.shape[1])): 188 | if (first_actual_sizes[l] > 0) and (second_actual_sizes[l] > 0): 189 | if (first_importances[0, l] * second_importances[0, l] > ans): 190 | ans = first_importances[0, l] * second_importances[0, l] 191 | 192 | return ans 193 | 194 | 195 | cdef double get_upper_threshold_covariants(double[:, :] first_importances, int[:] first_actual_sizes, 196 | double[:, :] second_importances, int[:] second_actual_sizes): 197 | cdef double ans = 0.0 198 | cdef int l1, l2 199 | 200 | cdef int second_size = second_importances.shape[1] 201 | for l1 in range(first_importances.shape[1]): 202 | for l2 in range(second_size): 203 | if (first_actual_sizes[l1] > 0) and (second_actual_sizes[l2] > 0): 204 | if (first_importances[0, l1] * second_importances[0, l2] > ans): 205 | ans = first_importances[0, l1] * second_importances[0, l2] 206 | 207 | return ans 208 | 209 | 210 | cdef int get_total_num_full(double[:, :] first_importances, int[:] first_actual_sizes, 211 | double[:, :] second_importances, int[:] second_actual_sizes, 212 | double threshold, Mode mode): 213 | if mode == Mode.covariants: 214 | return get_total_num_full_covariants(first_importances, first_actual_sizes, 215 | second_importances, second_actual_sizes, 216 | threshold) 217 | if mode == Mode.invariants: 218 | return get_total_num_full_invariants(first_importances, first_actual_sizes, 219 | second_importances, second_actual_sizes, 220 | threshold) 221 | 222 | cdef int get_total_num_full_invariants(double[:, :] first_importances, int[:] first_actual_sizes, 223 | double[:, :] second_importances, int[:] second_actual_sizes, 224 | double threshold): 225 | cdef int l 226 | cdef int second_size = second_importances.shape[1] 227 | cdef int res = 0 228 | for l in range(min_c(first_importances.shape[1], second_importances.shape[1])): 229 | if (first_actual_sizes[l] > 0) and (second_actual_sizes[l] > 0): 230 | res += get_total_num(first_importances[:first_actual_sizes[l], l], 231 | second_importances[:second_actual_sizes[l], l], threshold) 232 | return res 233 | 234 | 235 | 236 | cdef int get_total_num_full_covariants(double[:, :] first_importances, int[:] first_actual_sizes, 237 | double[:, :] second_importances, int[:] second_actual_sizes, 238 | double threshold): 239 | cdef int l1, l2 240 | cdef int second_size = second_importances.shape[1] 241 | cdef int res = 0 242 | for l1 in range(first_importances.shape[1]): 243 | for l2 in range(second_size): 244 | if (first_actual_sizes[l1] > 0) and (second_actual_sizes[l2] > 0): 245 | res += get_total_num(first_importances[:first_actual_sizes[l1], l1], 246 | second_importances[:second_actual_sizes[l2], l2], threshold) 247 | 248 | return res 249 | 250 | cdef int get_total_num(double[:] a, double[:] b, double threshold): 251 | cdef int b_size = b.shape[0] 252 | cdef int i, j, ans 253 | i = 0 254 | j = b_size 255 | ans = 0 256 | for i in range(a.shape[0]): 257 | while ((j > 0) and (a[i] * b[j - 1] < threshold)): 258 | j -= 1 259 | ans += j 260 | return ans 261 | -------------------------------------------------------------------------------- /nice/unrolling_individual_pca.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import numpy as np 3 | from nice.packing import pack_dense, unpack_dense 4 | 5 | from sklearn.decomposition import TruncatedSVD #not center the data 6 | class UnrollingIndividualPCA(TruncatedSVD): 7 | def __init__(self, *args, normalize_importances = True, **kwargs): 8 | self.normalize_importances_ = normalize_importances 9 | super().__init__(*args, **kwargs) 10 | 11 | def fit(self, *args): 12 | if (len(args) == 1): 13 | return super().fit(args[0]) 14 | #print("num components: ", self.n_components) 15 | covariants, l = args 16 | n_feat = covariants.shape[1] 17 | if (self.n_components > n_feat): 18 | #print("in if: ", self.n_components, n_feat) 19 | self.n_components = n_feat 20 | 21 | self.l_ = l 22 | if (self.n_components < n_feat): 23 | packed = pack_dense(covariants, l, n_feat, n_feat) 24 | if (self.n_components == n_feat): 25 | packed = pack_dense(covariants, l, n_feat, n_feat + 1) 26 | res = super().fit_transform(packed) 27 | 28 | self.importances_ = np.mean(res * res, axis = 0) 29 | if (self.normalize_importances_): 30 | self.importances_ = self.importances_ / np.sum(self.importances_) 31 | indices = np.argsort(self.importances_)[::-1] 32 | self.importances_ = self.importances_[indices] 33 | self.components_ = self.components_[indices] 34 | self.explained_variance_ = self.explained_variance_[indices] 35 | self.explained_variance_ratio_ = self.explained_variance_ratio_[indices] 36 | self.singular_values_ = self.singular_values_[indices] 37 | 38 | def fit_transform(self, *args): 39 | if (len(args) ==1): 40 | return super().fit_transform(args[0]) 41 | covariants, l = args 42 | n_feat = covariants.shape[1] 43 | #print("num components: ", self.n_components) 44 | if (self.n_components > n_feat): 45 | #print("in if: ", self.n_components, n_feat) 46 | self.n_components = n_feat 47 | 48 | self.l_ = l 49 | if (self.n_components < n_feat): 50 | packed = pack_dense(covariants, l, n_feat, n_feat) 51 | if (self.n_components == n_feat): 52 | packed = pack_dense(covariants, l, n_feat, n_feat + 1) 53 | 54 | res = super().fit_transform(packed) 55 | self.importances_ = np.mean(res * res, axis = 0) 56 | if (self.normalize_importances_): 57 | self.importances_ = self.importances_ / np.sum(self.importances_) 58 | indices = np.argsort(self.importances_)[::-1] 59 | self.importances_ = self.importances_[indices] 60 | self.components_ = self.components_[indices] 61 | 62 | res = super().transform(packed) 63 | return unpack_dense(res, covariants.shape[0], 64 | self.l_, self.n_components) 65 | 66 | 67 | def transform(self, *args): 68 | 69 | 70 | if (len(args) == 1): 71 | return super().transform(args) 72 | #print("components shape: ", self.components_.shape) 73 | #print("num components: ", self.n_components) 74 | covariants, l = args 75 | n_feat = covariants.shape[1] 76 | 77 | if (self.n_components < n_feat): 78 | packed = pack_dense(covariants, l, n_feat, n_feat) 79 | if (self.n_components == n_feat): 80 | packed = pack_dense(covariants, l, n_feat, n_feat + 1) 81 | res = super().transform(packed) 82 | return unpack_dense(res, covariants.shape[0], 83 | self.l_, self.n_components) 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /nice/unrolling_pca.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import numpy as np 3 | from libc.math cimport fabs 4 | 5 | @cython.boundscheck(False) 6 | @cython.wraparound(False) 7 | cpdef pack_dense(double[:, :, :, :] coefficients): 8 | cdef int n_envs = coefficients.shape[0] 9 | cdef int n_feat = coefficients.shape[1] 10 | cdef int l_max = coefficients.shape[2] - 1 11 | 12 | 13 | 14 | cdef int num_per_feat = (l_max + 1) * (l_max + 1) 15 | res = np.zeros([num_per_feat * n_envs, n_feat]) 16 | cdef double[:, :] res_view = res 17 | 18 | cdef int env_ind, feat_ind, now, l, m 19 | for feat_ind in range(n_feat): 20 | now = 0 21 | for env_ind in range(n_envs): 22 | for l in range(l_max + 1): 23 | for m in range(2 * l + 1): 24 | res_view[now, feat_ind] = coefficients[env_ind, feat_ind, l, m] 25 | now += 1 26 | 27 | return res 28 | 29 | cpdef unpack_dense(double[:, :] packed, int n_envs, int l_max): 30 | cdef int n_feat = packed.shape[1] 31 | 32 | res = np.zeros([n_envs, n_feat, l_max + 1, 2 * l_max + 1]) 33 | cdef double[:, :, :, :] res_view = res 34 | cdef int feat_ind, now, env_ind, l, m 35 | 36 | for feat_ind in range(n_feat): 37 | now = 0 38 | for env_ind in range(n_envs): 39 | for l in range(l_max + 1): 40 | for m in range(2 * l + 1): 41 | res_view[env_ind, feat_ind, l, m] = packed[now, feat_ind] 42 | now += 1 43 | return res 44 | 45 | cpdef get_signs(double[:, :] ar, epsilon = 1e-10): 46 | res = np.zeros([ar.shape[0]]) 47 | cdef double[:] res_view = res 48 | cdef int n_feat = ar.shape[1] 49 | cdef int i, j 50 | cdef double max_absolute_now 51 | for i in range(ar.shape[0]): 52 | max_absolute_now = ar[i, 0] 53 | for j in range(n_feat): 54 | if (fabs(ar[i, j]) > fabs(max_absolute_now)): 55 | max_absolute_now = ar[i, j] 56 | 57 | if (max_absolute_now > epsilon): 58 | res_view[i] = 1.0 59 | if (max_absolute_now < epsilon): 60 | res_view[i] = -1.0 61 | 62 | return res 63 | 64 | 65 | from sklearn.decomposition import TruncatedSVD #not center the data 66 | class UnrollingPCA(TruncatedSVD): 67 | def __init__(self, *args, **kwargs): 68 | super().__init__(*args, **kwargs) 69 | 70 | def fit_transform(self, coefficients): 71 | if (len(coefficients.shape) == 2): 72 | return super().fit_transform(coefficients) 73 | self.n_feat_ = coefficients.shape[1] 74 | self.l_max_ = coefficients.shape[2] - 1 75 | packed = pack_dense(coefficients) 76 | res = super().fit_transform(packed) 77 | return unpack_dense(res, coefficients.shape[0], 78 | self.l_max_) 79 | 80 | def fit(self, coefficients): 81 | if (len(coefficients.shape) == 2): 82 | return super().fit_transform(coefficients) 83 | self.n_feat_ = coefficients.shape[1] 84 | self.l_max_ = coefficients.shape[2] - 1 85 | packed = pack_dense(coefficients) 86 | super().fit(packed) 87 | 88 | def transform(self, coefficients): 89 | if (len(coefficients.shape) == 2): 90 | return super().fit_transform(coefficients) 91 | if (self.n_feat_ != coefficients.shape[1]): 92 | raise ValueError("wrong shape") 93 | if (self.l_max_ != coefficients.shape[2] - 1): 94 | raise ValueError("wrong shape") 95 | packed = pack_dense(coefficients) 96 | res = super().transform(packed) 97 | return unpack_dense(res, coefficients.shape[0], 98 | self.l_max_) 99 | 100 | -------------------------------------------------------------------------------- /nice/utilities.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | import numpy as np 3 | import nice.rascal_coefficients 4 | import copy 5 | import os 6 | from multiprocessing import Pool, cpu_count 7 | import warnings 8 | 9 | 10 | def get_all_species(structures): 11 | ''' getting all unique atomic species among the structures 12 | 13 | Args: 14 | structures: list of ase atoms objects 15 | 16 | Returns: 17 | sorted numpy array with ints with all unique species in the format where 18 | 1 states for H, 2 for He and so on. (inherits from ase function 19 | atoms_object.get_atomic_numbers()) 20 | 21 | ''' 22 | all_species = [] 23 | for structure in structures: 24 | all_species.append(np.array(structure.get_atomic_numbers())) 25 | all_species = np.concatenate(all_species, axis=0) 26 | all_species = np.sort(np.unique(all_species)) 27 | return all_species 28 | 29 | 30 | def get_compositional_features(structures, all_species): 31 | ''' getting compositional features suitable for linear regression which contains information 32 | about the number of atoms with particular species in the structure 33 | 34 | Args: 35 | structures: list of Ase atoms objects 36 | all_species: numpy array with ints of all unique species in the dataset. \ 37 | If all species argument is the same for several calls of this function, resulting \ 38 | blocks of compositional features are guaranteed to be consisted with each other 39 | 40 | Return: 41 | numpy array with shape [len(structures), len(all_species)] with compositional features 42 | ''' 43 | result = np.zeros([len(structures), len(all_species)]) 44 | for i, structure in tqdm.tqdm(enumerate(structures)): 45 | species_now = structure.get_atomic_numbers() 46 | for j, specie in enumerate(all_species): 47 | num = np.sum(species_now == specie) 48 | result[i, j] = num 49 | return result 50 | 51 | 52 | def get_spherical_expansion(structures, 53 | rascal_hypers, 54 | all_species, 55 | task_size=100, 56 | num_threads=None, 57 | split_by_central_specie = True, 58 | show_progress=True): 59 | '''getting spherical expansion coefficients 60 | 61 | Args: 62 | structures: list of Ase atoms objects 63 | rascal_hypers: dictionary with parameters for librascal controlling spherical expansion 64 | all_species: numpy array with ints of all unique species in the dataset. \ 65 | If all species argument is the same for several calls of this function, resulting \ 66 | blocks of spherical expansion coefficients are guaranteed to be consisted with each other 67 | task_size: number of structures in chunk for multiprocessing 68 | num_threads: number of threads in multiprocessing. If None than all available \ 69 | (len(os.sched_getaffinity(0))) threads are used 70 | split_by_central_specie: whether group or not spherical expansion coefficients by central specie 71 | show_progress: whether or not show progress via tqdm 72 | 73 | Return: 74 | dictionary in which keys are elements of all_speceis and entries are numpy arrays with indexing 75 | [environmental index, radial basis/neighbor specie index, lambda, m] with spherical expansion coefficients for 76 | environments around atoms with specie indicated in key. Coefficients are stored from the beginning, 77 | i. e. [:, : lambda, :(2 * lambda + 1)] elements are valid 78 | ''' 79 | hypers = copy.deepcopy(rascal_hypers) 80 | 81 | if ('expansion_by_species_method' in hypers.keys()): 82 | if (hypers['expansion_by_species_method'] != 'user defined'): 83 | raise ValueError( 84 | "for proper packing spherical expansion coefficients into [env index, radial/specie index, l, m] shape output should be uniform, thus 'expansion_by_species_method' must be 'user defined'" 85 | ) 86 | 87 | hypers['expansion_by_species_method'] = 'user defined' 88 | 89 | species_list = [] 90 | for structure in structures: 91 | species_list.append(structure.get_atomic_numbers()) 92 | species_list = np.concatenate(species_list, axis=0) 93 | species_list = species_list.astype(np.int32) 94 | all_species = all_species.astype(np.int32) 95 | 96 | if ('global_species' not in hypers.keys()): 97 | hypers['global_species'] = [int(specie) for specie in all_species] 98 | else: 99 | for specie in all_species: 100 | if (specie not in hypers['global_species']): 101 | warnings.warn( 102 | "atom with type {} is presented in the all_species argument to this function but it is not listed in the global_species, adding it" 103 | .format(specie)) 104 | hypers['global_species'].append(int(specie)) 105 | 106 | all_species = np.array(hypers['global_species']).astype(np.int32) 107 | 108 | if (num_threads is None): 109 | num_threads = len(os.sched_getaffinity(0)) 110 | 111 | p = Pool(num_threads) 112 | tasks = [] 113 | for i in range(0, len(structures), task_size): 114 | tasks.append([structures[i:i + task_size], hypers, len(all_species)]) 115 | 116 | result = [ 117 | res for res in tqdm.tqdm(p.imap( 118 | nice.rascal_coefficients.get_rascal_coefficients_stared, tasks), 119 | total=len(tasks), 120 | disable=not show_progress) 121 | ] 122 | p.close() 123 | p.join() 124 | result = np.concatenate(result, axis=0) 125 | if (split_by_central_specie): 126 | return nice.rascal_coefficients.split_by_central_specie( 127 | species_list, all_species, result, show_progress=show_progress) 128 | else: 129 | return result 130 | 131 | 132 | def make_structural_features(features, 133 | structures, 134 | all_species, 135 | show_progress=True): 136 | ''' getting structural features suitable for linear regression which consist of sums \ 137 | over atomic features 138 | 139 | Args: 140 | features: nested dictionary with atomic features. First level keys are central species, \ 141 | second level keys are body orders. Entries are 2-dimensional numpy arrays. 142 | structures: list of Ase atoms objects 143 | all_species: numpy array with ints of all unique species in the dataset. \ 144 | If all species argument is the same for several calls of this function, resulting \ 145 | blocks of structural features are guaranteed to be consistent with each other. \ 146 | If for given block of structures there are no atoms of some particular specie,\ 147 | features dictionary still have to contain key with this specie. It should contain \ 148 | numpy arrays with shapes [0, number of features]. This is need to get proper placing\ 149 | of features to fulfill consistency. 150 | show_progress: whether or not show progress via tqdm 151 | 152 | Return: 153 | numpy array with shape [len(structures), number of structural features] with structural features 154 | ''' 155 | 156 | for specie in all_species: 157 | if (specie not in features.keys()): 158 | raise ValueError( 159 | "all_species contains atomic specie {}, " 160 | "but there are no features for it. " 161 | "In case of absence of such atoms in given set " 162 | "of structures provide empty array with shape " 163 | "[0, num_features] which is needed to " 164 | "determine proper shape of output ".format(specie)) 165 | 166 | start_indices, end_indices = {}, {} 167 | now = 0 168 | for specie_index in all_species: 169 | start_indices[specie_index] = {} 170 | end_indices[specie_index] = {} 171 | for body_order_index in features[specie_index].keys(): 172 | start_indices[specie_index][body_order_index] = now 173 | now += features[specie_index][body_order_index].shape[1] 174 | end_indices[specie_index][body_order_index] = now 175 | 176 | total_size = now 177 | 178 | result = np.zeros([len(structures), total_size]) 179 | 180 | current_positions = {} 181 | for specie in all_species: 182 | current_positions[specie] = 0 183 | 184 | for i in tqdm.tqdm(range(len(structures)), disable=not (show_progress)): 185 | species_now = structures[i].get_atomic_numbers() 186 | for specie in all_species: 187 | num_atoms_now = np.sum(species_now == specie) 188 | if (num_atoms_now == 0): 189 | continue 190 | 191 | for body_order in features[specie].keys(): 192 | features_now = np.sum( 193 | features[specie][body_order][current_positions[specie]:( 194 | current_positions[specie] + num_atoms_now)], 195 | axis=0) 196 | result[i, start_indices[specie][body_order]:end_indices[specie] 197 | [body_order]] = features_now 198 | 199 | current_positions[specie] += num_atoms_now 200 | 201 | return result 202 | 203 | 204 | def transform_sequentially(nice, 205 | structures, 206 | rascal_hypers, 207 | all_species, 208 | block_size=500, 209 | show_progress=True): 210 | ''' transforming structures into structural features by chunks in order to use less amount of RAM 211 | 212 | Args: 213 | nice: dictionary where keys are species and entries are nice transformers.\ 214 | If you want to use single nice transformer to all environments regardless of central\ 215 | specie just pass {key : nice_single for specie in all_species} 216 | structures: list of Ase atoms objects 217 | rascal_hypers: dictionary with parameters for librascal controlling spherical expansion.\ 218 | Should be the same as used for fitting nice transformers 219 | all_species: numpy array with ints of all unique species in the dataset. 220 | block_size: size of chunks measured in number of environments 221 | show_progress: whether or not show progress via tqdm 222 | 223 | 224 | Return: 225 | numpy array with shape [len(structures), number of structural features] with structural features 226 | ''' 227 | 228 | pieces = [] 229 | 230 | for i in tqdm.tqdm(range(0, len(structures), block_size), 231 | disable=not show_progress): 232 | now = {} 233 | coefficients = get_spherical_expansion(structures[i:i + block_size], 234 | rascal_hypers, 235 | all_species, 236 | show_progress=False) 237 | for specie in all_species: 238 | if (coefficients[specie].shape[0] != 0): 239 | now[specie] = nice[specie].transform( 240 | coefficients[specie], return_only_invariants=True) 241 | else: 242 | # determining size of output 243 | dummy_shape = coefficients[specie].shape 244 | dummy_shape = list(dummy_shape) 245 | dummy_shape[0] = 1 246 | dummy_data = np.ones(dummy_shape) 247 | dummy_output = nice[specie].transform( 248 | dummy_data, return_only_invariants=True) 249 | current_block = {} 250 | for key in dummy_output.keys(): 251 | current_block[key] = np.zeros( 252 | [0, dummy_output[key].shape[1]]) 253 | now[specie] = current_block 254 | 255 | pieces.append( 256 | make_structural_features(now, 257 | structures[i:i + block_size], 258 | all_species, 259 | show_progress=False)) 260 | 261 | return np.concatenate(pieces, axis=0) 262 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "Cython"] 3 | -------------------------------------------------------------------------------- /reference_configurations/readme.txt: -------------------------------------------------------------------------------- 1 | some reference configurations for tests 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | numpy 3 | ase 4 | tqdm 5 | scikit-learn 6 | sympy 7 | parse 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension, find_packages 2 | from Cython.Build import cythonize 3 | 4 | with open('requirements.txt', 'r') as f: 5 | requirements = [ 6 | line.strip() for line in f if not line.strip().startswith('#') 7 | ] 8 | 9 | extensions = [ 10 | Extension("nice.*", ["nice/*.pyx"], 11 | extra_compile_args=['-O3', '-fopenmp'], 12 | extra_link_args=['-fopenmp']) 13 | ] 14 | setup( 15 | name='nice', 16 | packages=find_packages(), 17 | install_requires=requirements, 18 | ext_modules=cythonize(extensions), 19 | zip_safe=False, 20 | ) 21 | -------------------------------------------------------------------------------- /tests/compare_kernels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import ase.io 3 | from nice.blocks import * 4 | from nice.utilities import * 5 | from nice.rascal_coefficients import process_structures 6 | import copy 7 | from rascal.representations import SphericalInvariants 8 | 9 | def get_nice_powerspectrum(): 10 | return StandardSequence([ 11 | StandardBlock(None, None, None, 12 | ThresholdExpansioner(num_expand=None, mode='invariants'), 13 | None, None) 14 | ], 15 | initial_scaler=None) 16 | 17 | def get_nice_ps_kernel(structures, hypers): 18 | 19 | all_species = get_all_species(structures) 20 | coefficients = get_spherical_expansion(structures, hypers, all_species, split_by_central_specie=False, 21 | show_progress = False) 22 | nice = get_nice_powerspectrum() 23 | nice.fit(coefficients) 24 | nice_ps = nice.transform(coefficients, return_only_invariants = True)[2] 25 | return nice_ps.dot(nice_ps.T) 26 | 27 | def get_rascal_ps_kernel(structures, hypers): 28 | structures = process_structures(structures) 29 | soap = SphericalInvariants(**hypers) 30 | librascal_ps = soap.transform(structures).get_features(soap) 31 | return librascal_ps.dot(librascal_ps.T) 32 | 33 | def test_powerspectrum_kernels(epsilon = 1e-10): 34 | structures = ase.io.read('../reference_configurations/methane_100.extxyz', index = ':') 35 | HYPERS = { 36 | 'interaction_cutoff': 6.3, 37 | 'max_radial': 5, 38 | 'max_angular': 5, 39 | 'gaussian_sigma_type': 'Constant', 40 | 'gaussian_sigma_constant': 0.3, 41 | 'cutoff_smooth_width': 0.3, 42 | 'radial_basis': 'GTO', 43 | 44 | } 45 | 46 | HYPERS_PS = copy.deepcopy(HYPERS) 47 | HYPERS_PS['normalize'] = False 48 | HYPERS_PS['soap_type'] = 'PowerSpectrum' 49 | 50 | nice_kernel = get_nice_ps_kernel(structures, HYPERS) 51 | rascal_kernel = get_rascal_ps_kernel(structures, HYPERS_PS) 52 | 53 | nice_kernel = np.reshape(nice_kernel, [-1]) 54 | rascal_kernel = np.reshape(rascal_kernel, [-1]) 55 | 56 | mask = rascal_kernel > epsilon 57 | nice_kernel = nice_kernel[mask] 58 | rascal_kernel = rascal_kernel[mask] 59 | 60 | ratios = nice_kernel / rascal_kernel 61 | discrepancy = (np.max(ratios) - np.min(ratios)) / np.mean(ratios) 62 | assert discrepancy < epsilon 63 | 64 | -------------------------------------------------------------------------------- /tests/readme.txt: -------------------------------------------------------------------------------- 1 | python3 -m pytest compare_kernels.py 2 | -------------------------------------------------------------------------------- /tutorials/calculating_covariants.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Calculating covariants" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In the previous tutorial, we calculated invariant representations of atomic environments and used them for the prediction of energies - invariant properties. \n", 15 | "\n", 16 | "In the case when there is a need to predict covariant properties, covariants instead of invariants are required. This tutorial shows how to calculate them.\n", 17 | "\n", 18 | "First of all, we need to get **fitted** instance of the model as in the previous tutorial. It is done by the following preliminaries cell: (with the only difference that since we want to calculate covariants, we clearly shouldn't leave the covariants branch of the last block empty) " 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# cell to wrap in collapsible in future\n", 28 | "\n", 29 | "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n", 30 | "\n", 31 | "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n", 32 | "!gunzip -k methane.extxyz.gz\n", 33 | "\n", 34 | "import numpy as np\n", 35 | "import ase.io\n", 36 | "import tqdm\n", 37 | "from nice.blocks import *\n", 38 | "from nice.utilities import *\n", 39 | "from matplotlib import pyplot as plt\n", 40 | "from sklearn.linear_model import BayesianRidge\n", 41 | "\n", 42 | "HARTREE_TO_EV = 27.211386245988\n", 43 | "train_subset = \"0:10000\" #input for ase.io.read command\n", 44 | "test_subset = \"10000:15000\" #input to ase.io.read command\n", 45 | "environments_for_fitting = 1000 #number of environments to fit nice transfomers\n", 46 | "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000, 5000, 7500,\n", 47 | " 10000] #for learning curve\n", 48 | "\n", 49 | "#HYPERS for librascal spherical expansion coefficients\n", 50 | "HYPERS = {\n", 51 | " 'interaction_cutoff': 6.3,\n", 52 | " 'max_radial': 5,\n", 53 | " 'max_angular': 5,\n", 54 | " 'gaussian_sigma_type': 'Constant',\n", 55 | " 'gaussian_sigma_constant': 0.05,\n", 56 | " 'cutoff_smooth_width': 0.3,\n", 57 | " 'radial_basis': 'GTO'\n", 58 | "}\n", 59 | "\n", 60 | "\n", 61 | "#our model:\n", 62 | "def get_nice():\n", 63 | " return StandardSequence([\n", 64 | " StandardBlock(ThresholdExpansioner(num_expand=150),\n", 65 | " CovariantsPurifierBoth(max_take=10),\n", 66 | " IndividualLambdaPCAsBoth(n_components=50),\n", 67 | " ThresholdExpansioner(num_expand=300, mode='invariants'),\n", 68 | " InvariantsPurifier(max_take=50),\n", 69 | " InvariantsPCA(n_components=200)),\n", 70 | " StandardBlock(ThresholdExpansioner(num_expand=150),\n", 71 | " CovariantsPurifierBoth(max_take=10),\n", 72 | " IndividualLambdaPCAsBoth(n_components=10),\n", 73 | " ThresholdExpansioner(num_expand=300, mode='invariants'),\n", 74 | " InvariantsPurifier(max_take=50),\n", 75 | " InvariantsPCA(n_components=200)),\n", 76 | " StandardBlock(ThresholdExpansioner(num_expand=150),\n", 77 | " CovariantsPurifierBoth(max_take=10), None,\n", 78 | " ThresholdExpansioner(num_expand=300, mode='invariants'),\n", 79 | " InvariantsPurifier(max_take=50),\n", 80 | " InvariantsPCA(n_components=200))\n", 81 | " ],\n", 82 | " initial_scaler=InitialScaler(\n", 83 | " mode='signal integral', individually=True))\n", 84 | "\n", 85 | "\n", 86 | "train_structures = ase.io.read('methane.extxyz', index=train_subset)\n", 87 | "\n", 88 | "test_structures = ase.io.read('methane.extxyz', index=test_subset)\n", 89 | "\n", 90 | "all_species = get_all_species(train_structures + test_structures)\n", 91 | "\n", 92 | "train_coefficients = get_spherical_expansion(train_structures, HYPERS,\n", 93 | " all_species)\n", 94 | "\n", 95 | "test_coefficients = get_spherical_expansion(test_structures, HYPERS,\n", 96 | " all_species)\n", 97 | "\n", 98 | "#individual nice transformers for each atomic specie in the dataset\n", 99 | "nice = {}\n", 100 | "for key in train_coefficients.keys():\n", 101 | " nice[key] = get_nice()\n", 102 | "\n", 103 | "for key in train_coefficients.keys():\n", 104 | " nice[key].fit(train_coefficients[key][:environments_for_fitting])" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Now we need to call **.transform** method with **return_only_invariants = False**, which is the default value:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "data_even, data_odd, invariants_even = nice[1].transform(train_coefficients[1])" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Result is **data_even**, **data_odd** and **invariants_even**. The first two objects are covariants. The last one is invariants. \n", 128 | "\n", 129 | "There is another important symmetry in addition to the translational and rotational one. Usually, atomic properties, such as energy, also transform in a certain way with respect to inversion. Particularly, energy is invariant with respect to it. \n", 130 | "\n", 131 | "In NICE, features are separated into two groups - the ones which are invariant with respect to inversion and the ones that change their sign. The first ones are called even; the second ones are called odd. \n", 132 | "\n", 133 | "Now let's take a look at the returned objects more closely:" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "**Invariants** is the same object as in the previous tutorial - dictionary, where keys are body order." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "for key in invariants_even.keys():\n", 150 | " print(invariants_even[key].shape)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Returned covariants are covariants after the last block, i. e. in our case of body order 4. \n", 158 | "(functionality to get all covariants of all body order from **StandardSequence** will be added in the next version of NICE)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Even covariants are packed in the class Data, which has two relevant fields - \n", 166 | "**.covariants_** and **.actual_sizes_**. (getters are also to be added in the next version) First is np.array with covariants themselves. It has following indexing -**[environmental_index, feature_index, lambda, m]**. But the problem is that for each lambda channel, the actual number of features is different. Thus, the shape of this array doesn't reflect the real number of meaningful entries. Information about the actual number of features is stored in **.actual_sizes_**:" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "print(type(data_even))\n", 176 | "print(\"shape of even covariants array: {}\".format(data_even.covariants_.shape))\n", 177 | "print(\"actual sizes of even covariants: {}\".format(data_even.actual_sizes_))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "It is the same for odd covariants:" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "print(\"shape of odd covariants array: {}\".format(data_odd.covariants_.shape))\n", 194 | "print(\"actual sizes of odd covariants: {}\".format(data_odd.actual_sizes_))" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "There is one other point - for each lambda channel the size of covariant vectors is (2 * lambda + 1). These vectors are stored from the beginning. It means that the meaningful entries for each lambda are located in **[:, :, lambda, :(2 * lambda + 1)]**" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "In the [nice article](https://aip.scitation.org/doi/10.1063/5.0021116) another definition of **parity** is used. Covariants are split into **true** and **pseudo** groups. All the covariants in the **true** group are transformed with respect to inversion as (-1)^lambda, while all the covariants in the **pseudo** group are transformed as (-1) ^ (lambda + 1). \n", 209 | "\n", 210 | "There is a special class - **ParityDefinitionChanger** to switch between these definitions:" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "data_true, data_pseudo = ParityDefinitionChanger().transform(\n", 220 | " data_even, data_odd)\n", 221 | "\n", 222 | "print(data_true.covariants_.shape)\n", 223 | "print(data_true.actual_sizes_)\n", 224 | "\n", 225 | "print(data_pseudo.covariants_.shape)\n", 226 | "print(data_pseudo.actual_sizes_)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "Since this transformation is symmetric, we can use this once again to go back from the true and pseudo covariants to even and odd:" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "data_even, data_odd = ParityDefinitionChanger().transform(\n", 243 | " data_true, data_pseudo)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "There is one other discrepancy - covariants defined in the nice article, are smaller by the factor of (2 * lambda + 1). Thus, the last step to get full compliance is the following:" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "for lambd in range(6):\n", 260 | " data_true.covariants_[:, :data_true.actual_sizes_[lambd],\n", 261 | " lambd, :(2 * lambd + 1)] /= (2 * lambd + 1)\n", 262 | " data_pseudo.covariants_[:, :data_pseudo.actual_sizes_[lambd],\n", 263 | " lambd, :(2 * lambd + 1)] /= (2 * lambd + 1)" 264 | ] 265 | } 266 | ], 267 | "metadata": { 268 | "kernelspec": { 269 | "display_name": "Python 3", 270 | "language": "python", 271 | "name": "python3" 272 | }, 273 | "language_info": { 274 | "codemirror_mode": { 275 | "name": "ipython", 276 | "version": 3 277 | }, 278 | "file_extension": ".py", 279 | "mimetype": "text/x-python", 280 | "name": "python", 281 | "nbconvert_exporter": "python", 282 | "pygments_lexer": "ipython3", 283 | "version": "3.6.9" 284 | }, 285 | "toc": { 286 | "base_numbering": 1, 287 | "nav_menu": {}, 288 | "number_sections": true, 289 | "sideBar": true, 290 | "skip_h1_title": false, 291 | "title_cell": "Table of Contents", 292 | "title_sidebar": "Contents", 293 | "toc_cell": false, 294 | "toc_position": {}, 295 | "toc_section_display": true, 296 | "toc_window_display": false 297 | } 298 | }, 299 | "nbformat": 4, 300 | "nbformat_minor": 4 301 | } 302 | -------------------------------------------------------------------------------- /tutorials/custom_regressors_into_purifiers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Custom regressors into purifiers" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "As was already mentioned in the first tutorial, purifiers can accept arbitrarily linear regressors from sklearn.linear_model. In order to feed it with a custom linear regressor, some requirements should be fulfilled. Firstly, it should have the same interface as linear regressors from sklearn with the fit and predict methods. Secondly, it should fulfill sklearn requirements to make it possible to clone with [sklearn.base.clone](https://scikit-learn.org/stable/modules/generated/sklearn.base.clone.html) function. This tutorial shows an example of such a class. \n", 15 | "\n", 16 | "As before, let's calculate spherical expansion coefficients for H environments: " 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n", 26 | "\n", 27 | "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n", 28 | "!gunzip -k methane.extxyz.gz\n", 29 | "\n", 30 | "import numpy as np\n", 31 | "import ase.io\n", 32 | "import tqdm\n", 33 | "from nice.blocks import *\n", 34 | "from nice.utilities import *\n", 35 | "from matplotlib import pyplot as plt\n", 36 | "from sklearn.linear_model import BayesianRidge\n", 37 | "\n", 38 | "structures = ase.io.read('methane.extxyz', index='0:1000')\n", 39 | "\n", 40 | "HYPERS = {\n", 41 | " 'interaction_cutoff': 6.3,\n", 42 | " 'max_radial': 5,\n", 43 | " 'max_angular': 5,\n", 44 | " 'gaussian_sigma_type': 'Constant',\n", 45 | " 'gaussian_sigma_constant': 0.05,\n", 46 | " 'cutoff_smooth_width': 0.3,\n", 47 | " 'radial_basis': 'GTO'\n", 48 | "}\n", 49 | "\n", 50 | "all_species = get_all_species(structures)\n", 51 | "\n", 52 | "coefficients = get_spherical_expansion(structures, HYPERS, all_species)\n", 53 | "coefficients = coefficients[1]" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "Our custom class looks like this:" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "from sklearn.model_selection import cross_val_predict\n", 70 | "from sklearn.linear_model import Ridge\n", 71 | "\n", 72 | "\n", 73 | "class AdaptiveRidge:\n", 74 | " def __init__(self):\n", 75 | " pass\n", 76 | "\n", 77 | " def fit(self, X, y):\n", 78 | " minimum = None\n", 79 | " self.best_alpha_ = None\n", 80 | " for alpha in np.logspace(-25, 10, 300):\n", 81 | " regressor = Ridge(alpha=alpha, fit_intercept=False)\n", 82 | " predictions = cross_val_predict(regressor, X, y)\n", 83 | " now = np.mean((predictions - y)**2)\n", 84 | " if (minimum is None) or (now < minimum):\n", 85 | " minimum = now\n", 86 | " self.best_alpha_ = alpha\n", 87 | "\n", 88 | " self.ridge_ = Ridge(alpha=self.best_alpha_, fit_intercept=False)\n", 89 | " self.ridge_.fit(X, y)\n", 90 | "\n", 91 | " def predict(self, X):\n", 92 | " return self.ridge_.predict(X)\n", 93 | "\n", 94 | " def get_params(self, deep=True):\n", 95 | " return {}\n", 96 | "\n", 97 | " def set_params(self, **parameters):\n", 98 | " for parameter, value in parameters.items():\n", 99 | " setattr(self, parameter, value)\n", 100 | " return self" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "During fitting it estimates best value of regularization by cross validation using training data. There are additional methods get_params and set_params. These methods are required for sklearn.base.clone function. More details about it [here](https://scikit-learn.org/stable/developers/develop.html) (It is necessary to read only cloning section). \n", 108 | "\n", 109 | "Let's use it:" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "from scipy.linalg import LinAlgWarning\n", 119 | "\n", 120 | "nice = StandardSequence([\n", 121 | " StandardBlock(ThresholdExpansioner(50), None, IndividualLambdaPCAsBoth(20),\n", 122 | " ThresholdExpansioner(50, mode='invariants'), None, None),\n", 123 | " StandardBlock(\n", 124 | " ThresholdExpansioner(50),\n", 125 | " CovariantsPurifierBoth(regressor=AdaptiveRidge(), max_take=20),\n", 126 | " IndividualLambdaPCAsBoth(10),\n", 127 | " ThresholdExpansioner(50, mode='invariants'),\n", 128 | " InvariantsPurifier(regressor=AdaptiveRidge(), max_take=20),\n", 129 | " InvariantsPCA(20)),\n", 130 | "])\n", 131 | "\n", 132 | "with warnings.catch_warnings():\n", 133 | " # a lot of ill conditioned matrices with super small alpha\n", 134 | " warnings.filterwarnings(\"ignore\", category=LinAlgWarning)\n", 135 | " nice.fit(coefficients)\n", 136 | "\n", 137 | "res = nice.transform(coefficients)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "It is possible to access best alpha parameters for all paritiies and lambda chanels in the final model: \n", 145 | "\n", 146 | "(convenient getters might be added in the next version of NICE)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "for lambd in range(6):\n", 156 | " if (nice.blocks_[1].covariants_purifier_.even_purifier_.purifiers_[lambd]):\n", 157 | " print(\"parity: even; lambda: {}; best alpha: {}\".format(\n", 158 | " lambd, nice.blocks_[1].covariants_purifier_.even_purifier_.\n", 159 | " purifiers_[lambd].regressor_.best_alpha_))\n", 160 | " if (nice.blocks_[1].covariants_purifier_.odd_purifier_.purifiers_[lambd]):\n", 161 | " print(\"parity odd; lambda: {}; best alpha: {}\".format(\n", 162 | " lambd, nice.blocks_[1].covariants_purifier_.odd_purifier_.\n", 163 | " purifiers_[lambd].regressor_.best_alpha_))" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "The same for InvariantsPurifier:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "print(\"best alpha of invariants purifier: \",\n", 180 | " nice.blocks_[1].invariants_purifier_.regressor_.best_alpha_)" 181 | ] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "Python 3", 187 | "language": "python", 188 | "name": "python3" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.6.9" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 4 205 | } 206 | -------------------------------------------------------------------------------- /tutorials/getting_insights_about_the_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Getting insights about the model" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In the first tutorial, we calculated invariant representations of atomic environments and used them for the prediction of energies.\n", 15 | "\n", 16 | "But it is always good to have some understanding of the model. This tutorial will show how to get spectrums of pca along with the number of covariants after each transformation.\n", 17 | "\n", 18 | "First of all, we need **fitted** model. This preliminary cell reproduces the corresponding part of the first tutorial, \"constructing machine learning potential\": (few hypers are changed)" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# cell to wrap in collapsible in future\n", 28 | "\n", 29 | "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n", 30 | "\n", 31 | "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n", 32 | "!gunzip -k methane.extxyz.gz\n", 33 | "\n", 34 | "import numpy as np\n", 35 | "import ase.io\n", 36 | "import tqdm\n", 37 | "from nice.blocks import *\n", 38 | "from nice.utilities import *\n", 39 | "from matplotlib import pyplot as plt\n", 40 | "from sklearn.linear_model import BayesianRidge\n", 41 | "\n", 42 | "HARTREE_TO_EV = 27.211386245988\n", 43 | "train_subset = \"0:10000\" #input for ase.io.read command\n", 44 | "test_subset = \"10000:15000\" #input to ase.io.read command\n", 45 | "environments_for_fitting = 1000 #number of environments to fit nice transfomers\n", 46 | "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000, 5000, 7500,\n", 47 | " 10000] #for learning curve\n", 48 | "\n", 49 | "#HYPERS for librascal spherical expansion coefficients\n", 50 | "HYPERS = {\n", 51 | " 'interaction_cutoff': 6.3,\n", 52 | " 'max_radial': 5,\n", 53 | " 'max_angular': 5,\n", 54 | " 'gaussian_sigma_type': 'Constant',\n", 55 | " 'gaussian_sigma_constant': 0.05,\n", 56 | " 'cutoff_smooth_width': 0.3,\n", 57 | " 'radial_basis': 'GTO'\n", 58 | "}\n", 59 | "\n", 60 | "\n", 61 | "#our model:\n", 62 | "def get_nice():\n", 63 | " return StandardSequence([\n", 64 | " StandardBlock(ThresholdExpansioner(num_expand=150),\n", 65 | " CovariantsPurifierBoth(max_take=10),\n", 66 | " IndividualLambdaPCAsBoth(n_components=50),\n", 67 | " ThresholdExpansioner(num_expand=300, mode='invariants'),\n", 68 | " InvariantsPurifier(max_take=50),\n", 69 | " InvariantsPCA(n_components=200)),\n", 70 | " StandardBlock(ThresholdExpansioner(num_expand=150),\n", 71 | " CovariantsPurifierBoth(max_take=10),\n", 72 | " IndividualLambdaPCAsBoth(n_components=50),\n", 73 | " ThresholdExpansioner(num_expand=300, mode='invariants'),\n", 74 | " InvariantsPurifier(max_take=50),\n", 75 | " InvariantsPCA(n_components=200)),\n", 76 | " StandardBlock(ThresholdExpansioner(num_expand=150),\n", 77 | " CovariantsPurifierBoth(max_take=10),\n", 78 | " IndividualLambdaPCAsBoth(n_components=50),\n", 79 | " ThresholdExpansioner(num_expand=300, mode='invariants'),\n", 80 | " InvariantsPurifier(max_take=50),\n", 81 | " InvariantsPCA(n_components=200))\n", 82 | " ],\n", 83 | " initial_scaler=InitialScaler(\n", 84 | " mode='signal integral', individually=True))\n", 85 | "\n", 86 | "\n", 87 | "train_structures = ase.io.read('methane.extxyz', index=train_subset)\n", 88 | "\n", 89 | "test_structures = ase.io.read('methane.extxyz', index=test_subset)\n", 90 | "\n", 91 | "all_species = get_all_species(train_structures + test_structures)\n", 92 | "\n", 93 | "train_coefficients = get_spherical_expansion(train_structures, HYPERS,\n", 94 | " all_species)\n", 95 | "\n", 96 | "test_coefficients = get_spherical_expansion(test_structures, HYPERS,\n", 97 | " all_species)\n", 98 | "\n", 99 | "#individual nice transformers for each atomic specie in the dataset\n", 100 | "nice = {}\n", 101 | "for key in train_coefficients.keys():\n", 102 | " nice[key] = get_nice()\n", 103 | "\n", 104 | "for key in train_coefficients.keys():\n", 105 | " nice[key].fit(train_coefficients[key][:environments_for_fitting])" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "As was discussed in the first tutorial, **ThresholdExpansioner** sorts all pairs of inputs by their pairwise importances and, after that, produces the output only for a fixed number of the most important pairs. This number is controlled by **num_expand**. \n", 113 | "\n", 114 | "However, there are two reasons why the real number of covariants after **ThresholdEpansioner** might be different from the specified one. \n", 115 | "1) Some pairs of input covariants do not produce features to all lambda channels. Particularly, pair of input covariants with some l1 and l2 produces covariants only to lambda channels where |l1 - l2| <= lambda <= l1 + l2. Thus, the real number of features after **ThresholdExpanioner** would be smaller than the specified one in **num_expand**.\n", 116 | "\n", 117 | "2) Pairwise importances can have a lot of collisions. For instance, it is impossible to select such a threshold to filter out exactly 3 pairs from the set of pairs with the following importances [1, 1, 2, 2]. It is possible to filter out either 0, either 2, either 4, but not exactly 3. \n", 118 | "\n", 119 | "Thus, it is a good idea to have the possibility to look at the actual amount of intermediate features." 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "**StandardSequence** has a method **get_intermediat_shapes()**. It returns intermediate shapes in the form of nested dictionary:" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "intermediate_shapes = nice[1].get_intermediate_shapes()\n", 136 | "\n", 137 | "for key in intermediate_shapes.keys():\n", 138 | " print(key, ':', intermediate_shapes[key], end='\\n\\n\\n')" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "Spectrums of pcas can be accessed in the following way: \n", 146 | "(convenient getters will be inserted in the next version of NICE)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "def proper_log_plot(array, *args, **kwargs):\n", 156 | " '''avoiding log(0)'''\n", 157 | " plt.plot(np.arange(len(array)) + 1, array, *args, **kwargs)\n", 158 | " plt.ylim([1e-3, 1e0])\n", 159 | "\n", 160 | "\n", 161 | "colors = ['r', 'g', 'b', 'orange', 'yellow', 'purple']\n", 162 | "\n", 163 | "print(\"nu: \", 1)\n", 164 | "for i in range(6): # loop over lambda channels\n", 165 | " if (nice[6].initial_pca_ is not None):\n", 166 | " if (nice[6].initial_pca_.even_pca_.pcas_[i] is not None):\n", 167 | " proper_log_plot(\n", 168 | " nice[6].initial_pca_.even_pca_.pcas_[i].importances_,\n", 169 | " color=colors[i],\n", 170 | " label=\"lambda = {}\".format(i))\n", 171 | "\n", 172 | "for i in range(6): # loop over lambda channels\n", 173 | " if (nice[6].initial_pca_ is not None):\n", 174 | " if (nice[6].initial_pca_.odd_pca_.pcas_[i] is not None):\n", 175 | " proper_log_plot(\n", 176 | " nice[6].initial_pca_.odd_pca_.pcas_[i].importances_,\n", 177 | " '--',\n", 178 | " color=colors[i],\n", 179 | " label=\"lambda = {}\".format(i))\n", 180 | "\n", 181 | "plt.yscale('log')\n", 182 | "plt.xscale('log')\n", 183 | "plt.legend()\n", 184 | "plt.show()\n", 185 | "\n", 186 | "for nu in range(len(nice[6].blocks_)): # loop over body orders\n", 187 | " print(\"nu: \", nu + 2)\n", 188 | " for i in range(6): # loop over lambda channels\n", 189 | " if (nice[6].blocks_[nu].covariants_pca_ is not None):\n", 190 | " if (nice[6].blocks_[nu].covariants_pca_.even_pca_.pcas_[i]\n", 191 | " is not None):\n", 192 | " proper_log_plot(nice[6].blocks_[nu].covariants_pca_.even_pca_.\n", 193 | " pcas_[i].importances_,\n", 194 | " color=colors[i],\n", 195 | " label=\"lambda = {}\".format(i))\n", 196 | "\n", 197 | " for i in range(6): # loop over lambda channels\n", 198 | " if (nice[6].blocks_[nu].covariants_pca_ is not None):\n", 199 | " if (nice[6].blocks_[nu].covariants_pca_.odd_pca_.pcas_[i]\n", 200 | " is not None):\n", 201 | " proper_log_plot(nice[6].blocks_[nu].covariants_pca_.odd_pca_.\n", 202 | " pcas_[i].importances_,\n", 203 | " '--',\n", 204 | " color=colors[i])\n", 205 | "\n", 206 | " plt.yscale('log')\n", 207 | " plt.xscale('log')\n", 208 | " plt.legend()\n", 209 | " plt.show()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "(checks if pca instance is **None** are needed since it would be **None** if the number of features for corresponding lambda channel would be zero after the expansion step)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "Inner class for single Lambda channel inherits from sklearn.decomposition.TruncatedSVD (PCA without centering the data, which would break covariant transformation). Thus, in addition to **.importances_**, **.explained_variance_** and **.explained_variance_ratio_** are also accessible. \n", 224 | "\n", 225 | "**importances_** (which are used by subsequent **TresholdExpansioners**) are **explained_variance_** normalized not to variance of input as **explained_variance_ratio_**, but to variance of output:" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "print(np.sum(nice[6].blocks_[1].\\\n", 235 | " covariants_pca_.even_pca_.pcas_[2].explained_variance_))\n", 236 | "print(np.sum(nice[6].blocks_[1].\\\n", 237 | " covariants_pca_.even_pca_.pcas_[2].explained_variance_ratio_))\n", 238 | "print(np.sum(nice[6].blocks_[1].\\\n", 239 | " covariants_pca_.even_pca_.pcas_[2].importances_))" 240 | ] 241 | } 242 | ], 243 | "metadata": { 244 | "kernelspec": { 245 | "display_name": "Python 3", 246 | "language": "python", 247 | "name": "python3" 248 | }, 249 | "language_info": { 250 | "codemirror_mode": { 251 | "name": "ipython", 252 | "version": 3 253 | }, 254 | "file_extension": ".py", 255 | "mimetype": "text/x-python", 256 | "name": "python", 257 | "nbconvert_exporter": "python", 258 | "pygments_lexer": "ipython3", 259 | "version": "3.6.9" 260 | } 261 | }, 262 | "nbformat": 4, 263 | "nbformat_minor": 4 264 | } 265 | -------------------------------------------------------------------------------- /tutorials/sequential_fitting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Sequential fitting" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "It is not always clear how to select good hyperparameters for calculations. The second tutorial \"Getting insights about the model, \" showed how to plot PCA spectrums for all lambda channels and parities. This information, along with the other one, such as regression accuracy, might be useful to select better hypers. Particularly, the most straightforward way is to select the number of PCA components in such a way as to cover the most part of the variance and do it successively from block to block. \n", 15 | "\n", 16 | "In this case, it is very undesirable to fit all parts of the model, including not changed ones from scratch. One possible way around is to do all things by hand, as was described in the tutorial \"Constructor or non standard_sequence,\" but there would be an additional headache with packing resulting blocks into a single model with a convenient .transform method. Nice toolbox has the capability to do it very succinctly.\n", 17 | "\n", 18 | "First of all, we need to get spherical expansion coefficients the same way as in previous tutorials:" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n", 28 | "\n", 29 | "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n", 30 | "!gunzip -k methane.extxyz.gz\n", 31 | "\n", 32 | "import numpy as np\n", 33 | "import ase.io\n", 34 | "import tqdm\n", 35 | "from nice.blocks import *\n", 36 | "from nice.utilities import *\n", 37 | "from matplotlib import pyplot as plt\n", 38 | "from sklearn.linear_model import BayesianRidge\n", 39 | "\n", 40 | "structures = ase.io.read('methane.extxyz', index='0:1000')\n", 41 | "\n", 42 | "HYPERS = {\n", 43 | " 'interaction_cutoff': 6.3,\n", 44 | " 'max_radial': 5,\n", 45 | " 'max_angular': 5,\n", 46 | " 'gaussian_sigma_type': 'Constant',\n", 47 | " 'gaussian_sigma_constant': 0.05,\n", 48 | " 'cutoff_smooth_width': 0.3,\n", 49 | " 'radial_basis': 'GTO'\n", 50 | "}\n", 51 | "\n", 52 | "all_species = get_all_species(structures)\n", 53 | "\n", 54 | "coefficients = get_spherical_expansion(structures, HYPERS, all_species)\n", 55 | "coefficients = coefficients[1]" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "coefficients are now spherical expansion coefficients for H centered environments:" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "print(coefficients.shape)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "Let's do the first steps from standar sequence:" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "even_0, odd_0 = InitialTransformer().transform(coefficients)\n", 88 | "initial_pca = IndividualLambdaPCAsBoth()\n", 89 | "initial_pca.fit(even_0, odd_0)\n", 90 | "even_0_t, odd_0_t = initial_pca.transform(even_0, odd_0)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "Now we can fit couple of standard blocks:" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "block_1 = StandardBlock(ThresholdExpansioner(100), None,\n", 107 | " IndividualLambdaPCAsBoth(20))\n", 108 | "block_1.fit(even_0_t, odd_0_t, even_0_t, odd_0_t)\n", 109 | "even_1, odd_1, _ = block_1.transform(even_0_t, odd_0_t, even_0_t, odd_0_t)\n", 110 | "\n", 111 | "block_2 = StandardBlock(None, None, None,\n", 112 | " ThresholdExpansioner(100, mode='invariants'))\n", 113 | "block_2.fit(even_1, odd_1, even_0_t, odd_0_t)\n", 114 | "_, _, even_invariants = block_2.transform(even_1, odd_1, even_0_t, odd_0_t)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "At his moment we have all parts of this standard sequence fitted:\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "nice = StandardSequence(initial_pca=initial_pca, blocks=[block_1, block_2])\n", 131 | "print(initial_pca.is_fitted())\n", 132 | "print(block_1.is_fitted())\n", 133 | "print(block_2.is_fitted())" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "what about full model?" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "print(nice.is_fitted())" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Nope. \n", 157 | "\n", 158 | "At this point, there is a very high probability of making a mistake. Particularly one can feed StandardSequence with some fitted initial_pca along with blocks, which were fitted based not on the same initial_pca, with different initial_normalizer, or even on different data. In order to prevent it, there is a requirement to pass an additional flag guaranteed_parts_fitted_consistently = True to the model:" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "nice = StandardSequence(initial_pca=initial_pca,\n", 168 | " blocks=[block_1, block_2],\n", 169 | " guaranteed_parts_fitted_consistently=True)\n", 170 | "print(nice.is_fitted())" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "Model is considered to be fitted if 1) all parts are fitted and 2) if guaranteed_parts_fitted_consistently is set to be True\n", 178 | "\n", 179 | "**Golden rule:** Every time you pass guaranteed_parts_fitted_consistently = True make a pause and think twice. \n", 180 | "\n", 181 | "Let's check consistency:" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "even_invariants_2 = nice.transform(coefficients,\n", 191 | " return_only_invariants=True)[3]\n", 192 | "print(np.sum(np.abs(even_invariants - even_invariants_2)))" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "This also works in other direction:" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "initial_pca = IndividualLambdaPCAsBoth()\n", 209 | "block_1 = StandardBlock(ThresholdExpansioner(100), None,\n", 210 | " IndividualLambdaPCAsBoth(20))\n", 211 | "block_2 = StandardBlock(None, None, None,\n", 212 | " ThresholdExpansioner(100, mode='invariants'))\n", 213 | "\n", 214 | "print(initial_pca.is_fitted())\n", 215 | "print(block_1.is_fitted())\n", 216 | "print(block_2.is_fitted())\n", 217 | "\n", 218 | "nice = StandardSequence(initial_pca=initial_pca, blocks=[block_1, block_2])\n", 219 | "nice.fit(coefficients)\n", 220 | "\n", 221 | "print(initial_pca.is_fitted())\n", 222 | "print(block_1.is_fitted())\n", 223 | "print(block_2.is_fitted())" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "StandardBlock behaves the same way:\n", 231 | " " 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "expansioner, pca = ThresholdExpansioner(100), IndividualLambdaPCAsBoth(20)\n", 241 | "print(expansioner.is_fitted())\n", 242 | "print(pca.is_fitted())\n", 243 | "\n", 244 | "block = StandardBlock(expansioner, None, pca)\n", 245 | "block.fit(even_0_t, odd_0_t, even_0_t, odd_0_t)\n", 246 | "\n", 247 | "print(expansioner.is_fitted())\n", 248 | "print(pca.is_fitted())" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "expansioner, pca = ThresholdExpansioner(100), IndividualLambdaPCAsBoth(20)\n", 258 | "expansioner.fit(even_0_t, odd_0_t, even_0_t, odd_0_t)\n", 259 | "even_1, odd_1 = expansioner.transform(even_0_t, odd_0_t, even_0_t, odd_0_t)\n", 260 | "pca.fit(even_1, odd_1)\n", 261 | "\n", 262 | "block = StandardBlock(expansioner,\n", 263 | " None,\n", 264 | " pca,\n", 265 | " guaranteed_parts_fitted_consistently=True)\n", 266 | "\n", 267 | "print(block.is_fitted())" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "There is another group of blocks that accepts classes, such as sklearn.linear_model.Ridge in the initialization. But in their case, there is a need to apply several distinct regressors separately for each lambda channel and parity. Thus, the input regressor is cloned, and initial instances are not touched in any way. So, the material of this tutorial does not apply to purifiers. " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [] 283 | } 284 | ], 285 | "metadata": { 286 | "kernelspec": { 287 | "display_name": "Python 3", 288 | "language": "python", 289 | "name": "python3" 290 | }, 291 | "language_info": { 292 | "codemirror_mode": { 293 | "name": "ipython", 294 | "version": 3 295 | }, 296 | "file_extension": ".py", 297 | "mimetype": "text/x-python", 298 | "name": "python", 299 | "nbconvert_exporter": "python", 300 | "pygments_lexer": "ipython3", 301 | "version": "3.6.9" 302 | }, 303 | "toc": { 304 | "base_numbering": 1, 305 | "nav_menu": {}, 306 | "number_sections": true, 307 | "sideBar": true, 308 | "skip_h1_title": false, 309 | "title_cell": "Table of Contents", 310 | "title_sidebar": "Contents", 311 | "toc_cell": false, 312 | "toc_position": {}, 313 | "toc_section_display": true, 314 | "toc_window_display": false 315 | } 316 | }, 317 | "nbformat": 4, 318 | "nbformat_minor": 4 319 | } 320 | -------------------------------------------------------------------------------- /update_docs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import copy 4 | 5 | # cutting notebooks 6 | def split(name, destination): 7 | with open("tutorials/{}".format(name), "r") as f: 8 | notebook = json.load(f) 9 | 10 | before_collapsible = copy.deepcopy(notebook) 11 | before_collapsible['cells'] = before_collapsible['cells'][0:2] 12 | 13 | collapsible = copy.deepcopy(notebook) 14 | collapsible['cells'] = [collapsible['cells'][2]] 15 | 16 | after_collapsible = copy.deepcopy(notebook) 17 | after_collapsible['cells'] = after_collapsible['cells'][3:] 18 | 19 | clean_name = name.strip().split('.')[0] 20 | with open("{}/{}_before_collapsible.ipynb".format(destination, 21 | clean_name), "w") as f: 22 | json.dump(before_collapsible, f) 23 | 24 | with open("{}/{}_collapsible.ipynb".format(destination, 25 | clean_name), "w") as f: 26 | json.dump(collapsible, f) 27 | 28 | 29 | with open("{}/{}_after_collapsible.ipynb".format(destination, 30 | clean_name), "w") as f: 31 | json.dump(after_collapsible, f) 32 | 33 | os.system("mkdir docs/cutted") 34 | split('calculating_covariants.ipynb', 'docs/cutted/') 35 | split('getting_insights_about_the_model.ipynb', 'docs/cutted/') 36 | split('constructor_or_non_standard_sequence.ipynb', 'docs/cutted/') 37 | split('sequential_fitting.ipynb', 'docs/cutted/') 38 | split('custom_regressors_into_purifiers.ipynb', 'docs/cutted/') 39 | 40 | # converting notebooks to rst 41 | 42 | def make_substitution(lines, index): 43 | lines_before = lines[0:index] 44 | end = len(lines) 45 | for j in range(index + 1, len(lines)): 46 | if not(lines[j].strip() == "" or lines[j].startswith(' ')): 47 | end = j 48 | break 49 | 50 | lines_raw = lines[index + 1 : end] 51 | raw_from = 0 52 | for i in range(len(lines_raw)): 53 | if (lines_raw[i].strip() != ''): 54 | raw_from = i 55 | break 56 | lines_raw = lines_raw[raw_from:] 57 | 58 | raw_to = 0 59 | for i in range(len(lines_raw)): 60 | if (lines_raw[i].strip() != ''): 61 | raw_to = i + 1 62 | 63 | lines_raw = lines_raw[:raw_to] 64 | 65 | 66 | lines_for_insertion = [".. raw:: html\n", 67 | "\n", 68 | "\n", 69 | "
\n",
 70 |                            '

\n'] 71 | lines_for_insertion = lines_for_insertion + lines_raw 72 | lines_for_insertion = lines_for_insertion + ["

\n", "
\n", "\n", '\n'] 73 | 74 | for i in range(1, len(lines_for_insertion)): 75 | lines_for_insertion[i] = ' ' + lines_for_insertion[i] 76 | 77 | return lines_before + lines_for_insertion + lines[end:] 78 | 79 | return lines[index : end] 80 | 81 | def get_bad_block(lines): 82 | for i in range(len(lines)): 83 | if (lines[i].strip() == ".. parsed-literal::"): 84 | return i 85 | return None 86 | 87 | def iterate(lines): 88 | while True: 89 | index = get_bad_block(lines) 90 | if index is None: 91 | return lines 92 | lines = make_substitution(lines, index) 93 | 94 | def fix_awful_nvconvert_format(file): 95 | lines = [] 96 | with open(file, "r") as f: 97 | lines = list(f) 98 | lines = iterate(lines) 99 | with open(file, "w") as f: 100 | for line in lines: 101 | f.write(line) 102 | 103 | os.chdir('docs/cutted/') 104 | names = [name for name in os.listdir('.') if name.endswith('.ipynb')] 105 | 106 | for name in names: 107 | dir_name = name.split('.')[0] 108 | os.system("mkdir {}".format(dir_name)) 109 | os.system("cp {} {}/".format(name, dir_name)) 110 | os.chdir(dir_name) 111 | os.system('jupyter nbconvert --to rst {}'.format(name)) 112 | fix_awful_nvconvert_format(name.split('.')[0] + '.rst') 113 | names_inner = os.listdir('.') 114 | for name_inner in names_inner: 115 | if (name_inner.endswith('_files')): 116 | os.system('cp -r {} ../../'.format(name_inner)) 117 | os.chdir('../') 118 | 119 | 120 | os.chdir('../..') 121 | 122 | os.system("rm -r ../build/*") 123 | os.chdir("./docs") 124 | os.system("sphinx-apidoc -f -o . ../nice") 125 | os.system("make html") 126 | os.chdir("../") 127 | os.system("git checkout -f gh-pages") 128 | os.system("git rm -r *") 129 | os.system("cp -r ../build/html/* .") 130 | with open(".nojekyll", "w") as f: 131 | pass 132 | 133 | os.system("git add *") 134 | os.system("git add .nojekyll") 135 | os.system("git commit -m 'automatic docs build'") 136 | os.system("git push") 137 | os.system("git checkout master") 138 | 139 | --------------------------------------------------------------------------------