├── LICENSE.txt
├── README.rst
├── docs
    ├── Makefile
    ├── acknowledgements.rst
    ├── blocks.rst
    ├── blocks_proper.rst
    ├── calculating_covariants.rst
    ├── conf.py
    ├── constructor_or_non_standard_sequence.rst
    ├── custom_regressors_into_purifiers.rst
    ├── defines.rst
    ├── examples.rst
    ├── getting_insights_about_the_model.rst
    ├── hello_user.rst
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── nice_abstract.rst
    ├── sequential_fitting.rst
    ├── symlinks
    │   └── constructing_machine_learning_potential.ipynb
    ├── theory.rst
    └── utilities.rst
├── enumerate
    ├── README.md
    ├── indep-1-1.dat
    ├── indep-1-8.dat
    ├── indep-7-1.dat
    ├── indep-7-4.dat
    ├── linear_reduce.nb
    └── polynomial_reduce.nb
├── examples
    ├── methane_home_pc.ipynb
    ├── methane_medium.ipynb
    ├── methane_small.ipynb
    ├── qm9_home_pc.ipynb
    └── qm9_small.ipynb
├── nice
    ├── __init__.py
    ├── blocks
    │   ├── __init__.py
    │   ├── compressors.py
    │   ├── expansioners.py
    │   ├── grouping.py
    │   ├── miscellaneous.py
    │   └── purifiers.py
    ├── clebsch_gordan.py
    ├── contracted_pca.py
    ├── nice_utilities.pxd
    ├── nice_utilities.pyx
    ├── packing.pyx
    ├── rascal_coefficients.pyx
    ├── thresholding.pyx
    ├── unrolling_individual_pca.pyx
    ├── unrolling_pca.pyx
    └── utilities.py
├── pyproject.toml
├── reference_configurations
    ├── methane_100.extxyz
    └── readme.txt
├── requirements.txt
├── setup.py
├── tests
    ├── compare_kernels.py
    └── readme.txt
├── tutorials
    ├── calculating_covariants.ipynb
    ├── constructing_machine_learning_potential.ipynb
    ├── constructor_or_non_standard_sequence.ipynb
    ├── custom_regressors_into_purifiers.ipynb
    ├── getting_insights_about_the_model.ipynb
    └── sequential_fitting.ipynb
└── update_docs.py


/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Sergey Pozdnyakov, Jigyasa Nigam,  Michele Ceriotti
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. role:: bash(code)
 2 |    :language: bash
 3 | 
 4 | NICE
 5 | ====
 6 | 
 7 | NICE (N-body Iteratively Contracted Equivariants) is a set of tools designed for the calculation of 
 8 | invariant and covariant atomic structure representations. It allows for
 9 | automatic selection of the most informative combinations of high order spectrum elements
10 | and performs their efficient computation using recurrence relations. 
11 | 
12 | Although it is designed specifically for atomistic machine learning, NICE in principle 
13 | can be applied to other machine learning tasks, such as those which involve signals in a ball or on a sphere, all which require invariant or covariant outputs. 
14 | 
15 | ++++++++++++
16 | Installation
17 | ++++++++++++
18 | 
19 | 1. Install `librascal <https://github.com/cosmo-epfl/librascal>`_
20 | 2. git clone or download archive with nice and unpack
21 | 3. cd to root nice directory and run :bash:`pip3 install .`
22 | 
23 | +++++++++++++
24 | Documentation
25 | +++++++++++++
26 | 
27 | Documentation can be found `here <https://lab-cosmo.github.io/nice/>`_
28 | 
29 | ++++++++++
30 | References
31 | ++++++++++
32 | 
33 | If you are using NICE, please cite `this article <https://aip.scitation.org/doi/10.1063/5.0021116>`_. 
34 | 
35 | [1] Jigyasa Nigam, Sergey Pozdnyakov, and Michele Ceriotti. "Recursive evaluation and iterative contraction of N-body equivariant features." The Journal of Chemical Physics 153.12 (2020): 121101.
36 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = ../../build/
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/acknowledgements.rst:
--------------------------------------------------------------------------------
1 | Acknowledgements
2 | ================
3 | We are very thankful to Felix Musil for remarkable advices especially 
4 | on tools for building documentation.
5 | 


--------------------------------------------------------------------------------
/docs/blocks.rst:
--------------------------------------------------------------------------------
 1 | Blocks
 2 | ======
 3 | 
 4 | NICE 
 5 | ----------------------------
 6 | 
 7 | .. automodule:: blocks.grouping
 8 |    :members:
 9 |    :undoc-members:
10 |    :show-inheritance:
11 | 
12 | Compressors
13 | -------------------------------
14 | 
15 | .. automodule:: blocks.compressors
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | Expansioners
21 | --------------------------------
22 | 
23 | .. automodule:: blocks.expansioners
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | Purifiers
29 | -----------------------------
30 | 
31 | .. automodule:: blocks.purifiers
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 
36 | Miscellaneous
37 | ---------------------------------
38 | 
39 | .. automodule:: blocks.miscellaneous
40 |    :members:
41 |    :undoc-members:
42 |    :show-inheritance:
43 | 


--------------------------------------------------------------------------------
/docs/blocks_proper.rst:
--------------------------------------------------------------------------------
 1 | NICE 
 2 | ----------------------------
 3 | 
 4 | .. automodule:: blocks.grouping
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Compressors
10 | -------------------------------
11 | 
12 | .. automodule:: blocks.compressors
13 |    :members:
14 |    :undoc-members:
15 |    :show-inheritance:
16 | 
17 | Expansioners
18 | --------------------------------
19 | 
20 | .. automodule:: blocks.expansioners
21 |    :members:
22 |    :undoc-members:
23 |    :show-inheritance:
24 | 
25 | Purifiers
26 | -----------------------------
27 | 
28 | .. automodule:: blocks.purifiers
29 |    :members:
30 |    :undoc-members:
31 |    :show-inheritance:
32 | 
33 | Miscellaneous
34 | ---------------------------------
35 | 
36 | .. automodule:: blocks.miscellaneous
37 |    :members:
38 |    :undoc-members:
39 |    :show-inheritance:
40 | 
41 | 


--------------------------------------------------------------------------------
/docs/calculating_covariants.rst:
--------------------------------------------------------------------------------
 1 | .. include:: cutted/calculating_covariants_before_collapsible/calculating_covariants_before_collapsible.rst
 2 | 
 3 | ..  admonition:: Preliminaries
 4 |    :class: toggle
 5 |  
 6 |    .. include:: cutted/calculating_covariants_collapsible/calculating_covariants_collapsible.rst
 7 | 
 8 | .. include:: cutted/calculating_covariants_after_collapsible/calculating_covariants_after_collapsible.rst
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # http://www.sphinx-doc.org/en/master/config
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('.'))
16 | sys.path.insert(0, os.path.abspath('../nice'))
17 | sys.path.insert(0, os.path.abspath('../nice/blocks'))
18 | 
19 | import sphinx_rtd_theme
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = 'NICE'
23 | copyright = '2020, Jigyasa Nigam, Sergey Pozdnyakov, Michele Ceriotti'
24 | author = 'Jigyasa Nigam, Sergey Pozdnyakov, Michele Ceriotti'
25 | 
26 | 
27 | # -- General configuration ---------------------------------------------------
28 | 
29 | # Add any Sphinx extension module names here, as strings. They can be
30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
31 | # ones.
32 | 
33 | 
34 | 
35 | extensions = [
36 |      "sphinx_rtd_theme", "nbsphinx", "sphinxcontrib.napoleon", "sphinx_togglebutton"
37 | ]
38 | 
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = ['_templates']
41 | 
42 | # List of patterns, relative to source directory, that match files and
43 | # directories to ignore when looking for source files.
44 | # This pattern also affects html_static_path and html_extra_path.
45 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
46 | 
47 | 
48 | # -- Options for HTML output -------------------------------------------------
49 | 
50 | # The theme to use for HTML and HTML Help pages.  See the documentation for
51 | # a list of builtin themes.
52 | #
53 | html_theme = 'sphinx_rtd_theme'
54 | 
55 | # Add any paths that contain custom static files (such as style sheets) here,
56 | # relative to this directory. They are copied after the builtin static files,
57 | # so a file named "default.css" will overwrite the builtin "default.css".
58 | html_static_path = ['_static']
59 | 


--------------------------------------------------------------------------------
/docs/constructor_or_non_standard_sequence.rst:
--------------------------------------------------------------------------------
 1 | .. include:: cutted/constructor_or_non_standard_sequence_before_collapsible/constructor_or_non_standard_sequence_before_collapsible.rst
 2 | 
 3 | 
 4 | ..  admonition:: Preliminaries
 5 |    :class: toggle    
 6 |    
 7 |    .. include:: cutted/constructor_or_non_standard_sequence_collapsible/constructor_or_non_standard_sequence_collapsible.rst
 8 | 
 9 | 
10 | .. include:: cutted/constructor_or_non_standard_sequence_after_collapsible/constructor_or_non_standard_sequence_after_collapsible.rst
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/docs/custom_regressors_into_purifiers.rst:
--------------------------------------------------------------------------------
 1 | .. include:: cutted/custom_regressors_into_purifiers_before_collapsible/custom_regressors_into_purifiers_before_collapsible.rst
 2 | 
 3 | 
 4 | ..  admonition:: Preliminaries
 5 |    :class: toggle    
 6 |    
 7 |    .. include:: cutted/custom_regressors_into_purifiers_collapsible/custom_regressors_into_purifiers_collapsible.rst
 8 | 
 9 | 
10 | .. include:: cutted/custom_regressors_into_purifiers_after_collapsible/custom_regressors_into_purifiers_after_collapsible.rst
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/defines.rst:
--------------------------------------------------------------------------------
1 | .. role:: bash(code)
2 |    :language: bash
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/docs/examples.rst:
--------------------------------------------------------------------------------
 1 | Real-Life Examples of Nice
 2 | ==========================
 3 | 
 4 | 
 5 | Among other things, this repository contains scripts and notebooks to contextualize NICE into real-world problems. These examples are similar to the procedures reported in Jigyasa Nigam, Sergey Pozdnyakov, and Michele Ceriotti. "Recursive evaluation and iterative contraction of N-body equivariant features." The Journal of Chemical Physics 153.12 (2020): 121101, but not direct productions.
 6 | 
 7 | In `qm9_home_pc.ipynb` and `qm9_small.ipynb` construct similar machine learned potentials for the QM9 dataset (see below). `qm9_home_pc.ipynb` is intended to run on a local workstation, whereas `qm9_small.ipynb` is best suited for HPC resources. We have also provided examples for the methane dataset (https://archive.materialscloud.org/record/2020.110). All notebooks include general advice on
 8 | appropriate real-life hyperparameters. 
 9 | 
10 | QM9 dataset is `available  <https://figshare.com/collections/Quantum_chemistry_structures_and_properties_of_134_kilo_molecules/978904>`_ 
11 | in the form of separate .xyz files for each molecule in such a special format
12 | that it can not be read by `ase <https://wiki.fysik.dtu.dk/ase/ase/io/io.html>`_.
13 | The first cells of qm9_home_pc.ipynb and qm9_small.ipynb notebooks contain code to fetch the raw QM9 dataset and parses it into a single ase .extxyz file. 
14 | 


--------------------------------------------------------------------------------
/docs/getting_insights_about_the_model.rst:
--------------------------------------------------------------------------------
 1 | .. include:: cutted/getting_insights_about_the_model_before_collapsible/getting_insights_about_the_model_before_collapsible.rst
 2 | 
 3 | 
 4 | ..  admonition:: Preliminaries
 5 |    :class: toggle    
 6 |    
 7 |    .. include:: cutted/getting_insights_about_the_model_collapsible/getting_insights_about_the_model_collapsible.rst
 8 | 
 9 | 
10 | .. include:: cutted/getting_insights_about_the_model_after_collapsible/getting_insights_about_the_model_after_collapsible.rst
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/hello_user.rst:
--------------------------------------------------------------------------------
 1 | NICE
 2 | ==========================
 3 | 
 4 | .. include:: nice_abstract.rst
 5 | 
 6 | Installation
 7 | ------------
 8 | 
 9 | .. include:: installation.rst
10 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. NICE documentation master file, created by
 2 |    sphinx-quickstart on Wed Sep 23 16:53:53 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 |   
 7 | 
 8 | .. include:: hello_user.rst   
 9 | 
10 | .. toctree::
11 |    :glob:
12 |    :maxdepth: 1
13 |    :caption: Theory in a nutshell
14 | 
15 |    theory    
16 | 
17 | .. toctree::
18 |    :glob:
19 |    :maxdepth: 1
20 |    :caption: Tutorials
21 | 
22 |    symlinks/constructing_machine_learning_potential
23 |    calculating_covariants
24 |    getting_insights_about_the_model
25 |    constructor_or_non_standard_sequence  
26 |    sequential_fitting
27 |    custom_regressors_into_purifiers
28 | 
29 | .. toctree::
30 |    :glob:   
31 |    :maxdepth: 1
32 |    :caption: Examples
33 | 
34 |    examples
35 | 
36 | .. toctree::
37 |    :glob:
38 |    :maxdepth: 5
39 |    :caption: Reference guide
40 | 
41 |    blocks
42 |    utilities
43 | 
44 | .. toctree::
45 |    :glob:   
46 |    :maxdepth: 1
47 |    :caption: Acknowledgements
48 | 
49 |    acknowledgements
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | .. include:: defines.rst
2 | 
3 | 1. Install `librascal <https://github.com/cosmo-epfl/librascal>`_
4 | 2. git clone or download archive with nice and unpack
5 | 3. cd to root nice directory and run :bash:`pip3 install .`
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR= ../../build/
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/nice_abstract.rst:
--------------------------------------------------------------------------------
1 | NICE (N-body Iteratively Contracted Equivariants) is a set of tools designed for the calculation of invariant and covariant atomic structure representations. It allows to
2 | automatically select the most informative combinations of high-order spectrum elements
3 | and perform their efficient computation using recurrence relations. 
4 | 
5 | Though being designed specifically for atomistic machine learning, NICE, in principle, can be applied to other machine learning tasks towards producing invariant or covariant output that involve signals within a spherical cutoff, either on the sphere surface (2D manifold) or within the enclosed area (3D manifold). 
6 | 


--------------------------------------------------------------------------------
/docs/sequential_fitting.rst:
--------------------------------------------------------------------------------
 1 | .. include:: cutted/sequential_fitting_before_collapsible/sequential_fitting_before_collapsible.rst
 2 | 
 3 | 
 4 | ..  admonition:: Preliminaries
 5 |    :class: toggle    
 6 |    
 7 |    .. include:: cutted/sequential_fitting_collapsible/sequential_fitting_collapsible.rst
 8 | 
 9 | 
10 | .. include:: cutted/sequential_fitting_after_collapsible/sequential_fitting_after_collapsible.rst
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/symlinks/constructing_machine_learning_potential.ipynb:
--------------------------------------------------------------------------------
1 | ../../tutorials/constructing_machine_learning_potential.ipynb


--------------------------------------------------------------------------------
/docs/theory.rst:
--------------------------------------------------------------------------------
  1 | Theory in a nutshell
  2 | _____________________________
  3 | 
  4 | One can use this toolbox as a black box that calculates proper atomic
  5 | structure representations. In this case, we refer the reader to tutorials along with an examples folder to borrow 
  6 | appropriate hyperparameters for real-life scenarios. 
  7 | 
  8 | In order to meaningfully select hypers or design your calculations, some understanding of 
  9 | what is going on is required. The most comprehensive description is given in [Ref]_, which 
 10 | might appear to be quite time-consuming for people not from the field. Thus, this 
 11 | section is designed to give a short overview of the method without any proofs or unnecessary 
 12 | details.
 13 | 
 14 | For various purposes in atomistic machine learning, there is a need to describe atomic environments
 15 | by invariant or covariant values. The most widespread case is the construction of so-called
 16 | machine learning potentials. In this case, the goal is to construct a mapping function from an atomistic structure,
 17 | whether it is a molecule, crystal, or amorphous solid to the energy of this configuration. Energy is 
 18 | an `extensive <https://en.wikipedia.org/wiki/Intensive_and_extensive_properties>`_ property, which allows representing total energy as a sum of atomic contributions which are defined by central atomic specie along with
 19 | the atomic environment. 
 20 | 
 21 | Most machine learning algorithms don't exhibit required symmetries such as rotational symmetry out of the box. 
 22 | Thus, there is a need to calculate atomic environment representation which is invariant with respect to certain transformations. 
 23 | For the prediction of other properties, there is also a need for covariant representations which transform in a certain way under rotations.
 24 | 
 25 | 
 26 | The atomic environment is described by an unordered set of 
 27 | relative positions of neighbors within a given cut-off radius, along with their species 
 28 | :math:`\{\{\vec{r_1}, \alpha_1\}, \{\vec{r_2}, \alpha_{2}\}... \{\vec{r_n}, \alpha_{n}\}\}`.
 29 | The number of neighbors potentially can vary. The goal is to provide a description
 30 | of the fixed size consisting of invariant or covariant features with respect
 31 | to permutations of atoms of the same specie along rotations of the environment. 
 32 | 
 33 | The invariance with respect to the permutation of atoms is achieved by introducing "neighbor 
 34 | density functions": 
 35 | :math:`\rho_{\alpha}(\vec{r}) = \sum\limits_i g(\vec{r} - \vec{r_i}) \delta_{\alpha, \alpha_i}`,
 36 | where :math:`g` is some local function, such as a gaussian or even delta function. After that 
 37 | fingerprints are expressed as the functionals of :math:`\rho`.
 38 | 
 39 | To deal with neighbor density functions, spherical expansion coefficients are introduced:
 40 | 
 41 | .. math::
 42 |    < \{n, \alpha\} \lambda m | \rho^1> =  \int d\vec{r} R_{n}(\vec{r}) Y_{\lambda}^m(\hat{r}) \rho_{\alpha}(\vec{r})
 43 | , where :math:`\hat{r}` is the unit direction vector, :math:`r = |\vec{r}|`, :math:`R_{n}(r)` is 
 44 | some complete basis, it doesn't really matter which one particularly, 
 45 | :math:`Y_{\lambda}^m(\hat{r})` are
 46 | `spherical harmonics <https://en.wikipedia.org/wiki/Spherical_harmonics>`_.  :math:`\lambda` index runs from :math:`0` 
 47 | to :math:`+\inf`, 
 48 | :math:`m` runs from :math:`-\lambda` to :math:`\lambda`.
 49 | 
 50 | :math:`\{n, \alpha\}` indices are never used separately from each other and, thus, for simplicity, 
 51 | in the further narrative, we will refer to them as just :math:`n`. 
 52 | 
 53 | It is known how coefficients :math:`< n \lambda m | \rho^1>` transform under rotations of the environment.
 54 | Particularly coefficients with :math:`l = 0` remain constant under rotations, i. e. are invariants,
 55 | while the general transformation rule is
 56 | 
 57 | .. math::
 58 |    < n \lambda m | \hat{R} | \rho^1> = \sum\limits_{m'} D^{\lambda}_{mm'} < n \lambda m' | \rho^1>
 59 | 
 60 | where :math:`< n \lambda m | \hat{R} | \rho^1>` are spherical expansion coefficients
 61 | for the rotated environment, :math:`\hat{R}` is the rotation, described, for instance,
 62 | by `Euler angles <https://en.wikipedia.org/wiki/Euler_angles>`_
 63 | , :math:`D^{\lambda}_{mm'}(\hat{R})` are
 64 | `Wigner D matrices <https://en.wikipedia.org/wiki/Wigner_D-matrix>`_. 
 65 | 
 66 | Let's look at this transformation more closely. First of all, we see that spherical expansion
 67 | coefficients of the rotated environment depend only on coefficients of the initial environments
 68 | with the same :math:`n` and :math:`\lambda` indices. I. e., one can group coefficients into vectors 
 69 | corresponding to fixed :math:`n` and :math:`\lambda` of size :math:`2 \lambda + 1` and indexed by :math: 'm'
 70 | index. The transformation itself is nothing else but matrix-vector multiplication. 
 71 | 
 72 | Within this framework, we work only with this way of transformation. Further, we will call 
 73 | any vector of odd size which transforms this way as a covariant feature/fingerprint. 
 74 | 
 75 | 
 76 | 
 77 | Some transformations upon covariant vectors also lead to covariant vectors. Some do not. 
 78 | For instance, we can apply elementwise squaring of vector elements which clearly would 
 79 | result in a non-covariant vector. 
 80 | 
 81 | There are several ways to combine covariants to get a covariant output. The most obvious is to
 82 | construct a linear combination of covariants. 
 83 | 
 84 | .. math:: 
 85 |    :label: first_expansion
 86 | 
 87 |    {output}^{\lambda}_m = \sum\limits_i (input_i)^{\lambda}_m * q_i
 88 | 
 89 |    
 90 |    
 91 | 
 92 | where :math:`q_i` are arbitrarily coefficients. The less obvious way is to do a Clebsch-Gordan 
 93 | iteration: 
 94 | 
 95 | .. math::
 96 |    :label: second_expansion
 97 | 
 98 |    {output}^{\lambda}_m  = \sum\limits_{m_1 m_2} <l_1 m_1; l_2 m_2| \lambda m>
 99 |     (first\:input)^{l_1}_{m_1} (second\:input)^{l_2}_{m_2}
100 | 
101 | , there :math:`<l_1 m_1; l_2 m_2| \lambda m>` are
102 | `Clebsch-Gordan coefficients <https://en.wikipedia.org/wiki/Clebsch%E2%80%93Gordan_coefficients>`_. 
103 | 
104 | Let's take a look at the second construction rule in more detail. It takes 
105 | two covariant vectors as input and constructs several covariant outputs, indexed
106 | by natural index :math:`\lambda`. (Actually, :math:`\lambda` is bounded between 
107 | :math:`| l_1 - l_2 |` and :math:`|l_1 + l_2|`, otherwise Clebsch-Gordan coefficients are zeros)
108 | 
109 | 
110 | For further purposes, it is necessary to introduce the concept of body order.
111 | 
112 | It is clear that by combining transformation rules :eq:`first_expansion` and :eq:`second_expansion`, we get covariants
113 | which depend polynomially on the entries of initial spherical expansion coefficients.
114 | 
115 | If all monomials have the same power :math:`\nu`, then we define the body order of the 
116 | corresponding covariant vector to be :math:`\nu`. If monomials have different powers, 
117 | then body order is undefined. 
118 | 
119 | If we apply the linear combination to the covariants of body order :math:`\nu`, then the result also 
120 | has a body order :math:`\nu`. If we do Clebsch-Gordan iteration with covariants of body order 
121 | :math:`\nu_1` and :math:`\nu_2`, then the result has body order :math:`\nu_1 + \nu_2`. 
122 | 
123 | Consider the following procedure. Initially, we 
124 | have :math:`\nu = 1`, and initial spherical expansion
125 | coefficients :math:`< n \lambda m | \rho^1>` . Let's apply the construction rule
126 | :eq:`second_expansion` for each pair of spherical expansion coefficients
127 | and for each possible output :math:`\lambda`. The result would be set 
128 | of :math:`\nu=2` body order covariants. As the next step, let's do the same 
129 | for each pair of the obtained :math:`\nu=2` covariants and
130 | initial :math:`\nu=1` spherical expansion coefficients. The result would
131 | be a set of :math:`\nu=3` covariants. And so on. 
132 | 
133 | 
134 | There are two important statements:
135 | 
136 | 1. Completeness a. 
137 |    For each :math:`\nu` set of covariants obtained by the previously discussed procedure is complete basis in the space of :math:`v` order functionals 
138 |    from :math:`\rho(*)` to invariant/covariant output. It means
139 |    that any :math:`\nu` order functional can be expressed as a linear combination 
140 |    of  :math:`\nu` order covariants/invariants. 
141 | 
142 | 2. Completeness b. 
143 |    For each :math:`\nu` set of covariants obtained by the previously discussed 
144 |    procedure is a complete basis in a space of :math:`v` body order potentials.
145 |    It means that any function of atomic structure given by the sum of contributions
146 |    over all subsets of :math:`\nu` atoms can be represented as the linear
147 |    combination of :math:`\nu` order covariants/invariants. Particularly any 
148 |    two-body potential, such as `LJ potential <https://en.wikipedia.org/wiki/Lennard-Jones_potential>`_,
149 |    can be represented as 
150 |    linear combination of first-order invariants, any three-body potential 
151 |    can be represented as a linear combination of second-order invariants
152 |    and so on.
153 | 
154 | 
155 | Taking into account these facts, it looks like the recipe for machine learning
156 | potentials is very clear. Just iterate over the body order
157 | until convergence. 
158 | 
159 | The problem is that the size of :math:`\nu` order covariants explodes with 
160 | :math:`\nu` exponentially. Indeed, when we go from :math:`\nu - 1` to
161 | :math:`\nu` order number of entries is multiplied by the number 
162 | of :math:`\nu=1` order covariant vectors and by the number of 
163 | different :math:`\lambda`-s. Thus, it is not computationally feasible to
164 | go to high body orders with this naive approach.
165 | 
166 | In practice, for particular distributions in phase space, given by particular
167 | datasets, by far, not all components of covariants are relevant. Namely,
168 | in real-life scenarios the `PCA <https://en.wikipedia.org/wiki/Principal_component_analysis>`_
169 | spectrum decreases very rapidly. So, 
170 | in fact, we need only a few components out of a great many. 
171 | 
172 | There is a way to construct iterative components iteratively. 
173 | It consists of iterative PCA and Clebsch-Gordan expansions. For each 
174 | transition from :math:`\nu-1` body order to :math:`\nu` body order, we do PCA 
175 | of :math:`\nu-1` body order covariants and use only those with the highest
176 | variance or importance for subsequent expansion. The number of components
177 | to take can be either fixed or selected dynamically in such a way as to cover a certain percentage of the 
178 | variance in the dataset. 
179 | 
180 | It is clear that in this way, most part of the variance is kept. Indeed,
181 | let's imagine that we had exact linear dependencies at some step, and, thus,
182 | after PCA, some components have exact zero variance. Substituting vector with zeros to the 
183 | expansion rule :eq:`second_expansion` we see that the result is ... also zeros. 
184 | The same relates to small components - components with small variance also 
185 | "give birth" to components with small variance. Thus, neglecting them 
186 | would not affect the covariants with higher body orders much. 
187 | 
188 | There is another important observation that on a particular dataset, covariants with different body orders can correlate with each other. Thus,
189 | it is a good idea to preserve at each iteration not the components with
190 | the highest absolute variance but the components with the
191 | highest "purified variance" or "new variance". I. e. components 
192 | with the highest residuals, which can not be explained by linear regression
193 | based on previous body orders. Using 
194 | "`sklearn <https://scikit-learn.org/stable/>`_ language" purification
195 | step can be viewed as :
196 | 
197 | .. code-block:: python
198 | 
199 |    purified_covariants = covariants - linear_regressor.fit(
200 |        all_covariants_of_smaller_body_order, covariants).predict(covariants)
201 | 
202 | 
203 | To conclude, NICE consist of iterations each of three steps:
204 | 
205 | 1. Expansion - raising the body order by one using Clebsh-Gordan iteration :eq:`second_expansion`.
206 | 2. Purification - getting rid of variance, which is explainable by previous body-order covariants.
207 | 3. PCA - to group the most part of the variance in a small subset of components.
208 | 
209 | 
210 | In principle, one can apply this machinery to other invariant/covariant machine learning tasks
211 | not related to atomistic machine learning. The only difference is that in this case, 
212 | input spherical expansion coefficients :math:`< n \lambda m | \rho^1>` would be obtained from 
213 | some other sphere/ball signal, not from the sum of Gaussians as in the case of atomistic machine learning. 
214 | 
215 | In the current implementation there is also a duplicate branch of only invariants, 
216 | which allows choosing hyper parameters, such as the number of components to expand,
217 | separately for invariants and covariants, which is very useful in practice. 
218 | 
219 | More about it in the first tutorial, "Constructing machine learning potential".
220 | 
221 | 
222 | 
223 | .. [Ref] https://aip.scitation.org/doi/10.1063/5.0021116
224 | 


--------------------------------------------------------------------------------
/docs/utilities.rst:
--------------------------------------------------------------------------------
1 | Utilities
2 | =========
3 | 
4 | .. automodule:: utilities
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/enumerate/README.md:
--------------------------------------------------------------------------------
 1 | Enumeration of linearly and polynomially-independent invariants
 2 | ===============================================================
 3 | 
 4 | The Mathematica notebooks  `linear_reduce.nb` and `polynomial_reduce.nb` 
 5 | use computer algebra to list all of the coefficients of density-correlation
 6 | equivariants that are linearly independent, or that cannot be computed as
 7 | polynomial of lower-order invariants.
 8 | 
 9 | The code is not optimized, and cannot go beyond relatively low body order
10 | and nmax,lmax thresholds. It shows however how there are relatively few
11 | invariants that can be dropped beyond those that can be identified based
12 | on angular momentum recoupling theory.
13 | 
14 | The repository also contains a few examples of the listings, named as
15 | `indep-nmax-lmax.dat`
16 | Entries in each file list the indices of the nonzero (and linearly independent)
17 | invariants, labeled as
18 | 
19 | ```
20 | # nu sigma lambda n1 l1 k1 [n2 l2 k2 .....]
21 | ```
22 | 
23 | following the notation from https://arxiv.org/abs/2007.03407
24 | 


--------------------------------------------------------------------------------
/enumerate/indep-1-1.dat:
--------------------------------------------------------------------------------
 1 | "# nu sigma lambda n1 l1 k1 [n2 l2 k2 .....]"
 2 | 1 1 0 1 0 0
 3 | 1 1 1 1 1 1
 4 | 2 1 0 1 0 0 1 0 0
 5 | 2 1 0 1 1 1 1 1 1
 6 | 2 1 1 1 0 0 1 1 0
 7 | 3 1 0 1 0 0 1 0 0 1 0 0
 8 | 3 1 0 1 0 0 1 1 0 1 1 1
 9 | 3 1 1 1 0 0 1 0 0 1 1 0
10 | 3 1 1 1 1 1 1 1 1 1 1 0
11 | 4 1 0 1 0 0 1 0 0 1 0 0 1 0 0
12 | 4 1 0 1 0 0 1 0 0 1 1 0 1 1 1
13 | 4 1 0 1 1 1 1 1 1 1 1 0 1 1 1
14 | 4 1 1 1 0 0 1 0 0 1 0 0 1 1 0
15 | 4 1 1 1 0 0 1 1 0 1 1 1 1 1 0
16 | 5 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0
17 | 5 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1
18 | 5 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1
19 | 5 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0
20 | 5 1 1 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0
21 | 5 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0
22 | 6 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0
23 | 6 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1
24 | 6 1 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1
25 | 6 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1
26 | 6 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0
27 | 6 1 1 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0
28 | 6 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0
29 | 7 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0
30 | 7 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1
31 | 7 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1
32 | 7 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1
33 | 7 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0
34 | 7 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0
35 | 7 1 1 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0
36 | 7 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0
37 | 8 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0
38 | 8 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1
39 | 8 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1
40 | 8 1 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1
41 | 8 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1
42 | 8 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0
43 | 8 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0
44 | 8 1 1 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0
45 | 8 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0


--------------------------------------------------------------------------------
/examples/methane_home_pc.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n",
 10 |     "\n",
 11 |     "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n",
 12 |     "!gunzip -k methane.extxyz.gz"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import numpy as np\n",
 22 |     "import ase.io\n",
 23 |     "import tqdm\n",
 24 |     "from nice.blocks import *\n",
 25 |     "from nice.utilities import *\n",
 26 |     "from matplotlib import pyplot as plt\n",
 27 |     "from sklearn.linear_model import BayesianRidge"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "HARTREE_TO_EV = 27.211386245988\n",
 37 |     "train_subset = \"0:10000\"  #input for ase.io.read command\n",
 38 |     "test_subset = \"10000:15000\"  #input to ase.io.read command\n",
 39 |     "environments_for_fitting = 1000  #number of environments to fit nice transfomers\n",
 40 |     "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000, 5000, 7500,\n",
 41 |     "        10000]  #for learning curve\n",
 42 |     "\n",
 43 |     "#HYPERS for librascal spherical expansion coefficients\n",
 44 |     "HYPERS = {\n",
 45 |     "    'interaction_cutoff': 6.3,\n",
 46 |     "    'max_radial': 5,\n",
 47 |     "    'max_angular': 5,\n",
 48 |     "    'gaussian_sigma_type': 'Constant',\n",
 49 |     "    'gaussian_sigma_constant': 0.05,\n",
 50 |     "    'cutoff_smooth_width': 0.3,\n",
 51 |     "    'radial_basis': 'GTO'\n",
 52 |     "}"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "#our model:\n",
 62 |     "def get_nice():\n",
 63 |     "    return StandardSequence([\n",
 64 |     "        StandardBlock(ThresholdExpansioner(num_expand=150),\n",
 65 |     "                      CovariantsPurifierBoth(max_take=10),\n",
 66 |     "                      IndividualLambdaPCAsBoth(n_components=50),\n",
 67 |     "                      ThresholdExpansioner(num_expand=300, mode='invariants'),\n",
 68 |     "                      InvariantsPurifier(max_take=50),\n",
 69 |     "                      InvariantsPCA(n_components=200)),\n",
 70 |     "        StandardBlock(ThresholdExpansioner(num_expand=150),\n",
 71 |     "                      CovariantsPurifierBoth(max_take=10),\n",
 72 |     "                      IndividualLambdaPCAsBoth(n_components=50),\n",
 73 |     "                      ThresholdExpansioner(num_expand=300, mode='invariants'),\n",
 74 |     "                      InvariantsPurifier(max_take=50),\n",
 75 |     "                      InvariantsPCA(n_components=200)),\n",
 76 |     "        StandardBlock(None, None, None,\n",
 77 |     "                      ThresholdExpansioner(num_expand=300, mode='invariants'),\n",
 78 |     "                      InvariantsPurifier(max_take=50),\n",
 79 |     "                      InvariantsPCA(n_components=200))\n",
 80 |     "    ],\n",
 81 |     "                            initial_scaler=InitialScaler(\n",
 82 |     "                                mode='signal integral', individually=True))"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "train_structures = ase.io.read('methane.extxyz', index=train_subset)\n",
 92 |     "\n",
 93 |     "test_structures = ase.io.read('methane.extxyz', index=test_subset)\n",
 94 |     "\n",
 95 |     "all_species = get_all_species(train_structures + test_structures)\n",
 96 |     "\n",
 97 |     "train_coefficients = get_spherical_expansion(train_structures, HYPERS,\n",
 98 |     "                                             all_species)\n",
 99 |     "\n",
100 |     "test_coefficients = get_spherical_expansion(test_structures, HYPERS,\n",
101 |     "                                            all_species)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "#individual nice transformers for each atomic specie in the dataset\n",
111 |     "nice = {}\n",
112 |     "for key in train_coefficients.keys():\n",
113 |     "    nice[key] = get_nice()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "for key in train_coefficients.keys():\n",
123 |     "    nice[key].fit(train_coefficients[key][:environments_for_fitting])"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "train_features = transform_sequentially(nice, train_structures, HYPERS,\n",
133 |     "                                        all_species)\n",
134 |     "test_features = transform_sequentially(nice, test_structures, HYPERS,\n",
135 |     "                                       all_species)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "train_energies = [structure.info['energy'] for structure in train_structures]\n",
145 |     "train_energies = np.array(train_energies) * HARTREE_TO_EV\n",
146 |     "\n",
147 |     "test_energies = [structure.info['energy'] for structure in test_structures]\n",
148 |     "test_energies = np.array(test_energies) * HARTREE_TO_EV"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "def get_rmse(first, second):\n",
158 |     "    return np.sqrt(np.mean((first - second)**2))\n",
159 |     "\n",
160 |     "\n",
161 |     "def get_standard_deviation(values):\n",
162 |     "    return np.sqrt(np.mean((values - np.mean(values))**2))\n",
163 |     "\n",
164 |     "\n",
165 |     "def get_relative_performance(predictions, values):\n",
166 |     "    return get_rmse(predictions, values) / get_standard_deviation(values)\n",
167 |     "\n",
168 |     "\n",
169 |     "def estimate_performance(regressor, data_train, data_test, targets_train,\n",
170 |     "                         targets_test):\n",
171 |     "    regressor.fit(data_train, targets_train)\n",
172 |     "    return get_relative_performance(regressor.predict(data_test), targets_test)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "errors = []\n",
182 |     "for el in tqdm.tqdm(grid):\n",
183 |     "    errors.append(\n",
184 |     "        estimate_performance(BayesianRidge(), train_features[:el],\n",
185 |     "                             test_features, train_energies[:el],\n",
186 |     "                             test_energies))"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "print(errors)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "from matplotlib import pyplot as plt\n",
205 |     "plt.plot(grid, errors, 'bo')\n",
206 |     "plt.plot(grid, errors, 'b')\n",
207 |     "plt.xlabel(\"number of structures\")\n",
208 |     "plt.ylabel(\"relative error\")\n",
209 |     "plt.xscale('log')\n",
210 |     "plt.yscale('log')\n",
211 |     "plt.show()"
212 |    ]
213 |   }
214 |  ],
215 |  "metadata": {
216 |   "kernelspec": {
217 |    "display_name": "Python 3",
218 |    "language": "python",
219 |    "name": "python3"
220 |   },
221 |   "language_info": {
222 |    "codemirror_mode": {
223 |     "name": "ipython",
224 |     "version": 3
225 |    },
226 |    "file_extension": ".py",
227 |    "mimetype": "text/x-python",
228 |    "name": "python",
229 |    "nbconvert_exporter": "python",
230 |    "pygments_lexer": "ipython3",
231 |    "version": "3.6.9"
232 |   }
233 |  },
234 |  "nbformat": 4,
235 |  "nbformat_minor": 4
236 | }
237 | 


--------------------------------------------------------------------------------
/examples/methane_medium.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n",
 10 |     "\n",
 11 |     "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n",
 12 |     "!gunzip -k methane.extxyz.gz"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import numpy as np\n",
 22 |     "import ase.io\n",
 23 |     "import tqdm\n",
 24 |     "from nice.blocks import *\n",
 25 |     "from nice.utilities import *\n",
 26 |     "from matplotlib import pyplot as plt\n",
 27 |     "from sklearn.linear_model import BayesianRidge"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "HARTREE_TO_EV = 27.211386245988\n",
 37 |     "train_subset = \"0:100000\"    #input for ase.io.read command\n",
 38 |     "test_subset = \"3050000:3130000\"     #input to ase.io.read command\n",
 39 |     "environments_for_fitting = 5000    #number of environments to fit nice transfomers\n",
 40 |     "grid =   [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000,\n",
 41 |     "          5000, 7500, 10000, 15000, 20000,\n",
 42 |     "          30000, 50000, 75000, 100000] #for learning curve\n",
 43 |     "\n",
 44 |     "#HYPERS for librascal spherical expansion coefficients\n",
 45 |     "HYPERS = {\n",
 46 |     "'interaction_cutoff': 6.3,\n",
 47 |     "'max_radial': 5,\n",
 48 |     "'max_angular': 5,\n",
 49 |     "'gaussian_sigma_type': 'Constant',\n",
 50 |     "'gaussian_sigma_constant': 0.05,\n",
 51 |     "'cutoff_smooth_width': 0.3,\n",
 52 |     "'radial_basis': 'GTO'\n",
 53 |     "}"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "#our model:\n",
 63 |     "def get_transformer():\n",
 64 |     "    return StandardSequence([StandardBlock(ThresholdExpansioner(),\n",
 65 |     "                                              CovariantsPurifierBoth(max_take = 100),\n",
 66 |     "                                                  IndividualLambdaPCAsBoth(),\n",
 67 |     "                                                 None,\n",
 68 |     "                                                 None,\n",
 69 |     "                                                  None),\n",
 70 |     "                            StandardBlock(ThresholdExpansioner(num_expand = 10000),\n",
 71 |     "                                              CovariantsPurifierBoth(max_take = 100),\n",
 72 |     "                                                  IndividualLambdaPCAsBoth(2000),\n",
 73 |     "                                                  ThresholdExpansioner(num_expand = 50000, mode = 'invariants'),\n",
 74 |     "                                              InvariantsPurifier(max_take = 100),\n",
 75 |     "                                                 InvariantsPCA(n_components = 2000)),\n",
 76 |     "                             StandardBlock(None,\n",
 77 |     "                                             None,\n",
 78 |     "                                                  None,\n",
 79 |     "                                                  ThresholdExpansioner(num_expand = 50000, mode = 'invariants'),\n",
 80 |     "                                              InvariantsPurifier(max_take = 100),\n",
 81 |     "                                                  InvariantsPCA(n_components = 5000))\n",
 82 |     "                                   ],\n",
 83 |     "                            initial_scaler = InitialScaler(mode = 'signal integral',\n",
 84 |     "                                                           individually = True))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "train_structures = ase.io.read('methane.extxyz', \n",
 94 |     "                         index = train_subset)\n",
 95 |     "\n",
 96 |     "test_structures = ase.io.read('methane.extxyz', \n",
 97 |     "                         index = test_subset)\n",
 98 |     "\n",
 99 |     "all_species = get_all_species(train_structures + test_structures)\n",
100 |     "\n",
101 |     "train_coefficients = get_spherical_expansion(train_structures, HYPERS, all_species)\n",
102 |     "\n",
103 |     "\n",
104 |     "\n",
105 |     "test_coefficients = get_spherical_expansion(test_structures, HYPERS, all_species)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "#individual transformers for each atomic specie in dataset\n",
115 |     "transformers = {}\n",
116 |     "for key in train_coefficients.keys():\n",
117 |     "    transformers[key] = get_transformer()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "for key in train_coefficients.keys():\n",
127 |     "    transformers[key].fit(train_coefficients[key][:environments_for_fitting])"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "train_features = transform_sequentially(transformers, \n",
137 |     "                                        train_structures, HYPERS, all_species)\n",
138 |     "test_features = transform_sequentially(transformers,\n",
139 |     "                                        test_structures, HYPERS, all_species)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "train_energies = [structure.info['energy'] for structure in train_structures]\n",
149 |     "train_energies = np.array(train_energies) * HARTREE_TO_EV\n",
150 |     "\n",
151 |     "test_energies = [structure.info['energy'] for structure in test_structures]\n",
152 |     "test_energies = np.array(test_energies) * HARTREE_TO_EV\n"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "def get_rmse(first, second):\n",
162 |     "    return np.sqrt(np.mean((first - second) ** 2))\n",
163 |     "\n",
164 |     "def get_standard_deviation(values):\n",
165 |     "    return np.sqrt(np.mean((values - np.mean(values)) ** 2))\n",
166 |     "\n",
167 |     "def get_relative_performance(predictions, values):\n",
168 |     "    return get_rmse(predictions, values) / get_standard_deviation(values)\n",
169 |     "\n",
170 |     "def estimate_performance(clf, data_train, data_test, targets_train, targets_test):\n",
171 |     "    clf.fit(data_train, targets_train)\n",
172 |     "    return get_relative_performance(clf.predict(data_test), targets_test)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "errors = []\n",
182 |     "for el in tqdm.tqdm(grid):   \n",
183 |     "    errors.append(estimate_performance(BayesianRidge(), train_features[:el],\n",
184 |     "                                       test_features, train_energies[:el],\n",
185 |     "                                       test_energies))"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "print(errors)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "from matplotlib import pyplot as plt\n",
204 |     "plt.plot(grid, errors, 'bo')\n",
205 |     "plt.plot(grid, errors, 'b')\n",
206 |     "plt.xlabel(\"number of structures\")\n",
207 |     "plt.ylabel(\"relative error\")\n",
208 |     "plt.xscale('log')\n",
209 |     "plt.yscale('log')\n",
210 |     "plt.show()"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": []
219 |   }
220 |  ],
221 |  "metadata": {
222 |   "kernelspec": {
223 |    "display_name": "Python 3",
224 |    "language": "python",
225 |    "name": "python3"
226 |   },
227 |   "language_info": {
228 |    "codemirror_mode": {
229 |     "name": "ipython",
230 |     "version": 3
231 |    },
232 |    "file_extension": ".py",
233 |    "mimetype": "text/x-python",
234 |    "name": "python",
235 |    "nbconvert_exporter": "python",
236 |    "pygments_lexer": "ipython3",
237 |    "version": "3.6.9"
238 |   }
239 |  },
240 |  "nbformat": 4,
241 |  "nbformat_minor": 4
242 | }
243 | 


--------------------------------------------------------------------------------
/examples/methane_small.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n",
 10 |     "\n",
 11 |     "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n",
 12 |     "!gunzip -k methane.extxyz.gz"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import numpy as np\n",
 22 |     "import ase.io\n",
 23 |     "import tqdm\n",
 24 |     "from nice.blocks import *\n",
 25 |     "from nice.utilities import *\n",
 26 |     "from matplotlib import pyplot as plt\n",
 27 |     "from sklearn.linear_model import BayesianRidge"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "HARTREE_TO_EV = 27.211386245988\n",
 37 |     "train_subset = \"0:100000\"    #input for ase.io.read command\n",
 38 |     "test_subset = \"3050000:3130000\"     #input to ase.io.read command\n",
 39 |     "environments_for_fitting = 5000    #number of environments to fit nice transfomers\n",
 40 |     "grid =   [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000,\n",
 41 |     "          5000, 7500, 10000, 15000, 20000,\n",
 42 |     "          30000, 50000, 75000, 100000] #for learning curve\n",
 43 |     "\n",
 44 |     "#HYPERS for librascal spherical expansion coefficients\n",
 45 |     "HYPERS = {\n",
 46 |     "'interaction_cutoff': 6.3,\n",
 47 |     "'max_radial': 5,\n",
 48 |     "'max_angular': 5,\n",
 49 |     "'gaussian_sigma_type': 'Constant',\n",
 50 |     "'gaussian_sigma_constant': 0.05,\n",
 51 |     "'cutoff_smooth_width': 0.3,\n",
 52 |     "'radial_basis': 'GTO'\n",
 53 |     "}"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "#our model:\n",
 63 |     "def get_transformer():\n",
 64 |     "    return StandardSequence([StandardBlock(ThresholdExpansioner(num_expand = 1000),\n",
 65 |     "                                              CovariantsPurifierBoth(max_take = 100),\n",
 66 |     "                                                  IndividualLambdaPCAsBoth(500),\n",
 67 |     "                                                 None,\n",
 68 |     "                                                 None,\n",
 69 |     "                                                  None),\n",
 70 |     "                            StandardBlock(ThresholdExpansioner(num_expand = 3000),\n",
 71 |     "                                              CovariantsPurifierBoth(max_take = 100),\n",
 72 |     "                                                  IndividualLambdaPCAsBoth(500),\n",
 73 |     "                                                  ThresholdExpansioner(num_expand = 5000, mode = 'invariants'),\n",
 74 |     "                                              InvariantsPurifier(max_take = 100),\n",
 75 |     "                                                 InvariantsPCA(n_components = 1000)),\n",
 76 |     "                             StandardBlock(None,\n",
 77 |     "                                             None,\n",
 78 |     "                                                  None,\n",
 79 |     "                                                  ThresholdExpansioner(num_expand = 5000, mode = 'invariants'),\n",
 80 |     "                                              InvariantsPurifier(max_take = 100),\n",
 81 |     "                                                  InvariantsPCA(n_components = 2000))\n",
 82 |     "                                   ],\n",
 83 |     "                            initial_scaler = InitialScaler(mode = 'signal integral',\n",
 84 |     "                                                           individually = True))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "train_structures = ase.io.read('methane.extxyz', \n",
 94 |     "                         index = train_subset)\n",
 95 |     "\n",
 96 |     "test_structures = ase.io.read('methane.extxyz', \n",
 97 |     "                         index = test_subset)\n",
 98 |     "\n",
 99 |     "all_species = get_all_species(train_structures + test_structures)\n",
100 |     "\n",
101 |     "train_coefficients = get_spherical_expansion(train_structures, HYPERS, all_species)\n",
102 |     "\n",
103 |     "\n",
104 |     "\n",
105 |     "test_coefficients = get_spherical_expansion(test_structures, HYPERS, all_species)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "#individual transformers for each atomic specie in dataset\n",
115 |     "transformers = {}\n",
116 |     "for key in train_coefficients.keys():\n",
117 |     "    transformers[key] = get_transformer()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "for key in train_coefficients.keys():\n",
127 |     "    transformers[key].fit(train_coefficients[key][:environments_for_fitting])"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "train_features = transform_sequentially(transformers, \n",
137 |     "                                        train_structures, HYPERS, all_species)\n",
138 |     "test_features = transform_sequentially(transformers,\n",
139 |     "                                        test_structures, HYPERS, all_species)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "train_energies = [structure.info['energy'] for structure in train_structures]\n",
149 |     "train_energies = np.array(train_energies) * HARTREE_TO_EV\n",
150 |     "\n",
151 |     "test_energies = [structure.info['energy'] for structure in test_structures]\n",
152 |     "test_energies = np.array(test_energies) * HARTREE_TO_EV\n"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "def get_rmse(first, second):\n",
162 |     "    return np.sqrt(np.mean((first - second) ** 2))\n",
163 |     "\n",
164 |     "def get_standard_deviation(values):\n",
165 |     "    return np.sqrt(np.mean((values - np.mean(values)) ** 2))\n",
166 |     "\n",
167 |     "def get_relative_performance(predictions, values):\n",
168 |     "    return get_rmse(predictions, values) / get_standard_deviation(values)\n",
169 |     "\n",
170 |     "def estimate_performance(clf, data_train, data_test, targets_train, targets_test):\n",
171 |     "    clf.fit(data_train, targets_train)\n",
172 |     "    return get_relative_performance(clf.predict(data_test), targets_test)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "errors = []\n",
182 |     "for el in tqdm.tqdm(grid):   \n",
183 |     "    errors.append(estimate_performance(BayesianRidge(), train_features[:el],\n",
184 |     "                                       test_features, train_energies[:el],\n",
185 |     "                                       test_energies))"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "print(errors)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "from matplotlib import pyplot as plt\n",
204 |     "plt.plot(grid, errors, 'bo')\n",
205 |     "plt.plot(grid, errors, 'b')\n",
206 |     "plt.xlabel(\"number of structures\")\n",
207 |     "plt.ylabel(\"relative error\")\n",
208 |     "plt.xscale('log')\n",
209 |     "plt.yscale('log')\n",
210 |     "plt.show()"
211 |    ]
212 |   }
213 |  ],
214 |  "metadata": {
215 |   "kernelspec": {
216 |    "display_name": "Python 3",
217 |    "language": "python",
218 |    "name": "python3"
219 |   },
220 |   "language_info": {
221 |    "codemirror_mode": {
222 |     "name": "ipython",
223 |     "version": 3
224 |    },
225 |    "file_extension": ".py",
226 |    "mimetype": "text/x-python",
227 |    "name": "python",
228 |    "nbconvert_exporter": "python",
229 |    "pygments_lexer": "ipython3",
230 |    "version": "3.6.9"
231 |   }
232 |  },
233 |  "nbformat": 4,
234 |  "nbformat_minor": 4
235 | }
236 | 


--------------------------------------------------------------------------------
/examples/qm9_home_pc.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "import ase\n",
 11 |     "from ase import Atoms\n",
 12 |     "import numpy as np\n",
 13 |     "import tqdm\n",
 14 |     "import ase.io\n",
 15 |     "from nice.blocks import *\n",
 16 |     "from nice.utilities import *\n",
 17 |     "from matplotlib import pyplot as plt\n",
 18 |     "from sklearn.linear_model import BayesianRidge"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "PROPERTIES_NAMES = [\n",
 28 |     "    'tag', 'index', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2',\n",
 29 |     "    'zpve', 'U0', 'U', 'H', 'G', 'Cv'\n",
 30 |     "]\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "def string_to_float(element):\n",
 34 |     "    '''because shit like 2.1997*^-6 happens'''\n",
 35 |     "    return float(element.replace('*^', 'e'))\n",
 36 |     "\n",
 37 |     "\n",
 38 |     "PROPERTIES_HANDLERS = [str, int\n",
 39 |     "                       ] + [string_to_float] * (len(PROPERTIES_NAMES) - 2)\n",
 40 |     "\n",
 41 |     "\n",
 42 |     "def parse_qm9_xyz(path):\n",
 43 |     "    with open(path, 'r') as f:\n",
 44 |     "        lines = list(f)\n",
 45 |     "    #print(lines)\n",
 46 |     "    n_atoms = int(lines[0])\n",
 47 |     "    properties = {\n",
 48 |     "        name: handler(value)\n",
 49 |     "        for handler, name, value in zip(PROPERTIES_HANDLERS, PROPERTIES_NAMES,\n",
 50 |     "                                        lines[1].strip().split())\n",
 51 |     "    }\n",
 52 |     "    composition = \"\"\n",
 53 |     "    positions = []\n",
 54 |     "    for i in range(2, 2 + n_atoms):\n",
 55 |     "        composition += lines[i].strip().split()[0]\n",
 56 |     "        positions.append([\n",
 57 |     "            string_to_float(value) for value in lines[i].strip().split()[1:4]\n",
 58 |     "        ])\n",
 59 |     "\n",
 60 |     "    positions = np.array(positions)\n",
 61 |     "    result = Atoms(composition, positions=np.array(positions))\n",
 62 |     "    result.info.update(properties)\n",
 63 |     "    return result\n",
 64 |     "\n",
 65 |     "\n",
 66 |     "def parse_index(path):\n",
 67 |     "    with open(path, \"r\") as f:\n",
 68 |     "        lines = list(f)\n",
 69 |     "    proper_lines = lines[9:-1]\n",
 70 |     "    result = [int(line.strip().split()[0]) for line in proper_lines]\n",
 71 |     "    return np.array(result, dtype=int)\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "def download_qm9(clean=True):\n",
 75 |     "    #downloading from https://figshare.com/collections/Quantum_chemistry_structures_and_properties_of_134_kilo_molecules/978904\n",
 76 |     "    os.system(\n",
 77 |     "        \"wget https://ndownloader.figshare.com/files/3195389 -O qm9_main.xyz.tar.bz2\"\n",
 78 |     "    )\n",
 79 |     "    os.system(\n",
 80 |     "        \"wget https://ndownloader.figshare.com/files/3195404 -O problematic_index.txt\"\n",
 81 |     "    )\n",
 82 |     "    os.system(\"mkdir qm9_main_structures\")\n",
 83 |     "    os.system(\"tar xjf qm9_main.xyz.tar.bz2 -C qm9_main_structures\")\n",
 84 |     "\n",
 85 |     "    names = [\n",
 86 |     "        name for name in os.listdir('qm9_main_structures/')\n",
 87 |     "        if name.endswith('.xyz')\n",
 88 |     "    ]\n",
 89 |     "    names = sorted(names)\n",
 90 |     "\n",
 91 |     "    structures = [\n",
 92 |     "        parse_qm9_xyz('qm9_main_structures/{}'.format(name))\n",
 93 |     "        for name in tqdm.tqdm(names)\n",
 94 |     "    ]\n",
 95 |     "\n",
 96 |     "    problematic_index = parse_index('problematic_index.txt')\n",
 97 |     "    np.save('problematic_index.npy', problematic_index)\n",
 98 |     "    ase.io.write('qm9_main.extxyz', structures)\n",
 99 |     "    if (clean):\n",
100 |     "        os.system(\"rm -r qm9_main_structures\")\n",
101 |     "        os.system(\"rm problematic_index.txt\")\n",
102 |     "        os.system(\"rm qm9_main.xyz.tar.bz2\")\n",
103 |     "    return structures, problematic_index\n",
104 |     "\n",
105 |     "\n",
106 |     "def get_qm9(clean=True):\n",
107 |     "    if ('qm9_main.extxyz' in os.listdir('.')) and \\\n",
108 |     "              ('problematic_index.npy' in os.listdir('.')):\n",
109 |     "        structures = ase.io.read('qm9_main.extxyz', index=':')\n",
110 |     "        problematic_index = np.load('problematic_index.npy')\n",
111 |     "        return structures, problematic_index\n",
112 |     "    else:\n",
113 |     "        return download_qm9(clean=clean)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "structures, problematic_index = get_qm9()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "HARTREE_TO_EV = 27.211386245988\n",
132 |     "USE_PROBLEMATIC_INDEX = False\n",
133 |     "np.random.seed(0)\n",
134 |     "\n",
135 |     "if (not USE_PROBLEMATIC_INDEX):\n",
136 |     "    structures = [\n",
137 |     "        structure for structure in structures\n",
138 |     "        if structure.info['index'] not in problematic_index\n",
139 |     "    ]\n",
140 |     "\n",
141 |     "del problematic_index  #it borrows indexing from 1 from qm9, deleting it away from sin\n",
142 |     "\n",
143 |     "permutation = np.random.permutation(len(structures))\n",
144 |     "train_indices = permutation[0:2000]\n",
145 |     "test_indices = permutation[2000:2500]\n",
146 |     "environments_for_fitting = 1000  #number of environments to fit nice transfomers\n",
147 |     "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000]  #for learning curve\n",
148 |     "\n",
149 |     "#HYPERS for librascal spherical expansion coefficients\n",
150 |     "HYPERS = {\n",
151 |     "    'interaction_cutoff': 5,\n",
152 |     "    'max_radial': 15,\n",
153 |     "    'max_angular': 5,\n",
154 |     "    'gaussian_sigma_type': 'Constant',\n",
155 |     "    'gaussian_sigma_constant': 0.05,\n",
156 |     "    'cutoff_smooth_width': 0.3,\n",
157 |     "    'radial_basis': 'GTO'\n",
158 |     "}"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "#our model:\n",
168 |     "def get_nice():\n",
169 |     "    return StandardSequence([\n",
170 |     "        StandardBlock(ThresholdExpansioner(num_expand=300),\n",
171 |     "                      CovariantsPurifierBoth(max_take=10),\n",
172 |     "                      IndividualLambdaPCAsBoth(n_components=100),\n",
173 |     "                      ThresholdExpansioner(num_expand=1000, mode='invariants'),\n",
174 |     "                      InvariantsPurifier(max_take=10),\n",
175 |     "                      InvariantsPCA(n_components=200)),\n",
176 |     "        StandardBlock(ThresholdExpansioner(num_expand=300),\n",
177 |     "                      CovariantsPurifierBoth(max_take=10),\n",
178 |     "                      IndividualLambdaPCAsBoth(n_components=100),\n",
179 |     "                      ThresholdExpansioner(num_expand=1000, mode='invariants'),\n",
180 |     "                      InvariantsPurifier(max_take=10),\n",
181 |     "                      InvariantsPCA(n_components=200)),\n",
182 |     "        StandardBlock(None, None, None,\n",
183 |     "                      ThresholdExpansioner(num_expand=1000, mode='invariants'),\n",
184 |     "                      InvariantsPurifier(max_take=10),\n",
185 |     "                      InvariantsPCA(n_components=100))\n",
186 |     "    ],\n",
187 |     "                            initial_scaler=InitialScaler(\n",
188 |     "                                mode='signal integral', individually=True))"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": [
197 |     "train_structures = [structures[i] for i in train_indices]\n",
198 |     "test_structures = [structures[i] for i in test_indices]\n",
199 |     "\n",
200 |     "all_species = get_all_species(train_structures + test_structures)\n",
201 |     "\n",
202 |     "train_coefficients = get_spherical_expansion(train_structures, HYPERS,\n",
203 |     "                                             all_species)\n",
204 |     "test_coefficients = get_spherical_expansion(test_structures, HYPERS,\n",
205 |     "                                            all_species)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "all_coefficients = [\n",
215 |     "    train_coefficients[key] for key in train_coefficients.keys()\n",
216 |     "]\n",
217 |     "all_coefficients = np.concatenate(all_coefficients, axis=0)\n",
218 |     "np.random.shuffle(all_coefficients)\n",
219 |     "all_coefficients = all_coefficients[0:environments_for_fitting]"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "nice_single = get_nice()\n",
229 |     "nice_single.fit(all_coefficients)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "# using same nice transformer regardless of central specie\n",
239 |     "nice = {specie: nice_single for specie in all_species}"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "train_features = transform_sequentially(nice, train_structures, HYPERS,\n",
249 |     "                                        all_species)\n",
250 |     "test_features = transform_sequentially(nice, test_structures, HYPERS,\n",
251 |     "                                       all_species)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "train_c_features = get_compositional_features(train_structures, all_species)\n",
261 |     "test_c_features = get_compositional_features(test_structures, all_species)\n",
262 |     "\n",
263 |     "train_features = np.concatenate([train_features, train_c_features], axis=1)\n",
264 |     "test_features = np.concatenate([test_features, test_c_features], axis=1)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "train_energies = [structure.info['U0'] for structure in train_structures]\n",
274 |     "train_energies = np.array(train_energies) * HARTREE_TO_EV\n",
275 |     "\n",
276 |     "test_energies = [structure.info['U0'] for structure in test_structures]\n",
277 |     "test_energies = np.array(test_energies) * HARTREE_TO_EV"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "def get_rmse(first, second):\n",
287 |     "    return np.sqrt(np.mean((first - second)**2))\n",
288 |     "\n",
289 |     "\n",
290 |     "def get_mae(first, second):\n",
291 |     "    return np.mean(np.abs(first - second))\n",
292 |     "\n",
293 |     "\n",
294 |     "def estimate_performance(regressor, data_train, data_test, targets_train,\n",
295 |     "                         targets_test):\n",
296 |     "    regressor.fit(data_train, targets_train)\n",
297 |     "    predictions = regressor.predict(data_test)\n",
298 |     "    return get_rmse(predictions,\n",
299 |     "                    targets_test), get_mae(predictions, targets_test)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "errors_compositional = []\n",
309 |     "for el in tqdm.tqdm(grid):\n",
310 |     "    errors_compositional.append(\n",
311 |     "        estimate_performance(BayesianRidge(), train_c_features[:el],\n",
312 |     "                             test_c_features, train_energies[:el],\n",
313 |     "                             test_energies))\n",
314 |     "\n",
315 |     "errors_compositional = np.array(errors_compositional)\n",
316 |     "errors_nice = []\n",
317 |     "for el in tqdm.tqdm(grid):\n",
318 |     "    # because without this step with residuals\n",
319 |     "    # joint fitting might face problems due to\n",
320 |     "    # regularization\n",
321 |     "    regressor = BayesianRidge()\n",
322 |     "    regressor.fit(train_c_features[:el], train_energies[:el])\n",
323 |     "\n",
324 |     "    residuals_train = train_energies[:el] - regressor.predict(\n",
325 |     "        train_c_features[:el])\n",
326 |     "    residuals_test = test_energies - regressor.predict(test_c_features)\n",
327 |     "\n",
328 |     "    errors_nice.append(\n",
329 |     "        estimate_performance(BayesianRidge(), train_features[:el],\n",
330 |     "                             test_features, residuals_train, residuals_test))\n",
331 |     "\n",
332 |     "errors_nice = np.array(errors_nice)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "fig, axes = plt.subplots(1, 2)\n",
342 |     "\n",
343 |     "axes[0].plot(grid, errors_compositional[:, 0], 'ro')\n",
344 |     "axes[0].plot(grid, errors_compositional[:, 0], 'r', label='only compositional')\n",
345 |     "\n",
346 |     "axes[0].plot(grid, errors_nice[:, 0], 'bo')\n",
347 |     "axes[0].plot(grid, errors_nice[:, 0], 'b', label='nice')\n",
348 |     "\n",
349 |     "axes[0].set_xlabel(\"n_train\")\n",
350 |     "axes[0].set_ylabel(\"rmse, eV\")\n",
351 |     "axes[0].set_xscale('log')\n",
352 |     "axes[0].set_yscale('log')\n",
353 |     "\n",
354 |     "axes[1].plot(grid, errors_compositional[:, 1], 'ro')\n",
355 |     "axes[1].plot(grid, errors_compositional[:, 1], 'r', label='only compositional')\n",
356 |     "\n",
357 |     "axes[1].plot(grid, errors_nice[:, 1], 'bo')\n",
358 |     "axes[1].plot(grid, errors_nice[:, 1], 'b', label='nice')\n",
359 |     "\n",
360 |     "axes[1].set_xlabel(\"n_train\")\n",
361 |     "axes[1].set_ylabel(\"mae, eV\")\n",
362 |     "axes[1].set_xscale('log')\n",
363 |     "axes[1].set_yscale('log')\n",
364 |     "plt.legend(loc='upper center')\n",
365 |     "plt.subplots_adjust(wspace=0.4)\n",
366 |     "plt.show()"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "print(\"mae: {} eV\".format(errors_nice[-1][0]))\n",
376 |     "print(\"rmse: {} eV\".format(errors_nice[-1][1]))"
377 |    ]
378 |   }
379 |  ],
380 |  "metadata": {
381 |   "kernelspec": {
382 |    "display_name": "Python 3",
383 |    "language": "python",
384 |    "name": "python3"
385 |   },
386 |   "language_info": {
387 |    "codemirror_mode": {
388 |     "name": "ipython",
389 |     "version": 3
390 |    },
391 |    "file_extension": ".py",
392 |    "mimetype": "text/x-python",
393 |    "name": "python",
394 |    "nbconvert_exporter": "python",
395 |    "pygments_lexer": "ipython3",
396 |    "version": "3.6.9"
397 |   }
398 |  },
399 |  "nbformat": 4,
400 |  "nbformat_minor": 4
401 | }
402 | 


--------------------------------------------------------------------------------
/nice/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lab-cosmo/nice/2ff446824c88958497e2d354be271d618e3a3d3e/nice/__init__.py


--------------------------------------------------------------------------------
/nice/blocks/__init__.py:
--------------------------------------------------------------------------------
1 | from nice.blocks.compressors import *
2 | from nice.blocks.expansioners import *
3 | from nice.blocks.purifiers import *
4 | from nice.blocks.grouping import *
5 | from nice.blocks.miscellaneous import *
6 | 


--------------------------------------------------------------------------------
/nice/blocks/compressors.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from nice.unrolling_individual_pca import UnrollingIndividualPCA
  3 | 
  4 | # from cython.parallel cimport prange
  5 | 
  6 | from nice.thresholding import get_thresholded_tasks
  7 | from nice.nice_utilities import do_partial_expansion, Data, get_sizes
  8 | from nice.clebsch_gordan import ClebschGordan, check_clebsch_gordan
  9 | from nice.packing import unite_parallel, subtract_parallel
 10 | from nice.packing import pack_dense, unpack_dense
 11 | from parse import parse
 12 | import warnings
 13 | from sklearn.linear_model import Ridge
 14 | from sklearn.base import clone
 15 | from sklearn.exceptions import NotFittedError
 16 | from sklearn.decomposition import PCA
 17 | 
 18 | 
 19 | def get_num_fit(desired_num, block_size):
 20 |     if desired_num % block_size == 0:
 21 |         return desired_num // block_size
 22 |     else:
 23 |         return (desired_num // block_size) + 1
 24 | 
 25 | 
 26 | class IndividualLambdaPCAs:
 27 |     ''' Block to do pca step for covariants of single parity. It operates with instances of Data class'''
 28 |     def __init__(self, n_components=None, num_to_fit="10x"):
 29 |         self.n_components_ = n_components
 30 |         self.num_to_fit_ = num_to_fit
 31 |         self.fitted_ = False
 32 | 
 33 |     def get_importances(self):
 34 |         if not self.fitted_:
 35 |             raise NotFittedError(
 36 |                 ("instance of {} is not fitted. "
 37 |                  "Thus importances are not available.").format(
 38 |                      type(self).__name__))
 39 |         result = np.empty([self.max_n_components_, self.l_max_ + 1])
 40 |         for lambd in range(self.l_max_ + 1):
 41 |             if self.pcas_[lambd] is not None:
 42 |                 result[:self.pcas_[lambd].n_components,
 43 |                        lambd] = self.pcas_[lambd].importances_
 44 | 
 45 |         actual_sizes = []
 46 |         for lambd in range(self.l_max_ + 1):
 47 |             if self.pcas_[lambd] is not None:
 48 |                 actual_sizes.append(self.pcas_[lambd].n_components)
 49 |             else:
 50 |                 actual_sizes.append(0)
 51 |         return result
 52 | 
 53 |     def fit(self, data):
 54 | 
 55 |         self.l_max_ = data.covariants_.shape[2] - 1
 56 |         self.pcas_ = []
 57 |         self.reduction_happened_ = False
 58 |         self.max_n_components_ = -1
 59 |         for lambd in range(self.l_max_ + 1):
 60 |             if data.actual_sizes_[lambd] > 0:
 61 |                 if self.n_components_ is None:
 62 |                     n_components_now = data.actual_sizes_[lambd]
 63 |                 else:
 64 |                     n_components_now = self.n_components_
 65 | 
 66 |                 self.max_n_components_ = max(self.max_n_components_,
 67 |                                              n_components_now)
 68 | 
 69 |                 if data.covariants_.shape[0] * (lambd + 1) < n_components_now:
 70 |                     raise ValueError((
 71 |                         "not enough data to fit pca, number of vectors is {}, "
 72 |                         "dimensionality of single vector (lambd + 1) is {}, "
 73 |                         "i. e. total number of points is {}, "
 74 |                         "while number of components is {}.").format(
 75 |                             data.covariants_.shape[0],
 76 |                             lambd + 1,
 77 |                             data.covariants_.shape[0] * (lambd + 1),
 78 |                             n_components_now,
 79 |                         ))
 80 | 
 81 |                 if type(self.num_to_fit_) is str:
 82 |                     multiplier = int(parse("{}x", self.num_to_fit_)[0])
 83 |                     num_fit_now = get_num_fit(multiplier * n_components_now,
 84 |                                               (lambd + 1))
 85 |                 else:
 86 |                     num_fit_now = self.num_to_fit_
 87 |                     if num_fit_now * (lambd + 1) < n_components_now:
 88 |                         raise ValueError(
 89 |                             ("specified parameter num fit ({}) is too "
 90 |                              "small to fit pca with number of components {}."
 91 |                              ).format(num_fit_now, n_components_now))
 92 | 
 93 |                 if data.covariants_.shape[0] * (lambd + 1) < num_fit_now:
 94 |                     warnings.warn(
 95 |                         ("given data is less than desired number "
 96 |                          "of points to fit pca. "
 97 |                          "Desired number of points to fit pca is {}, "
 98 |                          "while number of vectors is {}, "
 99 |                          "dimensionality of single vector (lambd + 1) is {}, "
100 |                          "i. e. total number of points is {}. "
101 |                          "Number of pca components is {}.").format(
102 |                              num_fit_now,
103 |                              data.covariants_.shape[0],
104 |                              (lambd + 1),
105 |                              data.covariants_.shape[0] * (lambd + 1),
106 |                              n_components_now,
107 |                          ),
108 |                         RuntimeWarning,
109 |                     )
110 | 
111 |                 if n_components_now < data.actual_sizes_[lambd]:
112 |                     self.reduction_happened_ = True
113 |                 pca = UnrollingIndividualPCA(n_components=n_components_now)
114 |                 pca.fit(
115 |                     data.covariants_[:num_fit_now, :data.actual_sizes_[lambd],
116 |                                      lambd, :],
117 |                     lambd,
118 |                 )
119 |                 self.pcas_.append(pca)
120 |             else:
121 |                 self.pcas_.append(None)
122 |         self.fitted_ = True
123 |         self.importances_ = self.get_importances()
124 | 
125 |     def transform(self, data):
126 |         if not self.fitted_:
127 |             raise NotFittedError(
128 |                 ("instance of {} is not fitted. "
129 |                  "It can not transform anything.").format(type(self).__name__))
130 |         result = np.empty([
131 |             data.covariants_.shape[0],
132 |             self.max_n_components_,
133 |             self.l_max_ + 1,
134 |             2 * self.l_max_ + 1,
135 |         ])
136 |         new_actual_sizes = np.zeros([self.l_max_ + 1], dtype=np.int32)
137 |         for lambd in range(self.l_max_ + 1):
138 |             if self.pcas_[lambd] is not None:
139 |                 now = self.pcas_[lambd].transform(
140 |                     data.covariants_[:, :data.actual_sizes_[lambd], lambd, :],
141 |                     lambd)
142 |                 result[:, :now.shape[1], lambd, :(2 * lambd + 1)] = now
143 |                 new_actual_sizes[lambd] = now.shape[1]
144 |             else:
145 |                 new_actual_sizes[lambd] = 0
146 | 
147 |         return Data(result, new_actual_sizes, importances=self.importances_)
148 | 
149 |     def is_fitted(self):
150 |         return self.fitted_
151 | 
152 | 
153 | class IndividualLambdaPCAsBoth:
154 |     ''' Block to do pca step for covariants of both parities. It operates with even-odd pairs of instances of Data class'''
155 |     def __init__(self, *args, **kwargs):
156 |         self.even_pca_ = IndividualLambdaPCAs(*args, **kwargs)
157 |         self.odd_pca_ = IndividualLambdaPCAs(*args, **kwargs)
158 |         self.fitted_ = False
159 | 
160 |     def fit(self, data_even, data_odd):
161 | 
162 |         self.even_pca_.fit(data_even)
163 |         self.odd_pca_.fit(data_odd)
164 |         self.fitted_ = True
165 | 
166 |     def transform(self, data_even, data_odd):
167 |         if not self.fitted_:
168 |             raise NotFittedError(
169 |                 ("instance of {} is not fitted. "
170 |                  "It can not transform anything.").format(type(self).__name__))
171 |         return self.even_pca_.transform(data_even), self.odd_pca_.transform(
172 |             data_odd)
173 | 
174 |     def is_fitted(self):
175 |         return self.fitted_
176 | 
177 | 
178 | class InvariantsPCA(PCA):
179 |     ''' Block to do pca step for invariants. It operates with 2d numpy arrays'''
180 |     def __init__(self, *args, num_to_fit="10x", **kwargs):
181 |         self.num_to_fit_ = num_to_fit
182 |         self.fitted_ = False
183 |         return super().__init__(*args, **kwargs)
184 | 
185 |     def _my_representation(self):
186 |         if (self.fitted_):
187 |             return "Instance of InvariantsPCA, fitted"
188 |         else:
189 |             return "Instance of InvariantsPCA, not fitted"
190 | 
191 |     def __repr__(self):
192 |         return self._my_representation()
193 | 
194 |     def __str__(self):
195 |         return self._my_representation()
196 | 
197 |     def process_input(self, X):
198 |         if (self.n_components is None):
199 |             self.n_components = X.shape[1]
200 |         if (self.n_components > X.shape[1]):
201 |             self.n_components = X.shape[1]
202 | 
203 |         if type(self.num_to_fit_) is str:
204 |             multiplier = int(parse("{}x", self.num_to_fit_)[0])
205 |             num_fit_now = multiplier * self.n_components
206 |         else:
207 |             num_fit_now = self.num_to_fit_
208 | 
209 |         if self.n_components > X.shape[0]:
210 |             raise ValueError(
211 |                 ("not enough data to fit pca. "
212 |                  "Number of environments is {}, number of components is {}."
213 |                  ).format(X.shape[0], self.n_components))
214 | 
215 |         if num_fit_now > X.shape[0]:
216 |             warnings.warn(("Amount of provided data is less "
217 |                            "than the desired one to fit PCA. "
218 |                            "Number of components is {}, "
219 |                            "desired number of environments is {}, "
220 |                            "actual number of environments is {}.").format(
221 |                                self.n_components, num_fit_now, X.shape[0]))
222 | 
223 |         return X[:num_fit_now]
224 | 
225 |     def fit(self, X):
226 | 
227 |         res = super().fit(self.process_input(X))
228 |         self.fitted_ = True
229 |         return res
230 | 
231 |     def fit_transform(self, X):
232 |         res = super().fit_transform(self.process_input(X))
233 |         self.fitted_ = True
234 |         return res
235 | 
236 |     def transform(self, X):
237 |         if not self.fitted_:
238 |             raise NotFittedError(
239 |                 ("instance of {} is not fitted. "
240 |                  "It can not transform anything.").format(type(self).__name__))
241 |         return super().transform(X)
242 | 
243 |     def is_fitted(self):
244 |         return self.fitted_
245 | 


--------------------------------------------------------------------------------
/nice/blocks/expansioners.py:
--------------------------------------------------------------------------------
  1 | from nice.thresholding import get_thresholded_tasks
  2 | from nice.nice_utilities import do_partial_expansion, Data, get_sizes
  3 | from nice.clebsch_gordan import ClebschGordan, check_clebsch_gordan
  4 | import numpy as np
  5 | from sklearn.exceptions import NotFittedError
  6 | 
  7 | 
  8 | class ThresholdExpansioner:
  9 |     ''' Block to do Clebsch-Gordan iteration. It uses two even-odd pairs of Data instances with covariants
 10 |     to produce new ones. If first even-odd pair contains covariants of body order v1, and the second v2, body
 11 |     order of the result would be v1 + v2. '''
 12 |     def __init__(self, num_expand=None, mode="covariants", num_threads=None):
 13 |         if num_expand is None:
 14 |             self.num_expand_ = -1
 15 |         else:
 16 |             self.num_expand_ = num_expand
 17 | 
 18 |         self.mode_ = mode
 19 |         self.num_threads_ = num_threads
 20 |         self.fitted_ = False
 21 | 
 22 |     def fit(self,
 23 |             first_even,
 24 |             first_odd,
 25 |             second_even,
 26 |             second_odd,
 27 |             clebsch_gordan=None):
 28 | 
 29 |         self.l_max_ = first_even.covariants_.shape[2] - 1
 30 | 
 31 |         if (first_even.importances_ is None) or (first_odd.importances_ is None) \
 32 |         or (second_even.importances_ is None) or (second_odd.importances_ is None):
 33 |             raise ValueError(
 34 |                 "For thresholding importances of features should be specified")
 35 | 
 36 |         (
 37 |             self.task_even_even_,
 38 |             self.task_odd_odd_,
 39 |             self.task_even_odd_,
 40 |             self.task_odd_even_,
 41 |         ) = get_thresholded_tasks(
 42 |             first_even,
 43 |             first_odd,
 44 |             second_even,
 45 |             second_odd,
 46 |             self.num_expand_,
 47 |             self.l_max_,
 48 |             self.mode_,
 49 |         )
 50 | 
 51 |         if clebsch_gordan is None:
 52 |             self.clebsch_ = ClebschGordan(self.l_max_)
 53 |         else:
 54 |             check_clebsch_gordan(clebsch_gordan, self.l_max_)
 55 |             self.clebsch_ = clebsch_gordan
 56 | 
 57 |         self.new_even_size_ = np.max(
 58 |             get_sizes(self.l_max_, self.task_even_even_[0], self.mode_) +
 59 |             get_sizes(self.l_max_, self.task_odd_odd_[0], self.mode_))
 60 | 
 61 |         self.new_odd_size_ = np.max(
 62 |             get_sizes(self.l_max_, self.task_even_odd_[0], self.mode_) +
 63 |             get_sizes(self.l_max_, self.task_odd_even_[0], self.mode_))
 64 | 
 65 |         self.new_even_raw_importances_ = np.concatenate(
 66 |             [self.task_even_even_[1], self.task_odd_odd_[1]], axis=0)
 67 |         self.new_odd_raw_importances_ = np.concatenate(
 68 |             [self.task_even_odd_[1], self.task_odd_even_[1]], axis=0)
 69 |         self.fitted_ = True
 70 | 
 71 |     def transform(self, first_even, first_odd, second_even, second_odd):
 72 |         if not self.fitted_:
 73 |             raise NotFittedError(
 74 |                 "instance of {} is not fitted. It can not transform anything".
 75 |                 format(type(self).__name__))
 76 | 
 77 |         if self.mode_ == "covariants":
 78 |             new_even = np.empty([
 79 |                 first_even.covariants_.shape[0],
 80 |                 self.new_even_size_,
 81 |                 self.l_max_ + 1,
 82 |                 2 * self.l_max_ + 1,
 83 |             ])
 84 |             new_odd = np.empty([
 85 |                 first_even.covariants_.shape[0],
 86 |                 self.new_odd_size_,
 87 |                 self.l_max_ + 1,
 88 |                 2 * self.l_max_ + 1,
 89 |             ])
 90 |         else:
 91 |             new_even = np.empty(
 92 |                 [first_even.covariants_.shape[0], self.new_even_size_, 1])
 93 |             new_odd = np.empty(
 94 |                 [first_even.covariants_.shape[0], self.new_odd_size_, 1])
 95 | 
 96 |         if self.mode_ == "covariants":
 97 |             new_even_actual_sizes = np.zeros([self.l_max_ + 1], dtype=np.int32)
 98 |             new_odd_actual_sizes = np.zeros([self.l_max_ + 1], dtype=np.int32)
 99 |         else:
100 |             new_even_actual_sizes = np.zeros([1], dtype=np.int32)
101 |             new_odd_actual_sizes = np.zeros([1], dtype=np.int32)
102 | 
103 |         do_partial_expansion(
104 |             self.clebsch_.precomputed_,
105 |             first_even.covariants_,
106 |             second_even.covariants_,
107 |             self.l_max_,
108 |             self.task_even_even_[0],
109 |             new_even,
110 |             new_even_actual_sizes,
111 |             self.mode_,
112 |             num_threads=self.num_threads_,
113 |         )
114 |         # print(new_even_actual_sizes)
115 |         do_partial_expansion(
116 |             self.clebsch_.precomputed_,
117 |             first_odd.covariants_,
118 |             second_odd.covariants_,
119 |             self.l_max_,
120 |             self.task_odd_odd_[0],
121 |             new_even,
122 |             new_even_actual_sizes,
123 |             self.mode_,
124 |             num_threads=self.num_threads_,
125 |         )
126 |         # print(new_even_actual_sizes)
127 |         do_partial_expansion(
128 |             self.clebsch_.precomputed_,
129 |             first_even.covariants_,
130 |             second_odd.covariants_,
131 |             self.l_max_,
132 |             self.task_even_odd_[0],
133 |             new_odd,
134 |             new_odd_actual_sizes,
135 |             self.mode_,
136 |             num_threads=self.num_threads_,
137 |         )
138 | 
139 |         do_partial_expansion(
140 |             self.clebsch_.precomputed_,
141 |             first_odd.covariants_,
142 |             second_even.covariants_,
143 |             self.l_max_,
144 |             self.task_odd_even_[0],
145 |             new_odd,
146 |             new_odd_actual_sizes,
147 |             self.mode_,
148 |             num_threads=self.num_threads_,
149 |         )
150 |         if self.mode_ == "covariants":
151 |             return Data(new_even,
152 |                         new_even_actual_sizes), Data(new_odd,
153 |                                                      new_odd_actual_sizes)
154 |         else:
155 |             return (
156 |                 new_even[:, :new_even_actual_sizes[0], 0],
157 |                 new_odd[:, :new_odd_actual_sizes[0], 0],
158 |             )
159 | 
160 |     def is_fitted(self):
161 |         return self.fitted_
162 | 


--------------------------------------------------------------------------------
/nice/blocks/miscellaneous.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from nice.nice_utilities import Data
  3 | from sklearn.exceptions import NotFittedError
  4 | 
  5 | 
  6 | class ParityDefinitionChanger():
  7 |     '''Block to change parity definition from even-odd to true-pseudo and vice versa'''
  8 |     def _init__(self):
  9 |         self.fitted_ = True
 10 | 
 11 |     def is_fitted(self):
 12 |         return self.fitted_
 13 | 
 14 |     def transform(self, first_data, second_data):
 15 |         l_max = first_data.covariants_.shape[2] - 1
 16 |         new_first_sizes, new_second_sizes = [], []
 17 |         for lambd in range(l_max + 1):
 18 |             if (lambd % 2 == 0):
 19 |                 new_first_sizes.append(first_data.actual_sizes_[lambd])
 20 |                 new_second_sizes.append(second_data.actual_sizes_[lambd])
 21 |             else:
 22 |                 new_first_sizes.append(second_data.actual_sizes_[lambd])
 23 |                 new_second_sizes.append(first_data.actual_sizes_[lambd])
 24 | 
 25 |         new_first_sizes, new_second_sizes = np.array(new_first_sizes,
 26 |                                                      dtype=np.int32), np.array(
 27 |                                                          new_second_sizes,
 28 |                                                          dtype=np.int32)
 29 |         new_first_shape = list(first_data.covariants_.shape)
 30 |         new_first_shape[1] = np.max(new_first_sizes)
 31 | 
 32 |         new_second_shape = list(second_data.covariants_.shape)
 33 |         new_second_shape[1] = np.max(new_second_sizes)
 34 | 
 35 |         new_first_covariants = np.empty(new_first_shape)
 36 |         new_second_covariants = np.empty(new_second_shape)
 37 | 
 38 |         for lambd in range(l_max + 1):  # todo may be do copying in parallel
 39 |             if (lambd % 2 == 0):
 40 |                 new_first_covariants[:, :new_first_sizes[lambd], lambd, :(
 41 |                     2 * lambd +
 42 |                     1)] = first_data.covariants_[:, :new_first_sizes[lambd],
 43 |                                                  lambd, :(2 * lambd + 1)]
 44 |                 new_second_covariants[:, :new_second_sizes[lambd], lambd, :(
 45 |                     2 * lambd +
 46 |                     1)] = second_data.covariants_[:, :new_second_sizes[lambd],
 47 |                                                   lambd, :(2 * lambd + 1)]
 48 |             else:
 49 |                 new_first_covariants[:, :new_first_sizes[lambd], lambd, :(
 50 |                     2 * lambd +
 51 |                     1)] = second_data.covariants_[:, :new_first_sizes[lambd],
 52 |                                                   lambd, :(2 * lambd + 1)]
 53 |                 new_second_covariants[:, :new_second_sizes[lambd], lambd, :(
 54 |                     2 * lambd +
 55 |                     1)] = first_data.covariants_[:, :new_second_sizes[lambd],
 56 |                                                  lambd, :(2 * lambd + 1)]
 57 | 
 58 |         if (first_data.importances_ is None) or (second_data.importances_ is
 59 |                                                  None):
 60 |             new_first_importances = None
 61 |             new_second_importances = None
 62 |         else:
 63 |             new_first_importances = np.empty(
 64 |                 [np.max(new_first_sizes), l_max + 1])
 65 |             new_second_importances = np.empty(
 66 |                 [np.max(new_second_sizes), l_max + 1])
 67 | 
 68 |             for lambd in range(l_max + 1):
 69 |                 if (lambd % 2 == 0):
 70 |                     new_first_importances[:new_first_sizes[
 71 |                         lambd], lambd] = first_data.importances_[:
 72 |                                                                  new_first_sizes[
 73 |                                                                      lambd],
 74 |                                                                  lambd]
 75 |                     new_second_importances[:new_second_sizes[
 76 |                         lambd], lambd] = second_data.importances_[:
 77 |                                                                   new_second_sizes[
 78 |                                                                       lambd],
 79 |                                                                   lambd]
 80 |                 else:
 81 |                     new_first_importances[:new_first_sizes[
 82 |                         lambd], lambd] = second_data.importances_[:
 83 |                                                                   new_first_sizes[
 84 |                                                                       lambd],
 85 |                                                                   lambd]
 86 |                     new_second_importances[:new_second_sizes[
 87 |                         lambd], lambd] = first_data.importances_[:
 88 |                                                                  new_second_sizes[
 89 |                                                                      lambd],
 90 |                                                                  lambd]
 91 | 
 92 |         return Data(new_first_covariants, new_first_sizes, new_first_importances), \
 93 |                Data(new_second_covariants, new_second_sizes, new_second_importances)
 94 | 
 95 | 
 96 | class InitialScaler():
 97 |     '''Block to scale initial spherical expansion coefficients in a certain way. It allows to both
 98 |     normalize coefficients for each environment individually, and to multiply whole array to single 
 99 |     scaling factor, thus, preserving information about relative scale'''
100 |     def __init__(self, mode="signal integral", individually=False):
101 |         self.individually_ = individually
102 | 
103 |         if self.individually_:
104 |             self.fitted_ = True
105 |         else:
106 |             self.fitted_ = False
107 | 
108 |         self.mode_ = mode
109 |         if (self.mode_ != "signal integral") and (self.mode_ != "variance"):
110 |             raise ValueError("mode should be ethier "
111 |                              "\"signal integral\" ethier \"variance\".")
112 | 
113 |     def _get_variance_multiplier(self, coefficients):
114 |         total = 0.0
115 |         total_values = 0
116 | 
117 |         for l in range(coefficients.shape[2]):
118 |             if self.individually_:
119 |                 total += np.sum((coefficients[:, :, l, 0:(2 * l + 1)])**2,
120 |                                 axis=(1, 2))
121 |                 total_values += coefficients.shape[1] * (2 * l + 1)
122 | 
123 |             else:
124 |                 total += np.sum((coefficients[:, :, l, 0:(2 * l + 1)])**2)
125 |                 total_values += coefficients.shape[0] * coefficients.shape[
126 |                     1] * (2 * l + 1)
127 | 
128 |         average = total / total_values
129 |         result = 1.0 / np.sqrt(average)
130 |         if (self.individually_):
131 |             return result[:, np.newaxis, np.newaxis, np.newaxis]
132 |         else:
133 |             return result
134 | 
135 |     def _get_signal_integral_multiplier(self, coefficients):
136 |         if self.individually_:
137 |             result = 1.0 / np.sqrt(np.sum(coefficients[:, :, 0, 0]**2, axis=1))
138 |             return result[:, np.newaxis, np.newaxis, np.newaxis]
139 |         else:
140 |             return 1.0 / np.sqrt(
141 |                 np.mean(np.sum(coefficients[:, :, 0, 0]**2, axis=1)))
142 | 
143 |     def fit(self, coefficients):
144 |         if not self.individually_:
145 |             if (self.mode_ == "signal integral"):
146 |                 self.multiplier_ = self._get_signal_integral_multiplier(
147 |                     coefficients)
148 | 
149 |             if (self.mode_ == "variance"):
150 |                 self.multiplier_ = self._get_variance_multiplier(coefficients)
151 | 
152 |         self.fitted_ = True
153 | 
154 |     def transform(self, coefficients):
155 |         if (not self.fitted_):
156 |             raise NotFittedError("instance of {} is not fitted. "
157 |                                  "It can not transform anything.".format(
158 |                                      type(self).__name__))
159 |         if (self.individually_):
160 |             if (self.mode_ == "signal integral"):
161 |                 multipliers = self._get_signal_integral_multiplier(
162 |                     coefficients)
163 | 
164 |             if (self.mode_ == "variance"):
165 |                 multipliers = self._get_variance_multiplier(coefficients)
166 |             return coefficients * multipliers
167 |         else:
168 |             return coefficients * self.multiplier_
169 | 
170 |     def is_fitted(self):
171 |         return self.fitted_
172 | 
173 | 
174 | class InitialTransformer():
175 |     '''Utility block to split spherical expansion coefficients stored in the form of single numpy array to 
176 |     even-odd pair of Data instances'''
177 |     def __init__(self):
178 |         self.fitted_ = True
179 | 
180 |     def transform(self, coefficients):
181 |         l_max = coefficients.shape[2] - 1
182 |         even_coefficients = np.copy(coefficients)
183 |         even_coefficients_sizes = [
184 |             coefficients.shape[1] if i % 2 == 0 else 0
185 |             for i in range(l_max + 1)
186 |         ]
187 | 
188 |         odd_coefficients = np.copy(coefficients)
189 |         odd_coefficients_sizes = [
190 |             coefficients.shape[1] if i % 2 == 1 else 0
191 |             for i in range(l_max + 1)
192 |         ]
193 | 
194 |         return Data(even_coefficients,
195 |                     even_coefficients_sizes), Data(odd_coefficients,
196 |                                                    odd_coefficients_sizes)
197 | 
198 |     def is_fitted(self):
199 |         return self.fitted_
200 | 


--------------------------------------------------------------------------------
/nice/blocks/purifiers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from nice.nice_utilities import Data
  4 | 
  5 | from nice.packing import unite_parallel, subtract_parallel
  6 | from nice.packing import pack_dense, unpack_dense
  7 | from parse import parse
  8 | import warnings
  9 | from sklearn.linear_model import Ridge
 10 | from sklearn.base import clone
 11 | from sklearn.exceptions import NotFittedError
 12 | 
 13 | DEFAULT_LINEAR_REGRESSOR = Ridge(alpha=1e-12)
 14 | 
 15 | 
 16 | class InvariantsPurifier:
 17 |     ''' Block to purify invariants. It operates with numpy 2d arrays containing invariants'''
 18 |     def __init__(self, regressor=None, num_to_fit='10x', max_take=None):
 19 |         if (regressor is None):
 20 |             self.regressor_ = clone(DEFAULT_LINEAR_REGRESSOR)
 21 |         else:
 22 |             self.regressor_ = regressor
 23 | 
 24 |         self.fitted_ = False
 25 |         self.num_to_fit_ = num_to_fit
 26 |         self.max_take_ = max_take
 27 |         if (type(self.max_take_) == list):
 28 |             self.max_take_ = np.array(self.max_take_)
 29 |         if (self.max_take_
 30 |                 is not None) and (type(self.max_take_) != np.ndarray):
 31 |             self.max_take_ = int(self.max_take_)
 32 | 
 33 |     def fit(self, old_blocks, new_block):
 34 |         total_num = 0
 35 |         for i in range(len(old_blocks)):
 36 |             if (self.max_take_ is None):
 37 |                 total_num += old_blocks[i].shape[1]
 38 |             else:
 39 |                 if (type(self.max_take_) is int):
 40 |                     total_num += min(old_blocks[i].shape[1], self.max_take_)
 41 |                 else:
 42 |                     total_num += min(old_blocks[i].shape[1], self.max_take_[i])
 43 | 
 44 |         if (type(self.num_to_fit_) is str):
 45 |             multiplier = int(parse('{}x', self.num_to_fit_)[0])
 46 |             num_fit_now = multiplier * total_num
 47 |         else:
 48 |             num_fit_now = self.num_to_fit_
 49 | 
 50 |         if (num_fit_now > new_block.shape[0]):
 51 |             warnings.warn("Amount of provided data is less than "
 52 |                           "the desired one to fit InvariantsPurifer. "
 53 |                           "Number of old features is {}, "
 54 |                           "desired number of environments is {}, "
 55 |                           "actual number of environments is {}.".format(
 56 |                               total_num, num_fit_now, new_block.shape[0]))
 57 | 
 58 |         if (self.max_take_ is None):
 59 |             restricted_blocks = [
 60 |                 old_block[:num_fit_now, :] for old_block in old_blocks
 61 |             ]
 62 |         else:
 63 |             if (type(self.max_take_) is int):
 64 |                 restricted_blocks = [
 65 |                     old_block[:num_fit_now, :self.max_take_]
 66 |                     for old_block in old_blocks
 67 |                 ]
 68 |             else:
 69 |                 restricted_blocks = [
 70 |                     old_blocks[i][:num_fit_now, :self.max_take_[i]]
 71 |                     for i in range(len(old_blocks))
 72 |                 ]
 73 | 
 74 |         old_uniting = unite_parallel(restricted_blocks)
 75 |         self.regressor_.fit(old_uniting, new_block[:num_fit_now, :])
 76 | 
 77 |         self.fitted_ = True
 78 | 
 79 |     def transform(self, old_blocks, new_block):
 80 |         if (not self.fitted_):
 81 |             raise NotFittedError(
 82 |                 "instance of {} is not fitted. It can not transform anything".
 83 |                 format(type(self).__name__))
 84 | 
 85 |         if (self.max_take_ is None):
 86 |             restricted_blocks = [old_block[:, :] for old_block in old_blocks]
 87 |         else:
 88 |             if (type(self.max_take_) is int):
 89 |                 restricted_blocks = [
 90 |                     old_block[:, :self.max_take_] for old_block in old_blocks
 91 |                 ]
 92 |             else:
 93 |                 restricted_blocks = [
 94 |                     old_blocks[i][:, :self.max_take_[i]]
 95 |                     for i in range(len(old_blocks))
 96 |                 ]
 97 | 
 98 |         old_uniting = unite_parallel(restricted_blocks)
 99 |         predictions = self.regressor_.predict(old_uniting)
100 |         return subtract_parallel(new_block, predictions)
101 | 
102 |     def is_fitted(self):
103 |         return self.fitted_
104 | 
105 | 
106 | class CovariantsIndividualPurifier:
107 |     '''Block to purify single covariants lambda channel. It operates with 3 dimensional numpy arrays
108 |     with indexing [environmental_index, feature_index, m]'''
109 |     def __init__(self, regressor=None, num_to_fit='10x', max_take=None):
110 |         if (regressor is None):
111 |             self.regressor_ = clone(DEFAULT_LINEAR_REGRESSOR)
112 |             self.regressor_.set_params(**{"fit_intercept": False})
113 |         else:
114 |             self.regressor_ = regressor
115 | 
116 |         self.fitted_ = False
117 |         self.num_to_fit_ = num_to_fit
118 |         self.max_take_ = max_take
119 |         if (type(self.max_take_) == list):
120 |             self.max_take_ = np.array(self.max_take_)
121 | 
122 |         if (self.max_take_
123 |                 is not None) and (type(self.max_take_) != np.ndarray):
124 |             self.max_take_ = int(self.max_take_)
125 | 
126 |     def fit(self, old_blocks, new_block, l):
127 |         total_num = 0
128 |         for i in range(len(old_blocks)):
129 |             if (self.max_take_ is None):
130 |                 total_num += old_blocks[i].shape[1]
131 |             else:
132 |                 if (type(self.max_take_) is int):
133 |                     total_num += min(old_blocks[i].shape[1], self.max_take_)
134 |                 else:
135 |                     total_num += min(old_blocks[i].shape[1], self.max_take_[i])
136 | 
137 |         if (type(self.num_to_fit_) is str):
138 |             multiplier = int(parse('{}x', self.num_to_fit_)[0])
139 |             num_fit_now = multiplier * total_num
140 |         else:
141 |             num_fit_now = self.num_to_fit_
142 | 
143 |         if (num_fit_now > new_block.shape[0] * (l + 1)):
144 |             warnings.warn(
145 |                 "Amount of provided data is less than "
146 |                 "the desired one to fit InvariantsPurifer. "
147 |                 "Number of old features is {}, "
148 |                 "desired number of data points is {}, "
149 |                 "actual number of data points (n_env * (l + 1)) is {}, "
150 |                 "since number of environments is {}, and l is {}.".format(
151 |                     total_num, num_fit_now, new_block.shape[0] * (l + 1),
152 |                     new_block.shape[0], l))
153 | 
154 |         if (num_fit_now % (l + 1) == 0):
155 |             num_fit_now = num_fit_now // (l + 1)
156 |         else:
157 |             num_fit_now = (num_fit_now // (l + 1)) + 1
158 | 
159 |         if (self.max_take_ is None):
160 |             old_blocks_reshaped = []
161 |             for old_block in old_blocks:
162 |                 old_blocks_reshaped.append(
163 |                     pack_dense(old_block[:num_fit_now], l, old_block.shape[1],
164 |                                old_block.shape[1]))
165 |         else:
166 |             if (type(self.max_take_) is int):
167 |                 size_now = self.max_take_
168 |             else:
169 |                 size_now = self.max_take_[i]
170 | 
171 |             old_blocks_reshaped = []
172 |             for old_block in old_blocks:
173 |                 old_blocks_reshaped.append(
174 |                     pack_dense(
175 |                         old_block[:num_fit_now, :min(size_now, old_block.
176 |                                                      shape[1])], l,
177 |                         min(size_now, old_block.shape[1]),
178 |                         min(size_now, old_block.shape[1])))
179 | 
180 |         old_uniting = unite_parallel(old_blocks_reshaped)
181 |         new_reshaped = pack_dense(new_block[:num_fit_now], l,
182 |                                   new_block.shape[1], new_block.shape[1])
183 |         self.regressor_.fit(old_uniting, new_reshaped)
184 |         self.fitted_ = True
185 | 
186 |     def transform(self, old_blocks, new_block, l):
187 |         if (not self.fitted_):
188 |             raise NotFittedError(
189 |                 "instance of {} is not fitted. It can not transform anything".
190 |                 format(type(self).__name__))
191 | 
192 |         if (self.max_take_ is None):
193 |             old_blocks_reshaped = []
194 |             for old_block in old_blocks:
195 |                 old_blocks_reshaped.append(
196 |                     pack_dense(old_block, l, old_block.shape[1],
197 |                                old_block.shape[1]))
198 |         else:
199 |             if (type(self.max_take_) is int):
200 |                 size_now = self.max_take_
201 |             else:
202 |                 size_now = self.max_take_[i]
203 |             old_blocks_reshaped = []
204 |             for old_block in old_blocks:
205 |                 old_blocks_reshaped.append(
206 |                     pack_dense(
207 |                         old_block[:, :min(size_now, old_block.shape[1])], l,
208 |                         min(size_now, old_block.shape[1]),
209 |                         min(size_now, old_block.shape[1])))
210 | 
211 |         old_uniting = unite_parallel(old_blocks_reshaped)
212 |         new_reshaped = pack_dense(new_block, l, new_block.shape[1],
213 |                                   new_block.shape[1])
214 |         predictions = self.regressor_.predict(old_uniting)
215 |         result = subtract_parallel(new_reshaped, predictions)
216 |         return unpack_dense(result, new_block.shape[0], l, new_block.shape[1])
217 | 
218 |     def is_fitted(self):
219 |         return self.fitted_
220 | 
221 | 
222 | class CovariantsPurifier:
223 |     '''Block to purify covariants of single parity. It operates with instances of Data class with covariants'''
224 |     def __init__(self, regressor=None, num_to_fit='10x', max_take=None):
225 |         if (regressor is None):
226 |             self.regressor_ = clone(DEFAULT_LINEAR_REGRESSOR)
227 |             self.regressor_.set_params(**{"fit_intercept": False})
228 |         else:
229 |             self.regressor_ = regressor
230 | 
231 |         self.regressor_.set_params(**{"fit_intercept": False})
232 |         self.fitted_ = False
233 |         self.num_to_fit_ = num_to_fit
234 |         self.max_take_ = max_take
235 |         if (type(self.max_take_) == list):
236 |             self.max_take_ = np.array(self.max_take_)
237 |         if (self.max_take_
238 |                 is not None) and (type(self.max_take_) != np.ndarray):
239 |             self.max_take_ = int(self.max_take_)
240 | 
241 |     def fit(self, old_datas, new_data):
242 | 
243 |         self.l_max_ = new_data.covariants_.shape[2] - 1
244 |         self.purifiers_ = []
245 | 
246 |         for l in range(self.l_max_ + 1):
247 |             if (self.regressor_ is None):
248 |                 current_regressor = None
249 |             else:
250 |                 current_regressor = clone(self.regressor_)
251 |             self.purifiers_.append(
252 |                 CovariantsIndividualPurifier(regressor=current_regressor,
253 |                                              num_to_fit=self.num_to_fit_,
254 |                                              max_take=self.max_take_))
255 | 
256 |         for l in range(self.l_max_ + 1):
257 |             old_blocks_now = []
258 |             for old_data in old_datas:
259 |                 if (old_data.actual_sizes_[l] > 0):
260 |                     old_blocks_now.append(
261 |                         old_data.covariants_[:, :old_data.actual_sizes_[l],
262 |                                              l, :])
263 | 
264 |             new_block_now = new_data.covariants_[:, :new_data.actual_sizes_[l],
265 |                                                  l, :]
266 | 
267 |             old_total_size = 0
268 |             for old_data in old_datas:
269 |                 old_total_size += old_data.actual_sizes_[l]
270 |             new_size = new_data.actual_sizes_[l]
271 |             if (old_total_size == 0) or (new_size == 0):
272 |                 self.purifiers_[l] = None
273 |             else:
274 |                 self.purifiers_[l].fit(old_blocks_now, new_block_now, l)
275 | 
276 |         self.fitted_ = True
277 | 
278 |     def transform(self, old_datas, new_data):
279 |         if (not self.fitted_):
280 |             raise NotFittedError(
281 |                 "instance of {} is not fitted. It can not transform anything".
282 |                 format(type(self).__name__))
283 |         ans = Data(np.empty(new_data.covariants_.shape),
284 |                    np.copy(new_data.actual_sizes_),
285 |                    importances=None)
286 | 
287 |         for l in range(self.l_max_ + 1):
288 |             if (self.purifiers_[l] is not None):
289 |                 old_blocks_now = [
290 |                     old_data.covariants_[:, :old_data.actual_sizes_[l], l, :]
291 |                     for old_data in old_datas
292 |                 ]
293 |                 new_block_now = new_data.covariants_[:, :new_data.
294 |                                                      actual_sizes_[l], l, :]
295 |                 now = self.purifiers_[l].transform(old_blocks_now,
296 |                                                    new_block_now, l)
297 |                 ans.covariants_[:, :now.shape[1],
298 |                                 l, :(2 * l + 1)] = now  # todo parallelize it
299 |             else:
300 |                 if (ans.actual_sizes_[l] > 0):
301 |                     ans.covariants_[:, :ans.actual_sizes_[l], l, :(2 * l + 1)] = \
302 |                     new_data.covariants_[:, :ans.actual_sizes_[l], l, :(2 * l + 1)]  # todo parallelize it
303 | 
304 |         return ans
305 | 
306 |     def is_fitted(self):
307 |         return self.fitted_
308 | 
309 | 
310 | class CovariantsPurifierBoth:
311 |     '''Block to purify covariants of both parities. It operates with pairs of instances of Data class with covariants'''
312 |     def __init__(self, regressor=None, num_to_fit='10x', max_take=None):
313 |         self.num_to_fit_ = num_to_fit
314 |         self.max_take_ = max_take
315 |         if (self.max_take_
316 |                 is not None) and (type(self.max_take_) != np.ndarray):
317 |             self.max_take_ = int(self.max_take_)
318 |         if (regressor is None):
319 |             even_regressor, odd_regressor = None, None
320 |         else:
321 |             even_regressor, odd_regressor = clone(regressor), clone(regressor)
322 |             even_regressor.set_params(**{"fit_intercept": False})
323 |             odd_regressor.set_params(**{"fit_intercept": False})
324 | 
325 |         self.even_purifier_ = CovariantsPurifier(regressor=even_regressor,
326 |                                                  num_to_fit=self.num_to_fit_,
327 |                                                  max_take=self.max_take_)
328 |         self.odd_purifier_ = CovariantsPurifier(regressor=odd_regressor,
329 |                                                 num_to_fit=self.num_to_fit_,
330 |                                                 max_take=self.max_take_)
331 |         self.fitted_ = False
332 | 
333 |     def fit(self, old_datas_even, new_data_even, old_datas_odd, new_data_odd):
334 | 
335 |         self.even_purifier_.fit(old_datas_even, new_data_even)
336 |         self.odd_purifier_.fit(old_datas_odd, new_data_odd)
337 |         self.fitted_ = True
338 | 
339 |     def transform(self, old_datas_even, new_data_even, old_datas_odd,
340 |                   new_data_odd):
341 |         if (not self.fitted_):
342 |             raise NotFittedError(
343 |                 "instance of {} is not fitted. It can not transform anything".
344 |                 format(type(self).__name__))
345 |         return self.even_purifier_.transform(old_datas_even, new_data_even),\
346 |                self.odd_purifier_.transform(old_datas_odd, new_data_odd)
347 | 
348 |     def is_fitted(self):
349 |         return self.fitted_
350 | 


--------------------------------------------------------------------------------
/nice/clebsch_gordan.py:
--------------------------------------------------------------------------------
 1 | from sympy.physics.wigner import clebsch_gordan
 2 | from sympy import S
 3 | import numpy as np
 4 | 
 5 | 
 6 | def get_single(l1, l2, l, m1, m2):
 7 |     return float(clebsch_gordan(S(l1), S(l2), S(l), S(m1), S(m2), S(m1 + m2)))
 8 | 
 9 | 
10 | class ClebschGordan:
11 |     def __init__(self, l_max):
12 |         self.l_max_ = l_max
13 |         self.precomputed_ = np.zeros(
14 |             [l_max + 1, l_max + 1, l_max + 1, 2 * l_max + 1, 2 * l_max + 1])
15 | 
16 |         for l1 in range(l_max + 1):
17 |             for l2 in range(l_max + 1):
18 |                 for l in range(l_max + 1):
19 |                     for m1 in range(-l_max, l_max + 1):
20 |                         for m2 in range(-l_max, l_max + 1):
21 |                             now = get_single(l1, l2, l, m1, m2)
22 |                             self.precomputed_[l1, l2, l, m1 + l1,
23 |                                               m2 + l2] = now
24 | 
25 | 
26 | def check_clebsch_gordan(clebsch_gordan, required_l_max):
27 |     if (type(clebsch_gordan) != ClebschGordan):
28 |         raise ValueError("type of precomputed clebsch gordan "
29 |                          "coefficients should be ClebschGordan class.")
30 |     if (clebsch_gordan.l_max_ < required_l_max):
31 |         raise ValueError("given precomputed clebsch gordan coefficients "
32 |                          "have smaller l_max than required one.")
33 | 


--------------------------------------------------------------------------------
/nice/contracted_pca.py:
--------------------------------------------------------------------------------
 1 | from sklearn.utils.extmath import randomized_svd
 2 | import numpy as np
 3 | 
 4 | 
 5 | def do_sign_covariant_pca(X, n_components):
 6 |     sums = np.sum(X, axis=1)
 7 |     signs = ((sums <= 0) - 0.5) * 2.0
 8 |     X_normalized = signs[:, np.newaxis] * X
 9 |     U, S, V = randomized_svd(X_normalized,
10 |                              n_components=n_components,
11 |                              flip_sign=True)
12 |     return U * signs[:, np.newaxis]
13 | 
14 | 
15 | def do_pca_step(features, n_components, normalize=True, epsilon=1e-8):
16 |     shape_initial = features.shape
17 |     features = np.transpose(features, axes=(0, 2, 3, 1))
18 |     features = np.reshape(features, [-1, features.shape[-1]])
19 |     #features = np.vstack([np.real(features), np.imag(features)])
20 | 
21 |     if (normalize):
22 |         stds = np.sqrt(np.mean(features * features, axis=0))
23 |         stds = np.maximum(stds, epsilon)
24 |         features = features / stds[np.newaxis, :]
25 | 
26 |     features = do_sign_covariant_pca(features, n_components)
27 |     #features = features[0:(features.shape[0] // 2)] + 1j * features[(features.shape[0] // 2):]
28 |     features = np.reshape(features, [
29 |         shape_initial[0], shape_initial[2], shape_initial[3],
30 |         features.shape[-1]
31 |     ])
32 |     features = np.transpose(features, axes=(0, 3, 1, 2))
33 |     return features
34 | 


--------------------------------------------------------------------------------
/nice/nice_utilities.pxd:
--------------------------------------------------------------------------------
 1 | cdef void single_contraction(const double[:, :, :, :, :] clebsh_gordan,
 2 |                             double* first_covariant, int l1,
 3 |                             double* second_covariant, int l2,
 4 |                             int lambd, double* ans_placeholder, 
 5 |                             double** buff) nogil
 6 | 
 7 | 
 8 | cdef int min_c(int a, int b) nogil
 9 | cdef int max_c(int a, int b) nogil
10 |     
11 | cdef int abs_c(int a) nogil
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/nice/packing.pyx:
--------------------------------------------------------------------------------
  1 | cimport cython
  2 | import os
  3 | import numpy as np
  4 | from cython.parallel import prange
  5 | from multiprocessing import cpu_count
  6 | 
  7 | cdef int switch_to_parallel_after = 36000000
  8 | 
  9 |                      
 10 |                      
 11 | @cython.boundscheck(False)
 12 | @cython.wraparound(False)
 13 | cpdef pack_dense(double[:, :, :] covariants, int l,
 14 |                  int n_feat, int desired_n_feat, num_threads = None):
 15 |     cdef int num_threads_int
 16 |     if (num_threads is None):
 17 |         num_threads_int = len(os.sched_getaffinity(0))
 18 |     else:
 19 |         num_threads_int = num_threads
 20 |     
 21 |     cdef int n_envs = covariants.shape[0]
 22 |     cdef int num_per_feat = (l + 1)
 23 |     res = np.empty([n_envs * (2 * l + 1), desired_n_feat])
 24 |     cdef double[:, :] res_view = res
 25 |     cdef int env_ind, feat_ind, now, m
 26 |     
 27 |     if (n_feat * (2 * l + 1) * n_envs) > switch_to_parallel_after:
 28 |         for env_ind in prange(n_envs, nogil = True, schedule = 'static', num_threads = num_threads_int):
 29 |             for feat_ind in range(n_feat):
 30 |                 for m in range(2 * l + 1):
 31 |                     res_view[m + env_ind * (2 * l + 1), feat_ind] = covariants[env_ind, feat_ind, m]
 32 | 
 33 |         for env_ind in prange(n_envs, nogil = True, schedule = 'static', num_threads = num_threads_int):
 34 |             for feat_ind in range(n_feat, desired_n_feat):
 35 |                 for m in range(2 * l + 1):
 36 |                     res_view[m + env_ind * (2 * l + 1), feat_ind] = 0.0
 37 |                     
 38 |     else:
 39 |         for env_ind in range(n_envs):
 40 |             for feat_ind in range(n_feat):
 41 |                 for m in range(2 * l + 1):
 42 |                     res_view[m + env_ind * (2 * l + 1), feat_ind] = covariants[env_ind, feat_ind, m]
 43 | 
 44 |         for env_ind in range(n_envs):
 45 |             for feat_ind in range(n_feat, desired_n_feat):
 46 |                 for m in range(2 * l + 1):
 47 |                     res_view[m + env_ind * (2 * l + 1), feat_ind] = 0.0
 48 |     '''for feat_ind in prange(n_feat, nogil = True, schedule = 'static', num_threads = num_threads_int):
 49 |         now = 0
 50 |         for env_ind in range(n_envs):           
 51 |             for m in range(2 * l + 1):
 52 |                 res_view[now, feat_ind] = covariants[env_ind, feat_ind, m]
 53 |                 now = now + 1
 54 |                 
 55 |     for feat_ind in prange(n_feat, desired_n_feat, nogil = True, schedule = 'static', num_threads = num_threads_int):
 56 |         now = 0
 57 |         for env_ind in range(n_envs):           
 58 |             for m in range(2 * l + 1):
 59 |                 res_view[now, feat_ind] = 0.0
 60 |                 now = now + 1'''
 61 |                     
 62 |     return res
 63 |     
 64 | '''@cython.boundscheck(False)
 65 | @cython.wraparound(False)
 66 | cdef transform_inplace(double[:, :, :] covariants, double[:, :] components, 
 67 |                         int l, int n_feat):
 68 |     cdef int n_envs = covariants.shape[0]
 69 |     res = np.zeros([n_envs, components.shape[0], 2 * l + 1])
 70 |     cdef double[:, :, :] res_view = res
 71 |     cdef int feat_ind, env_ind, m, i
 72 |     
 73 |     for env_ind in range(n_envs):
 74 |         for feat_ind in range(components.shape[0]):
 75 |             for m in range(2 * l + 1):
 76 |                 for i in range(n_feat):
 77 |                     res_view[env_ind, feat_ind, m] += components[feat_ind, i] * covariants[env_ind, i, m]
 78 |     return res'''
 79 |                 
 80 | @cython.boundscheck(False)
 81 | @cython.wraparound(False)
 82 | cpdef unpack_dense(double[:, :] packed, int n_envs, int l, int n_feat, num_threads = None):
 83 |     cdef int num_threads_int
 84 |     if (num_threads is None):
 85 |         num_threads_int = len(os.sched_getaffinity(0))
 86 |     else:
 87 |         num_threads_int = num_threads
 88 |         
 89 |     res = np.empty([n_envs, n_feat, 2 * l + 1])
 90 |     cdef double[:, :, :] res_view = res
 91 |     cdef int feat_ind, now, env_ind, m
 92 |     
 93 |     '''for feat_ind in prange(n_feat, nogil = True, schedule = 'static', num_threads = num_threads_int):
 94 |         now = 0
 95 |         for env_ind in range(n_envs):           
 96 |             for m in range(2 * l + 1):
 97 |                 res_view[env_ind, feat_ind, m] = packed[now, feat_ind]
 98 |                 now = now + 1'''
 99 |     if (n_feat * (2 * l + 1) * n_envs) > switch_to_parallel_after:
100 |         for env_ind in prange(n_envs, nogil = True, schedule = 'static', num_threads = num_threads_int):
101 |             for feat_ind in range(n_feat):
102 |                 for m in range(2 * l + 1):
103 |                     res_view[env_ind, feat_ind, m] = packed[m + env_ind * (2 * l + 1), feat_ind]
104 |                     
105 |     else:
106 |         for env_ind in range(n_envs):
107 |             for feat_ind in range(n_feat):
108 |                 for m in range(2 * l + 1):
109 |                     res_view[env_ind, feat_ind, m] = packed[m + env_ind * (2 * l + 1), feat_ind]
110 |     return res
111 | 
112 | @cython.boundscheck(False)
113 | @cython.wraparound(False)
114 | cpdef copy_parallel(double[:, :] source, double[:, :] destination, num_threads = None):
115 |     cdef int num_threads_int
116 |     if (num_threads is None):
117 |         num_threads_int = len(os.sched_getaffinity(0))
118 |     else:
119 |         num_threads_int = num_threads
120 |         
121 |     cdef int env_ind, feat_ind
122 |     cdef int n_feat = source.shape[1]
123 |     if (source.shape[0] * source.shape[1] > switch_to_parallel_after):
124 |         for env_ind in prange(source.shape[0], nogil = True, schedule = 'static', num_threads = num_threads_int):
125 |             for feat_ind in range(n_feat):
126 |                 destination[env_ind, feat_ind] = source[env_ind, feat_ind]
127 |     else:
128 |         for env_ind in range(source.shape[0]):
129 |             for feat_ind in range(n_feat):
130 |                 destination[env_ind, feat_ind] = source[env_ind, feat_ind]
131 |     
132 |             
133 | def unite_parallel(blocks, num_threads = None):
134 |     total_size = 0
135 |     for block in blocks:
136 |         total_size += block.shape[1]
137 |     res = np.empty([blocks[0].shape[0], total_size])
138 |     now = 0
139 |     for block in blocks:
140 |         copy_parallel(block, res[:, now : now + block.shape[1]], num_threads = num_threads)
141 |         now += block.shape[1]
142 |     return res
143 | 
144 | @cython.boundscheck(False)
145 | @cython.wraparound(False)
146 | cpdef subtract_parallel(double[:, :] a, double[:, :] b, num_threads = None):
147 |     result = np.empty([a.shape[0], a.shape[1]])
148 |     cdef double[:, :] result_view = result
149 |     
150 |     cdef int num_threads_int
151 |     if (num_threads is None):
152 |         num_threads_int = len(os.sched_getaffinity(0))
153 |     else:
154 |         num_threads_int = num_threads
155 |         
156 |     cdef int env_ind, feat_ind
157 |     cdef int n_feat = a.shape[1]
158 |     if (a.shape[0] * a.shape[1] > switch_to_parallel_after):
159 |         for env_ind in prange(a.shape[0], nogil = True, schedule = 'static', num_threads = num_threads_int):
160 |             for feat_ind in range(n_feat):
161 |                 result_view[env_ind, feat_ind] = a[env_ind, feat_ind] - b[env_ind, feat_ind]
162 |     else:
163 |         for env_ind in range(a.shape[0]):
164 |             for feat_ind in range(n_feat):
165 |                 result_view[env_ind, feat_ind] = a[env_ind, feat_ind] - b[env_ind, feat_ind]
166 |             
167 |     return result
168 |      
169 | 
170 | '''@cython.boundscheck(False)
171 | @cython.wraparound(False)
172 | cpdef accumulate(double[:, :] values, int[:] structure_indices, int central_now,
173 |                 double[:, :] ans):
174 |     
175 |     cdef int env_ind, feat_ind, n_feat = values.shape[1]
176 |     cdef int now = 0
177 |     for env_ind in range(values.shape[0]):      
178 |         for feat_ind in range(n_feat):
179 |             ans[structure_indices[env_ind], feat_ind] += values[env_ind, feat_ind]
180 |     
181 | def accumulate_to_structures(structures, values):
182 |     all_species = []
183 |     for structure in structures:
184 |         all_species.append(structure.get_atomic_numbers())
185 |     all_species = np.concatenate(all_species, axis = 0)
186 |     species = np.unique(all_species)
187 |     all_species = all_species.astype(np.int32)
188 |     species = species.astype(np.int32)
189 |     
190 |     result = {}
191 |     for specie in tqdm.tqdm(species):
192 |         num_now = np.sum(all_species == specie)
193 |         result[specie] = np.empty([num_now, coefficients.shape[1], coefficients.shape[2], coefficients.shape[3]])
194 |         copy_coefs(coefficients, all_species, specie, result[specie])
195 |     return result   '''
196 | 


--------------------------------------------------------------------------------
/nice/rascal_coefficients.pyx:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | cimport cython
  3 | from nice_utilities cimport single_contraction, min_c, abs_c, max_c
  4 | from libc.math cimport sin, M_PI, sqrt, fmax
  5 | import tqdm
  6 | import rascal
  7 | import os
  8 | from ase import Atoms
  9 | from rascal.representations import SphericalInvariants as SOAP
 10 | from rascal.representations import SphericalExpansion as SPH
 11 | from rascal.neighbourlist.structure_manager import (
 12 |         mask_center_atoms_by_species, mask_center_atoms_by_id)
 13 | import warnings
 14 | import copy
 15 | from multiprocessing import Pool, cpu_count
 16 | 
 17 | 
 18 | @cython.boundscheck(False)
 19 | @cython.wraparound(False)
 20 | cpdef void copy_coefs(double[:, :, :, :] coefficients, int[:] central_species, int central_now,
 21 |                 double[:, :, :, :] ans):
 22 |     cdef int now = 0
 23 |     cdef int n_radial = coefficients.shape[1]
 24 |     cdef int l_max = coefficients.shape[2] - 1
 25 |     cdef int env_ind, radial_ind, l, m
 26 |     
 27 |     
 28 |     for env_ind in range(coefficients.shape[0]):
 29 |         if central_species[env_ind] == central_now:
 30 |             for radial_ind in range(n_radial):
 31 |                 for l in range(l_max + 1):
 32 |                     for m in range(2 * l_max + 1):
 33 |                         ans[now, radial_ind, l, m] = coefficients[env_ind, radial_ind, l, m]
 34 |             now += 1
 35 | 
 36 | 
 37 | def split_by_central_specie(all_species, species, coefficients, show_progress = True): 
 38 |     result = {}
 39 |     for specie in tqdm.tqdm(species, disable = not show_progress):
 40 |         num_now = np.sum(all_species == specie)
 41 |         result[specie] = np.empty([num_now, coefficients.shape[1], coefficients.shape[2], coefficients.shape[3]])
 42 |         copy_coefs(coefficients, all_species, specie, result[specie])
 43 |     return result   
 44 |     
 45 | 
 46 | 
 47 | @cython.boundscheck(False)
 48 | @cython.wraparound(False)
 49 | cpdef convert_rascal_coefficients(double[:, :] coefficients, int n_max, int n_types, int l_max):
 50 |     cdef int n_envs = coefficients.shape[0]
 51 |     cdef int env_ind, n, l, m
 52 |     cdef int n_radial = n_max * n_types
 53 |     cdef int now
 54 |     ans = np.zeros([n_envs, n_radial, l_max + 1, 2 * l_max + 1])
 55 |     cdef double[:, :, :, :] ans_view = ans
 56 |     
 57 |     for env_ind in range(n_envs):
 58 |         now = 0
 59 |         for n in range(n_radial):
 60 |             for l in range(l_max + 1):
 61 |                 for m in range(-l, l + 1):
 62 |                     ans_view[env_ind, n, l, m + l] = coefficients[env_ind, now]
 63 |                     now += 1
 64 |     return ans
 65 | 
 66 |    
 67 | 
 68 | def process_structures(structures, delta = 0.1):   
 69 |     """Satisfying librascal desire of having all atoms 
 70 |     inside the cell even if structure is not periodic. 
 71 |     (changes only non periodic structures)
 72 |     """
 73 | 
 74 |     result = []
 75 |     for structure in structures:
 76 |         if True in structure.pbc:
 77 |             result.append(copy.deepcopy(structure))
 78 |         else:
 79 |             current = copy.deepcopy(structure)
 80 |             for dim in range(3):
 81 |                 min_now = np.min( current.positions[:, dim])
 82 |                 current.positions[:, dim] =  current.positions[:, dim] - min_now + delta
 83 |             
 84 |             spreads = []
 85 |             for dim in range(3):                
 86 |                 spreads.append(np.max(current.positions[:, dim]) + delta)
 87 |             current.cell = spreads
 88 |             result.append(current)
 89 |     return result
 90 | 
 91 | 
 92 | def get_rascal_coefficients(structures, HYPERS, n_types):
 93 |    
 94 |     
 95 |     sph = SPH(**HYPERS)
 96 |     try:
 97 |         n_max = HYPERS['max_radial']
 98 |         l_max = HYPERS['max_angular']
 99 |     except KeyError:
100 |         raise KeyError("max_radial and max_angular should be specified")
101 |         
102 |     structures = process_structures(structures)
103 |     
104 |     feat = sph.transform(structures).get_features(sph)
105 |     res = convert_rascal_coefficients(feat, n_max, n_types, l_max)
106 |     
107 |     #if (normalize):
108 |     #    normalize_by_ps(res)
109 |     return np.array(res)
110 | 
111 | 
112 | def get_rascal_coefficients_stared(task):
113 |     return get_rascal_coefficients(*task)
114 | 
115 | 
116 |             
117 | 


--------------------------------------------------------------------------------
/nice/thresholding.pyx:
--------------------------------------------------------------------------------
  1 | from libc.math cimport sin, M_PI, sqrt, fmax
  2 | cimport cython
  3 | import numpy as np
  4 | #from cython.parallel cimport prange
  5 | from nice_utilities cimport min_c, abs_c, max_c
  6 | cdef double sqrt_2 = sqrt(2.0)
  7 | #from nice_utilities import Data
  8 | 
  9 | cdef enum Mode:
 10 |     covariants, invariants
 11 | 
 12 |     
 13 |   
 14 | cdef get_thresholded_task(double[:, :] first_importances, int[:] first_actual_sizes,
 15 |                            double[:, :] second_importances, int[:] second_actual_sizes,
 16 |                            double threshold, int known_num, int l_max, Mode mode):
 17 |     if mode == Mode.covariants:
 18 |         return get_thresholded_task_covariants(first_importances, first_actual_sizes,
 19 |                                     second_importances, second_actual_sizes,
 20 |                                     threshold, known_num, l_max)
 21 |     if mode == Mode.invariants:
 22 |         return get_thresholded_task_invariants(first_importances, first_actual_sizes,
 23 |                                     second_importances, second_actual_sizes,
 24 |                                     threshold, known_num, l_max)
 25 |     
 26 | cdef get_thresholded_task_invariants(double[:, :] first_importances, int[:] first_actual_sizes,
 27 |                            double[:, :] second_importances, int[:] second_actual_sizes,
 28 |                            double threshold, int known_num, int l_max):
 29 |     
 30 |     ans = np.empty([known_num, 4], dtype = np.int32)
 31 |     
 32 |     raw_importances = np.empty([known_num])
 33 |     
 34 |     cdef int[:, :] ans_view = ans
 35 |     
 36 |     cdef int l, first_ind, second_ind, lambd
 37 |     cdef int pos = 0
 38 |    
 39 |     for l in range(l_max + 1):       
 40 |         for first_ind in range(first_actual_sizes[l]):
 41 |             for second_ind in range(second_actual_sizes[l]):
 42 |                 if (first_importances[first_ind, l] * second_importances[second_ind, l] >= threshold):                     
 43 |                     ans_view[pos, 0] = first_ind
 44 |                     ans_view[pos, 1] = l
 45 |                     ans_view[pos, 2] = second_ind
 46 |                     ans_view[pos, 3] = l                           
 47 |                     raw_importances[pos] = first_importances[first_ind, l] * second_importances[second_ind, l]
 48 |                     pos += 1
 49 |    
 50 |     return [ans[:pos], raw_importances[:pos]] 
 51 | 
 52 | cdef get_thresholded_task_covariants(double[:, :] first_importances, int[:] first_actual_sizes,
 53 |                            double[:, :] second_importances, int[:] second_actual_sizes,
 54 |                            double threshold, int known_num, int l_max):
 55 |     
 56 |     ans = np.empty([known_num, 4], dtype = np.int32)
 57 |     
 58 |     raw_importances = np.empty([known_num])
 59 |     
 60 |     cdef int[:, :] ans_view = ans
 61 |     
 62 |     cdef int l1, l2, first_ind, second_ind, lambd
 63 |     cdef int pos = 0
 64 |    
 65 |     for l1 in range(l_max + 1):
 66 |         for l2 in range(l_max + 1):
 67 |             for first_ind in range(first_actual_sizes[l1]):
 68 |                 for second_ind in range(second_actual_sizes[l2]):
 69 |                     if (first_importances[first_ind, l1] * second_importances[second_ind, l2] >= threshold):                     
 70 |                         ans_view[pos, 0] = first_ind
 71 |                         ans_view[pos, 1] = l1
 72 |                         ans_view[pos, 2] = second_ind
 73 |                         ans_view[pos, 3] = l2                           
 74 |                         raw_importances[pos] = first_importances[first_ind, l1] * second_importances[second_ind, l2]
 75 |                         pos += 1
 76 |    
 77 |     return [ans[:pos], raw_importances[:pos]]                       
 78 |                        
 79 |                                       
 80 | cpdef get_thresholded_tasks(first_even, first_odd, second_even, second_odd, int desired_num, int l_max, mode_string):
 81 |     
 82 |     cdef Mode mode
 83 |     if mode_string == 'covariants':
 84 |         mode = Mode.covariants
 85 |     if mode_string == 'invariants':
 86 |         mode = Mode.invariants
 87 |   
 88 |     cdef double threshold_even
 89 |     cdef int num_even_even, num_odd_odd
 90 |     threshold_even, num_even_even, num_odd_odd = get_threshold(first_even.importances_, first_even.actual_sizes_,
 91 |                                                                second_even.importances_, second_even.actual_sizes_,
 92 |                                                                first_odd.importances_, first_odd.actual_sizes_,
 93 |                                                                second_odd.importances_, second_odd.actual_sizes_,
 94 |                                                                desired_num, mode)
 95 |     
 96 |     cdef double threshold_odd
 97 |     cdef int num_even_odd, num_odd_even
 98 |     threshold_odd, num_even_odd, num_odd_even = get_threshold(first_even.importances_, first_even.actual_sizes_,
 99 |                                                               second_odd.importances_, second_odd.actual_sizes_,
100 |                                                               first_odd.importances_, first_odd.actual_sizes_,
101 |                                                               second_even.importances_, second_even.actual_sizes_,
102 |                                                               desired_num, mode)        
103 |       
104 |     
105 |     
106 |     task_even_even = get_thresholded_task(first_even.importances_, first_even.actual_sizes_,
107 |                                           second_even.importances_, second_even.actual_sizes_, 
108 |                                           threshold_even, num_even_even, l_max, mode)
109 |     
110 |     task_odd_odd = get_thresholded_task(first_odd.importances_, first_odd.actual_sizes_,
111 |                                         second_odd.importances_, second_odd.actual_sizes_,
112 |                                         threshold_even, num_odd_odd, l_max, mode)
113 |     
114 |     task_even_odd = get_thresholded_task(first_even.importances_, first_even.actual_sizes_,
115 |                                          second_odd.importances_, second_odd.actual_sizes_,
116 |                                          threshold_odd, num_even_odd, l_max, mode)
117 |     
118 |     task_odd_even = get_thresholded_task(first_odd.importances_, first_odd.actual_sizes_,
119 |                                          second_even.importances_, second_even.actual_sizes_,
120 |                                          threshold_odd, num_odd_even, l_max, mode)
121 |     
122 |     return task_even_even, task_odd_odd, task_even_odd, task_odd_even
123 |                            
124 |                            
125 |                            
126 | cdef get_threshold(double[:, :] first_importances_1, int[:] first_actual_sizes_1,
127 |                    double[:, :] second_importances_1, int[:] second_actual_sizes_1,
128 |                    double[:, :] first_importances_2, int[:] first_actual_sizes_2,
129 |                    double[:, :] second_importances_2, int[:] second_actual_sizes_2,
130 |                    int desired_num, Mode mode, int min_iterations = 50):
131 |     
132 |     
133 |     if (desired_num == -1):
134 |         num_1_1 = get_total_num_full(first_importances_1, first_actual_sizes_1, second_importances_1, second_actual_sizes_1, -1.0, mode)  
135 |         num_2_2 = get_total_num_full(first_importances_2, first_actual_sizes_2, second_importances_2, second_actual_sizes_2, -1.0, mode)  
136 |         return -1.0, num_1_1, num_2_2
137 |     
138 |     cdef double left = -1.0
139 |     cdef double first = get_upper_threshold(first_importances_1, first_actual_sizes_1, second_importances_1, second_actual_sizes_1, mode) + 1.0
140 |     cdef double second = get_upper_threshold(first_importances_2, first_actual_sizes_2, second_importances_2, second_actual_sizes_2, mode) + 1.0
141 |     
142 |     cdef double right = fmax(first, second)
143 |     cdef double middle = (left + right) / 2.0
144 |     cdef int num_now, num_previous = -1
145 |     cdef int num_it_no_change = 0
146 |     while (True):
147 |         middle = (left + right) / 2.0
148 |         num_now = get_total_num_full(first_importances_1, first_actual_sizes_1, second_importances_1, second_actual_sizes_1, middle, mode) + get_total_num_full(first_importances_2, first_actual_sizes_2, second_importances_2, second_actual_sizes_2, middle, mode)
149 |         
150 |         if (num_now == desired_num):
151 |             left = middle
152 |             break
153 |         if (num_now > desired_num):
154 |             left = middle
155 |         if (num_now < desired_num):
156 |             right = middle
157 |             
158 |         if (num_now == num_previous):
159 |             num_it_no_change += 1
160 |             if (num_it_no_change > min_iterations):
161 |                 break
162 |         else:
163 |             num_it_no_change = 0
164 |         num_previous = num_now
165 |             
166 |     num_1_1 = get_total_num_full(first_importances_1, first_actual_sizes_1, second_importances_1, second_actual_sizes_1, left, mode)  
167 |     num_2_2 = get_total_num_full(first_importances_2, first_actual_sizes_2, second_importances_2, second_actual_sizes_2, left, mode)  
168 |     return left, num_1_1, num_2_2
169 |     
170 |     
171 | cdef double get_upper_threshold(double[:, :] first_importances, int[:] first_actual_sizes, 
172 |                              double[:, :] second_importances, int[:] second_actual_sizes, Mode mode):
173 |     if mode == Mode.covariants:
174 |         return get_upper_threshold_covariants(first_importances, first_actual_sizes, 
175 |                                               second_importances, second_actual_sizes)
176 |     if mode == Mode.invariants:
177 |         return get_upper_threshold_invariants(first_importances, first_actual_sizes,
178 |                                               second_importances, second_actual_sizes)
179 |     
180 |     
181 |     
182 | cdef double get_upper_threshold_invariants(double[:, :] first_importances, int[:] first_actual_sizes, 
183 |                              double[:, :] second_importances, int[:] second_actual_sizes):
184 |     cdef double ans = 0.0
185 |     cdef int l
186 |        
187 |     for l in range(min_c(first_importances.shape[1], second_importances.shape[1])):      
188 |         if (first_actual_sizes[l] > 0) and (second_actual_sizes[l] > 0):
189 |             if (first_importances[0, l] * second_importances[0, l] > ans):
190 |                 ans = first_importances[0, l] * second_importances[0, l]
191 |                     
192 |     return ans
193 | 
194 | 
195 | cdef double get_upper_threshold_covariants(double[:, :] first_importances, int[:] first_actual_sizes, 
196 |                              double[:, :] second_importances, int[:] second_actual_sizes):
197 |     cdef double ans = 0.0
198 |     cdef int l1, l2
199 |         
200 |     cdef int second_size = second_importances.shape[1]
201 |     for l1 in range(first_importances.shape[1]):
202 |         for l2 in range(second_size):
203 |             if (first_actual_sizes[l1] > 0) and (second_actual_sizes[l2] > 0):
204 |                 if (first_importances[0, l1] * second_importances[0, l2] > ans):
205 |                     ans = first_importances[0, l1] * second_importances[0, l2]
206 |                     
207 |     return ans
208 |                   
209 |     
210 | cdef int get_total_num_full(double[:, :] first_importances, int[:] first_actual_sizes,
211 |                             double[:, :] second_importances, int[:] second_actual_sizes,
212 |                             double threshold, Mode mode):
213 |     if mode == Mode.covariants:
214 |         return get_total_num_full_covariants(first_importances, first_actual_sizes,
215 |                                              second_importances, second_actual_sizes,
216 |                                              threshold)
217 |     if mode == Mode.invariants:
218 |         return get_total_num_full_invariants(first_importances, first_actual_sizes,
219 |                                              second_importances, second_actual_sizes,
220 |                                              threshold)
221 |     
222 | cdef int get_total_num_full_invariants(double[:, :] first_importances, int[:] first_actual_sizes,
223 |                             double[:, :] second_importances, int[:] second_actual_sizes,
224 |                             double threshold):
225 |     cdef int l
226 |     cdef int second_size = second_importances.shape[1]
227 |     cdef int res = 0
228 |     for l in range(min_c(first_importances.shape[1], second_importances.shape[1])):
229 |         if (first_actual_sizes[l] > 0) and (second_actual_sizes[l] > 0):
230 |             res += get_total_num(first_importances[:first_actual_sizes[l], l],
231 |                                  second_importances[:second_actual_sizes[l], l], threshold)
232 |     return res
233 | 
234 |                                  
235 |                                  
236 | cdef int get_total_num_full_covariants(double[:, :] first_importances, int[:] first_actual_sizes,
237 |                             double[:, :] second_importances, int[:] second_actual_sizes,
238 |                             double threshold):
239 |     cdef int l1, l2
240 |     cdef int second_size = second_importances.shape[1]
241 |     cdef int res = 0
242 |     for l1 in range(first_importances.shape[1]):
243 |         for l2 in range(second_size):
244 |             if (first_actual_sizes[l1] > 0) and (second_actual_sizes[l2] > 0):
245 |                 res += get_total_num(first_importances[:first_actual_sizes[l1], l1],
246 |                                      second_importances[:second_actual_sizes[l2], l2], threshold)
247 |             
248 |     return res
249 |         
250 | cdef int get_total_num(double[:] a, double[:] b, double threshold):
251 |     cdef int b_size = b.shape[0]
252 |     cdef int i, j, ans
253 |     i = 0
254 |     j = b_size
255 |     ans = 0
256 |     for i in range(a.shape[0]):
257 |         while ((j > 0) and (a[i] * b[j - 1] < threshold)):
258 |             j -= 1
259 |         ans += j
260 |     return ans
261 | 


--------------------------------------------------------------------------------
/nice/unrolling_individual_pca.pyx:
--------------------------------------------------------------------------------
 1 | cimport cython
 2 | import numpy as np
 3 | from nice.packing import pack_dense, unpack_dense
 4 | 
 5 | from sklearn.decomposition import TruncatedSVD #not center the data
 6 | class UnrollingIndividualPCA(TruncatedSVD):
 7 |     def __init__(self, *args, normalize_importances = True, **kwargs):
 8 |         self.normalize_importances_ = normalize_importances
 9 |         super().__init__(*args, **kwargs)     
10 |         
11 |     def fit(self, *args):
12 |         if (len(args) == 1):
13 |             return super().fit(args[0])
14 |         #print("num components: ", self.n_components)
15 |         covariants, l = args
16 |         n_feat = covariants.shape[1]
17 |         if (self.n_components > n_feat):
18 |             #print("in if: ", self.n_components, n_feat)
19 |             self.n_components = n_feat           
20 |         
21 |         self.l_ = l
22 |         if (self.n_components < n_feat):
23 |             packed = pack_dense(covariants, l, n_feat, n_feat)
24 |         if (self.n_components == n_feat):
25 |             packed = pack_dense(covariants, l, n_feat, n_feat + 1)
26 |         res = super().fit_transform(packed)
27 |         
28 |         self.importances_ = np.mean(res * res, axis = 0)
29 |         if (self.normalize_importances_):
30 |             self.importances_ = self.importances_ / np.sum(self.importances_)
31 |         indices = np.argsort(self.importances_)[::-1]
32 |         self.importances_ = self.importances_[indices]
33 |         self.components_ = self.components_[indices]
34 |         self.explained_variance_ = self.explained_variance_[indices]
35 |         self.explained_variance_ratio_ = self.explained_variance_ratio_[indices]
36 |         self.singular_values_ = self.singular_values_[indices]
37 |     
38 |     def fit_transform(self, *args):
39 |         if (len(args) ==1):
40 |             return super().fit_transform(args[0])
41 |         covariants, l = args
42 |         n_feat = covariants.shape[1]
43 |         #print("num components: ", self.n_components)
44 |         if (self.n_components > n_feat):
45 |             #print("in if: ", self.n_components, n_feat)
46 |             self.n_components = n_feat           
47 |         
48 |         self.l_ = l
49 |         if (self.n_components < n_feat):
50 |             packed = pack_dense(covariants, l, n_feat, n_feat)
51 |         if (self.n_components == n_feat):
52 |             packed = pack_dense(covariants, l, n_feat, n_feat + 1)   
53 |             
54 |         res = super().fit_transform(packed)        
55 |         self.importances_ = np.mean(res * res, axis = 0)
56 |         if (self.normalize_importances_):
57 |             self.importances_ = self.importances_ / np.sum(self.importances_)
58 |         indices = np.argsort(self.importances_)[::-1]
59 |         self.importances_ = self.importances_[indices]
60 |         self.components_ = self.components_[indices]
61 |         
62 |         res = super().transform(packed)
63 |         return unpack_dense(res, covariants.shape[0],
64 |                                          self.l_, self.n_components)
65 |     
66 |         
67 |     def transform(self, *args):
68 |         
69 |         
70 |         if (len(args) == 1):
71 |             return super().transform(args)
72 |         #print("components shape: ", self.components_.shape)
73 |         #print("num components: ", self.n_components)
74 |         covariants, l = args
75 |         n_feat = covariants.shape[1]
76 |         
77 |         if (self.n_components < n_feat):
78 |             packed = pack_dense(covariants, l, n_feat, n_feat)
79 |         if (self.n_components == n_feat):
80 |             packed = pack_dense(covariants, l, n_feat, n_feat + 1)    
81 |         res = super().transform(packed)
82 |         return unpack_dense(res, covariants.shape[0],
83 |                                      self.l_, self.n_components)
84 | 
85 |  
86 |        
87 |     
88 |         
89 |         


--------------------------------------------------------------------------------
/nice/unrolling_pca.pyx:
--------------------------------------------------------------------------------
  1 | cimport cython
  2 | import numpy as np
  3 | from libc.math cimport fabs
  4 | 
  5 | @cython.boundscheck(False)
  6 | @cython.wraparound(False)
  7 | cpdef pack_dense(double[:, :, :, :] coefficients):
  8 |     cdef int n_envs = coefficients.shape[0]
  9 |     cdef int n_feat = coefficients.shape[1]   
 10 |     cdef int l_max = coefficients.shape[2] - 1
 11 |     
 12 |     
 13 |    
 14 |     cdef int num_per_feat = (l_max + 1) * (l_max + 1)
 15 |     res = np.zeros([num_per_feat * n_envs, n_feat])
 16 |     cdef double[:, :] res_view = res
 17 |     
 18 |     cdef int env_ind, feat_ind, now, l, m
 19 |     for feat_ind in range(n_feat):
 20 |         now = 0
 21 |         for env_ind in range(n_envs):
 22 |             for l in range(l_max + 1):
 23 |                 for m in range(2 * l + 1):
 24 |                     res_view[now, feat_ind] = coefficients[env_ind, feat_ind, l, m]
 25 |                     now += 1
 26 |                     
 27 |     return res
 28 | 
 29 | cpdef unpack_dense(double[:, :] packed, int n_envs, int l_max):
 30 |     cdef int n_feat = packed.shape[1]
 31 |     
 32 |     res = np.zeros([n_envs, n_feat, l_max + 1, 2 * l_max + 1])
 33 |     cdef double[:, :, :, :] res_view = res
 34 |     cdef int feat_ind, now, env_ind, l, m
 35 |     
 36 |     for feat_ind in range(n_feat):
 37 |         now = 0
 38 |         for env_ind in range(n_envs):
 39 |             for l in range(l_max + 1):
 40 |                 for m in range(2 * l + 1):
 41 |                     res_view[env_ind, feat_ind, l, m] = packed[now, feat_ind]
 42 |                     now += 1
 43 |     return res
 44 | 
 45 | cpdef get_signs(double[:, :] ar, epsilon = 1e-10):
 46 |     res = np.zeros([ar.shape[0]])
 47 |     cdef double[:] res_view = res
 48 |     cdef int n_feat = ar.shape[1]
 49 |     cdef int i, j
 50 |     cdef double max_absolute_now 
 51 |     for i in range(ar.shape[0]):
 52 |         max_absolute_now = ar[i, 0]
 53 |         for j in range(n_feat):
 54 |             if (fabs(ar[i, j]) > fabs(max_absolute_now)):
 55 |                 max_absolute_now = ar[i, j]
 56 |                 
 57 |         if (max_absolute_now > epsilon):
 58 |             res_view[i] = 1.0
 59 |         if (max_absolute_now < epsilon):
 60 |             res_view[i] = -1.0
 61 |         
 62 |     return res
 63 | 
 64 | 
 65 | from sklearn.decomposition import TruncatedSVD #not center the data
 66 | class UnrollingPCA(TruncatedSVD):
 67 |     def __init__(self, *args, **kwargs):
 68 |         super().__init__(*args, **kwargs)     
 69 |         
 70 |     def fit_transform(self, coefficients):
 71 |         if (len(coefficients.shape) == 2):
 72 |             return super().fit_transform(coefficients)
 73 |         self.n_feat_ = coefficients.shape[1]
 74 |         self.l_max_ = coefficients.shape[2] - 1
 75 |         packed = pack_dense(coefficients)      
 76 |         res = super().fit_transform(packed)
 77 |         return unpack_dense(res, coefficients.shape[0],
 78 |                                          self.l_max_)
 79 |     
 80 |     def fit(self, coefficients):
 81 |         if (len(coefficients.shape) == 2):
 82 |             return super().fit_transform(coefficients)
 83 |         self.n_feat_ = coefficients.shape[1]
 84 |         self.l_max_ = coefficients.shape[2] - 1
 85 |         packed = pack_dense(coefficients) 
 86 |         super().fit(packed)
 87 |         
 88 |     def transform(self, coefficients):
 89 |         if (len(coefficients.shape) == 2):
 90 |             return super().fit_transform(coefficients)
 91 |         if (self.n_feat_ != coefficients.shape[1]):
 92 |             raise ValueError("wrong shape")
 93 |         if (self.l_max_ != coefficients.shape[2] - 1):
 94 |             raise ValueError("wrong shape")
 95 |         packed = pack_dense(coefficients)
 96 |         res = super().transform(packed)       
 97 |         return unpack_dense(res, coefficients.shape[0],
 98 |                                          self.l_max_)
 99 |         
100 |         


--------------------------------------------------------------------------------
/nice/utilities.py:
--------------------------------------------------------------------------------
  1 | import tqdm
  2 | import numpy as np
  3 | import nice.rascal_coefficients
  4 | import copy
  5 | import os
  6 | from multiprocessing import Pool, cpu_count
  7 | import warnings
  8 | 
  9 | 
 10 | def get_all_species(structures):
 11 |     ''' getting all unique atomic species among the structures
 12 |     
 13 |     Args:
 14 |         structures: list of ase atoms objects
 15 |         
 16 |     Returns:
 17 |         sorted numpy array with ints with all unique species in the format where 
 18 |         1 states for H, 2 for He and so on. (inherits from ase function 
 19 |         atoms_object.get_atomic_numbers())
 20 |         
 21 |     '''
 22 |     all_species = []
 23 |     for structure in structures:
 24 |         all_species.append(np.array(structure.get_atomic_numbers()))
 25 |     all_species = np.concatenate(all_species, axis=0)
 26 |     all_species = np.sort(np.unique(all_species))
 27 |     return all_species
 28 | 
 29 | 
 30 | def get_compositional_features(structures, all_species):
 31 |     ''' getting compositional features suitable for linear regression which contains information
 32 |     about the number of atoms with particular species in the structure
 33 |     
 34 |     Args:
 35 |         structures: list of Ase atoms objects
 36 |         all_species: numpy array with ints of all unique species in the dataset. \
 37 |         If all species argument is the same for several calls of this function, resulting \
 38 |         blocks of compositional features are guaranteed to be consisted with each other
 39 |         
 40 |     Return:
 41 |         numpy array with shape [len(structures), len(all_species)] with compositional features
 42 |     '''
 43 |     result = np.zeros([len(structures), len(all_species)])
 44 |     for i, structure in tqdm.tqdm(enumerate(structures)):
 45 |         species_now = structure.get_atomic_numbers()
 46 |         for j, specie in enumerate(all_species):
 47 |             num = np.sum(species_now == specie)
 48 |             result[i, j] = num
 49 |     return result
 50 | 
 51 | 
 52 | def get_spherical_expansion(structures,
 53 |                             rascal_hypers,
 54 |                             all_species,
 55 |                             task_size=100,
 56 |                             num_threads=None,
 57 |                             split_by_central_specie = True,
 58 |                             show_progress=True):
 59 |     '''getting spherical expansion coefficients
 60 |     
 61 |     Args:
 62 |         structures: list of Ase atoms objects        
 63 |         rascal_hypers: dictionary with parameters for librascal controlling spherical expansion        
 64 |         all_species: numpy array with ints of all unique species in the dataset. \
 65 |         If all species argument is the same for several calls of this function, resulting \
 66 |         blocks of spherical expansion coefficients are guaranteed to be consisted with each other
 67 |         task_size: number of structures in chunk for multiprocessing
 68 |         num_threads: number of threads in multiprocessing. If None than all available \
 69 |         (len(os.sched_getaffinity(0))) threads are used
 70 |         split_by_central_specie: whether group or not spherical expansion coefficients by central specie
 71 |         show_progress: whether or not show progress via tqdm
 72 |        
 73 |     Return:
 74 |         dictionary in which keys are elements of all_speceis and entries are numpy arrays with indexing
 75 |         [environmental index, radial basis/neighbor specie index, lambda, m] with spherical expansion coefficients for 
 76 |         environments around atoms with specie indicated in key. Coefficients are stored from the beginning,
 77 |         i. e. [:, : lambda, :(2 * lambda + 1)] elements are valid
 78 |     '''
 79 |     hypers = copy.deepcopy(rascal_hypers)
 80 | 
 81 |     if ('expansion_by_species_method' in hypers.keys()):
 82 |         if (hypers['expansion_by_species_method'] != 'user defined'):
 83 |             raise ValueError(
 84 |                 "for proper packing spherical expansion coefficients into [env index, radial/specie index, l, m] shape output should be uniform, thus 'expansion_by_species_method' must be 'user defined'"
 85 |             )
 86 | 
 87 |     hypers['expansion_by_species_method'] = 'user defined'
 88 | 
 89 |     species_list = []
 90 |     for structure in structures:
 91 |         species_list.append(structure.get_atomic_numbers())
 92 |     species_list = np.concatenate(species_list, axis=0)
 93 |     species_list = species_list.astype(np.int32)
 94 |     all_species = all_species.astype(np.int32)
 95 | 
 96 |     if ('global_species' not in hypers.keys()):
 97 |         hypers['global_species'] = [int(specie) for specie in all_species]
 98 |     else:
 99 |         for specie in all_species:
100 |             if (specie not in hypers['global_species']):
101 |                 warnings.warn(
102 |                     "atom with type {} is presented in the all_species argument to this function but it is not listed in the global_species, adding it"
103 |                     .format(specie))
104 |                 hypers['global_species'].append(int(specie))
105 | 
106 |         all_species = np.array(hypers['global_species']).astype(np.int32)
107 | 
108 |     if (num_threads is None):
109 |         num_threads = len(os.sched_getaffinity(0))
110 | 
111 |     p = Pool(num_threads)
112 |     tasks = []
113 |     for i in range(0, len(structures), task_size):
114 |         tasks.append([structures[i:i + task_size], hypers, len(all_species)])
115 | 
116 |     result = [
117 |         res for res in tqdm.tqdm(p.imap(
118 |             nice.rascal_coefficients.get_rascal_coefficients_stared, tasks),
119 |                                  total=len(tasks),
120 |                                  disable=not show_progress)
121 |     ]
122 |     p.close()
123 |     p.join()
124 |     result = np.concatenate(result, axis=0)
125 |     if (split_by_central_specie):
126 |         return nice.rascal_coefficients.split_by_central_specie(
127 |             species_list, all_species, result, show_progress=show_progress)
128 |     else:
129 |         return result
130 | 
131 | 
132 | def make_structural_features(features,
133 |                              structures,
134 |                              all_species,
135 |                              show_progress=True):
136 |     ''' getting structural features suitable for linear regression which consist of sums \
137 |      over atomic features
138 |     
139 |     Args:
140 |         features: nested dictionary with atomic features. First level keys are central species, \
141 |         second level keys are body orders. Entries are 2-dimensional numpy arrays. 
142 |         structures: list of Ase atoms objects
143 |         all_species: numpy array with ints of all unique species in the dataset. \
144 |         If all species argument is the same for several calls of this function, resulting \
145 |         blocks of structural features are guaranteed to be consistent with each other.   \
146 |         If for given block of structures there are no atoms of some particular specie,\
147 |         features dictionary still have to contain key with this specie. It should contain \
148 |         numpy arrays with shapes [0, number of features]. This is need to get proper placing\
149 |         of features to fulfill consistency.
150 |         show_progress: whether or not show progress via tqdm
151 |         
152 |     Return:
153 |         numpy array with shape [len(structures), number of structural features] with structural features
154 |     '''
155 | 
156 |     for specie in all_species:
157 |         if (specie not in features.keys()):
158 |             raise ValueError(
159 |                 "all_species contains atomic specie {}, "
160 |                 "but there are no features for it. "
161 |                 "In case of absence of such atoms in given set "
162 |                 "of structures provide empty array with shape "
163 |                 "[0, num_features] which is needed to "
164 |                 "determine proper shape of output ".format(specie))
165 | 
166 |     start_indices, end_indices = {}, {}
167 |     now = 0
168 |     for specie_index in all_species:
169 |         start_indices[specie_index] = {}
170 |         end_indices[specie_index] = {}
171 |         for body_order_index in features[specie_index].keys():
172 |             start_indices[specie_index][body_order_index] = now
173 |             now += features[specie_index][body_order_index].shape[1]
174 |             end_indices[specie_index][body_order_index] = now
175 | 
176 |     total_size = now
177 | 
178 |     result = np.zeros([len(structures), total_size])
179 | 
180 |     current_positions = {}
181 |     for specie in all_species:
182 |         current_positions[specie] = 0
183 | 
184 |     for i in tqdm.tqdm(range(len(structures)), disable=not (show_progress)):
185 |         species_now = structures[i].get_atomic_numbers()
186 |         for specie in all_species:
187 |             num_atoms_now = np.sum(species_now == specie)
188 |             if (num_atoms_now == 0):
189 |                 continue
190 | 
191 |             for body_order in features[specie].keys():
192 |                 features_now = np.sum(
193 |                     features[specie][body_order][current_positions[specie]:(
194 |                         current_positions[specie] + num_atoms_now)],
195 |                     axis=0)
196 |                 result[i, start_indices[specie][body_order]:end_indices[specie]
197 |                        [body_order]] = features_now
198 | 
199 |             current_positions[specie] += num_atoms_now
200 | 
201 |     return result
202 | 
203 | 
204 | def transform_sequentially(nice,
205 |                            structures,
206 |                            rascal_hypers,
207 |                            all_species,
208 |                            block_size=500,
209 |                            show_progress=True):
210 |     ''' transforming structures into structural features by chunks in order to use less amount of RAM
211 |     
212 |     Args:
213 |         nice: dictionary where keys are species and entries are nice transformers.\
214 |         If you want to use single nice transformer to all environments regardless of central\
215 |         specie just pass {key : nice_single for specie in all_species}
216 |         structures: list of Ase atoms objects
217 |         rascal_hypers: dictionary with parameters for librascal controlling spherical expansion.\
218 |         Should be the same as used for fitting nice transformers
219 |         all_species: numpy array with ints of all unique species in the dataset. 
220 |         block_size: size of chunks measured in number of environments
221 |         show_progress: whether or not show progress via tqdm
222 |         
223 |         
224 |     Return:
225 |         numpy array with shape [len(structures), number of structural features] with structural features
226 |     '''
227 | 
228 |     pieces = []
229 | 
230 |     for i in tqdm.tqdm(range(0, len(structures), block_size),
231 |                        disable=not show_progress):
232 |         now = {}
233 |         coefficients = get_spherical_expansion(structures[i:i + block_size],
234 |                                                rascal_hypers,
235 |                                                all_species,
236 |                                                show_progress=False)
237 |         for specie in all_species:
238 |             if (coefficients[specie].shape[0] != 0):
239 |                 now[specie] = nice[specie].transform(
240 |                     coefficients[specie], return_only_invariants=True)
241 |             else:
242 |                 # determining size of output
243 |                 dummy_shape = coefficients[specie].shape
244 |                 dummy_shape = list(dummy_shape)
245 |                 dummy_shape[0] = 1
246 |                 dummy_data = np.ones(dummy_shape)
247 |                 dummy_output = nice[specie].transform(
248 |                     dummy_data, return_only_invariants=True)
249 |                 current_block = {}
250 |                 for key in dummy_output.keys():
251 |                     current_block[key] = np.zeros(
252 |                         [0, dummy_output[key].shape[1]])
253 |                 now[specie] = current_block
254 | 
255 |         pieces.append(
256 |             make_structural_features(now,
257 |                                      structures[i:i + block_size],
258 |                                      all_species,
259 |                                      show_progress=False))
260 | 
261 |     return np.concatenate(pieces, axis=0)
262 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel", "Cython"]
3 | 


--------------------------------------------------------------------------------
/reference_configurations/readme.txt:
--------------------------------------------------------------------------------
1 | some reference configurations for tests
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cython
2 | numpy
3 | ase
4 | tqdm
5 | scikit-learn
6 | sympy
7 | parse
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension, find_packages
 2 | from Cython.Build import cythonize
 3 | 
 4 | with open('requirements.txt', 'r') as f:
 5 |     requirements = [
 6 |         line.strip() for line in f if not line.strip().startswith('#')
 7 |     ]
 8 | 
 9 | extensions = [
10 |     Extension("nice.*", ["nice/*.pyx"],
11 |               extra_compile_args=['-O3', '-fopenmp'],
12 |               extra_link_args=['-fopenmp'])
13 | ]
14 | setup(
15 |     name='nice',
16 |     packages=find_packages(),
17 |     install_requires=requirements,
18 |     ext_modules=cythonize(extensions),
19 |     zip_safe=False,
20 | )
21 | 


--------------------------------------------------------------------------------
/tests/compare_kernels.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import ase.io
 3 | from nice.blocks import *
 4 | from nice.utilities import *
 5 | from nice.rascal_coefficients import process_structures
 6 | import copy
 7 | from rascal.representations import SphericalInvariants
 8 | 
 9 | def get_nice_powerspectrum():
10 |     return StandardSequence([       
11 |         StandardBlock(None, None, None,
12 |                       ThresholdExpansioner(num_expand=None, mode='invariants'),
13 |                       None, None)
14 |     ],
15 |                             initial_scaler=None)
16 | 
17 | def get_nice_ps_kernel(structures, hypers):
18 |     
19 |     all_species = get_all_species(structures)
20 |     coefficients = get_spherical_expansion(structures, hypers, all_species, split_by_central_specie=False,
21 |                                            show_progress = False)
22 |     nice = get_nice_powerspectrum()
23 |     nice.fit(coefficients)
24 |     nice_ps = nice.transform(coefficients, return_only_invariants = True)[2]
25 |     return nice_ps.dot(nice_ps.T)
26 | 
27 | def get_rascal_ps_kernel(structures, hypers):
28 |     structures = process_structures(structures)
29 |     soap = SphericalInvariants(**hypers)
30 |     librascal_ps = soap.transform(structures).get_features(soap)
31 |     return librascal_ps.dot(librascal_ps.T)
32 | 
33 | def test_powerspectrum_kernels(epsilon = 1e-10):
34 |     structures = ase.io.read('../reference_configurations/methane_100.extxyz', index = ':')
35 |     HYPERS = {
36 |         'interaction_cutoff': 6.3,
37 |         'max_radial': 5,
38 |         'max_angular': 5,
39 |         'gaussian_sigma_type': 'Constant',
40 |         'gaussian_sigma_constant': 0.3,
41 |         'cutoff_smooth_width': 0.3,
42 |         'radial_basis': 'GTO',
43 | 
44 |     }
45 | 
46 |     HYPERS_PS = copy.deepcopy(HYPERS)
47 |     HYPERS_PS['normalize'] = False
48 |     HYPERS_PS['soap_type'] = 'PowerSpectrum'
49 |     
50 |     nice_kernel = get_nice_ps_kernel(structures, HYPERS)
51 |     rascal_kernel = get_rascal_ps_kernel(structures, HYPERS_PS)
52 |     
53 |     nice_kernel = np.reshape(nice_kernel, [-1])
54 |     rascal_kernel = np.reshape(rascal_kernel, [-1])
55 |     
56 |     mask = rascal_kernel > epsilon
57 |     nice_kernel = nice_kernel[mask]
58 |     rascal_kernel = rascal_kernel[mask]
59 |     
60 |     ratios = nice_kernel / rascal_kernel
61 |     discrepancy = (np.max(ratios) - np.min(ratios)) / np.mean(ratios)   
62 |     assert discrepancy < epsilon
63 |     
64 |     


--------------------------------------------------------------------------------
/tests/readme.txt:
--------------------------------------------------------------------------------
1 | python3 -m pytest compare_kernels.py
2 | 


--------------------------------------------------------------------------------
/tutorials/calculating_covariants.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Calculating covariants"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In the previous tutorial, we calculated invariant representations of atomic environments and used them for the prediction of energies - invariant properties. \n",
 15 |     "\n",
 16 |     "In the case when there is a need to predict covariant properties, covariants instead of invariants are required. This tutorial shows how to calculate them.\n",
 17 |     "\n",
 18 |     "First of all, we need to get **fitted** instance of the model as in the previous tutorial. It is done by the following preliminaries cell: (with the only difference that since we want to calculate covariants, we clearly shouldn't leave the covariants branch of the last block empty) "
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# cell to wrap in collapsible in future\n",
 28 |     "\n",
 29 |     "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n",
 30 |     "\n",
 31 |     "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n",
 32 |     "!gunzip -k methane.extxyz.gz\n",
 33 |     "\n",
 34 |     "import numpy as np\n",
 35 |     "import ase.io\n",
 36 |     "import tqdm\n",
 37 |     "from nice.blocks import *\n",
 38 |     "from nice.utilities import *\n",
 39 |     "from matplotlib import pyplot as plt\n",
 40 |     "from sklearn.linear_model import BayesianRidge\n",
 41 |     "\n",
 42 |     "HARTREE_TO_EV = 27.211386245988\n",
 43 |     "train_subset = \"0:10000\"  #input for ase.io.read command\n",
 44 |     "test_subset = \"10000:15000\"  #input to ase.io.read command\n",
 45 |     "environments_for_fitting = 1000  #number of environments to fit nice transfomers\n",
 46 |     "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000, 5000, 7500,\n",
 47 |     "        10000]  #for learning curve\n",
 48 |     "\n",
 49 |     "#HYPERS for librascal spherical expansion coefficients\n",
 50 |     "HYPERS = {\n",
 51 |     "    'interaction_cutoff': 6.3,\n",
 52 |     "    'max_radial': 5,\n",
 53 |     "    'max_angular': 5,\n",
 54 |     "    'gaussian_sigma_type': 'Constant',\n",
 55 |     "    'gaussian_sigma_constant': 0.05,\n",
 56 |     "    'cutoff_smooth_width': 0.3,\n",
 57 |     "    'radial_basis': 'GTO'\n",
 58 |     "}\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "#our model:\n",
 62 |     "def get_nice():\n",
 63 |     "    return StandardSequence([\n",
 64 |     "        StandardBlock(ThresholdExpansioner(num_expand=150),\n",
 65 |     "                      CovariantsPurifierBoth(max_take=10),\n",
 66 |     "                      IndividualLambdaPCAsBoth(n_components=50),\n",
 67 |     "                      ThresholdExpansioner(num_expand=300, mode='invariants'),\n",
 68 |     "                      InvariantsPurifier(max_take=50),\n",
 69 |     "                      InvariantsPCA(n_components=200)),\n",
 70 |     "        StandardBlock(ThresholdExpansioner(num_expand=150),\n",
 71 |     "                      CovariantsPurifierBoth(max_take=10),\n",
 72 |     "                      IndividualLambdaPCAsBoth(n_components=10),\n",
 73 |     "                      ThresholdExpansioner(num_expand=300, mode='invariants'),\n",
 74 |     "                      InvariantsPurifier(max_take=50),\n",
 75 |     "                      InvariantsPCA(n_components=200)),\n",
 76 |     "        StandardBlock(ThresholdExpansioner(num_expand=150),\n",
 77 |     "                      CovariantsPurifierBoth(max_take=10), None,\n",
 78 |     "                      ThresholdExpansioner(num_expand=300, mode='invariants'),\n",
 79 |     "                      InvariantsPurifier(max_take=50),\n",
 80 |     "                      InvariantsPCA(n_components=200))\n",
 81 |     "    ],\n",
 82 |     "                            initial_scaler=InitialScaler(\n",
 83 |     "                                mode='signal integral', individually=True))\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "train_structures = ase.io.read('methane.extxyz', index=train_subset)\n",
 87 |     "\n",
 88 |     "test_structures = ase.io.read('methane.extxyz', index=test_subset)\n",
 89 |     "\n",
 90 |     "all_species = get_all_species(train_structures + test_structures)\n",
 91 |     "\n",
 92 |     "train_coefficients = get_spherical_expansion(train_structures, HYPERS,\n",
 93 |     "                                             all_species)\n",
 94 |     "\n",
 95 |     "test_coefficients = get_spherical_expansion(test_structures, HYPERS,\n",
 96 |     "                                            all_species)\n",
 97 |     "\n",
 98 |     "#individual nice transformers for each atomic specie in the dataset\n",
 99 |     "nice = {}\n",
100 |     "for key in train_coefficients.keys():\n",
101 |     "    nice[key] = get_nice()\n",
102 |     "\n",
103 |     "for key in train_coefficients.keys():\n",
104 |     "    nice[key].fit(train_coefficients[key][:environments_for_fitting])"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "Now we need to call **.transform** method with **return_only_invariants = False**, which is the default value:"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "data_even, data_odd, invariants_even = nice[1].transform(train_coefficients[1])"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Result is **data_even**, **data_odd** and **invariants_even**. The first two objects are covariants. The last one is invariants. \n",
128 |     "\n",
129 |     "There is another important symmetry in addition to the translational and rotational one. Usually, atomic properties, such as energy, also transform in a certain way with respect to inversion. Particularly, energy is invariant with respect to it. \n",
130 |     "\n",
131 |     "In NICE, features are separated into two groups - the ones which are invariant with respect to inversion and the ones that change their sign. The first ones are called even; the second ones are called odd. \n",
132 |     "\n",
133 |     "Now let's take a look at the returned objects more closely:"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "**Invariants** is the same object as in the previous tutorial - dictionary, where keys are body order."
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "for key in invariants_even.keys():\n",
150 |     "    print(invariants_even[key].shape)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Returned covariants are covariants after the last block, i. e. in our case of body order 4. \n",
158 |     "(functionality to get all covariants of all body order from **StandardSequence** will be added in the next version of NICE)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "Even covariants are packed in the class Data, which has two relevant fields - \n",
166 |     "**.covariants_** and **.actual_sizes_**. (getters are also to be added in the next version) First is np.array with covariants themselves. It has following indexing -**[environmental_index, feature_index, lambda, m]**. But the problem is that for each lambda channel, the actual number of features is different. Thus, the shape of this array doesn't reflect the real number of meaningful entries. Information about the actual number of features is stored in **.actual_sizes_**:"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "print(type(data_even))\n",
176 |     "print(\"shape of even covariants array: {}\".format(data_even.covariants_.shape))\n",
177 |     "print(\"actual sizes of even covariants: {}\".format(data_even.actual_sizes_))"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "It is the same for odd covariants:"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "print(\"shape of odd covariants array: {}\".format(data_odd.covariants_.shape))\n",
194 |     "print(\"actual sizes of odd covariants: {}\".format(data_odd.actual_sizes_))"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "There is one other point - for each lambda channel the size of covariant vectors is (2 * lambda + 1). These vectors are stored from the beginning. It means that the meaningful entries for each lambda are located in **[:, :, lambda, :(2 * lambda + 1)]**"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "In the [nice article](https://aip.scitation.org/doi/10.1063/5.0021116) another definition of **parity** is used. Covariants are split into **true** and **pseudo** groups. All the covariants in the **true** group are transformed with respect to inversion as (-1)^lambda, while all the covariants in the **pseudo** group are transformed as (-1) ^ (lambda + 1). \n",
209 |     "\n",
210 |     "There is a special class - **ParityDefinitionChanger** to switch between these definitions:"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "data_true, data_pseudo = ParityDefinitionChanger().transform(\n",
220 |     "    data_even, data_odd)\n",
221 |     "\n",
222 |     "print(data_true.covariants_.shape)\n",
223 |     "print(data_true.actual_sizes_)\n",
224 |     "\n",
225 |     "print(data_pseudo.covariants_.shape)\n",
226 |     "print(data_pseudo.actual_sizes_)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "Since this transformation is symmetric, we can use this once again to go back from the true and pseudo covariants to even and odd:"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "data_even, data_odd = ParityDefinitionChanger().transform(\n",
243 |     "    data_true, data_pseudo)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "There is one other discrepancy - covariants defined in the nice article, are smaller by the factor of (2 * lambda + 1). Thus, the last step to get full compliance is the following:"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "for lambd in range(6):\n",
260 |     "    data_true.covariants_[:, :data_true.actual_sizes_[lambd],\n",
261 |     "                          lambd, :(2 * lambd + 1)] /= (2 * lambd + 1)\n",
262 |     "    data_pseudo.covariants_[:, :data_pseudo.actual_sizes_[lambd],\n",
263 |     "                            lambd, :(2 * lambd + 1)] /= (2 * lambd + 1)"
264 |    ]
265 |   }
266 |  ],
267 |  "metadata": {
268 |   "kernelspec": {
269 |    "display_name": "Python 3",
270 |    "language": "python",
271 |    "name": "python3"
272 |   },
273 |   "language_info": {
274 |    "codemirror_mode": {
275 |     "name": "ipython",
276 |     "version": 3
277 |    },
278 |    "file_extension": ".py",
279 |    "mimetype": "text/x-python",
280 |    "name": "python",
281 |    "nbconvert_exporter": "python",
282 |    "pygments_lexer": "ipython3",
283 |    "version": "3.6.9"
284 |   },
285 |   "toc": {
286 |    "base_numbering": 1,
287 |    "nav_menu": {},
288 |    "number_sections": true,
289 |    "sideBar": true,
290 |    "skip_h1_title": false,
291 |    "title_cell": "Table of Contents",
292 |    "title_sidebar": "Contents",
293 |    "toc_cell": false,
294 |    "toc_position": {},
295 |    "toc_section_display": true,
296 |    "toc_window_display": false
297 |   }
298 |  },
299 |  "nbformat": 4,
300 |  "nbformat_minor": 4
301 | }
302 | 


--------------------------------------------------------------------------------
/tutorials/custom_regressors_into_purifiers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Custom regressors into purifiers"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "As was already mentioned in the first tutorial, purifiers can accept arbitrarily linear regressors from sklearn.linear_model. In order to feed it with a custom linear regressor, some requirements should be fulfilled. Firstly, it should have the same interface as linear regressors from sklearn with the fit and predict methods. Secondly, it should fulfill sklearn requirements to make it possible to clone with [sklearn.base.clone](https://scikit-learn.org/stable/modules/generated/sklearn.base.clone.html) function. This tutorial shows an example of such a class. \n",
 15 |     "\n",
 16 |     "As before, let's calculate spherical expansion coefficients for H environments: "
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n",
 26 |     "\n",
 27 |     "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n",
 28 |     "!gunzip -k methane.extxyz.gz\n",
 29 |     "\n",
 30 |     "import numpy as np\n",
 31 |     "import ase.io\n",
 32 |     "import tqdm\n",
 33 |     "from nice.blocks import *\n",
 34 |     "from nice.utilities import *\n",
 35 |     "from matplotlib import pyplot as plt\n",
 36 |     "from sklearn.linear_model import BayesianRidge\n",
 37 |     "\n",
 38 |     "structures = ase.io.read('methane.extxyz', index='0:1000')\n",
 39 |     "\n",
 40 |     "HYPERS = {\n",
 41 |     "    'interaction_cutoff': 6.3,\n",
 42 |     "    'max_radial': 5,\n",
 43 |     "    'max_angular': 5,\n",
 44 |     "    'gaussian_sigma_type': 'Constant',\n",
 45 |     "    'gaussian_sigma_constant': 0.05,\n",
 46 |     "    'cutoff_smooth_width': 0.3,\n",
 47 |     "    'radial_basis': 'GTO'\n",
 48 |     "}\n",
 49 |     "\n",
 50 |     "all_species = get_all_species(structures)\n",
 51 |     "\n",
 52 |     "coefficients = get_spherical_expansion(structures, HYPERS, all_species)\n",
 53 |     "coefficients = coefficients[1]"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "Our custom class looks like this:"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "from sklearn.model_selection import cross_val_predict\n",
 70 |     "from sklearn.linear_model import Ridge\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "class AdaptiveRidge:\n",
 74 |     "    def __init__(self):\n",
 75 |     "        pass\n",
 76 |     "\n",
 77 |     "    def fit(self, X, y):\n",
 78 |     "        minimum = None\n",
 79 |     "        self.best_alpha_ = None\n",
 80 |     "        for alpha in np.logspace(-25, 10, 300):\n",
 81 |     "            regressor = Ridge(alpha=alpha, fit_intercept=False)\n",
 82 |     "            predictions = cross_val_predict(regressor, X, y)\n",
 83 |     "            now = np.mean((predictions - y)**2)\n",
 84 |     "            if (minimum is None) or (now < minimum):\n",
 85 |     "                minimum = now\n",
 86 |     "                self.best_alpha_ = alpha\n",
 87 |     "\n",
 88 |     "        self.ridge_ = Ridge(alpha=self.best_alpha_, fit_intercept=False)\n",
 89 |     "        self.ridge_.fit(X, y)\n",
 90 |     "\n",
 91 |     "    def predict(self, X):\n",
 92 |     "        return self.ridge_.predict(X)\n",
 93 |     "\n",
 94 |     "    def get_params(self, deep=True):\n",
 95 |     "        return {}\n",
 96 |     "\n",
 97 |     "    def set_params(self, **parameters):\n",
 98 |     "        for parameter, value in parameters.items():\n",
 99 |     "            setattr(self, parameter, value)\n",
100 |     "        return self"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "During fitting it estimates best value of regularization by cross validation using training data. There are additional methods get_params and set_params. These methods are required for sklearn.base.clone function. More details about it [here](https://scikit-learn.org/stable/developers/develop.html) (It is necessary to read only cloning section).  \n",
108 |     "\n",
109 |     "Let's use it:"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "from scipy.linalg import LinAlgWarning\n",
119 |     "\n",
120 |     "nice = StandardSequence([\n",
121 |     "    StandardBlock(ThresholdExpansioner(50), None, IndividualLambdaPCAsBoth(20),\n",
122 |     "                  ThresholdExpansioner(50, mode='invariants'), None, None),\n",
123 |     "    StandardBlock(\n",
124 |     "        ThresholdExpansioner(50),\n",
125 |     "        CovariantsPurifierBoth(regressor=AdaptiveRidge(), max_take=20),\n",
126 |     "        IndividualLambdaPCAsBoth(10),\n",
127 |     "        ThresholdExpansioner(50, mode='invariants'),\n",
128 |     "        InvariantsPurifier(regressor=AdaptiveRidge(), max_take=20),\n",
129 |     "        InvariantsPCA(20)),\n",
130 |     "])\n",
131 |     "\n",
132 |     "with warnings.catch_warnings():\n",
133 |     "    # a lot of ill conditioned matrices with super small alpha\n",
134 |     "    warnings.filterwarnings(\"ignore\", category=LinAlgWarning)\n",
135 |     "    nice.fit(coefficients)\n",
136 |     "\n",
137 |     "res = nice.transform(coefficients)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "It is possible to access best alpha parameters for all paritiies and lambda chanels in the final model: \n",
145 |     "\n",
146 |     "(convenient getters might be added in the next version of NICE)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "for lambd in range(6):\n",
156 |     "    if (nice.blocks_[1].covariants_purifier_.even_purifier_.purifiers_[lambd]):\n",
157 |     "        print(\"parity: even; lambda: {}; best alpha: {}\".format(\n",
158 |     "            lambd, nice.blocks_[1].covariants_purifier_.even_purifier_.\n",
159 |     "            purifiers_[lambd].regressor_.best_alpha_))\n",
160 |     "    if (nice.blocks_[1].covariants_purifier_.odd_purifier_.purifiers_[lambd]):\n",
161 |     "        print(\"parity odd; lambda: {}; best alpha: {}\".format(\n",
162 |     "            lambd, nice.blocks_[1].covariants_purifier_.odd_purifier_.\n",
163 |     "            purifiers_[lambd].regressor_.best_alpha_))"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "The same for InvariantsPurifier:"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "print(\"best alpha of invariants purifier: \",\n",
180 |     "      nice.blocks_[1].invariants_purifier_.regressor_.best_alpha_)"
181 |    ]
182 |   }
183 |  ],
184 |  "metadata": {
185 |   "kernelspec": {
186 |    "display_name": "Python 3",
187 |    "language": "python",
188 |    "name": "python3"
189 |   },
190 |   "language_info": {
191 |    "codemirror_mode": {
192 |     "name": "ipython",
193 |     "version": 3
194 |    },
195 |    "file_extension": ".py",
196 |    "mimetype": "text/x-python",
197 |    "name": "python",
198 |    "nbconvert_exporter": "python",
199 |    "pygments_lexer": "ipython3",
200 |    "version": "3.6.9"
201 |   }
202 |  },
203 |  "nbformat": 4,
204 |  "nbformat_minor": 4
205 | }
206 | 


--------------------------------------------------------------------------------
/tutorials/getting_insights_about_the_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Getting insights about the model"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In the first tutorial, we calculated invariant representations of atomic environments and used them for the prediction of energies.\n",
 15 |     "\n",
 16 |     "But it is always good to have some understanding of the model. This tutorial will show how to get spectrums of pca along with the number of covariants after each transformation.\n",
 17 |     "\n",
 18 |     "First of all, we need **fitted** model. This preliminary cell reproduces the corresponding part of the first tutorial, \"constructing machine learning potential\": (few hypers are changed)"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# cell to wrap in collapsible in future\n",
 28 |     "\n",
 29 |     "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n",
 30 |     "\n",
 31 |     "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n",
 32 |     "!gunzip -k methane.extxyz.gz\n",
 33 |     "\n",
 34 |     "import numpy as np\n",
 35 |     "import ase.io\n",
 36 |     "import tqdm\n",
 37 |     "from nice.blocks import *\n",
 38 |     "from nice.utilities import *\n",
 39 |     "from matplotlib import pyplot as plt\n",
 40 |     "from sklearn.linear_model import BayesianRidge\n",
 41 |     "\n",
 42 |     "HARTREE_TO_EV = 27.211386245988\n",
 43 |     "train_subset = \"0:10000\"  #input for ase.io.read command\n",
 44 |     "test_subset = \"10000:15000\"  #input to ase.io.read command\n",
 45 |     "environments_for_fitting = 1000  #number of environments to fit nice transfomers\n",
 46 |     "grid = [150, 200, 350, 500, 750, 1000, 1500, 2000, 3000, 5000, 7500,\n",
 47 |     "        10000]  #for learning curve\n",
 48 |     "\n",
 49 |     "#HYPERS for librascal spherical expansion coefficients\n",
 50 |     "HYPERS = {\n",
 51 |     "    'interaction_cutoff': 6.3,\n",
 52 |     "    'max_radial': 5,\n",
 53 |     "    'max_angular': 5,\n",
 54 |     "    'gaussian_sigma_type': 'Constant',\n",
 55 |     "    'gaussian_sigma_constant': 0.05,\n",
 56 |     "    'cutoff_smooth_width': 0.3,\n",
 57 |     "    'radial_basis': 'GTO'\n",
 58 |     "}\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "#our model:\n",
 62 |     "def get_nice():\n",
 63 |     "    return StandardSequence([\n",
 64 |     "        StandardBlock(ThresholdExpansioner(num_expand=150),\n",
 65 |     "                      CovariantsPurifierBoth(max_take=10),\n",
 66 |     "                      IndividualLambdaPCAsBoth(n_components=50),\n",
 67 |     "                      ThresholdExpansioner(num_expand=300, mode='invariants'),\n",
 68 |     "                      InvariantsPurifier(max_take=50),\n",
 69 |     "                      InvariantsPCA(n_components=200)),\n",
 70 |     "        StandardBlock(ThresholdExpansioner(num_expand=150),\n",
 71 |     "                      CovariantsPurifierBoth(max_take=10),\n",
 72 |     "                      IndividualLambdaPCAsBoth(n_components=50),\n",
 73 |     "                      ThresholdExpansioner(num_expand=300, mode='invariants'),\n",
 74 |     "                      InvariantsPurifier(max_take=50),\n",
 75 |     "                      InvariantsPCA(n_components=200)),\n",
 76 |     "        StandardBlock(ThresholdExpansioner(num_expand=150),\n",
 77 |     "                      CovariantsPurifierBoth(max_take=10),\n",
 78 |     "                      IndividualLambdaPCAsBoth(n_components=50),\n",
 79 |     "                      ThresholdExpansioner(num_expand=300, mode='invariants'),\n",
 80 |     "                      InvariantsPurifier(max_take=50),\n",
 81 |     "                      InvariantsPCA(n_components=200))\n",
 82 |     "    ],\n",
 83 |     "                            initial_scaler=InitialScaler(\n",
 84 |     "                                mode='signal integral', individually=True))\n",
 85 |     "\n",
 86 |     "\n",
 87 |     "train_structures = ase.io.read('methane.extxyz', index=train_subset)\n",
 88 |     "\n",
 89 |     "test_structures = ase.io.read('methane.extxyz', index=test_subset)\n",
 90 |     "\n",
 91 |     "all_species = get_all_species(train_structures + test_structures)\n",
 92 |     "\n",
 93 |     "train_coefficients = get_spherical_expansion(train_structures, HYPERS,\n",
 94 |     "                                             all_species)\n",
 95 |     "\n",
 96 |     "test_coefficients = get_spherical_expansion(test_structures, HYPERS,\n",
 97 |     "                                            all_species)\n",
 98 |     "\n",
 99 |     "#individual nice transformers for each atomic specie in the dataset\n",
100 |     "nice = {}\n",
101 |     "for key in train_coefficients.keys():\n",
102 |     "    nice[key] = get_nice()\n",
103 |     "\n",
104 |     "for key in train_coefficients.keys():\n",
105 |     "    nice[key].fit(train_coefficients[key][:environments_for_fitting])"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "As was discussed in the first tutorial, **ThresholdExpansioner** sorts all pairs of inputs by their pairwise importances and, after that, produces the output only for a fixed number of the most important pairs. This number is controlled by **num_expand**. \n",
113 |     "\n",
114 |     "However, there are two reasons why the real number of covariants after **ThresholdEpansioner** might be different from the specified one. \n",
115 |     "1) Some pairs of input covariants do not produce features to all lambda channels. Particularly, pair of input covariants with some l1 and l2 produces covariants only to lambda channels where |l1 - l2| <= lambda <= l1 + l2. Thus, the real number of features after **ThresholdExpanioner** would be smaller than the specified one in **num_expand**.\n",
116 |     "\n",
117 |     "2) Pairwise importances can have a lot of collisions. For instance, it is impossible to select such a threshold to filter out exactly 3 pairs from the set of pairs with the following importances [1, 1, 2, 2]. It is possible to filter out either 0, either 2, either 4, but not exactly 3. \n",
118 |     "\n",
119 |     "Thus, it is a good idea to have the possibility to look at the actual amount of intermediate features."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "**StandardSequence** has a method **get_intermediat_shapes()**. It returns intermediate shapes in the form of nested dictionary:"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "intermediate_shapes = nice[1].get_intermediate_shapes()\n",
136 |     "\n",
137 |     "for key in intermediate_shapes.keys():\n",
138 |     "    print(key, ':', intermediate_shapes[key], end='\\n\\n\\n')"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "Spectrums of pcas can be accessed in the following way: \n",
146 |     "(convenient getters will be inserted in the next version of NICE)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "def proper_log_plot(array, *args, **kwargs):\n",
156 |     "    '''avoiding log(0)'''\n",
157 |     "    plt.plot(np.arange(len(array)) + 1, array, *args, **kwargs)\n",
158 |     "    plt.ylim([1e-3, 1e0])\n",
159 |     "\n",
160 |     "\n",
161 |     "colors = ['r', 'g', 'b', 'orange', 'yellow', 'purple']\n",
162 |     "\n",
163 |     "print(\"nu: \", 1)\n",
164 |     "for i in range(6):  # loop over lambda channels\n",
165 |     "    if (nice[6].initial_pca_ is not None):\n",
166 |     "        if (nice[6].initial_pca_.even_pca_.pcas_[i] is not None):\n",
167 |     "            proper_log_plot(\n",
168 |     "                nice[6].initial_pca_.even_pca_.pcas_[i].importances_,\n",
169 |     "                color=colors[i],\n",
170 |     "                label=\"lambda = {}\".format(i))\n",
171 |     "\n",
172 |     "for i in range(6):  # loop over lambda channels\n",
173 |     "    if (nice[6].initial_pca_ is not None):\n",
174 |     "        if (nice[6].initial_pca_.odd_pca_.pcas_[i] is not None):\n",
175 |     "            proper_log_plot(\n",
176 |     "                nice[6].initial_pca_.odd_pca_.pcas_[i].importances_,\n",
177 |     "                '--',\n",
178 |     "                color=colors[i],\n",
179 |     "                label=\"lambda = {}\".format(i))\n",
180 |     "\n",
181 |     "plt.yscale('log')\n",
182 |     "plt.xscale('log')\n",
183 |     "plt.legend()\n",
184 |     "plt.show()\n",
185 |     "\n",
186 |     "for nu in range(len(nice[6].blocks_)):  # loop over body orders\n",
187 |     "    print(\"nu: \", nu + 2)\n",
188 |     "    for i in range(6):  # loop over lambda channels\n",
189 |     "        if (nice[6].blocks_[nu].covariants_pca_ is not None):\n",
190 |     "            if (nice[6].blocks_[nu].covariants_pca_.even_pca_.pcas_[i]\n",
191 |     "                    is not None):\n",
192 |     "                proper_log_plot(nice[6].blocks_[nu].covariants_pca_.even_pca_.\n",
193 |     "                                pcas_[i].importances_,\n",
194 |     "                                color=colors[i],\n",
195 |     "                                label=\"lambda = {}\".format(i))\n",
196 |     "\n",
197 |     "    for i in range(6):  # loop over lambda channels\n",
198 |     "        if (nice[6].blocks_[nu].covariants_pca_ is not None):\n",
199 |     "            if (nice[6].blocks_[nu].covariants_pca_.odd_pca_.pcas_[i]\n",
200 |     "                    is not None):\n",
201 |     "                proper_log_plot(nice[6].blocks_[nu].covariants_pca_.odd_pca_.\n",
202 |     "                                pcas_[i].importances_,\n",
203 |     "                                '--',\n",
204 |     "                                color=colors[i])\n",
205 |     "\n",
206 |     "    plt.yscale('log')\n",
207 |     "    plt.xscale('log')\n",
208 |     "    plt.legend()\n",
209 |     "    plt.show()"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "(checks if pca instance is **None** are needed since it would be **None** if the number of features for corresponding lambda channel would be zero after the expansion step)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "Inner class for single Lambda channel inherits from sklearn.decomposition.TruncatedSVD (PCA without centering the data, which would break covariant transformation). Thus, in addition to **.importances_**, **.explained_variance_** and **.explained_variance_ratio_** are also accessible. \n",
224 |     "\n",
225 |     "**importances_** (which are used by subsequent **TresholdExpansioners**) are **explained_variance_** normalized not to variance of input as **explained_variance_ratio_**, but to variance of output:"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "print(np.sum(nice[6].blocks_[1].\\\n",
235 |     "             covariants_pca_.even_pca_.pcas_[2].explained_variance_))\n",
236 |     "print(np.sum(nice[6].blocks_[1].\\\n",
237 |     "             covariants_pca_.even_pca_.pcas_[2].explained_variance_ratio_))\n",
238 |     "print(np.sum(nice[6].blocks_[1].\\\n",
239 |     "             covariants_pca_.even_pca_.pcas_[2].importances_))"
240 |    ]
241 |   }
242 |  ],
243 |  "metadata": {
244 |   "kernelspec": {
245 |    "display_name": "Python 3",
246 |    "language": "python",
247 |    "name": "python3"
248 |   },
249 |   "language_info": {
250 |    "codemirror_mode": {
251 |     "name": "ipython",
252 |     "version": 3
253 |    },
254 |    "file_extension": ".py",
255 |    "mimetype": "text/x-python",
256 |    "name": "python",
257 |    "nbconvert_exporter": "python",
258 |    "pygments_lexer": "ipython3",
259 |    "version": "3.6.9"
260 |   }
261 |  },
262 |  "nbformat": 4,
263 |  "nbformat_minor": 4
264 | }
265 | 


--------------------------------------------------------------------------------
/tutorials/sequential_fitting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Sequential fitting"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "It is not always clear how to select good hyperparameters for calculations. The second tutorial \"Getting insights about the model, \" showed how to plot PCA spectrums for all lambda channels and parities. This information, along with the other one, such as regression accuracy, might be useful to select better hypers. Particularly, the most straightforward way is to select the number of PCA components in such a way as to cover the most part of the variance and do it successively from block to block. \n",
 15 |     "\n",
 16 |     "In this case, it is very undesirable to fit all parts of the model, including not changed ones from scratch. One possible way around is to do all things by hand, as was described in the tutorial \"Constructor or non standard_sequence,\" but there would be an additional headache with packing resulting blocks into a single model with a convenient .transform method. Nice toolbox has the capability to do it very succinctly.\n",
 17 |     "\n",
 18 |     "First of all, we need to get spherical expansion coefficients the same way as in previous tutorials:"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# downloading dataset from https://archive.materialscloud.org/record/2020.110\n",
 28 |     "\n",
 29 |     "!wget \"https://archive.materialscloud.org/record/file?file_id=b612d8e3-58af-4374-96ba-b3551ac5d2f4&filename=methane.extxyz.gz&record_id=528\" -O methane.extxyz.gz\n",
 30 |     "!gunzip -k methane.extxyz.gz\n",
 31 |     "\n",
 32 |     "import numpy as np\n",
 33 |     "import ase.io\n",
 34 |     "import tqdm\n",
 35 |     "from nice.blocks import *\n",
 36 |     "from nice.utilities import *\n",
 37 |     "from matplotlib import pyplot as plt\n",
 38 |     "from sklearn.linear_model import BayesianRidge\n",
 39 |     "\n",
 40 |     "structures = ase.io.read('methane.extxyz', index='0:1000')\n",
 41 |     "\n",
 42 |     "HYPERS = {\n",
 43 |     "    'interaction_cutoff': 6.3,\n",
 44 |     "    'max_radial': 5,\n",
 45 |     "    'max_angular': 5,\n",
 46 |     "    'gaussian_sigma_type': 'Constant',\n",
 47 |     "    'gaussian_sigma_constant': 0.05,\n",
 48 |     "    'cutoff_smooth_width': 0.3,\n",
 49 |     "    'radial_basis': 'GTO'\n",
 50 |     "}\n",
 51 |     "\n",
 52 |     "all_species = get_all_species(structures)\n",
 53 |     "\n",
 54 |     "coefficients = get_spherical_expansion(structures, HYPERS, all_species)\n",
 55 |     "coefficients = coefficients[1]"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "coefficients are now spherical expansion coefficients for H centered environments:"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "print(coefficients.shape)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "Let's do the first steps from standar sequence:"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "even_0, odd_0 = InitialTransformer().transform(coefficients)\n",
 88 |     "initial_pca = IndividualLambdaPCAsBoth()\n",
 89 |     "initial_pca.fit(even_0, odd_0)\n",
 90 |     "even_0_t, odd_0_t = initial_pca.transform(even_0, odd_0)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "Now we can fit couple of standard blocks:"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "block_1 = StandardBlock(ThresholdExpansioner(100), None,\n",
107 |     "                        IndividualLambdaPCAsBoth(20))\n",
108 |     "block_1.fit(even_0_t, odd_0_t, even_0_t, odd_0_t)\n",
109 |     "even_1, odd_1, _ = block_1.transform(even_0_t, odd_0_t, even_0_t, odd_0_t)\n",
110 |     "\n",
111 |     "block_2 = StandardBlock(None, None, None,\n",
112 |     "                        ThresholdExpansioner(100, mode='invariants'))\n",
113 |     "block_2.fit(even_1, odd_1, even_0_t, odd_0_t)\n",
114 |     "_, _, even_invariants = block_2.transform(even_1, odd_1, even_0_t, odd_0_t)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "At his moment we have all parts of this standard sequence fitted:\n"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "nice = StandardSequence(initial_pca=initial_pca, blocks=[block_1, block_2])\n",
131 |     "print(initial_pca.is_fitted())\n",
132 |     "print(block_1.is_fitted())\n",
133 |     "print(block_2.is_fitted())"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "what about full model?"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "print(nice.is_fitted())"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "Nope. \n",
157 |     "\n",
158 |     "At this point, there is a very high probability of making a mistake. Particularly one can feed StandardSequence with some fitted initial_pca along with blocks, which were fitted based not on the same initial_pca, with different initial_normalizer, or even on different data. In order to prevent it, there is a requirement to pass an additional flag guaranteed_parts_fitted_consistently = True to the model:"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "nice = StandardSequence(initial_pca=initial_pca,\n",
168 |     "                        blocks=[block_1, block_2],\n",
169 |     "                        guaranteed_parts_fitted_consistently=True)\n",
170 |     "print(nice.is_fitted())"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "Model is considered to be fitted if 1) all parts are fitted and 2) if guaranteed_parts_fitted_consistently is set to be True\n",
178 |     "\n",
179 |     "**Golden rule:** Every time you pass guaranteed_parts_fitted_consistently = True make a pause and think twice. \n",
180 |     "\n",
181 |     "Let's check consistency:"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "even_invariants_2 = nice.transform(coefficients,\n",
191 |     "                                   return_only_invariants=True)[3]\n",
192 |     "print(np.sum(np.abs(even_invariants - even_invariants_2)))"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "This also works in other direction:"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "initial_pca = IndividualLambdaPCAsBoth()\n",
209 |     "block_1 = StandardBlock(ThresholdExpansioner(100), None,\n",
210 |     "                        IndividualLambdaPCAsBoth(20))\n",
211 |     "block_2 = StandardBlock(None, None, None,\n",
212 |     "                        ThresholdExpansioner(100, mode='invariants'))\n",
213 |     "\n",
214 |     "print(initial_pca.is_fitted())\n",
215 |     "print(block_1.is_fitted())\n",
216 |     "print(block_2.is_fitted())\n",
217 |     "\n",
218 |     "nice = StandardSequence(initial_pca=initial_pca, blocks=[block_1, block_2])\n",
219 |     "nice.fit(coefficients)\n",
220 |     "\n",
221 |     "print(initial_pca.is_fitted())\n",
222 |     "print(block_1.is_fitted())\n",
223 |     "print(block_2.is_fitted())"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "StandardBlock behaves the same way:\n",
231 |     "    "
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "expansioner, pca = ThresholdExpansioner(100), IndividualLambdaPCAsBoth(20)\n",
241 |     "print(expansioner.is_fitted())\n",
242 |     "print(pca.is_fitted())\n",
243 |     "\n",
244 |     "block = StandardBlock(expansioner, None, pca)\n",
245 |     "block.fit(even_0_t, odd_0_t, even_0_t, odd_0_t)\n",
246 |     "\n",
247 |     "print(expansioner.is_fitted())\n",
248 |     "print(pca.is_fitted())"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "expansioner, pca = ThresholdExpansioner(100), IndividualLambdaPCAsBoth(20)\n",
258 |     "expansioner.fit(even_0_t, odd_0_t, even_0_t, odd_0_t)\n",
259 |     "even_1, odd_1 = expansioner.transform(even_0_t, odd_0_t, even_0_t, odd_0_t)\n",
260 |     "pca.fit(even_1, odd_1)\n",
261 |     "\n",
262 |     "block = StandardBlock(expansioner,\n",
263 |     "                      None,\n",
264 |     "                      pca,\n",
265 |     "                      guaranteed_parts_fitted_consistently=True)\n",
266 |     "\n",
267 |     "print(block.is_fitted())"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "There is another group of blocks that accepts classes, such as sklearn.linear_model.Ridge in the initialization. But in their case, there is a need to apply several distinct regressors separately for each lambda channel and parity. Thus, the input regressor is cloned, and initial instances are not touched in any way. So, the material of this tutorial does not apply to purifiers. "
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": []
283 |   }
284 |  ],
285 |  "metadata": {
286 |   "kernelspec": {
287 |    "display_name": "Python 3",
288 |    "language": "python",
289 |    "name": "python3"
290 |   },
291 |   "language_info": {
292 |    "codemirror_mode": {
293 |     "name": "ipython",
294 |     "version": 3
295 |    },
296 |    "file_extension": ".py",
297 |    "mimetype": "text/x-python",
298 |    "name": "python",
299 |    "nbconvert_exporter": "python",
300 |    "pygments_lexer": "ipython3",
301 |    "version": "3.6.9"
302 |   },
303 |   "toc": {
304 |    "base_numbering": 1,
305 |    "nav_menu": {},
306 |    "number_sections": true,
307 |    "sideBar": true,
308 |    "skip_h1_title": false,
309 |    "title_cell": "Table of Contents",
310 |    "title_sidebar": "Contents",
311 |    "toc_cell": false,
312 |    "toc_position": {},
313 |    "toc_section_display": true,
314 |    "toc_window_display": false
315 |   }
316 |  },
317 |  "nbformat": 4,
318 |  "nbformat_minor": 4
319 | }
320 | 


--------------------------------------------------------------------------------
/update_docs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import copy
  4 | 
  5 | # cutting notebooks
  6 | def split(name, destination):
  7 |     with open("tutorials/{}".format(name), "r") as f:
  8 |         notebook = json.load(f)
  9 |         
 10 |     before_collapsible = copy.deepcopy(notebook)
 11 |     before_collapsible['cells'] = before_collapsible['cells'][0:2]
 12 | 
 13 |     collapsible = copy.deepcopy(notebook)
 14 |     collapsible['cells'] = [collapsible['cells'][2]]
 15 | 
 16 |     after_collapsible = copy.deepcopy(notebook)
 17 |     after_collapsible['cells'] = after_collapsible['cells'][3:]
 18 |     
 19 |     clean_name = name.strip().split('.')[0]
 20 |     with open("{}/{}_before_collapsible.ipynb".format(destination,
 21 |                                                       clean_name), "w") as f:
 22 |         json.dump(before_collapsible, f)
 23 |     
 24 |     with open("{}/{}_collapsible.ipynb".format(destination,
 25 |                                                       clean_name), "w") as f:
 26 |         json.dump(collapsible, f)
 27 | 
 28 | 
 29 |     with open("{}/{}_after_collapsible.ipynb".format(destination,
 30 |                                                       clean_name), "w") as f:
 31 |         json.dump(after_collapsible, f)
 32 |         
 33 | os.system("mkdir docs/cutted")       
 34 | split('calculating_covariants.ipynb', 'docs/cutted/')
 35 | split('getting_insights_about_the_model.ipynb', 'docs/cutted/')
 36 | split('constructor_or_non_standard_sequence.ipynb', 'docs/cutted/')
 37 | split('sequential_fitting.ipynb', 'docs/cutted/')
 38 | split('custom_regressors_into_purifiers.ipynb', 'docs/cutted/')
 39 | 
 40 | # converting notebooks to rst 
 41 | 
 42 | def make_substitution(lines, index):
 43 |     lines_before = lines[0:index]
 44 |     end = len(lines)
 45 |     for j in range(index + 1, len(lines)):
 46 |         if not(lines[j].strip() == "" or lines[j].startswith('    ')):
 47 |             end = j
 48 |             break
 49 | 
 50 |     lines_raw = lines[index + 1 : end]
 51 |     raw_from = 0
 52 |     for i in range(len(lines_raw)):
 53 |         if (lines_raw[i].strip() != ''):
 54 |             raw_from = i
 55 |             break
 56 |     lines_raw = lines_raw[raw_from:]
 57 |     
 58 |     raw_to = 0
 59 |     for i in range(len(lines_raw)):
 60 |         if (lines_raw[i].strip() != ''):
 61 |             raw_to = i + 1
 62 |             
 63 |     lines_raw = lines_raw[:raw_to]
 64 | 
 65 |     
 66 |     lines_for_insertion = [".. raw:: html\n",
 67 |                            "\n",
 68 |                            "<embed>\n",
 69 |                            "<pre>\n",
 70 |                            '<p style="margin-left: 5%;font-size:12px;line-height: 1.2; overflow:auto" >\n']
 71 |     lines_for_insertion = lines_for_insertion + lines_raw     
 72 |     lines_for_insertion = lines_for_insertion + ["</p>\n", "</pre>\n", "</embed>\n", '\n']
 73 |                        
 74 |     for i in range(1, len(lines_for_insertion)):
 75 |         lines_for_insertion[i] = '    ' + lines_for_insertion[i]
 76 |         
 77 |     return lines_before + lines_for_insertion + lines[end:]   
 78 |     
 79 |     return lines[index : end]    
 80 |         
 81 | def get_bad_block(lines):   
 82 |     for i in range(len(lines)):
 83 |         if (lines[i].strip() == ".. parsed-literal::"):
 84 |             return i
 85 |     return None
 86 | 
 87 | def iterate(lines):
 88 |     while True:
 89 |         index = get_bad_block(lines)
 90 |         if index is None:
 91 |             return lines
 92 |         lines = make_substitution(lines, index)
 93 |     
 94 | def fix_awful_nvconvert_format(file):
 95 |     lines = []
 96 |     with open(file, "r") as f:
 97 |         lines = list(f)
 98 |     lines = iterate(lines)
 99 |     with open(file, "w") as f:
100 |         for line in lines:
101 |             f.write(line)
102 | 
103 | os.chdir('docs/cutted/')
104 | names = [name for name in os.listdir('.') if name.endswith('.ipynb')]
105 | 
106 | for name in names:
107 |     dir_name = name.split('.')[0]    
108 |     os.system("mkdir {}".format(dir_name))
109 |     os.system("cp {} {}/".format(name, dir_name))
110 |     os.chdir(dir_name)
111 |     os.system('jupyter nbconvert --to rst {}'.format(name))
112 |     fix_awful_nvconvert_format(name.split('.')[0] + '.rst')
113 |     names_inner = os.listdir('.')
114 |     for name_inner in names_inner:
115 |         if (name_inner.endswith('_files')):  
116 |             os.system('cp -r {} ../../'.format(name_inner))  
117 |     os.chdir('../')
118 | 
119 |     
120 | os.chdir('../..')
121 | 
122 | os.system("rm -r ../build/*")
123 | os.chdir("./docs")
124 | os.system("sphinx-apidoc -f -o . ../nice")
125 | os.system("make html")
126 | os.chdir("../")
127 | os.system("git checkout -f gh-pages")
128 | os.system("git rm -r *")
129 | os.system("cp -r ../build/html/* .")
130 | with open(".nojekyll", "w") as f:
131 |     pass
132 | 
133 | os.system("git add *")
134 | os.system("git add .nojekyll")
135 | os.system("git commit -m 'automatic docs build'")
136 | os.system("git push")
137 | os.system("git checkout master")
138 | 
139 | 


--------------------------------------------------------------------------------