├── doc
    ├── modules.rst
    ├── index.rst
    ├── license.rst
    ├── bibliography.rst
    ├── regressions.rst
    ├── introduction.rst
    ├── Makefile
    ├── make.bat
    └── conf.py
├── setup.py
├── LICENSE
├── README.md
├── pyproject.toml
├── regressions
    ├── kernels.py
    ├── mlr.py
    ├── cls.py
    ├── pls_sb.py
    ├── __init__.py
    ├── kernel_pls.py
    ├── pls1.py
    ├── pls2.py
    ├── fitstats.py
    └── pcr.py
└── examples
    ├── kpls_sinc.py
    └── kpls_example.py


/doc/modules.rst:
--------------------------------------------------------------------------------
1 | regressions
2 | ===========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    regressions
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='Regressions',
 5 |     version='0.1.0',
 6 |     packages=find_packages(),
 7 |     install_requires=['numpy>=1.10'],
 8 |     description='Implementations of various regression algorithms, including '
 9 |     'Partial Least Squares and Principal Components Regression',
10 |     url='https://github.com/jhumphry/regressions',
11 |     license='ISC',
12 |     zip_safe=True,
13 | )
14 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. regressions documentation master file, created by
 2 |    sphinx-quickstart on Fri Nov  6 18:03:08 2015.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to regressions' documentation!
 7 | ======================================
 8 | 
 9 | Contents:
10 | 
11 | .. toctree::
12 |    :maxdepth: 4
13 | 
14 |    introduction
15 |    regressions
16 |    bibliography
17 |    license
18 | 
19 | Indices and tables
20 | ==================
21 | 
22 | * :ref:`genindex`
23 | * :ref:`modindex`
24 | * :ref:`search`
25 | 
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015-2025, James Humphry
 2 | 
 3 | Permission to use, copy, modify, and/or distribute this software for
 4 | any purpose with or without fee is hereby granted, provided that the
 5 | above copyright notice and this permission notice appear in all copies.
 6 | 
 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
 8 | WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
 9 | WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR
10 | BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES
11 | OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
12 | WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
13 | ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
14 | SOFTWARE.
15 | 


--------------------------------------------------------------------------------
/doc/license.rst:
--------------------------------------------------------------------------------
 1 | License
 2 | =======
 3 | 
 4 | Copyright (c) 2015-2025, James Humphry
 5 | 
 6 | Permission to use, copy, modify, and/or distribute this software for any
 7 | purpose with or without fee is hereby granted, provided that the above
 8 | copyright notice and this permission notice appear in all copies.
 9 | 
10 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
11 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
12 | AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
13 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
14 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
15 | OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16 | PERFORMANCE OF THIS SOFTWARE.
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Regressions
 2 | 
 3 | This package provides various forms of regression. The aim of these modules is
 4 | to achieve clarity of implementation with a clear connection to the
 5 | mathematical descriptions of the algorithms. The motivation for creating the
 6 | package was the desire to learn about and explore the use of Principal
 7 | Components Regression, Partial Least Squares regression and non-linear
 8 | kernel-based Partial Least Squares regression.
 9 | 
10 | Python 3.10 and Numpy 1.10 or greater are required as the new '@' matrix
11 | multiplication operator is used. If SciPy is available some linear algebra
12 | routines may be used as they can sometimes be faster than the routines in
13 | Numpy - however SciPy is not required. Matplotlib is used by the examples to
14 | display the results.
15 | 
16 | Full documentation of the API is maintained using Sphinx - see the `doc`
17 | directory.
18 | 


--------------------------------------------------------------------------------
/doc/bibliography.rst:
--------------------------------------------------------------------------------
 1 | Bibliography
 2 | ============
 3 | 
 4 | The following resources were referred to in the course of writing this
 5 | software.
 6 | 
 7 | | :title-reference:`Lecture notes for ST02: Multivariate Data Analysis and Chemometrics`
 8 | | Bent Jørgensen and Yuri Goegebeur
 9 | | http://statmaster.sdu.dk/courses/ST02
10 | 
11 | | :title-reference:`Overview and Recent Advances in Partial Least Squares`
12 | | Roman Rosipal and Nicole Krämer
13 | | SLSFS 2005, LNCS 3940, pp. 34–51, 2006
14 | 
15 | | :title-reference:`Kernel Partial Least Squares Regression in Reproducing Kernel Hilbert Space`
16 | | R. Rosipal and L.J. Trejo
17 | | Journal of Machine Learning Research, 2:97–123, 2001
18 | 
19 | | :title-reference:`Kernel Partial Least Squares for Nonlinear Regression and Discrimination`
20 | | Roman Rosipal
21 | 
22 | | :title-reference:`Nonlinear Partial Least Squares: An Overview`
23 | | Roman Rosipal
24 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # Project file for the Regressions package
 2 | 
 3 | [project]
 4 | name = "Regressions"
 5 | version = "0.1.0"
 6 | description = """Implementations of various regression algorithms, including \
 7 |                  Partial Least Squares and Principal Components Regression"""
 8 | readme = "README.md"
 9 | requires-python = ">=3.10"
10 | license = { text = "ISC License" }
11 | authors = [{ name = "J Humphry", email = "git@binecho.co.uk" }]
12 | keywords = ["statistics", "PLS", "regression"]
13 | 
14 | dependencies = ["numpy>=1.10"]
15 | 
16 | [project.optional-dependencies]
17 | scipy = ["scipy"]
18 | 
19 | [project.urls]
20 | repository = "https://github.com/jhumphry/regressions"
21 | 
22 | [tool.pyright]
23 | include = ["regressions", "examples"]
24 | exclude = ["**/node_modules", "**/__pycache__"]
25 | 
26 | reportMissingImports = true
27 | reportMissingTypeStubs = false
28 | 
29 | pythonVersion = "3.10"
30 | 
31 | [tool.ruff]
32 | exclude = ["**/__pycache__"]
33 | line-length = 100
34 | indent-width = 4
35 | target-version = "py311"
36 | 
37 | [tool.ruff.format]
38 | indent-style = "space"
39 | quote-style = "single"
40 | 
41 | [tool.ruff.lint]
42 | ignore = ["F403", "F405"]
43 | 


--------------------------------------------------------------------------------
/regressions/kernels.py:
--------------------------------------------------------------------------------
 1 | """A collection of kernels and kernel generators
 2 | 
 3 | These are mainly for use in kernel PLS. All of the kernels have the form
 4 | K(x, y) where x and y are either floats or numpy.ndarray of float.
 5 | """
 6 | 
 7 | import math
 8 | 
 9 | from . import *
10 | from collections.abc import Callable
11 | 
12 | Kernel_Function = Callable[
13 |     [
14 |         float | np.ndarray[tuple, np.dtype[np.float64]],
15 |         float | np.ndarray[tuple, np.dtype[np.float64]],
16 |     ],
17 |     float,
18 | ]
19 | 
20 | 
21 | def std_gaussian(
22 |     x: float | np.ndarray[tuple, np.dtype[np.float64]],
23 |     y: float | np.ndarray[tuple, np.dtype[np.float64]],
24 | ) -> float:
25 |     """A Gaussian kernel with width 1.
26 | 
27 |     The Gaussian kernel with standard deviation 1 is a routine choice.
28 | 
29 |     Args:
30 |         x (float or numpy.ndarray of float): The x coordinate
31 |         y (float or numpy.ndarray of float): The y coordinate
32 |     """
33 | 
34 |     return 0.3989422804014327 * math.exp(-0.5 * np.sum((x - y) ** 2))
35 | 
36 | 
37 | def make_gaussian_kernel(width: float = 1.0) -> Kernel_Function:
38 |     """Create a Gaussian kernel with adjustable width
39 | 
40 |     Args:
41 |         width (float) : The standard deviation of the Gaussian function
42 |             which adjusts the width of the resulting kernel.
43 | 
44 |     Returns:
45 |         gaussian_kernel (function) : A function of two floats or
46 |         numpy.ndarray of floats which computes the Gaussian kernel of
47 |         the desired width.
48 |     """
49 | 
50 |     normalization = 1.0 / math.sqrt(2.0 * math.pi * width)
51 |     scale = 1.0 / (2.0 * width**2)
52 | 
53 |     def gaussian_kernel(x, y):
54 |         return normalization * math.exp(-scale * np.sum((x - y) ** 2))
55 | 
56 |     return gaussian_kernel
57 | 


--------------------------------------------------------------------------------
/doc/regressions.rst:
--------------------------------------------------------------------------------
 1 | regressions package
 2 | ===================
 3 | 
 4 | Module contents
 5 | ---------------
 6 | 
 7 | .. automodule:: regressions
 8 |     :members:
 9 |     :undoc-members:
10 |     :show-inheritance:
11 | 
12 | 
13 | Submodules
14 | ----------
15 | 
16 | regressions.mlr module
17 | ----------------------
18 | 
19 | .. automodule:: regressions.mlr
20 |     :members:
21 |     :undoc-members:
22 |     :show-inheritance:
23 | 
24 | regressions.cls module
25 | ----------------------
26 | 
27 | .. automodule:: regressions.cls
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 
32 | regressions.pcr module
33 | ----------------------
34 | 
35 | .. automodule:: regressions.pcr
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 
40 | 
41 | regressions.pls1 module
42 | -----------------------
43 | 
44 | .. automodule:: regressions.pls1
45 |     :members:
46 |     :undoc-members:
47 |     :show-inheritance:
48 | 
49 | regressions.pls2 module
50 | -----------------------
51 | 
52 | .. automodule:: regressions.pls2
53 |     :members:
54 |     :undoc-members:
55 |     :show-inheritance:
56 | 
57 | regressions.pls_sb module
58 | -------------------------
59 | 
60 | .. automodule:: regressions.pls_sb
61 |     :members:
62 |     :undoc-members:
63 |     :show-inheritance:
64 | 
65 | regressions.kernel_pls module
66 | -----------------------------
67 | 
68 | .. automodule:: regressions.kernel_pls
69 |     :members:
70 |     :undoc-members:
71 |     :show-inheritance:
72 | 
73 | regressions.kernels module
74 | --------------------------
75 | 
76 | .. automodule:: regressions.kernels
77 |     :members:
78 |     :undoc-members:
79 |     :show-inheritance:
80 | 
81 | regressions.fitstats module
82 | ---------------------------
83 | 
84 | .. automodule:: regressions.fitstats
85 |     :members:
86 |     :undoc-members:
87 |     :show-inheritance:
88 | 


--------------------------------------------------------------------------------
/doc/introduction.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ============
 3 | 
 4 | This package provides various forms of regression. The aim of these modules is
 5 | to achieve clarity of implementation with a clear connection to the
 6 | mathematical descriptions of the algorithms. The motivation for creating the
 7 | package was the desire to learn about and explore the use of Principal
 8 | Components Regression, Partial Least Squares regression and non-linear
 9 | kernel-based Partial Least Squares regression.
10 | 
11 | Python 3.10 and Numpy 1.10 or greater are required as the new '@' matrix
12 | multiplication operator is used. If SciPy is available some linear algebra
13 | routines may be used as they can sometimes be faster than the routines in
14 | Numpy - however SciPy is not required. Matplotlib is used by the examples to
15 | display the results.
16 | 
17 | Overview of modules available
18 | -----------------------------
19 | 
20 | :py:mod:`regressions.mlr`
21 |     Standard Multiple Linear Regression for data with homoskedastic and
22 |     serially uncorrelated errors.
23 | :py:mod:`regressions.cls`
24 |     Classical Least Squares - equivalent to multiple linear regression but
25 |     with the regression computed in reverse (X on Y) and then
26 |     (pseudo-)inverted.
27 | :py:mod:`regressions.pcr`
28 |     Principal Component Regression - based on extracting a limited number
29 |     of components of the X data which best span the variance in X, and
30 |     then regressing Y on only those components. Both iterative (NIPALS)
31 |     and SVD approaches are implemented.
32 | :py:mod:`regressions.pls1`
33 |     Partial Least Squares based on the PLS1 algorithm for use with only
34 |     one Y variable but multiple X variables. Multiple Y variables are
35 |     handled completely independently from each other, without using
36 |     information about correlations. Uses an iterative approach.
37 | :py:mod:`regressions.pls2`
38 |     Partial Least Squares based on the PLS2 algorithm for use with
39 |     multiple X and Y variables simultaneously.  Uses an iterative
40 |     approach.
41 | :py:mod:`regressions.pls_sb`
42 |     Partial Least Squares based on the PLS-SB algorithm. This sets up the
43 |     problem in the same way as the PLS2 algorithm but then solves for the
44 |     eigenvectors directly, with a non-iterative deterministic approach.
45 | :py:mod:`regressions.kernel_pls`
46 |     Transforms the input X data into a higher-dimensional feature space
47 |     using a provided kernel, and then applies the PLS2 algorithm. This
48 |     allows non-linear problems to be addressed.
49 | :py:mod:`regressions.kernels`
50 |     A collection of kernels to use with kernel_pls
51 | :py:mod:`regressions.fitstats`
52 |     An implementation of statistical tests to help users choose an
53 |     appropriate number of components to extract in PCR or PLS without
54 |     over-fitting to the calibration data.
55 | 


--------------------------------------------------------------------------------
/examples/kpls_sinc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | """An example of the use of non-linear kernel PLS regression on the output of
 4 | a sinc function contaminated by noise.
 5 | 
 6 | Reproduces some of the figures from "Kernel Partial Least Squares Regression
 7 | in Reproducing Kernel Hilbert Space" by Roman Rosipal and Leonard J Trejo.
 8 | Journal of Machine Learning Research 2 (2001) 97-123"""
 9 | 
10 | # Copyright (c) 2015-2025, James Humphry - see LICENSE file for details
11 | 
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | 
15 | from regressions import kernel_pls, kernels
16 | 
17 | # pyright: reportInvalidStringEscapeSequence=false
18 | # This is because pyright doesn't like the escape sequences used by matplotlib
19 | 
20 | # Perform Kernel PLS on an uncontaminated sinc function to view the principal
21 | # components
22 | 
23 | x_values = np.linspace(-10.0, 10.0, 100)
24 | 
25 | pure_sinc = np.sin(np.abs(x_values)) / np.abs(x_values)
26 | pure_sinc -= pure_sinc.mean()
27 | 
28 | pure_kpls = kernel_pls.Kernel_PLS(X=x_values, Y=pure_sinc, g=4, X_kernel=kernels.std_gaussian)
29 | 
30 | # Contaminate the sinc function with some Gaussian noise and perform kernel
31 | # PLS on this revised version
32 | 
33 | noisy_sinc = pure_sinc + np.random.normal(loc=0.0, scale=0.2, size=100)
34 | noisy_sinc -= noisy_sinc.mean()
35 | 
36 | noisy_kpls = kernel_pls.Kernel_PLS(X=x_values, Y=noisy_sinc, g=1, X_kernel=kernels.std_gaussian)
37 | 
38 | # Choose some test points and use the kernel PLS results to predict /
39 | # reconstruct the true output of the sinc function
40 | 
41 | test_x = np.linspace(-10.0, 10.0, 80)
42 | test_y = np.sin(np.abs(test_x)) / np.abs(test_x)
43 | test_kpls_reconstruction = noisy_kpls.prediction(test_x)
44 | 
45 | # Perform PLS on the calibration data with a range of numbers of components and
46 | # measure the mean squared error when the test points are predicted
47 | 
48 | test_mse = np.empty((30,))
49 | for i in range(1, 31):
50 |     test_kpls = kernel_pls.Kernel_PLS(X=x_values, Y=noisy_sinc, g=i, X_kernel=kernels.std_gaussian)
51 |     prediction = test_kpls.prediction(test_x)
52 |     test_mse[i - 1] = ((prediction - test_y) ** 2).mean()
53 | 
54 | # Plot the results of the above calculations
55 | 
56 | fig = plt.figure('Kernel PLS applied to sinc function')
57 | fig.set_tight_layout(True)  # type: ignore[reportAttributeAccessIssue]
58 | plt.subplot(3, 1, 1)
59 | plt.title('Principal components found by Gaussian kernel PLS')
60 | plt.plot(x_values, pure_sinc, 'k-', label=r'$sinc\, x$')
61 | plt.plot(x_values, pure_kpls.P[:, 0], 'r-.', label='1st PC')
62 | plt.plot(x_values, pure_kpls.P[:, 1], 'b--', label='2nd PC')
63 | plt.plot(x_values, pure_kpls.P[:, 2], 'g.', label='3rd PC')
64 | plt.legend()
65 | 
66 | plt.subplot(3, 1, 2)
67 | plt.title('MSE versus number of components for kernel PLS')
68 | plt.plot(range(1, 31), test_mse, 'b-o')
69 | plt.autoscale(enable=True, axis='x', tight=True)
70 | 
71 | plt.subplot(3, 1, 3)
72 | plt.title('Gaussian kernel PLS reconstruction')
73 | plt.plot(x_values, pure_sinc, 'k--', label=r'$sinc\, x$')
74 | plt.plot(x_values, noisy_sinc, 'k.', label=r'$sinc\, x$ with noise')
75 | plt.plot(test_x, test_kpls_reconstruction, 'b-', label='KPLS, 1 PC')
76 | plt.legend()
77 | 
78 | plt.show()
79 | 


--------------------------------------------------------------------------------
/examples/kpls_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | """An example of the use of non-linear kernel PLS regression on the output of
 4 | a function z(x) = 4.26(exp (−x) − 4 exp (−2x) + 3 exp (−3x))
 5 | 
 6 | Reproduces figure 3 from "Overview and Recent Advances in Partial Least
 7 | Squares" Roman Rosipal and Nicole Krämer SLSFS 2005, LNCS 3940, pp. 34–51,
 8 | 2006 and figure 3 from "Nonlinear Partial Least Squares: An Overview" Roman
 9 | Rosipal"""
10 | 
11 | # Copyright (c) 2015, James Humphry - see LICENSE file for details
12 | 
13 | import numpy as np
14 | import matplotlib.pyplot as plt
15 | 
16 | from regressions import kernel_pls, kernels
17 | 
18 | 
19 | def z(x):
20 |     """Example function"""
21 |     return 4.26 * (np.exp(-x) - 4 * np.exp(-2.0 * x) + 3 * np.exp(-3.0 * x))
22 | 
23 | 
24 | # Define the kernel
25 | 
26 | kern = kernels.make_gaussian_kernel(width=1.8)
27 | 
28 | # Create sample data
29 | 
30 | x_values = np.linspace(0.0, 3.5, 100)
31 | 
32 | z_pure = z(x_values)
33 | z_pure -= z_pure.mean(0)  # Ensure z_pure is centered
34 | 
35 | noise = np.random.normal(loc=0.0, scale=0.2, size=100)
36 | z_noisy = z_pure + noise
37 | z_noisy -= z_noisy.mean(0)  # Ensure z_noisy is centered
38 | 
39 | # Perform Kernel PLS
40 | 
41 | kpls_1 = kernel_pls.Kernel_PLS(X=x_values, Y=z_noisy, g=1, X_kernel=kern)
42 | 
43 | kpls_1_results = kpls_1.prediction(x_values)
44 | 
45 | kpls_4 = kernel_pls.Kernel_PLS(X=x_values, Y=z_noisy, g=4, X_kernel=kern)
46 | 
47 | kpls_4_results = kpls_4.prediction(x_values)
48 | 
49 | kpls_8 = kernel_pls.Kernel_PLS(X=x_values, Y=z_noisy, g=8, X_kernel=kern)
50 | 
51 | kpls_8_results = kpls_8.prediction(x_values)
52 | 
53 | # Plot the results of the above calculations
54 | 
55 | fig = plt.figure('An example of Kernel PLS regression')
56 | 
57 | plt.title('An example of Kernel PLS regression')
58 | plt.plot(x_values, z_pure, 'k-', label='$z(.)$')
59 | plt.plot(x_values, z_noisy, 'k+', label='$z(.)$ with noise')
60 | plt.plot(x_values, kpls_1_results, 'k--', label='KPLS 1C')
61 | plt.plot(x_values, kpls_4_results, 'k:', label='KPLS 4C')
62 | plt.plot(x_values, kpls_8_results, 'k-.', label='KPLS 8C')
63 | plt.legend(loc=4)
64 | 
65 | plt.show()
66 | fig.clear()
67 | 
68 | # Plot some of the extracted components
69 | 
70 | # These figures plot the underlying function based on 100 (xi, z(xi)) pairs
71 | # as a dotted line in the original problem space. The component extracted
72 | # is a single vector in the 100-dimensional transformed feature space. Each
73 | # dimension in feature space corresponds to a K(?, xi) kernel function. As
74 | # the kernel in this case is the Gaussian kernel which is spacially
75 | # localised, it is workable to map each K(?, xi) function to the
76 | # x-cordinate xi for display in this manner. In the general case,
77 | # meaningfully plotting the components in kernel space is likely to be
78 | # difficult.
79 | 
80 | fig = plt.figure('Components found in Kernel PLS regression')
81 | 
82 | fig.set_tight_layout(True)  # type: ignore[reportAttributeAccessIssue]
83 | 
84 | for i in range(0, 8):
85 |     plt.subplot(4, 2, (i + 1))
86 |     plt.title('Kernel PLS component {}'.format((i + 1)))
87 |     plt.plot(x_values, z_pure, 'k--')
88 |     plt.plot(x_values, kpls_8.P[:, i], 'k-')
89 |     plt.gca().set_ybound(lower=-1.5, upper=1.0)
90 | 
91 | plt.show()
92 | fig.clear()
93 | 


--------------------------------------------------------------------------------
/regressions/mlr.py:
--------------------------------------------------------------------------------
 1 | """A module which implements Multiple Linear Regression."""
 2 | 
 3 | from . import *
 4 | 
 5 | 
 6 | class MLR(RegressionBase):
 7 |     """Multiple Linear Regression
 8 | 
 9 |     Standard multiple linear regression assumes the relationship between the
10 |     variables (once the means have been subtracted to center both variables)
11 |     is Y = A X + E where E is a vector of zero-mean noise vectors.
12 | 
13 |     Note :
14 |         The regression matrix B is found using the pseudo-inverse. In
15 |         order for this to be calculable, the number of calibration samples
16 |         ``N`` has be be larger than the number of X variables ``n``, and
17 |         there must not be any collinearities in the calibration X data.
18 | 
19 |     Args:
20 |         X (ndarray N x n): X calibration data, one row per data sample
21 |         Y (ndarray N x m): Y calibration data, one row per data sample
22 | 
23 |     Attributes:
24 |         B (ndarray m x n): Resulting regression matrix
25 | 
26 |     """
27 | 
28 |     B: np.ndarray[tuple[int, int], np.dtype[np.float64]]
29 | 
30 |     def __init__(
31 |         self,
32 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
33 |         Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
34 |     ):
35 |         Xc, Yc = super()._prepare_data(X, Y)
36 | 
37 |         if Xc.shape[0] <= Xc.shape[1]:
38 |             raise ParameterError(
39 |                 'MLR requires more rows (data samples) than input variables (columns of X data)'
40 |             )
41 | 
42 |         self.B = linalg.inv(Xc.T @ Xc) @ Xc.T @ Yc
43 | 
44 |     def prediction(
45 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
46 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
47 |         """Predict the output resulting from a given input
48 | 
49 |         Args:
50 |             Z (ndarray of floats): The input on which to make the
51 |                 prediction. Must either be a one dimensional array of the
52 |                 same length as the number of calibration X variables, or a
53 |                 two dimensional array with the same number of columns as
54 |                 the calibration X data and one row for each input row.
55 | 
56 |         Returns:
57 |             Y (ndarray of floats) : The predicted output - either a one
58 |             dimensional array of the same length as the number of
59 |             calibration Y variables or a two dimensional array with the
60 |             same number of columns as the calibration Y data and one row
61 |             for each input row.
62 |         """
63 | 
64 |         if len(Z.shape) == 1:
65 |             if Z.shape[0] != self.X_variables:
66 |                 raise ParameterError(
67 |                     'Data provided does not have the same '
68 |                     'number of variables as the original X '
69 |                     'data'
70 |                 )
71 |             return self.Y_offset + (Z - self.X_offset) @ self.B
72 |         else:
73 |             if Z.shape[1] != self.X_variables:
74 |                 raise ParameterError(
75 |                     'Data provided does not have the same '
76 |                     'number of variables as the original X '
77 |                     'data'
78 |                 )
79 |             result = np.empty((Z.shape[0], self.Y_variables))
80 |             for i in range(0, Z.shape[0]):
81 |                 result[i, :] = self.Y_offset + (Z[i, :] - self.X_offset) @ self.B
82 |             return result
83 | 


--------------------------------------------------------------------------------
/regressions/cls.py:
--------------------------------------------------------------------------------
 1 | """A module which implements Classical Least Squares Regression."""
 2 | 
 3 | from . import *
 4 | 
 5 | 
 6 | class CLS(RegressionBase):
 7 |     """Classical Least Squares Regression
 8 | 
 9 |     The classical least squares regression approach is to initially swap the
10 |     roles of the X and Y variables, perform linear regression and then to
11 |     invert the result. It is useful when the number of X variables is larger
12 |     than the number of calibration samples available, when conventional
13 |     multiple linear regression would be unable to proceed.
14 | 
15 |     Note :
16 |         The regression matrix A_pinv is found using the pseudo-inverse. In
17 |         order for this to be calculable, the number of calibration samples
18 |         ``N`` has be be larger than the number of Y variables ``m``, the
19 |         number of X variables ``n`` must at least equal the number of Y
20 |         variables, there must not be any collinearities in the calibration Y
21 |         data and Yt X must be non-singular.
22 | 
23 |     Args:
24 |         X (ndarray N x n): X calibration data, one row per data sample
25 |         Y (ndarray N x m): Y calibration data, one row per data sample
26 | 
27 |     Attributes:
28 |         A (ndarray m x n): Resulting regression matrix of X on Y
29 |         A_pinv (ndarray m x n): Pseudo-inverse of A
30 | 
31 |     """
32 | 
33 |     A: np.ndarray[tuple[int, int, int], np.dtype[np.float64]]
34 |     A_pinv: np.ndarray[tuple[int, int, int], np.dtype[np.float64]]
35 | 
36 |     def __init__(
37 |         self,
38 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
39 |         Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
40 |     ):
41 |         Xc, Yc = super()._prepare_data(X, Y)
42 | 
43 |         if Yc.shape[0] <= Yc.shape[1]:
44 |             raise ParameterError(
45 |                 'CLS requires more rows (data samples) than output variables (columns of Y data)'
46 |             )
47 | 
48 |         if Xc.shape[1] < Yc.shape[1]:
49 |             raise ParameterError(
50 |                 'CLS requires at least as input variables '
51 |                 '(columns of X data) as output variables '
52 |                 '(columns of Y data)'
53 |             )
54 | 
55 |         self.A = linalg.inv(Yc.T @ Yc) @ Yc.T @ Xc
56 |         self.A_pinv = self.A.T @ linalg.inv(self.A @ self.A.T)
57 | 
58 |     def prediction(
59 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
60 |     ) -> np.ndarray[tuple, np.dtype[np.float64]]:
61 |         """Predict the output resulting from a given input
62 | 
63 |         Args:
64 |             Z (ndarray of floats): The input on which to make the
65 |                 prediction. Must either be a one dimensional array of the
66 |                 same length as the number of calibration X variables, or a
67 |                 two dimensional array with the same number of columns as
68 |                 the calibration X data and one row for each input row.
69 | 
70 |         Returns:
71 |             Y (ndarray of floats) : The predicted output - either a one
72 |             dimensional array of the same length as the number of
73 |             calibration Y variables or a two dimensional array with the
74 |             same number of columns as the calibration Y data and one row
75 |             for each input row.
76 |         """
77 | 
78 |         if len(Z.shape) == 1:
79 |             if Z.shape[0] != self.X_variables:
80 |                 raise ParameterError(
81 |                     'Data provided does not have the same '
82 |                     'number of variables as the original X '
83 |                     'data'
84 |                 )
85 |             return self.Y_offset + (Z - self.X_offset) @ self.A_pinv
86 |         else:
87 |             if Z.shape[1] != self.X_variables:
88 |                 raise ParameterError(
89 |                     'Data provided does not have the same '
90 |                     'number of variables as the original X '
91 |                     'data'
92 |                 )
93 |             result = np.empty((Z.shape[0], self.Y_variables))
94 |             for i in range(0, Z.shape[0]):
95 |                 result[i, :] = self.Y_offset + (Z[i, :] - self.X_offset) @ self.A_pinv
96 |             return result
97 | 


--------------------------------------------------------------------------------
/regressions/pls_sb.py:
--------------------------------------------------------------------------------
  1 | """A module which implements the PLS-SB algorithm."""
  2 | 
  3 | from . import *
  4 | 
  5 | 
  6 | class PLS_SB(RegressionBase):
  7 |     """Regression using the PLS-SB algorithm.
  8 | 
  9 |     The PLS-SB sets up the same mathematical problem as the PLS2 module,
 10 |     but then formulates the convergence criteria as an eigenvalue problem
 11 |     and solves it directly. It is therefore a deterministic algorithm, but
 12 |     has the drawback that all components must be extracted at once, even
 13 |     if only a few are required. Note that the output of PLS-SB is not the
 14 |     same as PLS2. In the PLS2 each component found is removed from the
 15 |     working copies of the input matrices by a rank-1 operation so the next
 16 |     iterations will converge on a new component. In PLS-SB all components
 17 |     are found at once.
 18 | 
 19 |     Args:
 20 |         X (ndarray N x n): X calibration data, one row per data sample
 21 |         Y (ndarray N x m): Y calibration data, one row per data sample
 22 |         g (int): Number of components to extract
 23 | 
 24 |     Note:
 25 |         The attributes of the resulting class are exactly the same as for
 26 |         :py:class:`pls2.PLS2`.
 27 | 
 28 |     """
 29 | 
 30 |     components: int
 31 |     P: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 32 |     Q: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 33 |     T: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 34 |     U: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 35 |     W: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 36 |     C: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 37 |     B: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 38 | 
 39 |     def __init__(
 40 |         self,
 41 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 42 |         Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 43 |         g: int,
 44 |     ):
 45 |         Xc, Yc = super()._prepare_data(X, Y)
 46 | 
 47 |         if g < 1 or g > self.max_rank:
 48 |             raise ParameterError('Number of required components specified is impossible.')
 49 |         self.components = g
 50 | 
 51 |         XtY = Xc.T @ Yc
 52 |         _, W = linalg.eigh(XtY @ XtY.T)
 53 | 
 54 |         self.W = W[:, : -g - 1 : -1].real
 55 | 
 56 |         self.T = Xc @ self.W
 57 |         self.Q = Yc.T @ self.T
 58 |         self.Q /= np.linalg.norm(self.Q, axis=0)
 59 |         self.U = Yc @ self.Q
 60 |         t_dot_t = (self.T.T @ self.T).diagonal()
 61 |         self.C = np.diag((self.T.T @ self.U).diagonal() / t_dot_t)
 62 |         self.P = (Xc.T @ self.T) / t_dot_t
 63 | 
 64 |         self.B = self.W @ linalg.inv(self.P.T @ self.W) @ self.C @ self.Q.T
 65 | 
 66 |     def prediction(
 67 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
 68 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
 69 |         """Predict the output resulting from a given input
 70 | 
 71 |         Args:
 72 |             Z (ndarray of floats): The input on which to make the
 73 |                 prediction. Must either be a one dimensional array of the
 74 |                 same length as the number of calibration X variables, or a
 75 |                 two dimensional array with the same number of columns as
 76 |                 the calibration X data and one row for each input row.
 77 | 
 78 |         Returns:
 79 |             Y (ndarray of floats) : The predicted output - either a one
 80 |             dimensional array of the same length as the number of
 81 |             calibration Y variables or a two dimensional array with the
 82 |             same number of columns as the calibration Y data and one row
 83 |             for each input row.
 84 |         """
 85 | 
 86 |         if len(Z.shape) == 1:
 87 |             if Z.shape[0] != self.X_variables:
 88 |                 raise ParameterError(
 89 |                     'Data provided does not have the  same '
 90 |                     'number of variables as the original X '
 91 |                     'data'
 92 |                 )
 93 |             return self.Y_offset + (Z - self.X_offset).T @ self.B
 94 |         else:
 95 |             if Z.shape[1] != self.X_variables:
 96 |                 raise ParameterError(
 97 |                     'Data provided does not have the  same '
 98 |                     'number of variables as the original X '
 99 |                     'data'
100 |                 )
101 |             result = np.empty((Z.shape[0], self.Y_variables))
102 |             for i in range(0, Z.shape[0]):
103 |                 result[i, :] = self.Y_offset + (Z[i, :] - self.X_offset).T @ self.B
104 |             return result
105 | 
106 |     def prediction_iterative(
107 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
108 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
109 |         """Predict the output resulting from a given input, iteratively
110 | 
111 |         This produces the same output as the one-step version ``prediction``
112 |         but works by applying each loading in turn to extract the latent
113 |         variables corresponding to the input.
114 | 
115 |         Args:
116 |             Z (ndarray of floats): The input on which to make the
117 |                 prediction. Must either be a one dimensional array of the
118 |                 same length as the number of calibration X variables, or a
119 |                 two dimensional array with the same number of columns as
120 |                 the calibration X data and one row for each input row.
121 | 
122 |         Returns:
123 |             Y (ndarray of floats) : The predicted output - either a one
124 |             dimensional array of the same length as the number of
125 |             calibration Y variables or a two dimensional array with the
126 |             same number of columns as the calibration Y data and one row
127 |             for each input row.
128 |         """
129 | 
130 |         if len(Z.shape) == 1:
131 |             if Z.shape[0] != self.X_variables:
132 |                 raise ParameterError(
133 |                     'Data provided does not have the  same '
134 |                     'number of variables as the original X '
135 |                     'data'
136 |                 )
137 | 
138 |             x_j = Z - self.X_offset
139 |             t = np.empty((self.components))
140 |             for j in range(0, self.components):
141 |                 t[j] = x_j @ self.W[:, j]
142 |                 x_j = x_j - t[j] * self.P[:, j]
143 |             result = self.Y_offset + t @ self.C @ self.Q.T
144 | 
145 |             return result
146 | 
147 |         else:
148 |             if Z.shape[1] != self.X_variables:
149 |                 raise ParameterError(
150 |                     'Data provided does not have the  same '
151 |                     'number of variables as the original X '
152 |                     'data'
153 |                 )
154 |             result = np.empty((Z.shape[0], self.Y_variables))
155 |             t = np.empty((self.components))
156 | 
157 |             for k in range(0, Z.shape[0]):
158 |                 x_j = Z[k, :] - self.X_offset
159 |                 for j in range(0, self.components):
160 |                     t[j] = x_j @ self.W[:, j]
161 |                     x_j = x_j - t[j] * self.P[:, j]
162 |                 result[k, :] = self.Y_offset + t @ self.C @ self.Q.T
163 | 
164 |             return result
165 | 


--------------------------------------------------------------------------------
/regressions/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Regressions
  3 | ===========
  4 | 
  5 | Provides various forms of regression which are not all covered by other
  6 | Python statistical packages. The aim is to achieve clarity of
  7 | implementation with speed a secondary goal. Python 3.5 and Numpy 1.10 or
  8 | greater are required as the new '@' matrix multiplication operator is
  9 | used.
 10 | 
 11 | All of the regressions require the X and Y data to be provided in the form
 12 | of matrices, with one row per data sample and the same number of data
 13 | samples in each. Currently missing data or NaN are not supported.
 14 | 
 15 | """
 16 | 
 17 | # Copyright (c) 2015-2025, James Humphry - see LICENSE file for details
 18 | 
 19 | import abc
 20 | 
 21 | import numpy as np
 22 | 
 23 | try:
 24 |     import scipy.linalg as linalg  # noqa: F401
 25 | 
 26 |     _linalg_source = 'scipy'
 27 | except ImportError:
 28 |     import numpy.linalg as linalg  # noqa: F401
 29 | 
 30 |     _linalg_source = 'numpy'
 31 | 
 32 | 
 33 | class ParameterError(Exception):
 34 |     """Parameters passed to a regression routine are unacceptable
 35 | 
 36 |     This is a generic exception used to indicate that the parameters
 37 |     passed are mis-matched, nonsensical or otherwise problematic.
 38 |     """
 39 | 
 40 | 
 41 | class ConvergenceError(Exception):
 42 |     """Iterative algorithm failed to converge.
 43 | 
 44 |     Many of the routines used for regressions are iterative and in some
 45 |     cases may not converge. This is mainly likely to happen if the data
 46 |     has pathological features, or if too many components of a data set
 47 |     have been extracted by an iterative process and the residue is
 48 |     becoming dominated by rounding or other errors.
 49 |     """
 50 | 
 51 | 
 52 | DEFAULT_MAX_ITERATIONS: int = 250
 53 | """Default maximum number of iterations that iterative routines will
 54 | attempt before raising a ConvergenceError."""
 55 | 
 56 | DEFAULT_EPSILON: float = 1.0e-6
 57 | """A default epsilon value used in various places, such as to decide when
 58 | iterations have converged sufficiently."""
 59 | 
 60 | 
 61 | class RegressionBase(metaclass=abc.ABCMeta):
 62 |     """Abstract base class for regressions
 63 | 
 64 |     All the various types of regression objects will have at least the
 65 |     attributes present here.
 66 | 
 67 |     Args:
 68 |         X (ndarray N x n): X calibration data, one row per data sample
 69 |         Y (ndarray N x m): Y calibration data, one row per data sample
 70 |         standardize_X (boolean, optional): Standardize the X data
 71 |         standardize_Y (boolean, optional): Standardize the Y data
 72 | 
 73 |     Attributes:
 74 |         data_samples (int): number of calibration data samples (=N)
 75 |         max_rank (int): maximum rank of calibration X-data (limits the
 76 |             number of components that can be found)
 77 |         X_variables (int): number of X variables (=n)
 78 |         Y_variables (int): number of Y variables (=m)
 79 |         X_offset (float): Offset of calibration X data from zero
 80 |         Y_offset (float): Offset of calibration Y data from zero
 81 |         standardized_X (boolean): whether X data had variance standardized
 82 |         standardized_Y (boolean): whether Y data had variance standardized
 83 |         X_rscaling (float): the reciprocal of the scaling factor used for X
 84 |         Y_scaling (float): the scaling factor used for Y
 85 |     """
 86 | 
 87 |     # Type declarations for attributes:
 88 |     data_samples: int
 89 |     max_rank: int
 90 |     X_variables: int
 91 |     Y_variables: int
 92 |     X_offset: float
 93 |     Y_offset: float
 94 |     standardized_X: bool
 95 |     standardized_Y: bool
 96 |     X_rscaling: float
 97 |     Y_scaling: float
 98 | 
 99 |     @abc.abstractmethod
100 |     def __init__(
101 |         self,
102 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
103 |         Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
104 |         standardize_X: bool = False,
105 |         standardize_Y: bool = False,
106 |     ):
107 |         pass
108 | 
109 |     def _prepare_data(
110 |         self,
111 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
112 |         Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
113 |         standardize_X: bool = False,
114 |         standardize_Y: bool = False,
115 |     ) -> tuple[
116 |         np.ndarray[tuple[int, int], np.dtype[np.float64]],
117 |         np.ndarray[tuple[int, int], np.dtype[np.float64]],
118 |     ]:
119 |         """A private method that conducts routine data preparation
120 | 
121 |         Sets all of the RegressionBase attributes on ``self`` and returns
122 |         suitably centred and (where requested) variance standardized X and
123 |         Y data.
124 | 
125 |         Args:
126 |             X (ndarray N x n): X calibration data, one row per data sample
127 |             Y (ndarray N x m): Y calibration data, one row per data sample
128 |             standardize_X (boolean, optional): Standardize the X data
129 |             standardize_Y (boolean, optional): Standardize the Y data
130 | 
131 |         Returns:
132 |             Xc (ndarray N x n): Centralized and standardized X data
133 |             Yc (ndarray N x m): Centralized and standardized Y data
134 | 
135 |         """
136 | 
137 |         if X.shape[0] != Y.shape[0]:
138 |             raise ParameterError('X and Y data must have the same number of rows (data samples)')
139 | 
140 |         # Change 1-D arrays into column vectors
141 |         if len(X.shape) == 1:
142 |             X = X.reshape((X.shape[0], 1))  # type: ignore[reportAssignmentType]
143 | 
144 |         if len(Y.shape) == 1:
145 |             Y = Y.reshape((Y.shape[0], 1))  # type: ignore[reportAssignmentType]
146 | 
147 |         self.max_rank = min(X.shape)
148 |         self.data_samples = X.shape[0]
149 |         self.X_variables = X.shape[1]
150 |         self.Y_variables = Y.shape[1]
151 |         self.standardized_X = standardize_X
152 |         self.standardized_Y = standardize_Y
153 | 
154 |         self.X_offset = X.mean(0)
155 |         Xc = X - self.X_offset
156 | 
157 |         if standardize_X:
158 |             # The reciprocals of the standard deviations of each column are
159 |             # stored as these are what are needed for fast prediction
160 |             self.X_rscaling = 1.0 / Xc.std(0, ddof=1)
161 |             Xc *= self.X_rscaling
162 |         else:
163 |             self.X_rscaling = 1.0
164 | 
165 |         self.Y_offset = Y.mean(0)
166 |         Yc = Y - self.Y_offset
167 |         if standardize_Y:
168 |             self.Y_scaling = Y.std(0, ddof=1)
169 |             Yc /= self.Y_scaling
170 |         else:
171 |             self.Y_scaling = 1.0
172 | 
173 |         return Xc, Yc
174 | 
175 |     @abc.abstractmethod
176 |     def prediction(
177 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
178 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
179 |         """Predict the output resulting from a given input
180 | 
181 |         Args:
182 |             Z (ndarray of floats): The input on which to make the
183 |                 prediction. Must either be a one dimensional array of the
184 |                 same length as the number of calibration X variables, or a
185 |                 two dimensional array with the same number of columns as
186 |                 the calibration X data and one row for each input row.
187 | 
188 |         Returns:
189 |             Y (ndarray of floats) : The predicted output - either a one
190 |             dimensional array of the same length as the number of
191 |             calibration Y variables or a two dimensional array with the
192 |             same number of columns as the calibration Y data and one row
193 |             for each input row.
194 |         """
195 |         pass
196 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         = a4
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/regressions.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/regressions.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/regressions"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/regressions"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 2> nul
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\regressions.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\regressions.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/regressions/kernel_pls.py:
--------------------------------------------------------------------------------
  1 | """A module which implements kernel PLS."""
  2 | 
  3 | import random
  4 | 
  5 | from . import *
  6 | from .kernels import Kernel_Function
  7 | 
  8 | 
  9 | # pyright: reportUnboundVariable=false
 10 | # There will always be at least one iteration so w_j, t_j and q_j are always
 11 | # bound before they are used - however pyright cannot do this sort of analysis
 12 | 
 13 | 
 14 | class Kernel_PLS(RegressionBase):
 15 |     """Non-linear Kernel PLS regression using the PLS2 algorithm
 16 | 
 17 |     This class implements kernel PLS regression by transforming the input
 18 |     X data into feature space by applying a kernel function between each
 19 |     pair of inputs. The kernel function provided will be called with two
 20 |     vectors and should return a float. Kernels should be symmetrical with
 21 |     regard to the order in which the vectors are supplied. The PLS2
 22 |     algorithm is then applied to the transformed data. The application of
 23 |     the kernel function means that non-linear transformations are
 24 |     possible.
 25 | 
 26 |     Note:
 27 |         If ``ignore_failures`` is ``True`` then the resulting object
 28 |         may have fewer components than requested if convergence does
 29 |         not succeed.
 30 | 
 31 |     Args:
 32 |         X (ndarray N x n): X calibration data, one row per data sample
 33 |         Y (ndarray N x m): Y calibration data, one row per data sample
 34 |         g (int): Number of components to extract
 35 |         X_kernel (function): Kernel function
 36 |         max_iterations (int, optional) : Maximum number of iterations of
 37 |             NIPALS to attempt
 38 |         iteration_convergence (float, optional): Difference in norm
 39 |             between two iterations at which point the iteration will be
 40 |             considered to have converged.
 41 |         ignore_failures (boolean, optional): Do not raise an error if
 42 |             iteration has to be abandoned before the requested number
 43 |             of components have been recovered
 44 | 
 45 |     Attributes:
 46 |         components (int): number of components extracted (=g)
 47 |         X_training_set (ndarray N x n): X calibration data (centred)
 48 |         K (ndarray N x N): X calibration data transformed into feature space
 49 |         P (ndarray n x g): Loadings on K (Components extracted from data)
 50 |         Q (ndarray m x g): Loadings on Y (Components extracted from data)
 51 |         T (ndarray N x g): Scores on K
 52 |         U (ndarray N x g): Scores on Y
 53 |         B_RHS (ndarray n x m): Partial regression matrix
 54 | 
 55 |     """
 56 | 
 57 |     # Type declarations for attributes:
 58 |     components: int
 59 |     X_training_set: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 60 |     K: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 61 |     P: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 62 |     Q: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 63 |     T: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 64 |     U: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 65 |     B_RHS: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 66 | 
 67 |     def __init__(
 68 |         self,
 69 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 70 |         Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 71 |         g: int,
 72 |         X_kernel: Kernel_Function,
 73 |         max_iterations: int = DEFAULT_MAX_ITERATIONS,
 74 |         iteration_convergence: float = DEFAULT_EPSILON,
 75 |         ignore_failures: bool = True,
 76 |     ):
 77 |         if max_iterations < 1:
 78 |             raise ParameterError('At least one iteration is necessary')
 79 | 
 80 |         if iteration_convergence <= 0.0:
 81 |             raise ParameterError('Iteration convergence limit must be positive')
 82 | 
 83 |         Xc, Yc = super()._prepare_data(X, Y)
 84 | 
 85 |         self.X_training_set = Xc
 86 |         self.X_kernel = X_kernel
 87 | 
 88 |         K = np.empty((self.data_samples, self.data_samples))
 89 |         for i in range(0, self.data_samples):
 90 |             for j in range(0, i):
 91 |                 K[i, j] = X_kernel(Xc[i, :], Xc[j, :])
 92 |                 K[j, i] = K[i, j]
 93 |             K[i, i] = X_kernel(Xc[i, :], Xc[i, :])
 94 | 
 95 |         centralizer = (np.identity(self.data_samples)) - (1.0 / self.data_samples) * np.ones(
 96 |             (self.data_samples, self.data_samples)
 97 |         )
 98 |         K = centralizer @ K @ centralizer
 99 |         self.K = K
100 | 
101 |         T = np.empty((self.data_samples, g))
102 |         Q = np.empty((self.Y_variables, g))
103 |         U = np.empty((self.data_samples, g))
104 |         P = np.empty((self.data_samples, g))
105 | 
106 |         self.components = 0
107 |         K_j = K
108 |         Y_j = Yc
109 | 
110 |         for j in range(0, g):
111 |             u_j = Y_j[:, random.randint(0, self.Y_variables - 1)]
112 | 
113 |             iteration_count = 0
114 |             iteration_change = iteration_convergence * 10.0
115 | 
116 |             w_j: np.ndarray
117 |             t_j: np.ndarray
118 |             q_j: np.ndarray
119 | 
120 |             while iteration_count < max_iterations and iteration_change > iteration_convergence:
121 |                 w_j = K_j @ u_j
122 |                 t_j = w_j / np.linalg.norm(w_j, 2)
123 | 
124 |                 q_j = Y_j.T @ t_j
125 | 
126 |                 old_u_j = u_j
127 |                 u_j = Y_j @ q_j
128 |                 u_j /= np.linalg.norm(u_j, 2)
129 |                 iteration_change = linalg.norm(u_j - old_u_j)
130 |                 iteration_count += 1
131 | 
132 |             if iteration_count >= max_iterations:
133 |                 if ignore_failures:
134 |                     break
135 |                 else:
136 |                     raise ConvergenceError(
137 |                         'PLS2 failed to converge for component: {}'.format(self.components + 1)
138 |                     )
139 | 
140 |             T[:, j] = t_j
141 |             Q[:, j] = q_j
142 |             U[:, j] = u_j
143 | 
144 |             P[:, j] = (K_j.T @ w_j) / (w_j @ w_j)  # type: ignore
145 |             deflator = np.identity(self.data_samples) - np.outer(t_j.T, t_j)
146 |             K_j = deflator @ K_j @ deflator
147 |             Y_j = Y_j - np.outer(t_j, q_j.T)
148 |             self.components += 1
149 | 
150 |         # If iteration stopped early because of failed convergence, only
151 |         # the actual components will be copied
152 | 
153 |         self.T = T[:, 0 : self.components]
154 |         self.Q = Q[:, 0 : self.components]
155 |         self.U = U[:, 0 : self.components]
156 |         self.P = P[:, 0 : self.components]
157 | 
158 |         self.B_RHS = self.U @ linalg.inv(self.T.T @ self.K @ self.U) @ self.Q.T
159 | 
160 |     def prediction(
161 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
162 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
163 |         """Predict the output resulting from a given input
164 | 
165 |         Args:
166 |             Z (ndarray of floats): The input on which to make the
167 |                 prediction. A one-dimensional array will be interpreted as
168 |                 a single multi-dimensional input unless the number of X
169 |                 variables in the calibration data was 1, in which case it
170 |                 will be interpreted as a set of inputs. A two-dimensional
171 |                 array will be interpreted as one multi-dimensional input
172 |                 per row.
173 | 
174 |         Returns:
175 |             Y (ndarray of floats) : The predicted output - either a one
176 |             dimensional array of the same length as the number of
177 |             calibration Y variables or a two dimensional array with the
178 |             same number of columns as the calibration Y data and one row
179 |             for each input row.
180 |         """
181 | 
182 |         if len(Z.shape) == 1:
183 |             if self.X_variables == 1:
184 |                 Z = Z.reshape((Z.shape[0], 1))  # type: ignore[reportAssignmentType]
185 |                 Kt = np.empty((Z.shape[0], self.data_samples))
186 |             else:
187 |                 if Z.shape[0] != self.X_variables:
188 |                     raise ParameterError(
189 |                         'Data provided does not have the '
190 |                         'same number of variables as the '
191 |                         'original X data'
192 |                     )
193 |                 Z = Z.reshape((1, Z.shape[0]))  # type: ignore[reportAssignmentType]
194 |                 Kt = np.empty((1, self.data_samples))
195 |         else:
196 |             if Z.shape[1] != self.X_variables:
197 |                 raise ParameterError(
198 |                     'Data provided does not have the  same '
199 |                     'number of variables as the original X '
200 |                     'data'
201 |                 )
202 |             Kt = np.empty((Z.shape[0], self.data_samples))
203 | 
204 |         for i in range(0, Z.shape[0]):
205 |             for j in range(0, self.data_samples):
206 |                 Kt[i, j] = self.X_kernel(Z[i, :] - self.X_offset, self.X_training_set[j, :])
207 | 
208 |         centralizer = (1.0 / self.data_samples) * np.ones((Z.shape[0], self.data_samples))
209 | 
210 |         Kt = (Kt - centralizer @ self.K) @ (
211 |             np.identity(self.data_samples) - (1.0 / self.data_samples) * np.ones(self.data_samples)
212 |         )
213 | 
214 |         # Fix centralisation - appears to be necessary but not usually
215 |         # mentioned in papers
216 | 
217 |         Kt -= Kt.mean(0)
218 | 
219 |         return self.Y_offset + Kt @ self.B_RHS
220 | 


--------------------------------------------------------------------------------
/regressions/pls1.py:
--------------------------------------------------------------------------------
  1 | """A module which implements the Partial Least Squares 1 algorithm."""
  2 | 
  3 | from . import *
  4 | 
  5 | 
  6 | class PLS1(RegressionBase):
  7 |     """Regression using the PLS1 algorithm.
  8 | 
  9 |     The PLS1 algorithm forms a set of new latent variables from the
 10 |     provided X and Y data samples based on criteria that balance the need
 11 |     to explain the variance within X and Y and the covariance between X
 12 |     and Y. Regression is then performed on the latent variables. PLS1 only
 13 |     addresses the case of a single Y variable and if more than one output
 14 |     variable is required then PLS1 will be run multiple times. PLS1 is a
 15 |     deterministic algorithm that requires one iteration per component
 16 |     extracted.
 17 | 
 18 |     Note:
 19 |         If ``ignore_failures`` is ``True`` then the resulting object
 20 |         may have fewer components than requested if convergence does
 21 |         not succeed.
 22 | 
 23 |     Args:
 24 |         X (ndarray N x n): X calibration data, one row per data sample
 25 |         Y (ndarray N x m): Y calibration data, one row per data sample
 26 |         g (int): Number of components to extract
 27 |         epsilon (float, optional): Value at which the components
 28 |             extracted will be considered to be too small to be stable
 29 |             and iteration will cease
 30 |         ignore_failures (boolean, optional): Do not raise an error if
 31 |             iteration has to be abandoned before the requested number
 32 |             of components have been recovered
 33 | 
 34 |     Attributes:
 35 |         components (int): number of components extracted (=g)
 36 |         W (ndarray m x n x g): Weight vectors
 37 |         P (ndarray m x n x g): Loadings (Components extracted from data)
 38 |         T (ndarray m x N x g): Scores
 39 |         c (ndarray m x g): Regression coefficients
 40 |         b (ndarray m x n): Resulting regression matrix
 41 | 
 42 |     """
 43 | 
 44 |     # Type declarations for attributes:
 45 |     components: int
 46 |     W: np.ndarray[tuple[int, int, int], np.dtype[np.float64]]
 47 |     P: np.ndarray[tuple[int, int, int], np.dtype[np.float64]]
 48 |     T: np.ndarray[tuple[int, int, int], np.dtype[np.float64]]
 49 |     c: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 50 |     b: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 51 | 
 52 |     def __init__(
 53 |         self,
 54 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 55 |         Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 56 |         g: int,
 57 |         epsilon: float = DEFAULT_EPSILON,
 58 |         ignore_failures: bool = False,
 59 |     ):
 60 |         if epsilon <= 0.0:
 61 |             raise ParameterError('Epsilon must be positive')
 62 | 
 63 |         Xc, Yc = super()._prepare_data(X, Y)
 64 | 
 65 |         if g < 1 or g > self.max_rank:
 66 |             raise ParameterError('Number of required components specified is impossible.')
 67 |         self.components = g
 68 | 
 69 |         W = np.empty((self.Y_variables, self.X_variables, g))
 70 |         P = np.empty((self.Y_variables, self.X_variables, g))
 71 |         T = np.empty((self.Y_variables, self.data_samples, g))
 72 |         c = np.empty((self.Y_variables, g))
 73 |         b = np.empty((self.Y_variables, self.X_variables))
 74 | 
 75 |         for z in range(0, self.Y_variables):
 76 |             X_j = Xc
 77 |             y_j = Yc[:, z]
 78 | 
 79 |             for j in range(0, g):
 80 |                 w_j = X_j.T @ y_j
 81 |                 w_j /= linalg.norm(w_j, 2)
 82 | 
 83 |                 t_j = X_j @ w_j
 84 |                 tt_j = t_j.T @ t_j
 85 | 
 86 |                 c_j = (t_j.T @ y_j) / tt_j
 87 |                 if c_j < epsilon:
 88 |                     if ignore_failures:
 89 |                         if self.components > j:
 90 |                             self.components = j  # See comment below
 91 |                         break
 92 |                     else:
 93 |                         raise ConvergenceError('PLS1 failed at iteration: g={}, j={}'.format(g, j))
 94 | 
 95 |                 p_j = (X_j.T @ t_j) / tt_j
 96 | 
 97 |                 X_j = X_j - np.outer(t_j, p_j.T)  # Reduce in rank
 98 |                 y_j = y_j - t_j * c_j
 99 | 
100 |                 W[z, :, j] = w_j
101 |                 P[z, :, j] = p_j
102 |                 T[z, :, j] = t_j
103 |                 c[z, j] = c_j
104 |             else:
105 |                 # N.B - don't try to find the regression matrix if the
106 |                 # iteration failed! Inversion won't work...
107 |                 b[z, :] = W[z, :, :] @ linalg.inv(P[z, :, :].T @ W[z, :, :]) @ c[z, :]
108 | 
109 |         # If one of the iterations fails due to c_j becoming too small, then
110 |         # self.components will be reduced and the output will be cut down to
111 |         # the lowest number of iterations achieved for any of the Y variables.
112 |         # Of course, b may no longer be a particularly good regression vector
113 |         # in this case.
114 |         self.W = W[:, :, 0 : self.components]
115 |         self.P = P[:, :, 0 : self.components]
116 |         self.T = T[:, :, 0 : self.components]
117 |         self.c = c[:, 0 : self.components]
118 |         self.b = b
119 | 
120 |     def prediction(
121 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
122 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
123 |         """Predict the output resulting from a given input
124 | 
125 |         Args:
126 |             Z (ndarray of floats): The input on which to make the
127 |                 prediction. Must either be a one dimensional array of the
128 |                 same length as the number of calibration X variables, or a
129 |                 two dimensional array with the same number of columns as
130 |                 the calibration X data and one row for each input row.
131 | 
132 |         Returns:
133 |             Y (ndarray of floats) : The predicted output - either a one
134 |             dimensional array of the same length as the number of
135 |             calibration Y variables or a two dimensional array with the
136 |             same number of columns as the calibration Y data and one row
137 |             for each input row.
138 |         """
139 | 
140 |         if len(Z.shape) == 1:
141 |             if Z.shape[0] != self.X_variables:
142 |                 raise ParameterError(
143 |                     'Data provided does not have the  same '
144 |                     'number of variables as the original X '
145 |                     'data'
146 |                 )
147 |             return self.Y_offset + (Z - self.X_offset).T @ self.b.T
148 |         else:
149 |             if Z.shape[1] != self.X_variables:
150 |                 raise ParameterError(
151 |                     'Data provided does not have the  same '
152 |                     'number of variables as the original X '
153 |                     'data'
154 |                 )
155 |             result = np.empty((Z.shape[0], self.Y_variables))
156 |             for i in range(0, Z.shape[0]):
157 |                 result[i, :] = self.Y_offset + (Z[i, :] - self.X_offset).T @ self.b.T
158 |             return result
159 | 
160 |     def prediction_iterative(
161 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
162 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
163 |         """Predict the output resulting from a given input, iteratively
164 | 
165 |         This produces the same output as the one-step version ``prediction``
166 |         but works by applying each loading in turn to extract the latent
167 |         variables corresponding to the input.
168 | 
169 |         Args:
170 |             Z (ndarray of floats): The input on which to make the
171 |                 prediction. Must either be a one dimensional array of the
172 |                 same length as the number of calibration X variables, or a
173 |                 two dimensional array with the same number of columns as
174 |                 the calibration X data and one row for each input row.
175 | 
176 |         Returns:
177 |             Y (ndarray of floats) : The predicted output - either a one
178 |             dimensional array of the same length as the number of
179 |             calibration Y variables or a two dimensional array with the
180 |             same number of columns as the calibration Y data and one row
181 |             for each input row.
182 |         """
183 | 
184 |         if len(Z.shape) == 1:
185 |             if Z.shape[0] != self.X_variables:
186 |                 raise ParameterError(
187 |                     'Data provided does not have the  same '
188 |                     'number of variables as the original X '
189 |                     'data'
190 |                 )
191 |             result = np.empty((Z.shape[0], self.Y_variables))
192 |             result[:, :] = self.Y_offset
193 |             for k in range(0, self.Y_variables):
194 |                 x_j = Z - self.X_offset
195 |                 t = np.empty((self.components))
196 |                 for j in range(0, self.components):
197 |                     t[j] = x_j @ self.W[k, :, j]
198 |                     x_j = x_j - t[j] * self.P[k, :, j]
199 |                 result[k] += self.c[k, :] @ t
200 | 
201 |             return result
202 | 
203 |         else:
204 |             if Z.shape[1] != self.X_variables:
205 |                 raise ParameterError(
206 |                     'Data provided does not have the  same '
207 |                     'number of variables as the original X '
208 |                     'data'
209 |                 )
210 |             result = np.empty((Z.shape[0], self.Y_variables))
211 |             result[:, :] = self.Y_offset
212 |             for l in range(0, Z.shape[0]):  # noqa: E741 # For consistency with std math usage
213 |                 for k in range(0, self.Y_variables):
214 |                     x_j = Z[l, :] - self.X_offset
215 |                     t = np.empty((self.components))
216 |                     for j in range(0, self.components):
217 |                         t[j] = x_j @ self.W[k, :, j]
218 |                         x_j = x_j - t[j] * self.P[k, :, j]
219 |                     result[l, k] += self.c[k, :] @ t
220 | 
221 |             return result
222 | 


--------------------------------------------------------------------------------
/regressions/pls2.py:
--------------------------------------------------------------------------------
  1 | """A module which implements the Partial Least Squares 2 algorithm."""
  2 | 
  3 | import random
  4 | 
  5 | from . import *
  6 | 
  7 | 
  8 | # pyright: reportUnboundVariable=false
  9 | # There will always be at least one iteration so w_j, t_j and q_j are always
 10 | # bound before they are used - however pyright cannot do this sort of analysis
 11 | 
 12 | 
 13 | class PLS2(RegressionBase):
 14 |     """Regression using the PLS2 algorithm.
 15 | 
 16 |     The PLS2 algorithm forms a set of new latent variables from the
 17 |     provided X and Y data samples based on criteria that balance the need
 18 |     to explain the variance within X and Y and the covariance between X
 19 |     and Y. Regression is then performed on the latent variables. In
 20 |     contrast to PLS1, the PLS2 algorithm handles multi-dimensional Y in
 21 |     one pass, taking into account all of the Y variables at once. Due to
 22 |     the added complexity relative to PLS1, PLS2 is a non-deterministic
 23 |     iterative algorithm comparable to the NIPALS algorithm for PCR.
 24 | 
 25 |     Note:
 26 |         If ``ignore_failures`` is ``True`` then the resulting object
 27 |         may have fewer components than requested if convergence does
 28 |         not succeed.
 29 | 
 30 |     Args:
 31 |         X (ndarray N x n): X calibration data, one row per data sample
 32 |         Y (ndarray N x m): Y calibration data, one row per data sample
 33 |         g (int): Number of components to extract
 34 |         max_iterations (int, optional) : Maximum number of iterations of
 35 |             NIPALS to attempt
 36 |         iteration_convergence (float, optional): Difference in norm
 37 |             between two iterations at which point the iteration will be
 38 |             considered to have converged.
 39 |         ignore_failures (boolean, optional): Do not raise an error if
 40 |             iteration has to be abandoned before the requested number
 41 |             of components have been recovered
 42 | 
 43 |     Attributes:
 44 |         components (int): number of components extracted (=g)
 45 |         P (ndarray n x g): Loadings on X (Components extracted from data)
 46 |         Q (ndarray m x g): Loadings on Y (Components extracted from data)
 47 |         T (ndarray N x g): Scores on X
 48 |         U (ndarray N x g): Scores on Y
 49 |         W (ndarray n x g): Weight vectors
 50 |         C (ndarray g x g): Diagonal matrix of regression coefficients
 51 |         B (ndarray n x m): Final regression matrix
 52 | 
 53 |     """
 54 | 
 55 |     # Type declarations for attributes:
 56 |     components: int
 57 |     P: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 58 |     Q: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 59 |     T: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 60 |     U: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 61 |     W: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 62 |     C: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 63 |     B: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 68 |         Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 69 |         g: int,
 70 |         max_iterations: int = DEFAULT_MAX_ITERATIONS,
 71 |         iteration_convergence: float = DEFAULT_EPSILON,
 72 |         ignore_failures: bool = True,
 73 |     ):
 74 |         if max_iterations < 1:
 75 |             raise ParameterError('At least one iteration is necessary')
 76 | 
 77 |         if iteration_convergence <= 0.0:
 78 |             raise ParameterError('Iteration convergence limit must be positive')
 79 | 
 80 |         Xc, Yc = super()._prepare_data(X, Y)
 81 | 
 82 |         if g < 1 or g > self.max_rank:
 83 |             raise ParameterError('Number of required components specified is impossible.')
 84 | 
 85 |         W = np.empty((self.X_variables, g))
 86 |         T = np.empty((self.data_samples, g))
 87 |         Q = np.empty((self.Y_variables, g))
 88 |         U = np.empty((self.data_samples, g))
 89 |         P = np.empty((self.X_variables, g))
 90 |         c = np.empty((g,))
 91 | 
 92 |         self.components = 0
 93 |         X_j = Xc
 94 |         Y_j = Yc
 95 | 
 96 |         for j in range(0, g):
 97 |             u_j = Y_j[:, random.randint(0, self.Y_variables - 1)]
 98 | 
 99 |             iteration_count = 0
100 |             iteration_change = iteration_convergence * 10.0
101 | 
102 |             while iteration_count < max_iterations and iteration_change > iteration_convergence:
103 |                 w_j = X_j.T @ u_j
104 |                 w_j /= np.linalg.norm(w_j, 2)
105 | 
106 |                 t_j = X_j @ w_j
107 | 
108 |                 q_j = Y_j.T @ t_j
109 |                 q_j /= np.linalg.norm(q_j, 2)
110 | 
111 |                 old_u_j = u_j
112 |                 u_j = Y_j @ q_j
113 |                 iteration_change = linalg.norm(u_j - old_u_j)
114 |                 iteration_count += 1
115 | 
116 |             if iteration_count >= max_iterations:
117 |                 if ignore_failures:
118 |                     break
119 |                 else:
120 |                     raise ConvergenceError(
121 |                         'PLS2 failed to converge for component: {}'.format(self.components + 1)
122 |                     )
123 | 
124 |             W[:, j] = w_j
125 |             T[:, j] = t_j
126 |             Q[:, j] = q_j
127 |             U[:, j] = u_j
128 | 
129 |             t_dot_t = t_j.T @ t_j
130 |             c[j] = (t_j.T @ u_j) / t_dot_t
131 |             P[:, j] = (X_j.T @ t_j) / t_dot_t
132 |             X_j = X_j - np.outer(t_j, P[:, j].T)
133 |             Y_j = Y_j - c[j] * np.outer(t_j, q_j.T)
134 |             self.components += 1
135 | 
136 |         # If iteration stopped early because of failed convergence, only
137 |         # the actual components will be copied
138 | 
139 |         self.W = W[:, 0 : self.components]
140 |         self.T = T[:, 0 : self.components]
141 |         self.Q = Q[:, 0 : self.components]
142 |         self.U = U[:, 0 : self.components]
143 |         self.P = P[:, 0 : self.components]
144 |         self.C = np.diag(c[0 : self.components])
145 | 
146 |         self.B = self.W @ linalg.inv(self.P.T @ self.W) @ self.C @ self.Q.T
147 | 
148 |     def prediction(
149 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
150 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
151 |         """Predict the output resulting from a given input
152 | 
153 |         Args:
154 |             Z (ndarray of floats): The input on which to make the
155 |                 prediction. Must either be a one dimensional array of the
156 |                 same length as the number of calibration X variables, or a
157 |                 two dimensional array with the same number of columns as
158 |                 the calibration X data and one row for each input row.
159 | 
160 |         Returns:
161 |             Y (ndarray of floats) : The predicted output - either a one
162 |             dimensional array of the same length as the number of
163 |             calibration Y variables or a two dimensional array with the
164 |             same number of columns as the calibration Y data and one row
165 |             for each input row.
166 |         """
167 | 
168 |         if len(Z.shape) == 1:
169 |             if Z.shape[0] != self.X_variables:
170 |                 raise ParameterError(
171 |                     'Data provided does not have the  same '
172 |                     'number of variables as the original X '
173 |                     'data'
174 |                 )
175 |             return self.Y_offset + (Z - self.X_offset).T @ self.B
176 |         else:
177 |             if Z.shape[1] != self.X_variables:
178 |                 raise ParameterError(
179 |                     'Data provided does not have the  same '
180 |                     'number of variables as the original X '
181 |                     'data'
182 |                 )
183 |             result = np.empty((Z.shape[0], self.Y_variables))
184 |             for i in range(0, Z.shape[0]):
185 |                 result[i, :] = self.Y_offset + (Z[i, :] - self.X_offset).T @ self.B
186 |             return result
187 | 
188 |     def prediction_iterative(
189 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
190 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
191 |         """Predict the output resulting from a given input, iteratively
192 | 
193 |         This produces the same output as the one-step version ``prediction``
194 |         but works by applying each loading in turn to extract the latent
195 |         variables corresponding to the input.
196 | 
197 |         Args:
198 |             Z (ndarray of floats): The input on which to make the
199 |                 prediction. Must either be a one dimensional array of the
200 |                 same length as the number of calibration X variables, or a
201 |                 two dimensional array with the same number of columns as
202 |                 the calibration X data and one row for each input row.
203 | 
204 |         Returns:
205 |             Y (ndarray of floats) : The predicted output - either a one
206 |             dimensional array of the same length as the number of
207 |             calibration Y variables or a two dimensional array with the
208 |             same number of columns as the calibration Y data and one row
209 |             for each input row.
210 |         """
211 | 
212 |         if len(Z.shape) == 1:
213 |             if Z.shape[0] != self.X_variables:
214 |                 raise ParameterError(
215 |                     'Data provided does not have the  same '
216 |                     'number of variables as the original X '
217 |                     'data'
218 |                 )
219 | 
220 |             x_j = Z - self.X_offset
221 |             t = np.empty((self.components))
222 |             for j in range(0, self.components):
223 |                 t[j] = x_j @ self.W[:, j]
224 |                 x_j = x_j - t[j] * self.P[:, j]
225 |             result = self.Y_offset + t @ self.C @ self.Q.T
226 | 
227 |             return result
228 | 
229 |         else:
230 |             if Z.shape[1] != self.X_variables:
231 |                 raise ParameterError(
232 |                     'Data provided does not have the  same '
233 |                     'number of variables as the original X '
234 |                     'data'
235 |                 )
236 |             result = np.empty((Z.shape[0], self.Y_variables))
237 |             t = np.empty((self.components))
238 | 
239 |             for k in range(0, Z.shape[0]):
240 |                 x_j = Z[k, :] - self.X_offset
241 |                 for j in range(0, self.components):
242 |                     t[j] = x_j @ self.W[:, j]
243 |                     x_j = x_j - t[j] * self.P[:, j]
244 |                 result[k, :] = self.Y_offset + t @ self.C @ self.Q.T
245 | 
246 |             return result
247 | 


--------------------------------------------------------------------------------
/regressions/fitstats.py:
--------------------------------------------------------------------------------
  1 | """A module which implements goodness-of-fit statistics."""
  2 | 
  3 | try:
  4 |     import scipy.stats
  5 | 
  6 |     _stats_available = True
  7 | except ImportError:
  8 |     _stats_available = False
  9 | 
 10 | from . import *
 11 | 
 12 | 
 13 | def SS(Y: np.ndarray[tuple[int, int], np.dtype[np.float64]]) -> float:
 14 |     """Implements the Sum of Squares
 15 | 
 16 |     This function calculates the sum of the squared input data. The input
 17 |     data is first centered by subtracting the mean.
 18 | 
 19 |     Args:
 20 |         Y (ndarray N x m): Y calibration data, one row per data sample
 21 | 
 22 |     Returns:
 23 |         SS (float): The sum of the squares of the input data.
 24 | 
 25 |     """
 26 | 
 27 |     # Change 1-D array into column vector
 28 |     if len(Y.shape) == 1:
 29 |         Y = Y.reshape((Y.shape[0], 1))  # type: ignore[reportAssignmentType]
 30 | 
 31 |     Yc = Y - Y.mean(0)
 32 | 
 33 |     return (Yc**2.0).sum()
 34 | 
 35 | 
 36 | def RESS(
 37 |     R: type[RegressionBase],
 38 |     X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 39 |     Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 40 |     others: dict | None = None,
 41 |     relative: bool = False,
 42 | ) -> float:
 43 |     """Implements the Residual Error Sum of Squares
 44 | 
 45 |     This function calculates the RESS statistic for a given regression
 46 |     class and a set of calibration data. The regression function is
 47 |     trained on the X and Y data. The X data is then used to predict a set
 48 |     of Y data. The difference between these predictions and the true Y
 49 |     data is squared and summed to give the RESS statistic. Note that this
 50 |     statistic can be misleading if used on its own as it can reward
 51 |     routines which over-fit to the sample data and do not have good
 52 |     generalisation performance. Consider using in conjunction with the
 53 |     :py:func:`PRESS` statistic.
 54 | 
 55 |     Args:
 56 |         R (class): A regression class
 57 |         X (ndarray N x n): X calibration data, one row per data sample
 58 |         Y (ndarray N x m): Y calibration data, one row per data sample
 59 |         others (dict, optional): A dict of other parameters to send to the
 60 |             regression class constructor.
 61 |         relative (boolean, optional): whether to divide the error by the
 62 |             true Y value before squaring and summing - where Y columns have
 63 |             different scales this may help to prevent the output being
 64 |             dominated by the column with the largest magnitude.
 65 | 
 66 |     Returns:
 67 |         RESS (float): The RESS statistic.
 68 | 
 69 |     """
 70 | 
 71 |     if others is None:
 72 |         others = {}
 73 | 
 74 |     if X.shape[0] != Y.shape[0]:
 75 |         raise ParameterError('X and Y data must have the same number of rows (data samples)')
 76 | 
 77 |     # Change 1-D arrays into column vectors
 78 |     if len(X.shape) == 1:
 79 |         X = X.reshape((X.shape[0], 1))  # type: ignore[reportAssignmentType]
 80 | 
 81 |     if len(Y.shape) == 1:
 82 |         Y = Y.reshape((Y.shape[0], 1))  # type: ignore[reportAssignmentType]
 83 | 
 84 |     model: RegressionBase = R(X=X, Y=Y, **others)
 85 |     Yhat = model.prediction(Z=X)
 86 | 
 87 |     if relative:
 88 |         return (((Yhat - Y) / Y) ** 2).sum()
 89 |     else:
 90 |         return ((Yhat - Y) ** 2).sum()
 91 | 
 92 | 
 93 | def R2(
 94 |     R: type[RegressionBase],
 95 |     X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 96 |     Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 97 |     others: dict | None = None,
 98 | ) -> float:
 99 |     """Implements the R**2 statistic
100 | 
101 |     This function calculates the R**2 statistic for a given regression
102 |     class and a set of calibration data. This is equal to (1-RESS/SS),
103 |     which gives an indication of how much of the initial variation in the
104 |     (centered) Y data is explained by the regression model after it has
105 |     been trained on the same Y data. Note that an overfitted model can
106 |     have a very large R**2 but poor generalisation performance. The
107 |     :py:func:`Q2` statistic looks at how much variance in each part of the
108 |     Y data is explained by the regression model trained on only the other
109 |     parts of the Y data so is more robust against overfitting.
110 | 
111 |     Args:
112 |         R (class): A regression class
113 |         X (ndarray N x n): X calibration data, one row per data sample
114 |         Y (ndarray N x m): Y calibration data, one row per data sample
115 |         others (dict, optional): A dict of other parameters to send to the
116 |             regression class constructor.
117 | 
118 |     Returns:
119 |         R2 (float): The R2 statistic.
120 | 
121 |     """
122 | 
123 |     return 1.0 - RESS(R, X, Y, others, relative=False) / SS(Y)
124 | 
125 | 
126 | def PRESS(
127 |     R: type[RegressionBase],
128 |     X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
129 |     Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
130 |     groups: int = 4,
131 |     others: dict | None = None,
132 |     relative: bool = False,
133 | ) -> float:
134 |     """Implements the Predicted Residual Error Sum of Squares
135 | 
136 |     This function calculates the PRESS statistic for a given regression
137 |     class and a set of calibration data. Each groups of samples in turn is
138 |     removed from the data set, the regression model is trained on the
139 |     remaining data, and then is used to predict the Y values of the
140 |     samples that were removed. Once a full set of Y predictions has been
141 |     produced, the sum of the squared difference between them and the true
142 |     Y data is the PRESS statistic.
143 | 
144 |     Args:
145 |         R (class): A regression class
146 |         X (ndarray N x n): X calibration data, one row per data sample
147 |         Y (ndarray N x m): Y calibration data, one row per data sample
148 |         groups (int, optional): Number of cross-validation groups to use
149 |         others (dict, optional): A dict of other parameters to send to the
150 |             regression class constructor.
151 |         relative (boolean, optional): whether to divide the error by the
152 |             true Y value before squaring and summing - where Y columns have
153 |             different scales this may help to prevent the output being
154 |             dominated by the column with the largest magnitude.
155 | 
156 |     Returns:
157 |         PRESS (float): The PRESS statistic.
158 | 
159 |     """
160 | 
161 |     if others is None:
162 |         others = {}
163 | 
164 |     if X.shape[0] != Y.shape[0]:
165 |         raise ParameterError('X and Y data must have the same number of rows (data samples)')
166 | 
167 |     data_samples = X.shape[0]
168 | 
169 |     if data_samples < 2:
170 |         raise ParameterError(
171 |             'There must be at least two data samples to produce the PRESS statistic.'
172 |         )
173 | 
174 |     if data_samples < groups:
175 |         raise ParameterError(
176 |             'There must be at least as many data samples as cross-validation groups'
177 |         )
178 | 
179 |     if groups < 2:
180 |         raise ParameterError('There must be at least two cross-validation groups')
181 | 
182 |     group_size = data_samples // groups
183 |     start_indexes = [x * group_size for x in range(0, groups)]
184 |     end_indexes = [x * group_size for x in range(1, groups + 1)]
185 |     end_indexes[-1] = data_samples  # Last group may be bigger
186 | 
187 |     # Change 1-D arrays into column vectors
188 |     if len(X.shape) == 1:
189 |         X = X.reshape((X.shape[0], 1))  # type: ignore[reportAssignmentType]
190 | 
191 |     if len(Y.shape) == 1:
192 |         Y = Y.reshape((Y.shape[0], 1))  # type: ignore[reportAssignmentType]
193 | 
194 |     Yhat = np.empty(Y.shape)
195 | 
196 |     for i in range(0, groups):
197 |         samples_excluding_group = data_samples - (end_indexes[i] - start_indexes[i])
198 |         Xp = np.empty((samples_excluding_group, X.shape[1]))
199 |         Yp = np.empty((samples_excluding_group, Y.shape[1]))
200 | 
201 |         Xp[0 : start_indexes[i], :] = X[0 : start_indexes[i], :]
202 |         Xp[start_indexes[i] :, :] = X[end_indexes[i] :, :]
203 |         Yp[0 : start_indexes[i], :] = Y[0 : start_indexes[i], :]
204 |         Yp[start_indexes[i] :, :] = Y[end_indexes[i] :, :]
205 | 
206 |         model = R(X=Xp, Y=Yp, **others)
207 |         Yhat[start_indexes[i] : end_indexes[i], :] = model.prediction(
208 |             Z=X[start_indexes[i] : end_indexes[i], :]
209 |         )
210 | 
211 |     if relative:
212 |         return (((Yhat - Y) / Y) ** 2).sum()
213 |     else:
214 |         return ((Yhat - Y) ** 2).sum()
215 | 
216 | 
217 | def Q2(
218 |     R: type[RegressionBase],
219 |     X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
220 |     Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
221 |     groups: int = 4,
222 |     others: dict | None = None,
223 | ) -> float:
224 |     """Implements the Q**2 statistic
225 | 
226 |     This function calculates the Q**2 statistic for a given regression
227 |     class and a set of calibration data. This is equal to (1-PRESS/SS),
228 |     which gives an indication of how much of the initial variation in each
229 |     part of the (centered) Y data is explained by the regression model
230 |     trained on the other parts of the Y data. This attempts to ensure that
231 |     regression models with a tendency to over-fit training data are not
232 |     favoured.
233 | 
234 |     Args:
235 |         R (class): A regression class
236 |         X (ndarray N x n): X calibration data, one row per data sample
237 |         Y (ndarray N x m): Y calibration data, one row per data sample
238 |         groups (int, optional): Number of cross-validation groups to use
239 |         others (dict, optional): A dict of other parameters to send to the
240 |             regression class constructor.
241 | 
242 |     Returns:
243 |         Q2 (float): The Q2 statistic.
244 | 
245 |     """
246 | 
247 |     return 1.0 - PRESS(R, X, Y, groups, others, relative=False) / SS(Y)
248 | 
249 | 
250 | def residuals_QQ(
251 |     Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
252 | ) -> tuple[
253 |     np.ndarray[tuple[int], np.dtype[np.float64]], np.ndarray[tuple[int, int], np.dtype[np.float64]]
254 | ]:
255 |     """Function for creating normal Q-Q probability plots of residuals
256 | 
257 |     This function is used to explore the residuals left over after a
258 |     regression model has been fitted to some calibration data. The input
259 |     is a matrix of residuals created by subtracting the true Y calibration
260 |     values from the Y values predicted by the regression model when the X
261 |     calibration values are input. Each column represents a variable of Y,
262 |     and in turn each is centered, divided by the standard deviation of the
263 |     values in the column and sorted.
264 | 
265 |     Theoretical quantiles from the normal distribution and the sample
266 |     quantiles for each Y variable are returned. When the theoretical
267 |     quantiles are plotted against the sample quantiles for any of the Y
268 |     variables, a Q-Q plot is producted. If the residuals are normally
269 |     distributed, the points should lie on a straight line through the
270 |     origin.
271 | 
272 |     Requires 'SciPy' to be available.
273 | 
274 |     Args:
275 |         Y (ndarray N x m): Matrix of residuals
276 | 
277 |     Returns:
278 |         X, Y (tuple of ndarray N and ndarray N x m): The theoretical
279 |         quantiles from the normal distribution and the sample quantiles
280 |         from the normal distribution
281 | 
282 |     Raises:
283 |         NotImplementedError: SciPy is not available
284 | 
285 |     """
286 | 
287 |     if not _stats_available:
288 |         raise NotImplementedError('This function requires SciPy')
289 | 
290 |     # Change 1-D array into column vector
291 |     if len(Y.shape) == 1:
292 |         Y = Y.reshape((Y.shape[0], 1))  # type: ignore[reportAssignmentType]
293 | 
294 |     Yc = Y - Y.mean(0)
295 |     Yc /= Yc.std(0)
296 |     Yc.sort(0)
297 | 
298 |     samples = Y.shape[0]
299 |     X = np.empty((samples))
300 |     for i in range(0, samples):
301 |         X[i] = scipy.stats.norm.ppf(1.0 / (samples + 1) * (i + 1))  # type: ignore[reportPossiblyUnboundVariable]
302 | 
303 |     return X, Yc
304 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # regressions documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri Nov  6 18:03:08 2015.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | 
 17 | # If extensions (or modules to document with autodoc) are in another directory,
 18 | # add these directories to sys.path here. If the directory is relative to the
 19 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 20 | # sys.path.insert(0, os.path.abspath('.'))
 21 | 
 22 | # -- General configuration ------------------------------------------------
 23 | 
 24 | # If your documentation needs a minimal Sphinx version, state it here.
 25 | # needs_sphinx = '1.0'
 26 | 
 27 | # Add any Sphinx extension module names here, as strings. They can be
 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 29 | # ones.
 30 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.todo', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon']
 31 | 
 32 | # Add any paths that contain templates here, relative to this directory.
 33 | templates_path = ['_templates']
 34 | 
 35 | # The suffix(es) of source filenames.
 36 | # You can specify multiple suffix as a list of string:
 37 | # source_suffix = ['.rst', '.md']
 38 | source_suffix = '.rst'
 39 | 
 40 | # The encoding of source files.
 41 | # source_encoding = 'utf-8-sig'
 42 | 
 43 | # The master toctree document.
 44 | master_doc = 'index'
 45 | 
 46 | # General information about the project.
 47 | project = 'regressions'
 48 | copyright = '2015, James Humphry'
 49 | author = 'James Humphry'
 50 | 
 51 | # The version info for the project you're documenting, acts as replacement for
 52 | # |version| and |release|, also used in various other places throughout the
 53 | # built documents.
 54 | #
 55 | # The short X.Y version.
 56 | version = '0.0.1'
 57 | # The full version, including alpha/beta/rc tags.
 58 | release = '0.0.1'
 59 | 
 60 | # The language for content autogenerated by Sphinx. Refer to documentation
 61 | # for a list of supported languages.
 62 | #
 63 | # This is also used if you do content translation via gettext catalogs.
 64 | # Usually you set "language" from the command line for these cases.
 65 | language = 'en'
 66 | 
 67 | # There are two options for replacing |today|: either, you set today to some
 68 | # non-false value, then it is used:
 69 | # today = ''
 70 | # Else, today_fmt is used as the format for a strftime call.
 71 | # today_fmt = '%B %d, %Y'
 72 | 
 73 | # List of patterns, relative to source directory, that match files and
 74 | # directories to ignore when looking for source files.
 75 | exclude_patterns = ['_build']
 76 | 
 77 | # The reST default role (used for this markup: `text`) to use for all
 78 | # documents.
 79 | # default_role = None
 80 | 
 81 | # If true, '()' will be appended to :func: etc. cross-reference text.
 82 | # add_function_parentheses = True
 83 | 
 84 | # If true, the current module name will be prepended to all description
 85 | # unit titles (such as .. function::).
 86 | # add_module_names = True
 87 | 
 88 | # If true, sectionauthor and moduleauthor directives will be shown in the
 89 | # output. They are ignored by default.
 90 | # show_authors = False
 91 | 
 92 | # The name of the Pygments (syntax highlighting) style to use.
 93 | pygments_style = 'sphinx'
 94 | 
 95 | # A list of ignored prefixes for module index sorting.
 96 | # modindex_common_prefix = []
 97 | 
 98 | # If true, keep warnings as "system message" paragraphs in the built documents.
 99 | # keep_warnings = False
100 | 
101 | # If true, `todo` and `todoList` produce output, else they produce nothing.
102 | todo_include_todos = True
103 | 
104 | 
105 | # -- Options for HTML output ----------------------------------------------
106 | 
107 | # The theme to use for HTML and HTML Help pages.  See the documentation for
108 | # a list of builtin themes.
109 | html_theme = 'alabaster'
110 | 
111 | # Theme options are theme-specific and customize the look and feel of a theme
112 | # further.  For a list of options available for each theme, see the
113 | # documentation.
114 | # html_theme_options = {}
115 | 
116 | # Add any paths that contain custom themes here, relative to this directory.
117 | # html_theme_path = []
118 | 
119 | # The name for this set of Sphinx documents.  If None, it defaults to
120 | # "<project> v<release> documentation".
121 | # html_title = None
122 | 
123 | # A shorter title for the navigation bar.  Default is the same as html_title.
124 | # html_short_title = None
125 | 
126 | # The name of an image file (relative to this directory) to place at the top
127 | # of the sidebar.
128 | # html_logo = None
129 | 
130 | # The name of an image file (within the static path) to use as favicon of the
131 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
132 | # pixels large.
133 | # html_favicon = None
134 | 
135 | # Add any paths that contain custom static files (such as style sheets) here,
136 | # relative to this directory. They are copied after the builtin static files,
137 | # so a file named "default.css" will overwrite the builtin "default.css".
138 | html_static_path = ['_static']
139 | 
140 | # Add any extra paths that contain custom files (such as robots.txt or
141 | # .htaccess) here, relative to this directory. These files are copied
142 | # directly to the root of the documentation.
143 | # html_extra_path = []
144 | 
145 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
146 | # using the given strftime format.
147 | # html_last_updated_fmt = '%b %d, %Y'
148 | 
149 | # If true, SmartyPants will be used to convert quotes and dashes to
150 | # typographically correct entities.
151 | # html_use_smartypants = True
152 | 
153 | # Custom sidebar templates, maps document names to template names.
154 | # html_sidebars = {}
155 | 
156 | # Additional templates that should be rendered to pages, maps page names to
157 | # template names.
158 | # html_additional_pages = {}
159 | 
160 | # If false, no module index is generated.
161 | # html_domain_indices = True
162 | 
163 | # If false, no index is generated.
164 | # html_use_index = True
165 | 
166 | # If true, the index is split into individual pages for each letter.
167 | # html_split_index = False
168 | 
169 | # If true, links to the reST sources are added to the pages.
170 | # html_show_sourcelink = True
171 | 
172 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
173 | # html_show_sphinx = True
174 | 
175 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
176 | # html_show_copyright = True
177 | 
178 | # If true, an OpenSearch description file will be output, and all pages will
179 | # contain a <link> tag referring to it.  The value of this option must be the
180 | # base URL from which the finished HTML is served.
181 | # html_use_opensearch = ''
182 | 
183 | # This is the file name suffix for HTML files (e.g. ".xhtml").
184 | # html_file_suffix = None
185 | 
186 | # Language to be used for generating the HTML full-text search index.
187 | # Sphinx supports the following languages:
188 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
189 | #   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
190 | # html_search_language = 'en'
191 | 
192 | # A dictionary with options for the search language support, empty by default.
193 | # Now only 'ja' uses this config value
194 | # html_search_options = {'type': 'default'}
195 | 
196 | # The name of a javascript file (relative to the configuration directory) that
197 | # implements a search results scorer. If empty, the default will be used.
198 | # html_search_scorer = 'scorer.js'
199 | 
200 | # Output file base name for HTML help builder.
201 | htmlhelp_basename = 'regressionsdoc'
202 | 
203 | # -- Options for LaTeX output ---------------------------------------------
204 | 
205 | latex_elements = {
206 |     # The paper size ('letterpaper' or 'a4paper').
207 |     #'papersize': 'letterpaper',
208 |     # The font size ('10pt', '11pt' or '12pt').
209 |     #'pointsize': '10pt',
210 |     # Additional stuff for the LaTeX preamble.
211 |     #'preamble': '',
212 |     # Latex figure (float) alignment
213 |     #'figure_align': 'htbp',
214 | }
215 | 
216 | # Grouping the document tree into LaTeX files. List of tuples
217 | # (source start file, target name, title,
218 | #  author, documentclass [howto, manual, or own class]).
219 | latex_documents = [
220 |     (master_doc, 'regressions.tex', 'regressions Documentation', 'James Humphry', 'manual'),
221 | ]
222 | 
223 | # The name of an image file (relative to this directory) to place at the top of
224 | # the title page.
225 | # latex_logo = None
226 | 
227 | # For "manual" documents, if this is true, then toplevel headings are parts,
228 | # not chapters.
229 | # latex_use_parts = False
230 | 
231 | # If true, show page references after internal links.
232 | # latex_show_pagerefs = False
233 | 
234 | # If true, show URL addresses after external links.
235 | # latex_show_urls = False
236 | 
237 | # Documents to append as an appendix to all manuals.
238 | # latex_appendices = []
239 | 
240 | # If false, no module index is generated.
241 | # latex_domain_indices = True
242 | 
243 | 
244 | # -- Options for manual page output ---------------------------------------
245 | 
246 | # One entry per manual page. List of tuples
247 | # (source start file, name, description, authors, manual section).
248 | man_pages = [(master_doc, 'regressions', 'regressions Documentation', [author], 1)]
249 | 
250 | # If true, show URL addresses after external links.
251 | # man_show_urls = False
252 | 
253 | 
254 | # -- Options for Texinfo output -------------------------------------------
255 | 
256 | # Grouping the document tree into Texinfo files. List of tuples
257 | # (source start file, target name, title, author,
258 | #  dir menu entry, description, category)
259 | texinfo_documents = [
260 |     (
261 |         master_doc,
262 |         'regressions',
263 |         'regressions Documentation',
264 |         author,
265 |         'regressions',
266 |         'One line description of project.',
267 |         'Miscellaneous',
268 |     ),
269 | ]
270 | 
271 | # Documents to append as an appendix to all manuals.
272 | # texinfo_appendices = []
273 | 
274 | # If false, no module index is generated.
275 | # texinfo_domain_indices = True
276 | 
277 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
278 | # texinfo_show_urls = 'footnote'
279 | 
280 | # If true, do not generate a @detailmenu in the "Top" node's menu.
281 | # texinfo_no_detailmenu = False
282 | 
283 | 
284 | # -- Options for Epub output ----------------------------------------------
285 | 
286 | # Bibliographic Dublin Core info.
287 | epub_title = project
288 | epub_author = author
289 | epub_publisher = author
290 | epub_copyright = copyright
291 | 
292 | # The basename for the epub file. It defaults to the project name.
293 | # epub_basename = project
294 | 
295 | # The HTML theme for the epub output. Since the default themes are not optimized
296 | # for small screen space, using the same theme for HTML and epub output is
297 | # usually not wise. This defaults to 'epub', a theme designed to save visual
298 | # space.
299 | # epub_theme = 'epub'
300 | 
301 | # The language of the text. It defaults to the language option
302 | # or 'en' if the language is not set.
303 | # epub_language = ''
304 | 
305 | # The scheme of the identifier. Typical schemes are ISBN or URL.
306 | # epub_scheme = ''
307 | 
308 | # The unique identifier of the text. This can be a ISBN number
309 | # or the project homepage.
310 | # epub_identifier = ''
311 | 
312 | # A unique identification for the text.
313 | # epub_uid = ''
314 | 
315 | # A tuple containing the cover image and cover page html template filenames.
316 | # epub_cover = ()
317 | 
318 | # A sequence of (type, uri, title) tuples for the guide element of content.opf.
319 | # epub_guide = ()
320 | 
321 | # HTML files that should be inserted before the pages created by sphinx.
322 | # The format is a list of tuples containing the path and title.
323 | # epub_pre_files = []
324 | 
325 | # HTML files shat should be inserted after the pages created by sphinx.
326 | # The format is a list of tuples containing the path and title.
327 | # epub_post_files = []
328 | 
329 | # A list of files that should not be packed into the epub file.
330 | epub_exclude_files = ['search.html']
331 | 
332 | # The depth of the table of contents in toc.ncx.
333 | # epub_tocdepth = 3
334 | 
335 | # Allow duplicate toc entries.
336 | # epub_tocdup = True
337 | 
338 | # Choose between 'default' and 'includehidden'.
339 | # epub_tocscope = 'default'
340 | 
341 | # Fix unsupported image types using the Pillow.
342 | # epub_fix_images = False
343 | 
344 | # Scale large images.
345 | # epub_max_image_width = 0
346 | 
347 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
348 | # epub_show_urls = 'inline'
349 | 
350 | # If false, no index is generated.
351 | # epub_use_index = True
352 | 


--------------------------------------------------------------------------------
/regressions/pcr.py:
--------------------------------------------------------------------------------
  1 | """A module which implements Principal Component Regression."""
  2 | 
  3 | import random
  4 | 
  5 | from . import *
  6 | 
  7 | # pyright: reportUnboundVariable=false
  8 | # There will always be at least one iteration so p_j is always
  9 | # bound before it is used - however pyright cannot do this sort of analysis
 10 | 
 11 | 
 12 | class PCR_NIPALS(RegressionBase):
 13 |     """Principal Components Regression using the NIPALS algorithm
 14 | 
 15 |     PCR forms a set of new latent variables from the provided X data
 16 |     samples which describe as much of the variance in the X data as
 17 |     possible. The latent variables are then regressed against the provided
 18 |     Y data. PCR is connected with Principal Components Analysis, where the
 19 |     latent variables are referred to as Principal Components.
 20 | 
 21 |     This class uses the Non-linear Iterative PArtial Least Squares
 22 |     algorithm to extract the components. Either a fixed number of
 23 |     components should be specified using the ``g`` argument, or a target
 24 |     proportion of variation explained by the components should be
 25 |     specified via ``variation_explained``. The variables of the X and Y
 26 |     data can have their variances standardized. This is useful if they are
 27 |     of heterogeneous types as otherwise the components extracted can be
 28 |     dominated by the effects of different measurement scales rather than
 29 |     by the actual data.
 30 | 
 31 |     Note:
 32 |         If ``ignore_failures`` is ``True`` then the resulting object
 33 |         may have fewer components than requested if convergence does
 34 |         not succeed.
 35 | 
 36 |     Args:
 37 |         X (ndarray N x n): X calibration data, one row per data sample
 38 |         Y (ndarray N x m): Y calibration data, one row per data sample
 39 |         g (int): Number of components to extract
 40 |         variation_explained (float): Proportion of variance in X
 41 |             calibration data that the components extracted should explain
 42 |             (from 0.001 - 0.999)
 43 |         standardize_X (boolean, optional): Standardize the X data
 44 |         standardize_Y (boolean, optional): Standardize the Y data
 45 |         max_iterations (int, optional) : Maximum number of iterations of
 46 |             NIPALS to attempt
 47 |         iteration_convergence (float, optional): Difference in norm
 48 |             between two iterations at which point the iteration will be
 49 |             considered to have converged.
 50 |         ignore_failures (boolean, optional): Do not raise an error if
 51 |             iteration has to be abandoned before the requested number
 52 |             of or coverage by components has been achieved.
 53 | 
 54 |     Attributes:
 55 |         components (int): number of components extracted (=g)
 56 |         T (ndarray N x g): Scores
 57 |         P (ndarray n x g): Loadings (Components extracted from data)
 58 |         eigenvalues (ndarray g): Eigenvalues extracted
 59 |         total_variation (float): Total variation in calibration X data
 60 |         C (ndarray g x m): Regression coefficients
 61 |         PgC (ndarray n x m): Precalculated matrix product of P (limited to
 62 |             g components) and C
 63 | 
 64 |     """
 65 | 
 66 |     # Type declarations for attributes:
 67 |     components: int
 68 |     T: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 69 |     P: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 70 |     eigenvalues: np.ndarray[tuple[int], np.dtype[np.float64]]
 71 |     total_variation: float
 72 |     C: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 73 |     PgC: np.ndarray[tuple[int, int], np.dtype[np.float64]]
 74 | 
 75 |     def __init__(
 76 |         self,
 77 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 78 |         Y: np.ndarray[tuple[int, int], np.dtype[np.float64]],
 79 |         g: int | None = None,
 80 |         variation_explained: float | None = None,
 81 |         standardize_X: bool = False,
 82 |         standardize_Y: bool = False,
 83 |         max_iterations: int = DEFAULT_MAX_ITERATIONS,
 84 |         iteration_convergence: float = DEFAULT_EPSILON,
 85 |         ignore_failures: bool = True,
 86 |     ):
 87 |         if max_iterations < 1:
 88 |             raise ParameterError('At least one iteration is necessary')
 89 | 
 90 |         if iteration_convergence <= 0.0:
 91 |             raise ParameterError('Iteration convergence limit must be positive')
 92 | 
 93 |         if (g is None) == (variation_explained is None):
 94 |             raise ParameterError(
 95 |                 'Must specify either the number of principal '
 96 |                 'components g to use or the proportion of '
 97 |                 'data variance that must be explained.'
 98 |             )
 99 | 
100 |         if variation_explained is not None:
101 |             if variation_explained < 0.001 or variation_explained > 0.999:
102 |                 raise ParameterError(
103 |                     'PCR will not reliably be able to use '
104 |                     'principal components that explain less '
105 |                     'than 0.1% or more than 99.9% of the '
106 |                     'variation in the data.'
107 |                 )
108 | 
109 |         Xc, Yc = super()._prepare_data(X, Y, standardize_X, standardize_Y)
110 | 
111 |         if g is not None:
112 |             if g < 1 or g > self.max_rank:
113 |                 raise ParameterError('Number of required components specified is impossible.')
114 | 
115 |         if standardize_X:
116 |             self.total_variation = self.X_variables * (self.data_samples - 1.0)
117 |         else:
118 |             self.total_variation = (Xc @ Xc.T).trace()
119 | 
120 |         self._perform_pca(
121 |             Xc, g, variation_explained, max_iterations, iteration_convergence, ignore_failures
122 |         )
123 | 
124 |         # Find regression parameters
125 |         self.Y_offset = Y.mean(0)
126 |         Yc = Y - self.Y_offset
127 |         if standardize_Y:
128 |             self.Y_scaling = Y.std(0, ddof=1)
129 |             Yc /= self.Y_scaling
130 |         else:
131 |             self.Y_scaling = 1.0
132 | 
133 |         self.C = np.diag(1.0 / self.eigenvalues) @ self.T.T @ Yc
134 |         self.PgC = self.P @ self.C
135 | 
136 |     def _perform_pca(
137 |         self,
138 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
139 |         g: int | None = None,
140 |         variation_explained: float | None = None,
141 |         max_iterations: int = DEFAULT_MAX_ITERATIONS,
142 |         iteration_convergence: float = DEFAULT_EPSILON,
143 |         ignore_failures: bool = True,
144 |     ):
145 |         """A non-public routine that performs the PCA using an appropriate
146 |         method and sets up self.T, self.P, self.eignvalues and
147 |         self.components."""
148 | 
149 |         T = np.empty((self.data_samples, self.max_rank))  # Scores
150 |         P = np.empty((self.X_variables, self.max_rank))  # Loadings
151 |         eig = np.empty((self.max_rank,))
152 | 
153 |         self.components = 0
154 |         X_j = X
155 | 
156 |         while True:
157 |             t_j = X_j[:, random.randint(0, self.X_variables - 1)]
158 |             iteration_count = 0
159 |             iteration_change = iteration_convergence * 10.0
160 | 
161 |             while iteration_count < max_iterations and iteration_change > iteration_convergence:
162 |                 p_j = X_j.T @ t_j
163 |                 p_j /= np.linalg.norm(p_j, 2)  # Normalise p_j vectors
164 | 
165 |                 old_t_j = t_j
166 |                 t_j = X_j @ p_j
167 |                 iteration_change = linalg.norm(t_j - old_t_j)
168 |                 iteration_count += 1
169 | 
170 |             if iteration_count >= max_iterations:
171 |                 if ignore_failures:
172 |                     break
173 |                 else:
174 |                     raise ConvergenceError(
175 |                         'NIPALS PCA for PCR failed to converge for component: {}'.format(
176 |                             self.components + 1
177 |                         )
178 |                     )
179 | 
180 |             X_j = X_j - np.outer(t_j, p_j.T)  # Reduce in rank
181 |             T[:, self.components] = t_j
182 |             P[:, self.components] = p_j
183 |             eig[self.components] = t_j @ t_j
184 |             self.components += 1
185 | 
186 |             if g is not None:
187 |                 if self.components == g:
188 |                     break
189 | 
190 |             if variation_explained is not None:
191 |                 if eig[0 : self.components].sum() >= variation_explained * self.total_variation:
192 |                     break
193 | 
194 |         # Only copy the components actually used
195 |         self.T = T[:, 0 : self.components]
196 |         self.P = P[:, 0 : self.components]
197 | 
198 |         self.eigenvalues = eig[0 : self.components]
199 | 
200 |     def variation_explained(self) -> float:
201 |         """Return the proportion of variation explained
202 | 
203 |         Returns:
204 |             variation_explained (float): Proportion of the total variation
205 |             in the X data explained by the extracted principal components.
206 | 
207 |         """
208 | 
209 |         return self.eigenvalues.sum() / self.total_variation
210 | 
211 |     def prediction(
212 |         self, Z: np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]
213 |     ) -> np.ndarray[tuple[int, int] | tuple[int], np.dtype[np.float64]]:
214 |         """Predict the output resulting from a given input
215 | 
216 |         Args:
217 |             Z (ndarray of floats): The input on which to make the
218 |                 prediction. Must either be a one dimensional array of the
219 |                 same length as the number of calibration X variables, or a
220 |                 two dimensional array with the same number of columns as
221 |                 the calibration X data and one row for each input row.
222 | 
223 |         Returns:
224 |             Y (ndarray of floats) : The predicted output - either a one
225 |             dimensional array of the same length as the number of
226 |             calibration Y variables or a two dimensional array with the
227 |             same number of columns as the calibration Y data and one row
228 |             for each input row.
229 |         """
230 | 
231 |         if len(Z.shape) == 1:
232 |             if Z.shape[0] != self.X_variables:
233 |                 raise ParameterError(
234 |                     'Data provided does not have the same '
235 |                     'number of variables as the original X '
236 |                     'data'
237 |                 )
238 |         elif Z.shape[1] != self.X_variables:
239 |             raise ParameterError(
240 |                 'Data provided does not have the same number of variables as the original X data'
241 |             )
242 | 
243 |         tmp = Z - self.X_offset
244 |         if self.standardized_X:
245 |             tmp *= self.X_rscaling  # type: ignore
246 |         tmp = tmp @ self.PgC
247 |         if self.standardized_Y:
248 |             tmp *= self.Y_scaling  # type: ignore
249 |         return self.Y_offset + tmp
250 | 
251 | 
252 | class PCR_SVD(PCR_NIPALS):
253 |     """Principal Components Regression using SVD
254 | 
255 |     This class implements PCR with the same mathematical goals as
256 |     :py:class:`PCR_NIPALS` but using a different method to extract the
257 |     principal components. The convergence criteria in the NIPALS algorithm
258 |     can be formulated into an eigenvalue problem and solved directly using
259 |     an existing SVD-based solver. This has the advantage of being entirely
260 |     deterministic, but the disadvantage that all components have to be
261 |     extracted each time, even if only a few are required to explain most
262 |     of the variance in X.
263 | 
264 |     Note:
265 |         The attributes of the resulting class are exactly the same as for
266 |         :py:class:`PCR_NIPALS`.
267 | 
268 |     Args:
269 |         X (ndarray N x n): X calibration data, one row per data sample
270 |         Y (ndarray N x m): Y calibration data, one row per data sample
271 |         g (int): Number of components to extract
272 |         variation_explained (float): Proportion of variance in X
273 |             calibration data that the components extracted should explain
274 |             (from 0.001 - 0.999)
275 |         standardize_X (boolean, optional): Standardize the X data
276 |         standardize_Y (boolean, optional): Standardize the Y data
277 |         max_iterations  : Not relevant for SVD
278 |         iteration_convergence : Not relevant for SVD
279 |         ignore_failures: Not relevant for SVD
280 | 
281 |     """
282 | 
283 |     def _perform_pca(
284 |         self,
285 |         X: np.ndarray[tuple[int, int], np.dtype[np.float64]],
286 |         g: int | None = None,
287 |         variation_explained: float | None = None,
288 |         max_iterations: int = DEFAULT_MAX_ITERATIONS,
289 |         iteration_convergence: float = DEFAULT_EPSILON,
290 |         ignore_failures: bool = True,
291 |     ):
292 |         """A non-public routine that performs the PCA using an appropriate
293 |         method and sets up self.total_variation, self.T, self.P,
294 |         self.eignvalues and self.components."""
295 | 
296 |         u, s, v = linalg.svd(X, full_matrices=False)
297 | 
298 |         T = u @ np.diag(s)
299 |         P = v.T
300 |         eig = (T.T @ T).diagonal()
301 | 
302 |         if g is not None:
303 |             self.T = T[:, 0:g]
304 |             self.P = P[:, 0:g]
305 |             self.eigenvalues = eig[0:g]
306 |             self.components = g
307 |         else:
308 |             cuml = eig.cumsum() / self.total_variation
309 |             self.components = cuml.searchsorted(variation_explained) + 1
310 |             self.T = T[:, 0 : self.components]
311 |             self.P = P[:, 0 : self.components]
312 |             self.eigenvalues = eig[0 : self.components]
313 | 


--------------------------------------------------------------------------------