├── MANIFEST.in
├── setup.cfg
├── img
    ├── AIG_b.png
    ├── AIG_CV.png
    ├── AIG_T12.png
    ├── AIG_caseweights.png
    ├── AIG_yyp_train.png
    └── AIG_yyp_train_test.png
├── docs
    ├── modules.rst
    ├── .vscode
    │   └── settings.json
    ├── setup.rst
    ├── sphinx_requirements.txt
    ├── rtd-environment.yaml
    ├── generated
    │   ├── direpack.dicomo.dicomo.dicomo.rst
    │   ├── direpack.sprm.snipls.snipls.rst
    │   ├── direpack.ppdire.ppdire.ppdire.rst
    │   ├── direpack.sudire.sudire.sudire.rst
    │   ├── direpack.sprm.sprm.sprm.rst
    │   ├── direpack.preprocessing.robcent.VersatileScaler.rst
    │   └── direpack.preprocessing.gsspp.GenSpatialSignPrePprocessor.rst
    ├── Makefile
    ├── make.bat
    ├── index.rst
    ├── Contributing.rst
    ├── dicomo.md
    ├── Cross-validation and plotting.rst
    ├── conf.py
    ├── Pre-processing.rst
    ├── sudire.rst
    ├── sprm.rst
    ├── sudire.md
    ├── ppdire.rst
    ├── ppdire.md
    └── sprm.md
├── dev-requirements.txt
├── src
    └── direpack
    │   ├── test
    │       ├── __init__.py
    │       ├── test_ppdire.py
    │       ├── test_sprm.py
    │       ├── test_dicomo.py
    │       └── test_sudire.py
    │   ├── sprm
    │       ├── __init__.py
    │       ├── _m_support_functions.py
    │       ├── rm.py
    │       └── snipls.py
    │   ├── ppdire
    │       ├── __init__.py
    │       ├── _ppdire_utils.py
    │       └── capi.py
    │   ├── preprocessing
    │       ├── __init__.py
    │       ├── _gsspp_utils.py
    │       ├── gsspp.py
    │       ├── _preproc_utilities.py
    │       └── robcent.py
    │   ├── utils
    │       ├── __init__.py
    │       └── utils.py
    │   ├── plot
    │       ├── __init__.py
    │       ├── sudire_plot.py
    │       └── ppdire_plot.py
    │   ├── cross_validation
    │       ├── __init__.py
    │       └── _cv_support_functions.py
    │   ├── sudire
    │       └── __init__.py
    │   ├── dicomo
    │       └── __init__.py
    │   ├── ipopt_temp
    │       ├── __init__.py
    │       ├── jacobian.py
    │       └── ipopt_wrapper.py
    │   └── __init__.py
├── direpack_Future_Dev.md
├── requirements.txt
├── .readthedocs.yaml
├── LICENSE
├── setup.py
├── .github
    └── workflows
    │   ├── python-package.yml
    │   └── python-publish.yml
├── .gitignore
├── direpack_Release_Notes.md
└── README.md


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | 
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/img/AIG_b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_b.png


--------------------------------------------------------------------------------
/img/AIG_CV.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_CV.png


--------------------------------------------------------------------------------
/img/AIG_T12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_T12.png


--------------------------------------------------------------------------------
/img/AIG_caseweights.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_caseweights.png


--------------------------------------------------------------------------------
/img/AIG_yyp_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_yyp_train.png


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | direpack
2 | ========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    setup
8 | 


--------------------------------------------------------------------------------
/img/AIG_yyp_train_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_yyp_train_test.png


--------------------------------------------------------------------------------
/docs/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "C:\\Workdir\\Programs\\envs\\mddsdr2\\python.exe"
3 | }


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | pytest>=7.1.3
3 | prospector>=1.7.7
4 | bandit
5 | vulture
6 | coverage>=6.4.4
7 | 


--------------------------------------------------------------------------------
/docs/setup.rst:
--------------------------------------------------------------------------------
1 | setup module
2 | ============
3 | 
4 | .. automodule:: setup
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/src/direpack/test/__init__.py:
--------------------------------------------------------------------------------
1 | __name__ = "test"
2 | __author__ = "Emmanuel Jordy and Sven"
3 | __license__ = "MIT"
4 | __version__ = "0.0.4"
5 | __date__ = "2024-05-23"
6 | 


--------------------------------------------------------------------------------
/docs/sphinx_requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ####### requirements for sphinx#######
 3 | ###### Requirements without Version Specifiers ######
 4 | 
 5 | sphinx-math-dollar
 6 | sklearn
 7 | direpack
 8 | Ball
 9 | sympy
10 | 


--------------------------------------------------------------------------------
/direpack_Future_Dev.md:
--------------------------------------------------------------------------------
1 | Work to do
2 | ----------
3 | - optimize alignment to `sklearn`
4 | - optimize for speed 
5 | - extend to multivariate responses (open research topic for several of the options!)
6 | - extend backend to GPU compatibility
7 | - suggestions and contributions always welcome!


--------------------------------------------------------------------------------
/src/direpack/sprm/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jul 22 12:17:17 2018
 5 | 
 6 | @author: Sven Serneels, Ponalytics
 7 | """
 8 | 
 9 | __name__ = "sprm"
10 | __author__ = "Sven Serneels"
11 | __license__ = "MIT"
12 | __version__ = "0.8.1"
13 | __date__ = "2024-05-23"
14 | 


--------------------------------------------------------------------------------
/src/direpack/ppdire/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jul 9 14:20:17 2019
 5 | 
 6 | @author: Sven Serneels, Ponalytics
 7 | """
 8 | 
 9 | __name__ = "ppdire"
10 | __author__ = "Sven Serneels"
11 | __license__ = "MIT"
12 | __version__ = "0.2.12"
13 | __date__ = "2022-10-22"
14 | 


--------------------------------------------------------------------------------
/docs/rtd-environment.yaml:
--------------------------------------------------------------------------------
 1 | name: RobDimRed-rtd
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - python =3.7
 7 |   - pip
 8 |   - cyipopt
 9 |   - sphinx-math-dollar
10 |   - scikit-learn
11 |   - numpy
12 |   - matplotlib
13 |   - pandas 
14 |   - statsmodels
15 |   - dcor
16 |   - sympy
17 | 
18 | 


--------------------------------------------------------------------------------
/src/direpack/preprocessing/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jul 22 12:17:17 2018
 5 | 
 6 | @author: Sven Serneels, Ponalytics
 7 | """
 8 | 
 9 | __name__ = "preprocessing"
10 | __author__ = "Sven Serneels"
11 | __license__ = "MIT"
12 | __version__ = "0.8.0"
13 | __date__ = "2024-02-23"
14 | 


--------------------------------------------------------------------------------
/src/direpack/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Apr 11 17:22:09 2020
 4 | 
 5 | @author: Emmanuel Jordy Menvouta
 6 | """
 7 | 
 8 | 
 9 | __name__ = "utils"
10 | __author__ = "Emmanuel Jordy Menvouta and Sven Serneels"
11 | __license__ = "MIT"
12 | __version__ = "0.1.0"
13 | __date__ = "2024-02-23"
14 | 


--------------------------------------------------------------------------------
/src/direpack/plot/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jul 22 12:17:17 2018
 5 | 
 6 | @author: Sven Serneels, Ponalytics
 7 | """
 8 | 
 9 | __name__ = "plot"
10 | __author__ = "Sven Serneels"
11 | __license__ = "MIT"
12 | __version__ = "0.9.0"
13 | __date__ = "2020-04-18"
14 | 
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/src/direpack/cross_validation/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jul 22 12:17:17 2018
 5 | 
 6 | @author: Sven Serneels, Ponalytics
 7 | """
 8 | 
 9 | __name__ = "cross_validation"
10 | __author__ = "Sven Serneels"
11 | __license__ = "MIT"
12 | __version__ = "0.7.0"
13 | __date__ = "2020-04-03"
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/src/direpack/sudire/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Apr 11 17:22:09 2020
 4 | 
 5 | @author: Emmanuel Jordy Menvouta
 6 | Edits by Sven Serneels. 
 7 | """
 8 | 
 9 | 
10 | __name__ = "sudire"
11 | __author__ = "Emmanuel Jordy Menvouta"
12 | __license__ = "MIT"
13 | __version__ = "0.1.6"
14 | __date__ = "2022-10-09"
15 | 


--------------------------------------------------------------------------------
/src/direpack/dicomo/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Wed Jul 9 14:20:17 2019
 5 | 
 6 | @author: Sven Serneels, Ponalytics
 7 | """
 8 | 
 9 | __name__ = "dicomo"
10 | __author__ = "Sven Serneels"
11 | __license__ = "MIT"
12 | __version__ = "1.0.4"
13 | __date__ = "2022-10-08"
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/generated/direpack.dicomo.dicomo.dicomo.rst:
--------------------------------------------------------------------------------
 1 | direpack.dicomo.dicomo.dicomo
 2 | =============================
 3 | 
 4 | .. currentmodule:: direpack.dicomo.dicomo
 5 | 
 6 | .. autoclass:: dicomo
 7 | 
 8 |    
 9 |    .. automethod:: __init__
10 | 
11 |    
12 |    .. rubric:: Methods
13 | 
14 |    .. autosummary::
15 |    
16 |       ~dicomo.__init__
17 |       ~dicomo.fit
18 |       ~dicomo.get_params
19 |       ~dicomo.set_params
20 |    
21 |    
22 | 
23 |    
24 |    
25 |    


--------------------------------------------------------------------------------
/src/direpack/ipopt_temp/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Apr 12 2020
 5 | 
 6 | This folder is temporary. It copies a fix to ipopt: 
 7 |     https://github.com/matthias-k/optpy/blob/master/optpy/jacobian.py
 8 |     
 9 | Folder will stay in direpack until the latter has been released.
10 | 
11 | """
12 | 
13 | __name__ = "opt_temp"
14 | __author__ = "Sven Serneels"
15 | __license__ = "MIT"
16 | __version__ = "0.0.2"
17 | __date__ = "2021-04-15"
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | ####### sprm-requirements.txt #######
 3 | #
 4 | ###### Requirements without Version Specifiers ######
 5 | numpy
 6 | matplotlib
 7 | scipy >= 1.9.0
 8 | sklearn
 9 | pandas
10 | statsmodels
11 | # uncheck these for DCOV-SDR and MDD-SDR options in sudire
12 | # Cython
13 | # ipopt
14 | dcor
15 | sympy
16 | scikit-learn
17 | pandas
18 | 
19 | 
20 | #
21 | ###### Requirements with Version Specifiers ######
22 | #   See https://www.python.org/dev/peps/pep-0440/#version-specifiers
23 | # python > 3.5
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/generated/direpack.sprm.snipls.snipls.rst:
--------------------------------------------------------------------------------
 1 | direpack.sprm.snipls.snipls
 2 | ===========================
 3 | 
 4 | .. currentmodule:: direpack.sprm.snipls
 5 | 
 6 | .. autoclass:: snipls
 7 | 
 8 |    
 9 |    .. automethod:: __init__
10 | 
11 |    
12 |    .. rubric:: Methods
13 | 
14 |    .. autosummary::
15 |    
16 |       ~snipls.__init__
17 |       ~snipls.fit
18 |       ~snipls.fit_transform
19 |       ~snipls.get_params
20 |       ~snipls.predict
21 |       ~snipls.score
22 |       ~snipls.set_params
23 |       ~snipls.transform
24 |    
25 |    
26 | 
27 |    
28 |    
29 |    


--------------------------------------------------------------------------------
/docs/generated/direpack.ppdire.ppdire.ppdire.rst:
--------------------------------------------------------------------------------
 1 | direpack.ppdire.ppdire.ppdire
 2 | =============================
 3 | 
 4 | .. currentmodule:: direpack.ppdire.ppdire
 5 | 
 6 | .. autoclass:: ppdire
 7 | 
 8 |    
 9 |    .. automethod:: __init__
10 | 
11 |    
12 |    .. rubric:: Methods
13 | 
14 |    .. autosummary::
15 |    
16 |       ~ppdire.__init__
17 |       ~ppdire.fit
18 |       ~ppdire.fit_transform
19 |       ~ppdire.get_params
20 |       ~ppdire.predict
21 |       ~ppdire.score
22 |       ~ppdire.set_params
23 |       ~ppdire.transform
24 |    
25 |    
26 | 
27 |    
28 |    
29 |    


--------------------------------------------------------------------------------
/docs/generated/direpack.sudire.sudire.sudire.rst:
--------------------------------------------------------------------------------
 1 | direpack.sudire.sudire.sudire
 2 | =============================
 3 | 
 4 | .. currentmodule:: direpack.sudire.sudire
 5 | 
 6 | .. autoclass:: sudire
 7 | 
 8 |    
 9 |    .. automethod:: __init__
10 | 
11 |    
12 |    .. rubric:: Methods
13 | 
14 |    .. autosummary::
15 |    
16 |       ~sudire.__init__
17 |       ~sudire.fit
18 |       ~sudire.fit_transform
19 |       ~sudire.get_params
20 |       ~sudire.predict
21 |       ~sudire.score
22 |       ~sudire.set_params
23 |       ~sudire.transform
24 |    
25 |    
26 | 
27 |    
28 |    
29 |    


--------------------------------------------------------------------------------
/docs/generated/direpack.sprm.sprm.sprm.rst:
--------------------------------------------------------------------------------
 1 | direpack.sprm.sprm.sprm
 2 | =======================
 3 | 
 4 | .. currentmodule:: direpack.sprm.sprm
 5 | 
 6 | .. autoclass:: sprm
 7 | 
 8 |    
 9 |    .. automethod:: __init__
10 | 
11 |    
12 |    .. rubric:: Methods
13 | 
14 |    .. autosummary::
15 |    
16 |       ~sprm.__init__
17 |       ~sprm.fit
18 |       ~sprm.fit_transform
19 |       ~sprm.get_params
20 |       ~sprm.predict
21 |       ~sprm.score
22 |       ~sprm.set_params
23 |       ~sprm.transform
24 |       ~sprm.valscore
25 |       ~sprm.weightnewx
26 |    
27 |    
28 | 
29 |    
30 |    
31 |    


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |    configuration: docs/conf.py
11 | 
12 | # Optionally build your docs in additional formats such as PDF
13 | formats:
14 |    - pdf
15 | 
16 | 
17 | # Optionally set the version of Python and requirements required to build your docs
18 | python:
19 |    version: 3.7
20 |    install:
21 |    - requirements: docs/sphinx_requirements.txt
22 | 
23 | conda: 
24 |    environment: docs/rtd-environment.yaml
25 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/generated/direpack.preprocessing.robcent.VersatileScaler.rst:
--------------------------------------------------------------------------------
 1 | direpack.preprocessing.robcent.VersatileScaler
 2 | ==============================================
 3 | 
 4 | .. currentmodule:: direpack.preprocessing.robcent
 5 | 
 6 | .. autoclass:: VersatileScaler
 7 | 
 8 |    
 9 |    .. automethod:: __init__
10 | 
11 |    
12 |    .. rubric:: Methods
13 | 
14 |    .. autosummary::
15 |    
16 |       ~VersatileScaler.__init__
17 |       ~VersatileScaler.fit
18 |       ~VersatileScaler.fit_transform
19 |       ~VersatileScaler.get_params
20 |       ~VersatileScaler.inverse_transform
21 |       ~VersatileScaler.predict
22 |       ~VersatileScaler.set_params
23 |       ~VersatileScaler.transform
24 |    
25 |    
26 | 
27 |    
28 |    
29 |    


--------------------------------------------------------------------------------
/docs/generated/direpack.preprocessing.gsspp.GenSpatialSignPrePprocessor.rst:
--------------------------------------------------------------------------------
 1 | direpack.preprocessing.gsspp.GenSpatialSignPrePprocessor
 2 | ========================================================
 3 | 
 4 | .. currentmodule:: direpack.preprocessing.gsspp
 5 | 
 6 | .. autoclass:: GenSpatialSignPrePprocessor
 7 | 
 8 |    
 9 |    .. automethod:: __init__
10 | 
11 |    
12 |    .. rubric:: Methods
13 | 
14 |    .. autosummary::
15 |    
16 |       ~GenSpatialSignPrePprocessor.__init__
17 |       ~GenSpatialSignPrePprocessor.fit
18 |       ~GenSpatialSignPrePprocessor.fit_transform
19 |       ~GenSpatialSignPrePprocessor.get_params
20 |       ~GenSpatialSignPrePprocessor.set_params
21 |       ~GenSpatialSignPrePprocessor.transform
22 |    
23 |    
24 | 
25 |    
26 |    
27 |    


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Sven Serneels
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/direpack/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jul 22 12:17:17 2018
 5 | 
 6 | @author: Sven Serneels, Ponalytics
 7 | """
 8 | 
 9 | __name__ = "direpack"
10 | __author__ = "Emmanuel Jordy Menvouta, Sven Serneels, Tim Verdonck"
11 | __license__ = "MIT"
12 | __version__ = "1.1.3"
13 | __date__ = "2024-05-23"
14 | 
15 | # The commented lines can be uncommented if IPOPT has been installed independently.
16 | 
17 | from .preprocessing.robcent import (
18 |     VersatileScaler,
19 |     versatile_scale,
20 |     Wrapper,
21 |     wrap,
22 | )
23 | from .preprocessing.gsspp import (
24 |     GenSpatialSignPreProcessor,
25 |     gen_ss_pp,
26 |     gen_ss_covmat,
27 | )
28 | from .sprm.sprm import sprm
29 | from .sprm.snipls import snipls
30 | from .sprm.rm import rm
31 | from .cross_validation._cv_support_functions import robust_loss
32 | from .ppdire.ppdire import ppdire
33 | from .ppdire.capi import capi
34 | from .dicomo.dicomo import dicomo
35 | from .sudire.sudire import sudire, estimate_structural_dim
36 | from .plot.sudire_plot import sudire_plot
37 | from .plot.ppdire_plot import ppdire_plot
38 | from .plot.sprm_plot import sprm_plot, sprm_plot_cv
39 | from .ipopt_temp.ipopt_wrapper import minimize_ipopt
40 | from .ipopt_temp.jacobian import *
41 | 


--------------------------------------------------------------------------------
/src/direpack/sprm/_m_support_functions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on Fri Jan 25 18:22:27 2019
 3 | 
 4 | Functions called internally in M-estimation 
 5 | 
 6 | @author: Sven Serneels, Ponalytics
 7 | """
 8 | 
 9 | import numpy as np
10 | import pandas as ps
11 | 
12 | 
13 | def Fair(x, probct, *args):
14 |     return 1 / (1 + abs(x / (probct * 2))) ** 2
15 | 
16 | 
17 | def Huber(x, probct, *args):
18 |     x[np.where(x <= probct)[0]] = 1
19 |     x[np.where(x > probct)] = probct / abs(x[np.where(x > probct)])
20 |     return x
21 | 
22 | 
23 | def Hampel(x, probct, hampelb, hampelr):
24 |     wx = x
25 |     wx[np.where(x <= probct)[0]] = 1
26 |     wx[np.where((x > probct) & (x <= hampelb))[0]] = probct / abs(
27 |         x[np.where((x > probct) & (x <= hampelb))[0]]
28 |     )
29 |     wx[np.where((x > hampelb) & (x <= hampelr))[0]] = np.divide(
30 |         probct * (hampelr - (x[np.where((x > hampelb) & (x <= hampelr))[0]])),
31 |         (hampelr - hampelb) * abs(x[np.where((x > hampelb) & (x <= hampelr))[0]]),
32 |     )
33 |     wx[np.where(x > hampelr)[0]] = 0
34 |     return wx
35 | 
36 | 
37 | def brokenstick(n_components):
38 |     q = np.triu(np.ones((n_components, n_components)))
39 |     r = np.empty((n_components, 1), float)
40 |     r[0:n_components, 0] = range(1, n_components + 1)
41 |     q = np.matmul(q, 1 / r)
42 |     q /= n_components
43 |     return q
44 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jul 22 12:18:53 2018
 5 | 
 6 | @author: Sven serneels, Ponalytics
 7 | """
 8 | 
 9 | from setuptools import setup, find_packages
10 | import re
11 | import sys
12 | import os
13 | 
14 | SRC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),"./src")
15 | if SRC_DIR not in sys.path:
16 |     sys.path.insert(0,SRC_DIR)
17 | from direpack import __version__, __author__, __license__
18 | 
19 | readme_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'README.md')
20 | try:
21 |     from m2r import parse_from_file
22 |     readme = parse_from_file(readme_file)
23 | except ImportError:
24 |     # m2r may not be installed in user environment
25 |     with open(readme_file) as f:
26 |         readme = f.read()
27 | 
28 | setup(
29 |     name="direpack",
30 |     version=__version__,
31 |     author=__author__,
32 |     author_email="svenserneels@gmail.com",
33 |     description="A Python 3 Library for State-of-the-Art Statistical Dimension Reduction Methods",
34 |     long_description=readme,
35 |     long_description_content_type='text/markdown',
36 |     url="https://github.com/SvenSerneels/direpack",
37 |     classifiers=[
38 |         "Programming Language :: Python :: 3",
39 |         "License :: OSI Approved :: MIT License",
40 |         "Operating System :: OS Independent",
41 |     ],
42 |     packages=find_packages('src'),  # include all packages under src
43 |     package_dir={'':'src'},   # tell distutils packages are under src
44 |     include_package_data = True,
45 |     install_requires=[
46 |         'numpy>=1.5.0',
47 |         'scipy>=0.8.0',
48 |         'matplotlib>=2.2.0',
49 |         'scikit-learn>=0.18.0',
50 |         'pandas>=0.19.0',
51 |         'statsmodels>=0.8.0',
52 |         # 'ipopt>=0.1.5',
53 |         'dcor>=0.3'
54 |     ]
55 | )
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Welcome to direpack's documentation!
 4 | ====================================
 5 | The direpack package aims to establish a set of modern statistical dimension reduction techniques into the Python universe as a single, consistent package.
 6 | The dimension reduction methods included resort into three categories: projection pursuit based dimension reduction, sufficient dimension reduction, and robust M estimators for dimension reduction. 
 7 | As a corollary, regularized regression estimators based on these reduced dimension spaces are provided as well, ranging from classical principal component regression up to sparse partial robust M regression.
 8 | The package also contains a set of classical and robust pre-processing utilities, including generalized spatial signs,  as well as dedicated plotting functionality and cross-validation utilities. 
 9 | Finally, direpack has been written consistent with the scikit-learn API, such that the estimators can flawlessly be included into (statistical and/or machine) learning pipelines in that framework.
10 | 
11 | 
12 | 
13 | Installation
14 | ============
15 | The package is distributed through PyPI, so use:: 
16 | 
17 |       pip install direpack
18 | 
19 | Examples
20 | ===============
21 | Example notebooks have been produced to showcase the use of direpack for statistical dimension reduction. These notebooks contain a  `ppdire example <https://github.com/SvenSerneels/direpack/blob/master/examples/ppdire_example.ipynb>`_ ,  `sprm example <https://github.com/SvenSerneels/direpack/blob/master/examples/sprm_example.ipynb>`_  and a `sudire example <https://github.com/SvenSerneels/direpack/blob/master/examples/sudire_example.ipynb>`_ . 
22 | 
23 | 
24 | 
25 | 
26 | Contents
27 | ========
28 | 
29 | .. toctree::
30 |    :maxdepth: 2
31 | 
32 |    ppdire
33 |    sudire
34 |    sprm
35 |    Pre-processing
36 |    Cross-validation and plotting
37 | 
38 | 
39 | .. toctree::
40 |    :maxdepth: 1
41 |    :caption: Other information 
42 | 
43 |    Contributing
44 |    
45 | 
46 | 
47 | 
48 | Indices and tables
49 | ==================
50 | 
51 | * :ref:`genindex`
52 | * :ref:`search`
53 | 


--------------------------------------------------------------------------------
/docs/Contributing.rst:
--------------------------------------------------------------------------------
 1 | .. _Contributing:
 2 | 
 3 | ################
 4 | Contributing
 5 | ################
 6 | 
 7 | No package is complete and the authors would like to see direpack extend its functionality in the future. Some possible additions could be : 
 8 | 
 9 | -  Cellwise robust dimension reduction methods : For instance, a cellwise robust version of the robust M regression method, included in sprm, has recently been published (Filzmoseret  al.2020), and could be included in direpack.
10 | -  Uncertainty quantification : The methods provided through direpack provide point estimates. In the future, the package could, e.g. be augmented with appropriate bootstrapping techniques, as was done for a related dimension reduction context
11 | -  GPU flexibility : There are many matrix manipulations in direpack, which can possiblybe  sped  up  by  allowing  a  GPU  compatibility,  which  could  be  achieved  by  providing a TensorFlowor PyTorch back-end. However, this would be a major effort, since thepresent back-end integrally builds upon numpy.
12 | -  More (and better) unit tests. 
13 | 
14 | Guidelines
15 | ============
16 | 
17 | Testing
18 | -------
19 | Contributions should be accompanied by unit tests similar to those already available. Contrbutors can use the datasets presented in the example notebooks. 
20 | 
21 | Documentation
22 | -------------
23 | We have followed `PEP8 <https://www.python.org/dev/peps/pep-0008/>`_ style  when building this project and ask that contributors do so,
24 | for ease of maintainability. 
25 | 
26 | Article
27 | ================
28 | An article with further information on the package is available. Menvouta, E.J., Serneels, S., Verdonck, T., 2023. direpack: A python 3 package for state-of-the-art statistical dimensionality reduction methods. SoftwareX 21, 101282.
29 | 
30 | Contacts
31 | ================
32 | 
33 | * Dr Sven Serneels is co-founder at Gallop Data, Inc. and can be contacted at svenserneel (at) gmail.com.
34 | 
35 | * Emmanuel Jordy Menvouta is a PhD researcher in Statistics and Data Science at KU Leuven and can be contacted at emmanueljordy.menvoutankpwele (at) kuleuven.be. 
36 | 
37 | * Prof Tim Verdonck is Professor of Statistics and Data Science at University of Antwerp and KU Leuven. He can be reached at tim.verdonck (at) uantwerp.be.


--------------------------------------------------------------------------------
/src/direpack/cross_validation/_cv_support_functions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sun Jan 27 11:42:23 2019
 5 | 
 6 | Ancillary tools for plotting sprm results. 
 7 |     
 8 |     # Deleted: ABLine2D class, was broken in Py 3.7
 9 |     cv_score_table (function): tranform sklearn GridSearchCV 
10 |         results into Data Frame
11 | 
12 | @author: Sven Serneels
13 | """
14 | import numpy as np
15 | import pandas as ps
16 | from sklearn.metrics import mean_squared_error
17 | from scipy.stats import norm
18 | from ..sprm._m_support_functions import Fair, Huber, Hampel
19 |         
20 | def cv_score_table(res_sprm_cv):
21 |         
22 |     """
23 |     Internal function reorganizing sklearn GridSearchCV results to pandas table. 
24 |     The function adds the cv score table to the object as cv_score_table_
25 |     """
26 |         
27 |     n_settings = len(res_sprm_cv.cv_results_['params'])
28 |     etas = [res_sprm_cv.cv_results_['params'][i]['eta'] for i in range(0,n_settings)]
29 |     components = [res_sprm_cv.cv_results_['params'][i]['n_components'] for i in range(0,n_settings)]
30 |     cv_score_table_ = ps.DataFrame({'etas':etas, 'n_components':components, 'score':res_sprm_cv.cv_results_['mean_test_score']})
31 |     return(cv_score_table_)
32 |     
33 | def robust_loss(y,ypred,lfun=mean_squared_error,fun=Hampel,probct=norm.ppf(0.975),hampelb=norm.ppf(.99),hampelr=norm.ppf(.999)):
34 |     
35 |     """
36 |     Weighted loss function to be used in sklearn cross-validation
37 |     Inputs: 
38 |         y: array or matrix, original predictand
39 |         ypred, array or matrix, predicted values
40 |         lfun, function: an sklearn loss metric that accepts caseweights, 
41 |             e.g. sklearn.metrics.mean_squared_error
42 |         fun: function, weight function, 
43 |             e.g. Fair, Huber or Hampel from sprm.sprm._m_support_functions
44 |         probct, hampelb, hampelr: float, cutoffs for weight functions
45 |     Output:
46 |         loss, float
47 |     """            
48 |     
49 |     if len(ypred.shape) > 1:
50 |         ypred = np.array(ypred).reshape(-1)
51 |     ypred = ypred.astype('float64')
52 |     if len(y.shape) > 1:
53 |         y = np.array(y).reshape(-1)
54 |     y = y.astype('float64')
55 |     r = y - ypred
56 |     w = fun(r,probct,hampelb,hampelr)
57 |     return(lfun(y,ypred,sample_weight=w))
58 |     


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: [3.8, 3.9, "3.10"]
19 |         
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python ${{ matrix.python-version }}
23 |       # uses: actions/setup-python@v2
24 |       uses: conda-incubator/setup-miniconda@v2
25 |       with:
26 |         auto-update-conda: true
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |         $CONDA/bin/conda install -c conda-forge libstdcxx-ng
33 |         $CONDA/bin/conda install -c conda-forge libgcc=5.2.0
34 |         $CONDA/bin/conda install -c conda-forge scikit-learn
35 |         $CONDA/bin/conda install -c conda-forge pandas
36 |         $CONDA/bin/conda install -c conda-forge numpy
37 |         $CONDA/bin/conda install -c conda-forge statsmodels
38 |         $CONDA/bin/conda install -c conda-forge dcor
39 |         $CONDA/bin/conda install -c conda-forge sympy
40 |         $CONDA/bin/conda install -c conda-forge matplotlib
41 |         sudo apt install gcc
42 |     - name: Conda info
43 |       shell: bash -l {0}
44 |       run: conda info
45 |     - name: Conda list
46 |       shell: pwsh
47 |       run: conda list
48 |     - name: install ipopt
49 |       run: |
50 |         $CONDA/bin/conda install -c conda-forge cyipopt
51 |     - name: Lint with flake8
52 |       run: |
53 |         $CONDA/bin/conda install flake8
54 |         # stop the build if there are Python syntax errors or undefined names
55 |         $CONDA/bin/flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
56 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
57 |         $CONDA/bin/flake8 . --count --exit-zero --max-complexity=15 --max-line-length=127 --statistics
58 |     - name: Test with pytest
59 |       run: |
60 |         conda install pytest
61 |         $CONDA/bin/pytest
62 | 


--------------------------------------------------------------------------------
/src/direpack/ipopt_temp/jacobian.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Author: Matthias Kuemmerer, 2014
 3 | """
 4 | from __future__ import print_function, division, unicode_literals, absolute_import
 5 | 
 6 | import sys
 7 | import numpy as np
 8 | 
 9 | 
10 | class FunctionWithApproxJacobian(object):
11 |     def __init__(self, func, epsilon, verbose=True):
12 |         self._func = func
13 |         self.epsilon = epsilon
14 |         self.value_cache = {}
15 |         self.verbose = verbose
16 | 
17 |     def __call__(self, x, *args, **kwargs):
18 |         key = tuple(x)
19 |         if not key in self.value_cache:
20 |             self.log('.')
21 |             value = self._func(x, *args, **kwargs)
22 |             if np.any(np.isnan(value)):
23 |                 print("Warning! nan function value encountered at {0}".format(x))
24 |             self.value_cache[key] = value
25 |         return self.value_cache[key]
26 | 
27 |     def func(self, x, *args, **kwargs):
28 |         if self.verbose:
29 |             print(x)
30 |         return self(x, *args, **kwargs)
31 | 
32 |     def log(self, msg):
33 |         if self.verbose:
34 |             sys.stdout.write(msg)
35 |             sys.stdout.flush()
36 | 
37 |     def jac(self, x, *args, **kwargs):
38 |         self.log('G[')
39 |         x0 = np.asfarray(x)
40 |         #print x0
41 |         dxs = np.zeros((len(x0), len(x0) + 1))
42 |         for i in range(len(x0)):
43 |             dxs[i, i + 1] = self.epsilon
44 |         results = [self(*(x0 + dxs[:, i], ) + args, **kwargs) for i in range(len(x0) + 1)]
45 |         jac = np.zeros([len(x0), len(np.atleast_1d(results[0]))])
46 |         for i in range(len(x0)):
47 |             jac[i] = (results[i + 1] - results[0]) / self.epsilon
48 |         self.log(']')
49 |         return jac.transpose()
50 | 
51 | 
52 | class FunctionWithApproxJacobianCentral(FunctionWithApproxJacobian):
53 |     def jac(self, x, *args, **kwargs):
54 |         self.log('G[')
55 |         x0 = np.asfarray(x)
56 |         #print x0
57 |         dxs = np.zeros((len(x0), 2*len(x0)))
58 |         for i in range(len(x0)):
59 |             dxs[i, i] = -self.epsilon
60 |             dxs[i, len(x0)+i] = self.epsilon
61 |         results = [self(*(x0 + dxs[:, i], ) + args, **kwargs) for i in range(2*len(x0))]
62 |         jac = np.zeros([len(x0), len(np.atleast_1d(results[0]))])
63 |         for i in range(len(x0)):
64 |             jac[i] = (results[len(x0)+i] - results[i]) / (2*self.epsilon)
65 |         self.log(']')
66 |         return jac.transpose()


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 |     
10 |   workflow_dispatch:
11 |     inputs:
12 |       version_bump:
13 |         description: 'The verions portion to increment'
14 |         required: true
15 |         default: 'patch'
16 |         type: choice
17 |         options:
18 |         - patch
19 |         - minor
20 |         - major
21 |   workflow_call:
22 |     inputs:
23 |       version_bump:
24 |         description: "The version portion to increment"
25 |         required: true
26 |         type: string
27 |   push:
28 |     branches:
29 |       - master
30 |     paths-ignore:
31 |       - '.github/**'
32 |       - 'README.md'
33 |       - '.gitignore'
34 |       - 'CHANGELOG.md'
35 |   pull_request:
36 |     branches:
37 |       - master
38 | 
39 | jobs:
40 |   deploy:
41 | 
42 |     runs-on: ubuntu-latest
43 | 
44 |     steps:
45 |     - uses: actions/checkout@v3
46 |     - name: Set up Python
47 |       uses: conda-incubator/setup-miniconda@v2
48 |       with:
49 |         auto-update-conda: true
50 |         python-version: 3.9
51 |     - name: Install dependencies
52 |       run: |
53 |         python -m pip install --upgrade pip
54 |         # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
55 |         $CONDA/bin/conda install -c conda-forge libstdcxx-ng
56 |         $CONDA/bin/conda install -c conda-forge libgcc=5.2.0
57 |         $CONDA/bin/conda install -c conda-forge scikit-learn
58 |         $CONDA/bin/conda install -c conda-forge pandas
59 |         $CONDA/bin/conda install -c conda-forge numpy
60 |         $CONDA/bin/conda install -c conda-forge statsmodels
61 |         $CONDA/bin/conda install -c conda-forge dcor
62 |         $CONDA/bin/conda install -c conda-forge sympy
63 |         $CONDA/bin/conda install -c conda-forge matplotlib
64 |         sudo apt install gcc
65 |     - name: Conda info
66 |       shell: bash -l {0}
67 |       run: conda info
68 |     - name: Conda list
69 |       shell: pwsh
70 |       run: conda list
71 |     - name: install ipopt
72 |       run: |
73 |         $CONDA/bin/conda install -c conda-forge cyipopt
74 |     - name: Build and publish
75 |       env:
76 |         TWINE_USERNAME: ${{ secrets.PYPIUID }}
77 |         TWINE_PASSWORD: ${{ secrets.PYPIPWD }}
78 |       run: |
79 |         $CONDA/bin/python -m pip install setuptools wheel twine
80 |         $CONDA/bin/python setup.py sdist bdist_wheel
81 |         $CONDA/bin/twine upload dist/*
82 | 


--------------------------------------------------------------------------------
/src/direpack/utils/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Apr 13 16:08:22 2020
  5 | 
  6 | @author: sven
  7 | """
  8 | 
  9 | import pandas as ps
 10 | import numpy as np
 11 | 
 12 | 
 13 | class MyException(Exception):
 14 |     pass
 15 | 
 16 | 
 17 | def convert_X_input(X):
 18 | 
 19 |     if type(X) == ps.core.frame.DataFrame:
 20 |         X = X.to_numpy().astype('float64')
 21 |     return(X)
 22 | 
 23 | 
 24 | def convert_y_input(y):
 25 | 
 26 |     if type(y) in [ps.core.frame.DataFrame, ps.core.series.Series]:
 27 |         y = y.to_numpy().T.astype('float64')
 28 |     return(y)
 29 | 
 30 | 
 31 | def const_xscale(beta, *args):
 32 |     X = args[0]
 33 |     h = args[1]
 34 |     i = args[2]
 35 |     j = args[3]
 36 |     beta = np.reshape(beta, (-1, h), order='F')
 37 |     covx = np.cov(X, rowvar=False)
 38 |     ans = np.matmul(np.matmul(beta.T, covx), beta) - np.identity(h)
 39 |     return(ans[i, j])
 40 | 
 41 | 
 42 | def const_zscale(beta, *args):
 43 |     X = args[0]
 44 |     h = args[1]
 45 |     i = args[2]
 46 |     j = args[3]
 47 |     beta = np.reshape(beta, (-1, h), order='F')
 48 |     covx = np.identity(X.shape[1])
 49 |     ans = np.matmul(np.matmul(beta.T, covx), beta) - np.identity(h)
 50 |     return(ans[i, j])
 51 | 
 52 | 
 53 | def _predict_check_input(Xn):
 54 |     if type(Xn) == ps.core.series.Series:
 55 |         Xn = Xn.to_numpy()
 56 |     if Xn.ndim == 1:
 57 |         Xn = Xn.reshape((1, -1))
 58 |     if type(Xn) == ps.core.frame.DataFrame:
 59 |         Xn = Xn.to_numpy()
 60 |     n, p = Xn.shape
 61 |     return (n, p, Xn)
 62 | 
 63 | 
 64 | def _check_input(X):
 65 | 
 66 |     if(type(X) in (np.matrix, ps.core.frame.DataFrame, ps.core.series.Series)):
 67 |         X = np.array(X)
 68 | 
 69 |     if (X.dtype == np.dtype('O')):
 70 |         X = X.astype('float64')
 71 | 
 72 |     if X.ndim == 1:
 73 |         X = X.reshape((1, -1))
 74 | 
 75 |     n, p = X.shape
 76 | 
 77 |     if n == 1:
 78 |         if p >= 2:
 79 |             X = X.reshape((-1, 1))
 80 |     return(X)
 81 | 
 82 | 
 83 | def nandot(X, y):
 84 | 
 85 |     p, n = X.shape
 86 |     assert n == len(y), "Number of rows in X and y needs to agree"
 87 |     if len(y.shape) > 1:
 88 |         y = y.reshape(-1)
 89 |     product = [np.nansum(np.multiply(X[i, :], y)) for i in range(p)]
 90 | 
 91 |     return np.array(product).reshape((-1, 1))
 92 | 
 93 | 
 94 | def nanmatdot(X, Y):
 95 | 
 96 |     p, n = X.shape
 97 |     if len(Y.shape) == 1:
 98 |         return nandot(X, Y)
 99 |     else:
100 |         m, q = Y.shape
101 |         assert n == m, "Matrix diomensions need to agree"
102 |         if q == 1:
103 |             return nandot(X, Y)
104 |         else:
105 |             product = [[np.nansum(np.multiply(X[i, :], Y[:, j]))
106 |                         for i in range(p)] for j in range(q)]
107 | 
108 |             return np.array(product)
109 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/


--------------------------------------------------------------------------------
/docs/dicomo.md:
--------------------------------------------------------------------------------
 1 | Diverse (co-)moment statistics 
 2 | ==============================
 3 | 
 4 | This class implements (co)-moment statistics, covering both clasical product-moment 
 5 | statistics, as well as more recently developed energy statistics. 
 6 | The `dicomo` class also serves as a plug-in into `capi` and  `ppdire`. It has been written consistently with `ppdire` such that it provides a wide range of 
 7 | projection indices based on (co-)moments.    
 8 | 
 9 | Description
10 | -----------
11 | 
12 | The `dicomo` folder contains
13 | - The class object (`dicomo.py`) 
14 | - Ancillary functions for (co-)moment estimation (`_dicomo_utils.py`)
15 | 
16 | The `dicomo` class
17 | ==================
18 | 
19 | Parameters
20 | ----------
21 | - `est`, str: mode of estimation. The set of options are `'arithmetic'` (product-moment) or `'distance'` (energy statistics)
22 | - `mode`, str: type of moment. Options are: 
23 |     * `'mom'`: moment 
24 |     * `'var'`: variance 
25 |     * `'std'`: standard deviation 
26 |     * `'skew'`: skewness 
27 |     * `'kurt'`: kurtosis
28 |     * `'com'`: co-moment 
29 |     * `'M3'`: shortcut for third order co-moment
30 |     * `'cov'`: covariance 
31 |     * `'cos'`: co-skewness
32 |     * `'cok'`: co-kurtosis 
33 |     * `'corr'`: correlation, 
34 |     * `'continuum'`: continuum association 
35 |     * `'mdd'`: martingale difference divergence (requires `est = 'distance'`)
36 |     * `'mdc'`: martingale difference correlation (requires `est = 'distance'`)
37 |     * `'ballcov'`: ball covariance (requires installing `Ball` and uncommenting the `import` statement)
38 | - `center`: internal centring used in calculation. Options are `mean` or `median`.  
39 | 
40 | Attributes
41 | ----------
42 | Attributes always provided 
43 | -  `moment_`: The resulting (co-)moment
44 | 
45 | Depending on the options picked, intermediate results are stored as well, as `x_moment_`, `y_moment_` or `co_moment_`
46 | 
47 | 
48 | Methods
49 | --------
50 | - `fit(X, *args, **kwargs)`: fit model 
51 | 
52 | The `fit` function takes several optional input arguments. These are options that 
53 | apply to individual settings: 
54 | -   `biascorr`, Bool, when `True`, correct for bias. For classical product-moment statistics, this 
55 |     is the small sample correction. For energy statistics, this leads to the estimates that are unbiased in high dimension
56 |     (but not preferred in low dimension). 
57 | -   `alpha`, float, parameter for continuum association. Has no effect for other options.  
58 | -   `option`, int, determines which higher order co-moment to calculate, e.g. for co-skewness, `option=1` calciulates CoS(x,x,y)
59 | -   `order`, int, which order (co-)moment to calculate. Can be overruled by `mode`, e.g. if `mode='var'`, `order` is set to 2. 
60 | -   `calcmode`, str, to use the efficient or naive algorithm to calculate distance statistics. Defaults to `fast` when available. 
61 | 
62 | Examples 
63 | --------
64 | Check out the [dicomo examples notebook](https://github.com/SvenSerneels/direpack/blob/master/examples/dicomo_example.ipynb)


--------------------------------------------------------------------------------
/docs/Cross-validation and plotting.rst:
--------------------------------------------------------------------------------
 1 | .. _Cross-validation and plotting:
 2 | 
 3 | #############################
 4 | Cross-validation and plotting
 5 | #############################
 6 | 
 7 | Each of the sudire, ppdire and sprm subpackages in direpackare wrappers around a broad class of dimension reduction methods.  
 8 | Each of these methods will have at least one tune-able  hyperparameter;  some  have  many  more. The  user  will  want  to  be  able  to  find  the optimal hyperparameters for the data at hand, which can be done through cross-validation or bayesian optimization.
 9 | It is not the aim of direpack to provide its own hyperparameter tuning algorithms,  as ample cross-validation utilities are available in scikit-learn’s model selection subpackage and the direpack estimations have been written consistently with the scikit-learn API,
10 | such that these model selection tools from scikit-learn can directly be applied to them. However, some caution should be taken when training the robust methods.  While all classical (non-robust) methods could just use scikit-learn’s default settings, when tuning a robust model, 
11 | outliers  are  expected  to  be  in  the  data,  such  that  it  becomes  preferable  to  apply a robust  cross-validation  metric  as  well. Thereunto,  it  is  possible  to  use scikit-learn’s median_absolute_error, which is an MAE (L1) scorer that is less affected by extreme values than the default mean_squared_error. 
12 | However, particularly in the case of robust M estimators, a more model consistent approach can be pursued.  The robust M estimators provide a set of case weights,  and these can be used to construct a weighted evaluation metric for cross-validation.  Exactly this is provided in the robust_loss function that is a part of the direpack cross-validation utilities.
13 | 
14 | Similar to hyperparameter tuning, direpack's mission is not to deliver a broad set of plotting utilities, but rather focus on the dimension reduction statistics. However, some plots many users would like to have in this context, are provided for each of the methods. These are : 
15 | 
16 | * Projection plots. These plots visualize the scores $\mathbf{t}_i$ and a distinction can be made in the plots between cases that the model had been trained with, and test set cases. 
17 | * Parity plots. For the regularized regressions based on the estimated scores, these visualize the predicted versus actual responses, with the same distinction as for the scores. 
18 | 
19 | For the special case of SPRM, the plots have enhanced functionality. Since SPRM provides case weights, which can also be calculated for new cases, the SPRM plots can flag outliers. In the sprm_plot function, this is set up with two cut-offs, based on the caseweight values,and visualized asregular  cases,moderate  outliersorharsh  outliers.
20 | For SPRM, there is anoption as well to visualize the case weights themselves.
21 | 
22 | 
23 | Examples of direpack's plotting functionalities are available in the example notebooks of `ppdire <https://github.com/SvenSerneels/direpack/blob/master/examples/ppdire_example.ipynb>`_,  `sprm <https://github.com/SvenSerneels/direpack/blob/master/examples/sprm_example.ipynb>`_ and `sudire <https://github.com/SvenSerneels/direpack/blob/master/examples/sudire_example.ipynb>`_ . 
24 | 
25 | 


--------------------------------------------------------------------------------
/direpack_Release_Notes.md:
--------------------------------------------------------------------------------
 1 | `sprm` Release notes (versions 0.0 through 0.7)
 2 | ====================
 3 | 
 4 | Version 0.2.1
 5 | -------------
 6 | - sprm now takes both numeric (n,1) np matrices and (n,) np.arrays as input 
 7 | 
 8 | 
 9 | Version 0.2.0
10 | -------------
11 | Changes compared to version 0.1: 
12 | - All functionalities can now be loaded in modular way, e.g. to use plotting functions, now source the plot function separately:
13 |         
14 |         from sprm import sprm_plot 
15 |         
16 | - The package now includes a robust M regression estimator (rm.py), which is a multiple regression only variant of sprm. 
17 |   It is based on the same iterative re-weighting scheme, buit does not perform dimension reduction, nor variable selection.
18 | - The robust preprocessing routine (robcent.py) has been re-written so as to be more consistent with sklearn.
19 | 
20 | Version 0.3
21 | -----------
22 | All three estimators provided as separate classes in module:
23 | 
24 |         from sprm import sprm 
25 |         from sprm import snipls
26 |         from sprm import rm
27 |         
28 | Also, sprm now includes a check for zero scales. It will remove zero scale variables from the input data, and only use 
29 | columns corresponding to nonzero predictor scales in new data. This check has not yet been built in for snipls or rm 
30 | separately. 
31 |         
32 | Plus some minor changes to make it consistent with the latest numpy and matplotlib versions. 
33 | 
34 | Version 0.4
35 | -----------
36 | The preprocesing routine `robcent` has been refactored. Functionality has been 
37 | added to centre the data nonparametrically by the L1 median. The ancillary functions
38 | for `robcent` have been moved into `_preproc_utilities.py`. 
39 | 
40 | Furthermore, `sprm`, `snipls` and `rm` have all three been modified such that
41 | they accept matrix, array or data frame input for both X and y. Also, the option
42 | to provide column names has been extended to automatic extraction from data frame
43 | input, or direct input as list, array or pandas Index. 
44 | 
45 | The license has been changed from GPL3 to MIT. 
46 | 
47 | 0.4.2. `'kstepLTS'` location estimator included.
48 | 
49 | 
50 | Version 0.5 
51 | -----------
52 | Pre-processing functions further refactored so as to be compatible with `sklearn` pipelines. 
53 | Class now named `VersatileScaler`; the old `robcent` name still works, but will be sunset. 
54 | 
55 | Version 0.6
56 | -----------
57 | Preprocessing files moved into separate folder. More preprocessing options. 
58 | Examples moved into Jupyter notebook in separate examples section.
59 | 
60 | `direpack` release notes (since version 0.8)
61 | ========================
62 | 
63 | Version 0.8
64 | -----------
65 | `ppdire` merges in
66 | 
67 | Version 0.9
68 | -----------
69 | - `preprocessing` widely extended 
70 | - `plot` functions adapted 
71 | - documentation improved 
72 | 
73 | Version 1.0
74 | -----------
75 | - `sudire` joins in
76 | - `plot` functions adapted 
77 | - documentation provided for `dicomo` 
78 | - 1.0.2: link to `direpack` publication added
79 | - 1.0.3: fixed rare division by zero in `l1median`
80 | - 1.0.4: unit tests included
81 | - 1.0.5: `sudire` notebook adapted
82 | - 1.0.9: function to calculate the martingale difference divergence matrix (MDDM) added in `_dicomo_utils.py` 
83 | - 1.0.11: documentation updated to accommodate for go-live of readthedocs page
84 | - 1.0.13: fixed bug in option to use `Ball` in `sudire`. Adjusted readthedocs. 
85 | 
86 | 
87 | 
88 | 
89 |  
90 | 
91 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | #import unittest.mock as mock
16 | sys.path.insert(0, os.path.abspath('..'))
17 | #sys.path.insert(0, os.path.abspath('../src/direpack/'))
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = 'direpack'
23 | copyright = '2021, Sven Serneels and Emmanuel Jordy Menvouta'
24 | author = 'Sven Serneels and Emmanuel Jordy Menvouta'
25 | 
26 | # The full version, including alpha/beta/rc tags
27 | release = '1.0.10'
28 | 
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | 
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage','sphinx.ext.autosummary', 'sphinx.ext.napoleon','sphinx.ext.imgmath', "sphinx.ext.viewcode", 'sphinx_math_dollar']
36 | 
37 | # Add autosummary
38 | autosummary_generate = True
39 | add_module_names = False
40 | 
41 | # Add any paths that contain templates here, relative to this directory.
42 | templates_path = ['_templates']
43 | 
44 | # List of patterns, relative to source directory, that match files and
45 | # directories to ignore when looking for source files.
46 | # This pattern also affects html_static_path and html_extra_path.
47 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
48 | 
49 | 
50 |  
51 | # MOCK_MODULES = ['numpy', 'pandas', 'matplotlib','scikit-learn']
52 | # for mod_name in MOCK_MODULES:
53 | #      sys.modules[mod_name] = mock.Mock()
54 | 
55 | 
56 | # -- Options for HTML output -------------------------------------------------
57 | 
58 | # The theme to use for HTML and HTML Help pages.  See the documentation for
59 | # a list of builtin themes.
60 | #
61 | html_theme = 'sphinx_rtd_theme'
62 | 
63 | # Add any paths that contain custom static files (such as style sheets) here,
64 | # relative to this directory. They are copied after the builtin static files,
65 | # so a file named "default.css" will overwrite the builtin "default.css".
66 | html_static_path = ['_static']
67 | 
68 | 
69 | imgmath_latex_preamble = r'''
70 | \usepackage{lineno}
71 | \usepackage{amsmath}
72 | \usepackage{graphicx,psfrag,epsf}
73 | \usepackage{enumerate} 
74 | \usepackage{amsmath,amsfonts,amssymb,graphicx,multirow}
75 | \usepackage{mdsymbol}
76 | \usepackage{booktabs}
77 | \usepackage{amsthm}
78 | \usepackage{bbm}
79 | \usepackage{algorithm}
80 | \newcommand{\argmax}{\mathop{\mbox{argmax}}}
81 | \usepackage[noend]{algpseudocode}
82 | \usepackage{rotating}
83 | \modulolinenumbers[5]
84 | \def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
85 | \def\spacingset#1{\renewcommand{\baselinestretch}%
86 | {#1}\small\normalsize} \spacingset{1}
87 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert}
88 | '''
89 | 
90 | #imgmath_image_format = 'svg'
91 | 


--------------------------------------------------------------------------------
/src/direpack/test/test_ppdire.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jun 30 13:17:46 2020
 4 | 
 5 | @author: Emmanuel Jordy Menvouta
 6 | """
 7 | 
 8 | import unittest
 9 | import pandas as ps
10 | import numpy as np
11 | from ..preprocessing.robcent import VersatileScaler
12 | import sklearn.decomposition as skd
13 | from ..dicomo.dicomo import dicomo
14 | from ..ppdire.ppdire import ppdire
15 | import sklearn.cross_decomposition as skc
16 | 
17 | 
18 | class Testppdire(unittest.TestCase):
19 |     """Test some methods in the ppdire class"""
20 | 
21 |     @classmethod
22 |     def setUpClass(self):
23 |         print("setupClass")
24 | 
25 |     @classmethod
26 |     def tearDownClass(self):
27 |         print("teardownClass")
28 | 
29 |     def setUp(self):
30 |         self.data = ps.read_csv("./data/Returns_shares.csv")
31 |         self.datav = self.data.values[:, 2:8].astype("float64")
32 |         self.x = self.datav[:, 1:5]
33 |         self.y = self.datav[:, 0]
34 |         self.n = self.data.shape[0]
35 |         self.p = self.data.shape[1]
36 |         self.centring = VersatileScaler()
37 |         self.Xs = self.centring.fit_transform(self.x)
38 | 
39 |     def tearDown(self):
40 |         del self.x
41 |         del self.y
42 |         del self.n
43 |         del self.p
44 |         del self.Xs
45 |         del self.centring
46 | 
47 |     def test_pca(self):
48 |         """tests the exactness of ppdire's pca"""
49 | 
50 |         pppca = ppdire(
51 |             projection_index=dicomo,
52 |             pi_arguments={"mode": "var"},
53 |             n_components=4,
54 |             optimizer="SLSQP",
55 |         )
56 |         pppca.fit(self.x)
57 |         skpca = skd.PCA(n_components=4)
58 |         skpca.fit(self.Xs)
59 |         np.testing.assert_almost_equal(
60 |             np.abs(pppca.x_loadings_), np.abs(skpca.components_.T), decimal=3
61 |         )
62 | 
63 |     def test_pls(self):
64 |         """tests the exactness of ppdire's pls"""
65 | 
66 |         skpls = skc.PLSRegression(n_components=4)
67 |         skpls.fit(self.Xs, (self.y - np.mean(self.y)) / np.std(self.y))
68 |         pppls = ppdire(
69 |             projection_index=dicomo,
70 |             pi_arguments={"mode": "cov"},
71 |             n_components=4,
72 |             square_pi=True,
73 |             optimizer="SLSQP",
74 |             optimizer_options={"maxiter": 500},
75 |         )
76 |         pppls.fit(self.x, self.y)
77 |         np.testing.assert_almost_equal(
78 |             np.abs(
79 |                 np.matmul(self.Xs, skpls.coef_.reshape(-1)) * np.std(self.y)
80 |                 + np.mean(self.y)
81 |             ),
82 |             np.abs(pppls.fitted_.ravel()),
83 |             decimal=3,
84 |         )
85 | 
86 | 
87 | #    def test_robust(self):
88 | #        lcpca = ppdire(projection_index = dicomo, pi_arguments = {'mode' : 'var', 'center': 'median'}, n_components=4, optimizer='grid',optimizer_options={'ndir':1000,'maxiter':10})
89 | #        lcpca.fit(self.x)
90 | #        test_ans=np.array([[ 0.6324543 , -0.00651997, -0.35820225,  0.6438448 ],
91 | #                           [ 0.44750274, -0.67228343,  0.4950862 , -0.21806968],
92 | #                           [ 0.53378114,  0.28794634, -0.46650197, -0.72699245],
93 | #                           [ 0.35432068,  0.68524337,  0.64350842,  0.09692107]])
94 | #        np.testing.assert_almost_equal(np.abs(test_ans),np.abs(lcpca.x_loadings_),decimal=3)
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     unittest.main()
99 | 


--------------------------------------------------------------------------------
/src/direpack/test/test_sprm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jun 30 13:17:46 2020
  4 | 
  5 | @author: Emmanuel Jordy Menvouta
  6 | """
  7 | 
  8 | import unittest
  9 | import pandas as ps
 10 | import numpy as np
 11 | from ..sprm.sprm import sprm
 12 | from ..sprm.snipls import snipls
 13 | from ..sprm.rm import rm
 14 | 
 15 | 
 16 | class Testsprm(unittest.TestCase):
 17 |     """Test some methods in the sprm class"""
 18 | 
 19 |     @classmethod
 20 |     def setUpClass(cls):
 21 |         print("setupClass")
 22 | 
 23 |     @classmethod
 24 |     def tearDownClass(cls):
 25 |         print("teardownClass")
 26 | 
 27 |     def setUp(self):
 28 |         self.data = ps.read_csv("./data/Returns_shares.csv")
 29 |         self.datav = np.matrix(self.data.values[:, 2:8].astype("float64"))
 30 |         self.x = self.datav[:, 0:5]
 31 |         self.y = self.datav[:, 5]
 32 |         self.n = self.data.shape[0]
 33 |         self.p = self.data.shape[1]
 34 |         self.x0 = self.x.astype("float")
 35 |         self.y0 = self.y.astype("float")
 36 |         self.columns = self.data.columns[2:8]
 37 | 
 38 |     def tearDown(self):
 39 |         del self.x
 40 |         del self.y
 41 |         del self.n
 42 |         del self.p
 43 |         del self.x0
 44 |         del self.y0
 45 |         del self.data
 46 |         del self.datav
 47 | 
 48 |     def test_sprm(self):
 49 |         """Test the functioning of the sprm object"""
 50 | 
 51 |         res_sprm = sprm(
 52 |             2,
 53 |             0.8,
 54 |             "Hampel",
 55 |             0.95,
 56 |             0.975,
 57 |             0.999,
 58 |             "kstepLTS",
 59 |             "scaleTau2",
 60 |             True,
 61 |             100,
 62 |             0.01,
 63 |             "ally",
 64 |             "xonly",
 65 |             self.columns,
 66 |             True,
 67 |         )
 68 |         res_sprm.fit(self.x0[:2666], self.y0[:2666])
 69 |         test_ans = 28.40453479240838
 70 |         np.testing.assert_almost_equal(
 71 |             np.linalg.norm(res_sprm.weightnewx(self.x0[2666:])),
 72 |             test_ans,
 73 |             decimal=4,
 74 |         )
 75 | 
 76 |     def test_rm(self):
 77 |         """Test the functioning of the rm object"""
 78 | 
 79 |         res_rm = rm(
 80 |             "Hampel",
 81 |             0.95,
 82 |             0.975,
 83 |             0.999,
 84 |             "median",
 85 |             "mad",
 86 |             "specific",
 87 |             True,
 88 |             100,
 89 |             0.01,
 90 |             True,
 91 |         )
 92 |         res_rm.fit(self.x0[:2666], self.y0[:2666])
 93 |         test_ans = 28.62510008113666
 94 |         np.testing.assert_almost_equal(
 95 |             np.linalg.norm(res_rm.predict(self.x0[2666:])), test_ans, decimal=4
 96 |         )
 97 | 
 98 |     def test_snipls(self):
 99 |         """Test the functioning of the snipls object"""
100 |         res_snipls = snipls(n_components=4, eta=0.5)
101 |         res_snipls.fit(self.x0[:2666], self.y0[:2666])
102 |         test_ans = 38.6183244001568
103 |         np.testing.assert_almost_equal(
104 |             np.linalg.norm(res_snipls.predict(self.x0[2666:])),
105 |             test_ans,
106 |             decimal=4,
107 |         )
108 |         self.x[0, 0] = np.nan
109 |         res_snipls.fit(self.x0[:2666], self.y0[:2666])
110 |         np.testing.assert_almost_equal(
111 |             np.linalg.norm(res_snipls.predict(self.x0[2666:])),
112 |             test_ans,
113 |             decimal=4,
114 |         )
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     unittest.main()
119 | 


--------------------------------------------------------------------------------
/src/direpack/test/test_dicomo.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jun 30 13:17:46 2020
  4 | 
  5 | @author: Emmanuel Jordy Menvouta
  6 | """
  7 | 
  8 | import unittest
  9 | from ..dicomo.dicomo import dicomo
 10 | import pandas as ps
 11 | import numpy as np
 12 | import statsmodels.robust as srs
 13 | import scipy.stats as sps
 14 | import dcor as dc
 15 | 
 16 | class Testdicomo(unittest.TestCase):
 17 |     """ Test  methods in the dicomo class"""
 18 |     
 19 |     @classmethod
 20 |     def setUpClass(cls):
 21 |         print('...setupClass')
 22 |         
 23 |         
 24 |         
 25 |     @classmethod
 26 |     def tearDownClass(cls):
 27 |         print('...teardownClass')
 28 |         
 29 |         
 30 |     @classmethod    
 31 |     def setUp(self):
 32 |         self.data=ps.read_csv("./data/Returns_shares.csv")
 33 |         self.datav = np.array(self.data.values[:,2:8].astype('float64'))
 34 |         self.est = dicomo()
 35 |         self.x = self.datav[:,1]
 36 |         self.y = self.datav[:,0]
 37 |         self.n=self.data.shape[0]
 38 |         self.p = self.data.shape[1]
 39 |         
 40 |         
 41 |         
 42 |     @classmethod    
 43 |     def tearDown(self):
 44 |         del self.est
 45 |         del self.x
 46 |         del self.y
 47 |         del self.n
 48 |         del self.p
 49 |         
 50 |         
 51 |     
 52 |     def test_mom(self):
 53 |         """ Tests functions to compute moments"""
 54 |         
 55 |         self.assertAlmostEquals(self.est.fit(self.x,biascorr=False),np.var(self.x))# biased var
 56 |         self.assertAlmostEquals(self.est.fit(self.x,biascorr=True),np.var(self.x)*self.n/(self.n-1))#unbiased var
 57 |         self.est.set_params(center='median')
 58 |         self.assertAlmostEquals(self.est.fit(self.x),srs.mad(self.x),places=4)
 59 |         self.est.set_params(center='mean')
 60 |         self.assertAlmostEquals(self.est.fit(self.x,biascorr=False,order=3),sps.moment(self.x,3))#third moment
 61 |         self.est.set_params(mode='skew')
 62 |         self.assertAlmostEquals(self.est.fit(self.x,biascorr=False),sps.skew(self.x))# skew without small sample corr
 63 |         self.assertAlmostEquals(self.est.fit(self.x,biascorr=True),sps.skew(self.x,bias=False))
 64 |         
 65 |         
 66 |         
 67 |         
 68 |         
 69 |         
 70 |     def test_como(self):
 71 |         """ Tests function to compute comomennts"""
 72 |         
 73 |         self.est.set_params(mode='com')
 74 |         self.assertAlmostEquals(self.est.fit(self.x,y=self.y,biascorr=True),self.data.iloc[:,2:4].cov().values[0,1])#covariance
 75 |         self.assertAlmostEquals(self.est.fit(self.x,y=self.y,biascorr=True,option=1,order=3),0.39009,places=4)#third order comoment
 76 |         self.est.set_params(mode='corr')
 77 |         self.assertAlmostEquals(self.est.fit(self.x,y=self.y),self.data.iloc[:,2:4].corr().values[0,1])#correlation
 78 |         self.est.set_params(mode='continuum')
 79 |         self.assertAlmostEquals(np.sqrt(self.est.fit(self.x,y=self.y,alpha=1,biascorr=True)),self.data.iloc[:,2:4].cov().values[0,1])#continuum
 80 |         
 81 |         
 82 |     def test_energy(self):
 83 |         """ Tests function  to compute energy statistics"""
 84 |         
 85 |         self.est.set_params(est='distance',mode='var')
 86 |         self.assertAlmostEquals(self.est.fit(self.x,biascorr=False),dc.distance_stats(self.x,self.x).covariance_xy)
 87 |         self.assertAlmostEquals(self.est.fit(self.x,biascorr=True),np.sqrt(dc.u_distance_stats_sqr(self.x,self.x).covariance_xy))      
 88 |         self.est.set_params(mode='com')
 89 |         self.assertAlmostEquals(self.est.fit(self.x,y=self.y,biascorr=False),dc.distance_covariance(self.x,self.y))
 90 |         self.est.set_params(mode='mdd')
 91 |         self.assertAlmostEquals(self.est.fit(self.x,y=self.y,biascorr=False),0.352427150086)
 92 |         
 93 |         
 94 |         
 95 |         
 96 |         
 97 |         
 98 | 
 99 |         
100 |         
101 |         
102 | 
103 |         
104 |         
105 |         
106 |         
107 |         
108 |         
109 |         
110 |     
111 |     
112 | if __name__ =='__main__':
113 |     unittest.main()
114 |         
115 |     
116 | 


--------------------------------------------------------------------------------
/src/direpack/preprocessing/_gsspp_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Mar 25 09:02:05 2020
  5 | 
  6 | @author: Sven Serneels, Ponalytics. 
  7 | 
  8 | Code for radial transform functions largely adapted from 
  9 | R code by Jakob Raymaekers
 10 | 
 11 | """
 12 | 
 13 | import numpy as np
 14 | 
 15 | def quad(dd, p, n): 
 16 |     """
 17 |     Computes the quadratic radial function
 18 |     args:
 19 |         dd: vector of distances
 20 |         p: number of variables in original data
 21 |         n: number of rows in original data
 22 |     returns:
 23 |         xi: radial function
 24 |     """
 25 |     d_hmed = np.sort(dd,axis=0)[int(np.floor((n + p + 1) / 2))-1][0]
 26 |     idx = np.where(dd > d_hmed)[0]
 27 |     xi = np.ones((n,1))
 28 |     xi[idx] = (1 / np.square(dd[idx])) * (d_hmed**2)
 29 |     return(xi) 
 30 | 
 31 | def ss(dd, p,*args,prec=1e-10):
 32 | 
 33 |     """
 34 |     Computes the spatial sign radial function
 35 |     args:
 36 |       dd: vector of distances
 37 |       p: dimension of original data
 38 |       *args flag to be able to pass on n - has no effect
 39 |     returns:
 40 |       xi: radial function
 41 |     """
 42 |     dd = np.maximum(dd,prec)
 43 |     xi = 1 / dd
 44 |     return(xi)
 45 | 
 46 | def winsor(dd, p, n) :
 47 |     """
 48 |     Computes the Winsor radial function
 49 |     args:
 50 |       dd: vector of distances
 51 |       p: number of variables in original data
 52 |       n: number of rows in original data
 53 |     returns:
 54 |       xi: radial function
 55 |     """ 
 56 |     d_hmed  = np.sort(dd,axis=0)[int(np.floor((n + p + 1) / 2))-1][0]
 57 |     idx = np.where(dd > d_hmed)[0]
 58 |     xi = np.ones((n,1))
 59 |     xi[idx] = (1 / dd[idx]) * d_hmed
 60 |     return(xi)   
 61 |     
 62 | def ball(dd, p, n): 
 63 |     
 64 |     """
 65 |     Computes the Ball radial function
 66 |     args:
 67 |       dd: vector of distances
 68 |       p: number of variables in original data
 69 |       n: number of rows in original data
 70 |     returns:
 71 |       xi: radial function
 72 |     """
 73 |     
 74 |     dWH = np.power(dd,2/3) 
 75 |     dWH_hmed = np.sort(dWH,axis=0)[int(np.floor((n + p + 1) / 2))-1][0]
 76 |     d_hmed = np.power(dWH_hmed,3/2)
 77 |     idx = np.where(dd > d_hmed)[0]
 78 |     xi = np.ones((n,1))
 79 |     xi[idx] =  0
 80 |     return(xi)
 81 | 
 82 | 
 83 | def shell(dd, p, n) :
 84 |     """
 85 |     Computes the Shell radial function
 86 |     args:
 87 |       dd: vector of distances
 88 |       p: number of variables in original data
 89 |       n: number of rows in original data
 90 |     returns:
 91 |       xi: radial function
 92 |     """
 93 |     
 94 |     dWH = np.power(dd,2/3) 
 95 |     dWH_hmed = np.sort(dWH,axis=0)[int(np.floor((n + p + 1) / 2))-1][0]
 96 |     dWH_hmad = np.sort(np.abs(dWH - dWH_hmed),axis=0)[int(np.floor((n + p + 1) / 2))-1][0]
 97 |     cutoff1 = np.power(np.maximum(0, dWH_hmed - dWH_hmad),3/2)
 98 |     cutoff2 = np.power(dWH_hmed + dWH_hmad,3/2)
 99 |     idxlow = np.where(dd < cutoff1)[0] 
100 |     idxhigh = np.where(dd > cutoff2)[0] 
101 |     xi = np.ones((n,1))
102 |     xi[idxlow] = 0
103 |     xi[idxhigh] = 0
104 |     return(xi)
105 | 
106 | 
107 | def linear_redescending(dd, p,n): 
108 |     """
109 |     # Computes the Linear redescending radial function
110 |     args:
111 |       dd: vector of distances
112 |       p: number of variables in original data
113 |       n: number of rows in original data
114 |     returns:
115 |       xi: radial function
116 |     """
117 |     
118 |     dWH = np.power(dd,2/3) 
119 |     dWH_hmed = np.sort(dWH,axis=0)[int(np.floor((n + p + 1) / 2))-1][0]
120 |     dWH_hmad = np.sort(np.abs(dWH - dWH_hmed),axis=0)[int(np.floor((n + p + 1) / 2))-1][0]
121 |     d_hmed = dWH_hmed**(3/2)
122 |     cutoff = (dWH_hmed + 1.4826 * dWH_hmad)**(3/2)
123 |     idxmid = np.where(np.logical_and(dd > d_hmed,dd <= cutoff))[0]
124 |     idxhigh = np.where(dd > cutoff)[0]
125 |     xi = np.ones((n,1))
126 |     xi[idxmid] = 1 - (dd[idxmid,:] - d_hmed) / (cutoff - d_hmed)
127 |     xi[idxhigh] = 0
128 |     return(xi)
129 |     
130 | 
131 | def _norms(X,**kwargs):
132 |     """
133 |     Casewise norms of a matrix
134 |     """
135 |     return(np.linalg.norm(X,axis=1,keepdims=True,**kwargs))
136 |     
137 |     
138 | def _gsspp(X,p,n,fun=ss):
139 |     """
140 |     Generalized Spatial Sign Pre-Processing for Centred Data
141 |     """
142 |     return(np.multiply(X,fun(_norms(X),p,n)))
143 |         
144 | def _spatial_sign(X,**kwargs):
145 |     """
146 |     Spatial Sign Pre-Processing for Centred Data
147 |     """
148 |     return(X/_norms(X))
149 |     
150 | 
151 |     
152 |     
153 | 
154 |     


--------------------------------------------------------------------------------
/docs/Pre-processing.rst:
--------------------------------------------------------------------------------
 1 | .. _Pre-processing:
 2 | 
 3 | ################
 4 | Pre-processing
 5 | ################
 6 | 
 7 | The first step in most meaningful data analytics projects will be to pre-process the data, hence direpack proposes a set of tools for data pre-processing. 
 8 | 
 9 | Data standardization
10 | =====================
11 | 
12 | A first, well accepted way to pre-process data is to center them and scale them to unit variance on a column wise basis. This corresponds to transforming a $\mathbf{x}$ variable into z-scores:
13 | 
14 | .. math::
15 |    :nowrap:
16 | 
17 |         \begin{equation*}
18 |             \mathbf{z} = \frac{\mathbf{x} - \hat{\boldsymbol{\mu}}}{\hat{\boldsymbol{\sigma}}}
19 |         \end{equation*}
20 | 
21 | where $\hat{\boldsymbol{\mu}}$ and $\hat{\boldsymbol{\sigma}}$ are estimates of location and scale, respectively. 
22 | For normally distributed data, the appropriate way to accomplish this is by centering about the mean and dividing by the column wise standard deviation. 
23 | However, when the marginal distributions in the data significantly deviate from the normal, outliers could throw the result of that data standardization off, and robust or nonparametric alternatives become a more reliable choice.
24 | Essentially, all robust statistics are subject to a trade-off between  efficiency and  robustness, which means that the variance of the estimates will increase as the estimator can resist a higher fraction of outliers.
25 | While scikit-learn provides highly robust nonparametric standardization in its RobustScaler, the estimators included therein are known to have a low statistical efficiency (these are the median for location and the interquartile range for scale).
26 | Since autoscaling the data is often an essential step, a few location and scale estimators have been implemented. For location, with increasing performance in terms of the robustness---efficiency trade-off, these are: the column wise median, the spatial median (also called $L_1$-median, although it minimizes an $L_2$ norm) and the $k$ step least trimmed squares  (LTS, Rousseeuw and Leroy (1987)) estimator.
27 | For scale, the consistency corrected median absolute deviation (MAD) and the $\tau$ estimator of scale  (Maronna and Zamar 2002) have been included. Generally, it holds true that the more statistically efficient the estimator in these lists is, the higher its computational cost. In preprocessing, these estimators can be accessed through its VersatileScaler class, which takes the names of these estimators as strings, but it will also accept functions of location and scale estimators, should the user prefer to apply other ones.
28 | 
29 | Spatial sign pre-processing
30 | ============================
31 | Besides standardizing data, it can be beneficial to transform data to some sort of signs. The generalized spatial sign transformation consists of transforming a variable $\mathbf{x}$ into
32 | 
33 | .. math::
34 |    :nowrap:
35 | 
36 |         \begin{equation*}
37 |             \mathbf{u} = \left(\mathbf{x} - \hat{\boldsymbol{\mu}}\right) \times f\left(\mathbf{x} - \hat{\boldsymbol{\mu}}\right)
38 |         \end{equation*}
39 | 
40 | where the spatial sign is obtained by setting $f(x) = {\parallel x \parallel}^{-1}$ and $\parallel \cdot \parallel$ denotes the norm (in all published literature in this context, the $L_2$ norm).
41 | Since spatial sign pre-processing (SS-PP) consists of dividing the data by their Euclidean norm, it is also known as  normalizing and as such, is available in scikit-learn's Normalizer.
42 | Spatial sign pre-processing has been shown to convey moderate robustness to multivariate estimators that are entirely based on covariance estimates, such as PCA or PLS (Serneels, De Nolf, and Van Espen 2006). 
43 | Moderate robustness means in this case that the resulting estimator can resist up to 50% of outliers, but will have a  sizeable  bias  even  for  small  fractions  of  contamination. The  reason  why  this  happens  
44 | is that the spatial sign transform projects all cases onto the unit sphere indiscriminately, which can drastically change data topology, and thereby introduce bias. Recently, the generalized spatial sign transform has been proposed (Raymaekers and Rousseeuw 2019).
45 | These authors examine a set of different functions that can be plugged into the expression for $\mathbf{u}$, some of which will only transform those cases in the data that exceed a certain eccentricity threshold. These functions are the quadratic radial, ball, shell, Winsor and linear redescending (LR) functions, all of which can be accessed through direpack’s GenSpatialSignPreprocessor.
46 | 
47 | 
48 | Usage
49 | =========
50 | 
51 | 
52 | 
53 | .. currentmodule:: direpack.preprocessing.robcent
54 | 
55 | .. autosummary::
56 |     :toctree: generated/
57 |         
58 |     VersatileScaler
59 | 
60 | .. currentmodule:: direpack.preprocessing.gsspp
61 | 
62 | .. autosummary::
63 |     :toctree: generated/
64 |         
65 |     GenSpatialSignPreProcessor
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | References
73 | ==============
74 | 
75 | 1. Maronna  RA,  Zamar  RH  (2002). “Robust  estimates  of  location  and  dispersion  for  high-dimensional datasets.” Technometrics, 44(4), 307–317.
76 | 2. Rousseeuw PJ, Leroy AM (1987). Robust Regression and Outlier Detection. Wiley and Sons, New York
77 | 3. Raymaekers J, Rousseeuw PJ (2019). “A generalized spatial sign covariance matrix.” Journal of Multivariate Analysis, 171, 94–111.
78 | 4. Serneels S, De Nolf E, Van Espen PJ (2006). “Spatial Sign Preprocessing:  A Simple Way ToImpart Moderate Robustness to Multivariate Estimators.” Journal of Chemical Information and Modeling, 46, 1402–1409.


--------------------------------------------------------------------------------
/docs/sudire.rst:
--------------------------------------------------------------------------------
  1 | .. _sudire:
  2 | 
  3 | 
  4 | 
  5 | ################
  6 | sudire
  7 | ################
  8 | 
  9 | Sufficient dimension reduction (SDR) is a recent take on dimension reduction, where  one  aims  to  estimate  a  set  of  latent  variables 
 10 | that are linear combinations of the original variables :math:`\mathbf{T} = \mathbf{X}\mathbf{W}` in such a way that the subspace spanned by them contains all information 
 11 | relevant to the dependent variable  in such a way that the subspace spanned by them contains all information relevant to the dependent variable: 
 12 | :math:`\mathbf{Y} \upvDash \mathbf{X}\  | \ \mathbf{T}.` Here, $\mathbf{X}$ is a sample of $n$ cases of a $p$ variate random variable and $\mathbf{Y}$ 
 13 | is a sample of the dependent variable, $\mathbf{W}$ is a $p \times q$ matrix with $q \leq p$, and $\upvDash$ denotes statistical independence.
 14 | A lot of research has been done over the last thirty years investigating different approaches in
 15 | terms of asymptotics and assumptions made in each of the approaches. A good textbook
 16 | providing an overview of approaches to SDR is Li (2018). The subpackage sudire contains
 17 | implementations of a broad set of these approaches. 
 18 | 
 19 | Generally speaking, SDR techniques roughly resort in three categories. At first, there is a
 20 | successful set of approaches to SDR based on slicing the original space. Examples of these are
 21 | sliced inverse regression (SIR, Li (1991)) and sliced-average variance estimation (SAVE, Cook
 22 | (2000)). A second group of developments has involved selective focus on certain directions,
 23 | which has resulted in, among others, directional regression (DR, Li (2007)), principal Hessian
 24 | directions (PHD, Li (1992)) and the iterative Hessian transformations (IHT, Cook and Li
 25 | (2002)).
 26 | 
 27 | While all of the aforementioned methods are included in sudire and would merit a broader
 28 | discussion, at this point we would like to highlight that sudire contains implementations of a
 29 | more recent approach as well. The latter has, so far, resulted in three methods, all three of
 30 | which share the following advantages: they do not require conditions of linearity or constant
 31 | covariance, nor do they need distributional assumptions, yet they may be computationally
 32 | more demanding. This third group of SDR algorithms estimates a basis of the central subspace as:
 33 | 
 34 | .. math::
 35 |    :nowrap:
 36 | 
 37 |     \begin{equation*}
 38 |     \begin{aligned}
 39 |     & \mathbf{W}_h = \argmax_{\mathbf{B}} & & \mathfrak{P}^2\left(\mathbf{X}\mathbf{B},\mathbf{Y}\right)  \\
 40 |     & \text{subject to} & & \mathbf{B}^T\mathbf{X}^T\mathbf{X}\mathbf{B} = \mathbf{I}_h,\\
 41 |     \end{aligned}
 42 |     \end{equation*}
 43 | 
 44 |    
 45 | 
 46 | where $\mathbf{B}$ is an arbitrary $p \times h$ matrix, $h \in [1,\min(n,p)]$. Here, $\mathfrak{P}$ can be any statistic, that estimate a subspace whose complement 
 47 | is independent of $\mathbf{Y}$. Currently implemented $\mathfrak{P}$ statistics are : 
 48 | 
 49 | * distance  covariance  (Székely,  Rizzo,  and  Bakirov  2007),  leading  to  option dcov-sdr (Sheng and Yin 2016);
 50 | * martingale  difference  divergence  (Shao  and  Zhang  2014),  leading  to  option mdd-sdr (Zhang, Liu, Wu, and Fang 2019);
 51 | * ball covariance (Pan, Wang, Xiao, and Zhu 2019), leading to option bcov-sdr (Zhang and Chen 2019)
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | Usage
 59 | ===========
 60 | 
 61 | .. currentmodule:: direpack.sudire.sudire
 62 | 
 63 | .. autosummary::
 64 |     :toctree: generated/
 65 |     :caption: Sudire
 66 |         
 67 |     sudire
 68 |     
 69 | 
 70 |     
 71 | 
 72 | 
 73 | 
 74 | Dependencies
 75 | ================
 76 | 
 77 | 
 78 | - From `sklearn.base`: `BaseEstimator`,`TransformerMixin`,`RegressorMixin`
 79 | - From `sklearn.utils`: `_BaseComposition`
 80 | - `copy`
 81 | - From `scipy.stats` : `trim_mean`
 82 | - From `scipy.linalg`: `inv`, `sqrtm`
 83 | - `cython`
 84 | - From  `ipopt` : `minimize_ipopt`
 85 | - `numpy` 
 86 | - From `statsmodels.regression.linear_model`: `OLS`
 87 | - `statsmodels.robust`
 88 | 
 89 | References
 90 | ==========
 91 | 1. Wenhui Sheng and Xiangrong Yin Sufficient Dimension Reduction via Distance Covariance, in: Journal of Computational and Graphical Statistics (2016),  25, issue 1, pages 91-104.
 92 | 2. Yu Zhang, Jicai Liu, Yuesong Wu and Xiangzhong Fang, A martingale-difference-divergence-based estimation of central mean subspace,  in: Statistics and Its Interface (2019),  12, number 3, pages 489-501.
 93 | 3. Li K-C,  Sliced Inverse Regression for Dimension Reduction,  Journal of the American Statistical Association (1991), 86, 316-327.
 94 | 4. R.D. Cook, and Sanford Weisberg, Sliced Inverse Regression for Dimension Reduction: Comment,   Journal of the American Statistical Association (1991), 86, 328-332.
 95 | 5. B. Li and S.Wang, On directional regression for dimension reduction,  Journal of the American Statistical Association (2007), 102:997–1008.
 96 | 6. K.-C. Li., On principal hessian directions for data visualization and dimension reduction:Another application of stein’s lemma, Journal of the American Statistical Association(1992)., 87,1025–1039.
 97 | 7. R. D. Cook and B. Li., Dimension Reduction for Conditional Mean in Regression,  The Annals of Statistics(2002)30(2):455–474.
 98 | 8. Jia Zhang and Xin Chen, Robust Sufficient Dimension Reduction Via Ball Covariance  Computational Statistics and Data Analysis 140 (2019) 144–154
 99 | 9. Li B, Sufficient  Dimension  Reduction:  Methods  and  Applications  with  R. (2018)  Chapman& Hall /CRC, Monographs on Statistics and Applied Probability, New York 
100 | 


--------------------------------------------------------------------------------
/src/direpack/preprocessing/gsspp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Created on Wed Mar 25 09:01:53 2020
  5 | 
  6 | # @author: Sven Serneels, Ponalytics, Mar 2020. 
  7 | 
  8 | 
  9 | __all__ = ['GenSpatialSignPrePprocessor','gen_ss_pp','gen_ss_covmat']
 10 | 
 11 | from sklearn.base import BaseEstimator, TransformerMixin
 12 | from sklearn.utils.validation import check_is_fitted
 13 | from .robcent import VersatileScaler, versatile_scale
 14 | from ._preproc_utilities import *
 15 | from ..utils.utils import _check_input
 16 | from ._gsspp_utils import *
 17 | from ._gsspp_utils import _norms, _gsspp
 18 | 
 19 | __all__ = ['GenSpatialSignPreProcessor', 'gen_ss_covmat', 'gen_ss_pp']
 20 |     
 21 | class GenSpatialSignPreProcessor(TransformerMixin,BaseEstimator):
 22 |     
 23 |     """
 24 |     GenSpatialSignPreProcessor Generalized Spatial Sign Pre-Processing as a scikit-learn compatible object
 25 |     that can be used in ML pipelines. 
 26 |     
 27 |     Parameters
 28 |     ---------- 
 29 |         center: str or function, 
 30 |             location estimator for centring.str options: 'mean', 'median', 'l1median', 'kstepLTS', 'None' 
 31 | 
 32 |         fun: str or function, 
 33 |             radial transformation function, str options: 'ss' (the non-generalized spatial sign, equivalent to sklearn's Normalizer), 'ball', 'shell', 'quad' (quadratic), 'winsor', or 'linear_redescending'
 34 |             Methods: sklearn API: `fit(X)`, `transform(X)` and `fit_transform(X)` with 
 35 | 
 36 |         
 37 |     Attributes
 38 |     ---------- 
 39 |     Attributes always provided : 
 40 | 
 41 |         -  `gss_` : the generalized spatial signs 
 42 |         -  `Xm_` : the centred data 
 43 |         -  `centring_` : VersatileScaler centring object 
 44 |         -  `X_gss_pp_` : Data preprocessed by Generalized Spatial Sign
 45 |     """    
 46 | 
 47 |     def __init__(self,center='l1median',fun='linear_redescending'):
 48 |         
 49 |         self.center = center
 50 |         self.fun = fun 
 51 |         
 52 |     def fit(self,X): 
 53 |         
 54 |         """ 
 55 |         Calculate and store generalized spatial signs
 56 |         """
 57 |         
 58 |         X = _check_input(X)
 59 |         n,p = X.shape
 60 |         if type(self.fun) is str: 
 61 |             fun = eval(self.fun)
 62 |         else:
 63 |             fun = self.fun
 64 |         vs = VersatileScaler(center=self.center,scale='None')
 65 |         Xm = vs.fit_transform(X)
 66 |         gss_ = fun(_norms(Xm),p,n)
 67 |         setattr(self,'gss_',gss_)
 68 |         setattr(self,'Xm_',Xm)
 69 |         setattr(self,'centring_',vs)
 70 |         
 71 |     def transform(self,X):
 72 |         
 73 |         """
 74 |         Calculate Generalized Spatial Sign Pre-Pprocessed Data
 75 |         """
 76 |         
 77 |         check_is_fitted(self,('gss_','Xm_'))
 78 |         Xgss = np.multiply(self.Xm_,self.gss_)
 79 |         setattr(self,'X_gsspp_',Xgss)
 80 |         return(Xgss)
 81 | 
 82 |     def fit_transform(self,X): 
 83 | 
 84 |         self.fit(X)
 85 |         self.transform(X)
 86 |         return(self.X_gsspp_)        
 87 |         
 88 |     
 89 |     
 90 | 
 91 | def gen_ss_pp(X,center='l1median',fun='linear_redescending'):
 92 |     
 93 |     """
 94 |     Generalized Spatial Sign Pre-Processing as a one pass function
 95 |     Inputs: 
 96 |         X: Data matrix 
 97 |         center: str or function, location estimator for centring.
 98 |                 str options: 'mean', 'median', 'l1median', 'kstepLTS', 'None' 
 99 |         fun: str or function, radial transformation function,
100 |                 str options: 'ss' (the non-generalized spatial sign, equivalent
101 |                 to sklearn's Normalizer), 'ball', 'shell', 'quad' (quadratic), 
102 |                 'winsor', or 'linear_redescending'
103 |     Outputs: the pre-processed data 
104 |     """    
105 |     
106 |     if type(center) is str: 
107 |         center = eval(center)
108 |             
109 |     if type(fun) is str: 
110 |         fun = eval(fun)
111 |     
112 |     X = _check_input(X)        
113 |     n = X.shape
114 |     if len(n) > 1:
115 |         p = n[1]
116 |     else:
117 |         p = 1
118 |     n = n[0]
119 |     
120 |     if center != 'None':
121 |         X = versatile_scale(X,center=center,scale='None')
122 |         
123 |     return(_gsspp(X,p,n,fun=fun))
124 |     
125 |     
126 | def gen_ss_covmat(X,center='kstepLTS',fun=linear_redescending): 
127 |     
128 |     """
129 |     Generalized Spatial Sign Covariance Matrix 
130 |     Is equivalent to the covariance matrix of generalized spatial sign 
131 |     pre-processed data. 
132 |     
133 |     First published in: 
134 |         A generalized spatial sign covariance matrix,
135 |         Jakob Raymaekers, Peter Rousseeuw, 
136 |         Journal of Multivariate Analysis, 171 (2019), 94–111.
137 |         
138 |     Inputs: 
139 |         X: Data matrix 
140 |         center: str or function, location estimator for centring.
141 |                 str options: 'mean', 'median', 'l1median', 'kstepLTS', 'None' 
142 |         fun: str or function, radial transformation function,
143 |                 str options: 'ss' (the non-generalized spatial sign, equivalent
144 |                 to sklearn's Normalizer), 'ball', 'shell', 'quad' (quadratic), 
145 |                 'winsor', or 'linear_redescending'
146 |                 
147 |     Outputs: the generalized spatial sign covariance matrix
148 |     """    
149 |     
150 |     X = _check_input(X)  
151 |     rc = VersatileScaler(center=center, scale='None')
152 |     n,p = X.shape
153 |     Xm = rc.fit_transform(X)
154 |     Xgss = _gsspp(Xm,p,n,fun=fun)
155 |     return(Xgss.T*Xgss/n)
156 |     
157 |     
158 | 
159 |     
160 |     
161 |         


--------------------------------------------------------------------------------
/docs/sprm.rst:
--------------------------------------------------------------------------------
  1 | .. _sprm:
  2 | 
  3 | ################
  4 | sprm
  5 | ################
  6 | 
  7 | Sparse partial robust M regression (SPRM) is a sparse and robust alternative to PLS that can be calculated efficiently (Hoffmann, Serneels, Filzmoser,and Croux 2015). 
  8 | The subpackage is organized slightly differently from the other two mainsubpackages.  Because SPRM combines the virtues of robust regression with sparse dimension reduction, 
  9 | besides  the  SPRM  estimators  itself,  each  of  these  building  blocks  are  provided themselves as class objects that can be deployed in sklearn pipelines. 
 10 | The class objects rm, snipls and sprm are sourced by default when importing direpack. 
 11 | 
 12 | Robust M regression
 13 | =====================
 14 | 
 15 | M regression is a generalization of least squares regression in the sense that it minimizes a more general objective that allows to tune the estimator's robustness.
 16 | In M regression, the vector of regression coefficients is defined as: 
 17 | 
 18 | .. math::
 19 |    :label: optim_rm
 20 |    :nowrap:
 21 | 
 22 |     \begin{equation*}
 23 |     \hat{\boldsymbol{\beta}} = \mathop{\mbox{argmin}}_{\boldsymbol{\beta}}\sum_i \rho\left(\frac{r_i(\boldsymbol{\beta})}{\hat{\sigma}}\right) 
 24 |     \end{equation*}
 25 | 
 26 | where $r_i$ are the casewise regression residuals and $\hat{\sigma}$ is a robust scale estimator thereof. The $\rho$ function defines the properties of the estimator.
 27 | Identity to the least squares estimator is obtained if $\rho(r) = r^2$, but robustness can be introduced by taking a different function,
 28 | for instance a function that is approximately quadratic for small (absolute) $r$, but increases more slowly than $r^2$ for larger values of $r$. 
 29 | Objective :eq:`optim_rm` can be solved numerically, but it is well known that its solution can equivalently be obtained through an iteratively reweighting least squares (IRLS),
 30 | which is how it is implemented in sprm. In the package, the Fair, Huber or Hampel reweighting functions can be picked, which will lead to different robustness properties.
 31 | 
 32 | 
 33 | 
 34 | Sparse NIPALS
 35 | =====================
 36 | 
 37 | A second building block in the package is the SNIPLS algorithm. It is a sparse version of the NIPALS algorithm for PLS and as such, essentially a computationally efficient implementation of univariate sparse PLS.
 38 | Again, the SNIPLS components are linear combinations of the original variables through a set of weighting vectors $\mathbf{w}_i$ that maximize: 
 39 | 
 40 | .. math::
 41 |    :label: optim_snipls
 42 |    :nowrap:
 43 |     
 44 |     \begin{equation*}
 45 |     \begin{aligned}
 46 |     & 	\mathbf{w}_i &= \argmax_{\mathbf{a}} \mathop{\mbox{cov}^2}\left(\mathbf{a}^T\mathbf{X},\mathbf{y}\right) + \lambda \parallel\mathbf{a}\parallel_1 \\
 47 |     & \text{subject to} & \mathbf{w}_i^T\mathbf{X}^T\mathbf{X}\mathbf{w}_j = 0 \mbox{ and } \parallel \mathbf{w}_i\parallel_2 = 1\\
 48 |     \end{aligned}
 49 |     \end{equation*}
 50 | 
 51 | 
 52 | which in sparse PLS is typically maximized through a surrogate formulation.  However,  in this  case,  the  exact  solution  to  Criterion  :eq:`optim_snipls`  can  be  obtained,
 53 | which  is  what  the  SNIPLS algorithm  builds  upon.   For  details  on  the  algorithm,  the  reader  is  referred  to  Hoffmann, Filzmoser, Serneels, and Varmuza (2016).  
 54 | At this point, remark that the SNIPLS algorithm has also become a key building block to analyze outlyingness (Debruyne, Höppner, Serneels,and Verdonck 2019).
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | Sparse partial robust M
 61 | =========================
 62 | 
 63 | Sparse  partial  robust  M  dimension  reduction  unites  the  benefits  of  SNIPLS  and  robust  M estimation:  it yields an efficient sparse PLS dimension reduction, while at the same time,
 64 | it is robust against both leverage points and virtual outliers through robust M estimation. It is defined similarly as in :eq:`optim_snipls` but instead maximizing a weighted covariance, with case weights that depend on the data. 
 65 | Consistent with robust M estimation, it can be calculated through iteratively reweighting SNIPLS. SPRM improves upon the original reweighted PLS proposal by (i) yielding a sparse estimate, (ii) having a reweighting scheme as well as starting values that weight both in the score and residual spaces and (iii) by allowing different weight functions, the most tuneable one being the Hampel function.
 66 | 
 67 | Usage
 68 | ===========
 69 | 
 70 | .. currentmodule:: direpack.sprm.sprm
 71 | 
 72 | .. autosummary::
 73 |     :toctree: generated/
 74 |     :caption: SPRM
 75 |         
 76 |     sprm
 77 | 
 78 |        
 79 | .. currentmodule:: direpack.sprm.snipls
 80 | 
 81 | .. autosummary::
 82 |     :toctree: generated/
 83 |     :caption: SNIPLS
 84 | 
 85 |     snipls
 86 |     
 87 |         
 88 |     
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | Dependencies
 96 | ================
 97 | 
 98 | - `pandas`
 99 | - `numpy`
100 | 
101 | 
102 | References
103 | ================
104 | 
105 | 1. Irene Hoffmann, Sven Serneels, Peter Filzmoser, Christophe Croux, Sparse partial robust M regression, Chemometrics and Intelligent Laboratory Systems, 149 (2015), 50-59.
106 | 2. Sven Serneels, Christophe Croux, Peter Filzmoser, Pierre J. Van Espen, Partial robust M regression,  Chemometrics and Intelligent Laboratory Systems, 79 (2005), 55-64.
107 | 3. Hoffmann I., P. Filzmoser, S. Serneels, K. Varmuza, Sparse and robust PLS for binary classification,  Journal of Chemometrics, 30 (2016), 153-162.
108 | 4. Filzmoser P, Höppner S, Ortner I, Serneels S, Verdonck T. Cellwise robust M regression.  Computational Statistics and Data Analysis,147 (2020).
109 | 


--------------------------------------------------------------------------------
/src/direpack/test/test_sudire.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jun 30 13:17:46 2020
  4 | 
  5 | @author: Emmanuel Jordy Menvouta
  6 | """
  7 | 
  8 | import unittest
  9 | import numpy as np
 10 | import pandas as pd
 11 | from direpack.sudire._sudire_utils import *
 12 | from direpack import sudire
 13 | from sklearn.model_selection import train_test_split
 14 | 
 15 | 
 16 | class Testsudire(unittest.TestCase):
 17 |     """Test some methods in the sudire class"""
 18 | 
 19 |     @classmethod
 20 |     def setUpClass(cls):
 21 |         print("setupClass")
 22 | 
 23 |     @classmethod
 24 |     def tearDownClass(cls):
 25 |         print("teardownClass")
 26 | 
 27 |     def setUp(self):
 28 |         self.data = pd.read_csv("./data/boston_housing.csv")
 29 |         self.x = self.data
 30 |         self.y = self.x["MEDV"]
 31 |         self.x.drop("MEDV", axis=1, inplace=True)
 32 |         self.n = self.x.shape[0]
 33 |         self.p = self.x.shape[1]
 34 |         self.struct_dim = 2
 35 | 
 36 |     def tearDown(self):
 37 |         del self.x
 38 |         del self.y
 39 |         del self.n
 40 |         del self.p
 41 |         del self.data
 42 |         del self.struct_dim
 43 | 
 44 |     #    def test_estimdim(self):
 45 |     #        """ Tests the estimation of the central subspace via Bootstrap """
 46 |     #
 47 |     #        central_dim, diff_vec = estimate_structural_dim('dr',self.x_train.values,self.y_train.values , B=100, n_slices=4)
 48 |     #        np.testing.assert_equal(central_dim,6)
 49 | 
 50 |     def test_sir(self):
 51 |         """Tests Sliced Inverse Regression"""
 52 | 
 53 |         # mod_auto = sudire('sir', center_data= True, scale_data=True,n_components=self.struct_dim)
 54 |         # mod_auto.fit(self.x_train.values, self.y_train.values)
 55 |         res_sir = SIR(
 56 |             self.x.values,
 57 |             self.y.values,
 58 |             6,
 59 |             self.struct_dim,
 60 |             "continuous",
 61 |             True,
 62 |             True,
 63 |         )
 64 |         test_ans = 3.9759408796493894
 65 |         np.testing.assert_almost_equal(
 66 |             np.linalg.norm(res_sir), test_ans, decimal=8
 67 |         )
 68 | 
 69 |     def test_save(self):
 70 |         """Tests Sliced Average Variance Estimation"""
 71 | 
 72 |         # mod_auto = sudire('save', center_data= True, scale_data=True,n_components=self.struct_dim)
 73 |         # mod_auto.fit(self.x_train.values, self.y_train.values)
 74 |         res_save = SAVE(
 75 |             self.x.values,
 76 |             self.y.values,
 77 |             6,
 78 |             self.struct_dim,
 79 |             "continuous",
 80 |             True,
 81 |             True,
 82 |         )
 83 |         test_ans = 6.347895320407837
 84 |         np.testing.assert_almost_equal(
 85 |             np.linalg.norm(res_save), test_ans, decimal=8
 86 |         )
 87 | 
 88 |     def test_dr(self):
 89 |         """Tests Directional Regression"""
 90 | 
 91 |         # mod_auto = sudire('dr', center_data= True, scale_data=True,n_components=self.struct_dim)
 92 |         # mod_auto.fit(self.x_train.values, self.y_train.values)
 93 |         res_dr = DR(
 94 |             self.x.values,
 95 |             self.y.values,
 96 |             6,
 97 |             self.struct_dim,
 98 |             "continuous",
 99 |             True,
100 |             True,
101 |         )
102 |         test_ans = 4.013789664544885
103 |         np.testing.assert_almost_equal(
104 |             np.linalg.norm(res_dr), test_ans, decimal=8
105 |         )
106 | 
107 |     def test_iht(self):
108 |         """Tests Iterative Hessian Transformations"""
109 | 
110 |         # mod_auto = sudire('iht', center_data= True, scale_data=True,n_components=self.struct_dim)
111 |         # mod_auto.fit(self.x_train.values, self.y_train.values)
112 |         res_iht = IHT(
113 |             self.x.values, self.y.values, self.struct_dim, True, True
114 |         )
115 |         # local linux -- resolve platform sensitivity!!
116 |         # test_ans = 0.22443355
117 |         test_ans = 1.68656340
118 |         np.testing.assert_almost_equal(
119 |             np.linalg.norm(res_iht), test_ans, decimal=8
120 |         )
121 | 
122 |     def test_phd(self):
123 |         """Tests Principal Hessian Directions"""
124 | 
125 |         # mod_auto = sudire('phd', center_data= True, scale_data=True,n_components=self.struct_dim)
126 |         # mod_auto.fit(self.x_train.values, self.y_train.values)
127 |         res_phd = PHD(
128 |             self.x.values, self.y.values, self.struct_dim, True, True
129 |         )
130 |         test_ans = 3.2904239864118763
131 |         np.testing.assert_almost_equal(
132 |             np.linalg.norm(res_phd), test_ans, decimal=8
133 |         )
134 | 
135 |     def test_dcov(self):
136 |         """Test DCOV based SDR"""
137 | 
138 |         mod_auto = sudire(
139 |             "dcov-sdr",
140 |             center_data=True,
141 |             scale_data=True,
142 |             n_components=self.struct_dim,
143 |         )
144 |         mod_auto.fit(self.x.values, self.y.values)
145 |         test_ans = 1.4628980331787338
146 |         np.testing.assert_almost_equal(
147 |             np.linalg.norm(mod_auto.x_loadings_), test_ans, decimal=5
148 |         )
149 | 
150 |     def test_mdd(self):
151 | 
152 |         """Test MDD based SDR"""
153 |         mod_auto = sudire(
154 |             "mdd-sdr",
155 |             center_data=True,
156 |             scale_data=True,
157 |             n_components=self.struct_dim,
158 |         )
159 |         mod_auto.fit(self.x.values, self.y.values)
160 |         test_ans = 3.5752717342726803
161 |         np.testing.assert_almost_equal(
162 |             np.linalg.norm(mod_auto.x_loadings_), test_ans, decimal=5
163 |         )
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     unittest.main()
168 | 


--------------------------------------------------------------------------------
/docs/sudire.md:
--------------------------------------------------------------------------------
  1 | Sufficient Dimension Reduction
  2 | ======================================
  3 | 
  4 | A `scikit-learn` compatible Python 3 package for Sufficient Dimension Reduction. 
  5 | This class implements a set of methods to perform Sufficient Dimension Reduction .  
  6 | 
  7 | Description
  8 | -----------
  9 | 
 10 | Sufficient Dimension Reduction(SDR) is a general framework which aims to capture all the relevant information in high dimensional data. This capture of information is based on the notion that a  combination  of the  predictors provides all the relevant information on the response, so that the rest of the predictors can be ignored. 
 11 | 
 12 | The different SDR methods   implemented in this class are :  
 13 | - `dcov-sdr` : Sufficient Dimension Reduction via Distance Covariance
 14 | - `mdd-sdr ` : Sufficient Dimension Reduction via Martingale Difference Divergence
 15 | - `sir` : Sliced Inverse Regression
 16 | - `save`: Sliced Average Variance Estimation
 17 | - ` dr` :  Directional Regression
 18 | - ` phd ` : Principal Hessian Directions
 19 | - `iht` : Iterative Hessian Transformation 
 20 | 
 21 | User defined functions can also be maximised by the method explained in  \[1\]. For more details on how to use the implemented SDR methods and how to use user defined functions, have a look at the [sudire example notebook]() 
 22 | 
 23 | The `sudire` class also allows for estimation of the central subspace by optimizing an objective function . The optimization is performed using the Interior Point Optimizer (IPOPT) which is part of the [COIN-OR project](https://coin-or.github.io/Ipopt/) 
 24 | 
 25 | Remarks: 
 26 | - all the methods contained in this package have been designed for continuous data.  Categorical or textual data first needs to be one hot encoded  or embedded. 
 27 |         
 28 | The code is aligned to `scikit-learn`, such that modules such as `GridSearchCV` can flawlessly be applied to it. 
 29 | 
 30 | The `sudire` folder contains
 31 | - The estimator (`sudire.py`) 
 32 | - Plotting functions for fitted sudire objects  (`sudire_plot.py`)
 33 | - Ancillary functions for sufficient dimension reduction (`_sudire_utils.py`)
 34 | 
 35 | The sudire class
 36 | ================
 37 | 
 38 | Dependencies
 39 | ------------
 40 | - From `sklearn.base`: `BaseEstimator`,`TransformerMixin`,`RegressorMixin`
 41 | - From `sklearn.utils`: `_BaseComposition`
 42 | - `copy`
 43 | - From `scipy.stats` : `trim_mean`
 44 | - From `scipy.linalg`: `inv`, `sqrtm`
 45 | - `cython`
 46 | - From  `ipopt` : `minimize_ipopt`
 47 | - `numpy` 
 48 | - From `statsmodels.regression.linear_model`: `OLS`
 49 | - `statsmodels.robust`
 50 | 
 51 | 
 52 | 
 53 | Parameters
 54 | ----------
 55 | - `sufdirmeth`, function or string.  one of the elements in the list of implemented SDR methods
 56 |                                   or user defined function.
 57 | - `n_components`, int.  dimension of the central subspace.
 58 | - `trimming`, float. trimming percentage for projection index, to be entered as pct/100 
 59 | - `optimizer_options`: dict with options to pass on to the ipopt optimizer. 
 60 |    * `tol`: int: relative convergence tolerance.
 61 |    * `max_iter`: int. Maximal number of iterations. 
 62 |    * `constr_viol_tol` : Desired threshold for the constraint violation.
 63 | - `optimizer_constraints`: dict or list of dicts, further constraints to be
 64 |             passed on to the optimizer function.
 65 | - `center`, str. How to center the data. options accepted are options from
 66 |             `direpack`'s `VersatileScaler`. 
 67 | - `center_data`, bool. 
 68 | - `scale_data`, bool. Note: if set to `False`, convergence to correct optimum 
 69 |             is not a given. Will throw a warning. 
 70 | - `whiten_data`, bool. 
 71 | - `compression`, bool. If `True`, an internal SVD compression step is used for 
 72 |             flat data tables (p > n). Speds up the calculations. 
 73 | - `copy`, bool. Whether to make a deep copy of the input data or not. 
 74 | - `verbose`, bool. Set to `True` prints the iteration number. 
 75 | - `return_scaling_object`, bool.
 76 | Note:  parameters concerning the data can also be passed to the `fit` method.   
 77 | 
 78 | Attributes
 79 | ----------
 80 | Attributes always provided 
 81 | -  `x_loadings_`: Estimated basis of the central subsapce 
 82 | -  `x_scores_`: The projected X data. 
 83 | -  `x_loc_`:  location estimate for X 
 84 | -  `x_sca_`:  scale estimate for X
 85 | - ` ols_obj` : fitted OLS objected
 86 | -  `y_loc_`: y location estimate
 87 | -  `y_sca_`: y scale estimate
 88 | 
 89 | Attributes created only when corresponding input flags are `True`:
 90 | -   `whitening_`: whitened data matrix (usually denoted K)
 91 | -   `scaling_object_`: scaling object from `VersatileScaler`
 92 | 
 93 | 
 94 | Methods
 95 | --------
 96 | - `fit(X, *args, **kwargs)`: fit model 
 97 | - `predict(X)`: make predictions based on fit 
 98 | - `transform(X)`: project X onto latent space 
 99 | - `getattr()`: get list of attributes
100 | - `setattr(*kwargs)`: set individual attribute of sprm object 
101 | 
102 | The `fit` function takes several optional input arguments for user defined objective functions. 
103 |   
104 | 
105 | 
106 | 
107 |         
108 | References
109 | ----------
110 | 1. [Sufficient Dimension Reduction via Distance Covariance](https://doi.org/10.1080/10618600.2015.1026601), Wenhui Sheng and Xiangrong Yin in: Journal of Computational and Graphical Statistics (2016),  25, issue 1, pages 91-104.
111 | 2. [A martingale-difference-divergence-based estimation of central mean subspace](https://dx.doi.org/10.4310/19-SII562), Yu Zhang, Jicai Liu, Yuesong Wu and Xiangzhong Fang, in: Statistics and Its Interface (2019),  12, number 3, pages 489-501.
112 | 3. [Sliced Inverse Regression for Dimension Reduction](https://www.tandfonline.com/doi/abs/10.1080/01621459.1991.10475035) Li K-C,  Journal of the American Statistical Association (1991), 86, 316-327.
113 | 4. [Sliced Inverse Regression for Dimension Reduction: Comment](https://www.jstor.org/stable/2290564?seq=1#metadata_info_tab_contents),  R.D. Cook, and Sanford Weisberg, Journal of the American Statistical Association (1991), 86, 328-332.
114 | 5. [On directional regression for dimension reduction](https://doi.org/10.1198/016214507000000536) ,  B. Li and S.Wang, Journal of the American Statistical Association (2007), 102:997–1008.
115 | 6. [On principal hessian directions for data visualization and dimension reduction:Another application of stein’s lemma](https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10476258), K.-C. Li. , Journal of the American Statistical Association(1992)., 87,1025–1039.
116 | 7. [Dimension Reduction for Conditional Mean in Regression](https://pdfs.semanticscholar.org/fd99/4f0cd554790eb8e0449440a59dcd47cf3396.pdf), R. D. Cook and B. Li.,  The Annals of Statistics(2002)30(2):455–474.
117 | 8. [Robust Sufficient Dimension Reduction Via Ball Covariance](https://www.sciencedirect.com/science/article/pii/S0167947319301380) Jia Zhang and Xin Chen, Computational Statistics and Data Analysis 140 (2019) 144–154
118 | 


--------------------------------------------------------------------------------
/src/direpack/plot/sudire_plot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Apr 11 17:25:42 2020
  4 | 
  5 | @author: Emmanuel Jordy Menvouta
  6 | """
  7 | 
  8 | from __future__ import absolute_import, division, print_function
  9 | from __future__ import unicode_literals
 10 | 
 11 | 
 12 | 
 13 | from ..sudire.sudire import sudire
 14 | from ..utils.utils import MyException
 15 | import matplotlib.pyplot as pp 
 16 | import numpy as np
 17 | 
 18 | 
 19 | class sudire_plot(sudire):
 20 |     
 21 |     def __init__(self,res_sudire,colors,markers=['o','d','v'],*args):
 22 |         """
 23 |         Initialize with 
 24 |         res_sudire, a sudire class object
 25 |         
 26 |         Only mandatory input is colors, a list of colors for 
 27 |             [0] borders of pane 
 28 |             [1] plot background
 29 |             [2] marker fill
 30 |             [3] diagonal line 
 31 |             [4] marker contour, if different from fill
 32 |             [5] marker color for new cases, if applicable
 33 |                
 34 |         """
 35 |         if not(isinstance(res_sudire,sudire)):
 36 |             raise(MyException("Object supplied to sudireplot needs to be a sudire object"))
 37 |         self.res_sudire = res_sudire
 38 |         self.colors = colors
 39 |         self.markers = markers
 40 |         
 41 |     def plot_yyp(self,ytruev=[],Xn=[],label=[],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False):
 42 |         """
 43 |         plot_yyp will plot y vs y predicted for sudire M opbjects
 44 |         Optional inputs: 
 45 |             ytruev: array (new_cases,) of predictands
 46 |             Xn: array (new_cases,variables) of predictors 
 47 |             If these arguments are supplied, sudire predictions for ytrue will be 
 48 |                 made from Xn through res_sudire.predict()
 49 |             label: string: name of variable to be plotted. Will show in legend.
 50 |             names: list or tuple of strings, casenames from training set
 51 |             namesv: list or tuple of strings, casenames from test set
 52 |             title: String containing plot title
 53 |             legend_pos: string containing legend position
 54 |             onlyval: boolean: only plot validation cases
 55 |         """
 56 |         
 57 |         if len(label)==0:
 58 |             label = 'none'
 59 |         fig = pp.figure()
 60 |         fig.set_facecolor(self.colors[0])
 61 |         pp.rcParams['axes.facecolor'] = self.colors[1]
 62 |         ax1 = fig.add_subplot(111)
 63 |         if (not(onlyval)):
 64 |             ytruec = self.res_sudire.y0
 65 |             if len(ytruec.shape) >1:
 66 |                 ytruec = np.array(ytruec).reshape(-1).astype('float64')
 67 |             ypredc = np.array(self.res_sudire.ols_obj_.fittedvalues).T.reshape(-1)
 68 |             
 69 |             ax1.scatter(ytruec, ypredc, c=self.colors[2], label=label,
 70 |                     zorder=1,edgecolors=self.colors[4],marker=self.markers[0])
 71 |             pp.xlabel("y-true")
 72 |             pp.ylabel("y-pred")
 73 |             
 74 |         else:
 75 |             if (len(Xn)==0):
 76 |                 ValueError('In onlyval=True mode, new cases Xn need to be provided')
 77 |         if not(len(Xn)==0):
 78 |             if len(ytruev.shape) >1:
 79 |                 ytruev = np.array(ytruev).reshape(-1).astype('float64')
 80 |             ypredv = self.res_sudire.predict(Xn)
 81 |             ypredv = np.array(ypredv).reshape(-1).astype('float64')
 82 |                 
 83 |             ax1.scatter(ytruev,ypredv,c=self.colors[5],label=label,
 84 |                         zorder=1,edgecolors=self.colors[4],marker=self.markers[0])
 85 |             pp.xlabel("y-true")
 86 |             pp.ylabel("y-pred")
 87 |            
 88 |         x_abline = np.array(ax1.get_xbound())
 89 |         ax1.add_line(pp.Line2D(x_abline,x_abline,color=self.colors[3]))
 90 |         if len(label)==0:
 91 |             ax1.legend_.remove()
 92 |         else:
 93 |             pp.legend(loc=legend_pos)
 94 |         if len(names)>0:
 95 |             if not(onlyval):
 96 |                 for i in range(0,len(names)-1):
 97 |                     ax1.annotate(names[i], (ytruec[i],ypredc[i]))
 98 |         if len(namesv)>0:
 99 |             for i in range(0,len(namesv)-1):
100 |                 ax1.annotate(namesv[i], (ytruev[i],ypredv[i]))
101 |         if len(title)>0:
102 |             pp.title(title)
103 |         pp.show()
104 |         
105 |     def plot_projections(self,Xn=[],label=[],components = [0,1],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False):
106 |         
107 |         """
108 |         plot_projections will plot the score space  
109 |         Optional inputs: 
110 |             Xn: array (new_cases,variables) of predictors 
111 |             If supplied, sudire projections for new cases will be 
112 |                 made from Xn through res_sudire.transform()
113 |             label: string: name of variable to be plotted. Will show in legend.
114 |             names: list or tuple of strings, casenames from training set
115 |             namesv: list or tuple of strings, casenames from test set
116 |             title: String containing plot title
117 |             legend_pos: string containing legend position
118 |             onlyval: boolean: only plot validation cases
119 |         """
120 |         
121 |         if len(label)==0:
122 |             label = 'none'
123 |         fig = pp.figure()
124 |         fig.set_facecolor(self.colors[0])
125 |         pp.rcParams['axes.facecolor'] = self.colors[1]
126 |         ax1 = fig.add_subplot(111)
127 |         if (not(onlyval)):
128 |             Tc = np.array(self.res_sudire.x_scores_)
129 |             ax1.scatter(Tc[:,components[0]], Tc[:,components[1]], c=self.colors[2], label=label, 
130 |                     zorder=1,edgecolors=self.colors[4],marker=self.markers[0])
131 |         else:
132 |             if (len(Xn)==0):
133 |                 ValueError('In onlyval=True mode, new cases Xn need to be provided')
134 |         if not(len(Xn)==0):
135 |             Tv = np.array(self.res_sudire.transform(Xn))
136 |             ax1.scatter(Tv[:,components[0]], Tv[:,components[1]],c=self.colors[5],label=label,
137 |                         zorder=1,edgecolors=self.colors[4],marker=self.markers[0])
138 |         if len(label)==0:
139 |             ax1.legend_.remove()
140 |         else:
141 |             pp.legend(loc=legend_pos)
142 |         if len(names)>0:
143 |             if not(onlyval):
144 |                 for i in range(0,len(names)-1):
145 |                     ax1.annotate(names[i], (Tc[i,components[0]], Tc[i,components[1]]))
146 |         if len(namesv)>0:
147 |             for i in range(0,len(namesv)-1):
148 |                 ax1.annotate(namesv[i], (Tv[i,components[0]], Tv[i,components[1]]))
149 |         if len(title)>0:
150 |             pp.title(title)
151 |         pp.show()
152 |         
153 |         
154 |         
155 | 
156 | 


--------------------------------------------------------------------------------
/src/direpack/ppdire/_ppdire_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Thu Jan 2 2020
  5 | 
  6 | @author: Sven Serneels, Ponalytics. 
  7 | """
  8 | 
  9 | import numpy as np
 10 | import pandas as ps
 11 | 
 12 | 
 13 | def pp_objective(x,est,X,opt_args):
 14 |     
 15 |     """
 16 |     Optimization objective for ppdire 
 17 |     
 18 |     """
 19 |     
 20 |     n = len(x)
 21 |     x = np.array(x).reshape((n,1))
 22 |     return(-est.fit(np.matmul(X,x),**opt_args))
 23 | 
 24 | def gridplane(X,most,pi_arguments={},**kwargs):
 25 | 
 26 |     """
 27 |     Function for grid search in a plane in two dimensions
 28 |     
 29 |     Required: X, np.array(n,2), data, 
 30 |               most, class object, projection index. Designed for 
 31 |                   dicomo or capi classes.
 32 |     Optional: pi_arguments, dict: arguments to pass on to projection index, 
 33 |                     plus a few local arguments such as optrange and square_pi
 34 |                     (see ppdire for explanation)
 35 |         
 36 |               y, np.array(n,1), second block of data 
 37 |               biascorr, to apply bias correction at normal distribution 
 38 |               alphamat, np.array: matrix of alpha angles to be scanned. 
 39 |         
 40 |         
 41 |     Values: 
 42 |         wi, np.array(p,1): optimal direction 
 43 |         maximo, float: optimal value of projection index
 44 |         
 45 |     Note: this function is written exclusively to be called from within the ppdire class
 46 |     
 47 |     """
 48 |     
 49 |             
 50 |     if (('biascorr' not in kwargs) and ('biascorr' not in pi_arguments)):
 51 |         biascorr = False
 52 |     else:
 53 |         biascorr = kwargs.get('biascorr')
 54 |     
 55 |     if len(pi_arguments) == 0:
 56 |         
 57 |         pi_arguments = {
 58 |                         'alpha': 0,
 59 |                         'ndir': 1000,
 60 |                         'trimming': 0,
 61 |                         'biascorr': biascorr, 
 62 |                         'dmetric' : 'euclidean',
 63 |                         'alphamat': None,
 64 |                         'optrange': (-1,1),
 65 |                         'square_pi': False
 66 |                         }
 67 |         
 68 |     if ('y' in kwargs):
 69 |         y = kwargs.pop('y')
 70 |         pi_arguments['y'] = y
 71 |         
 72 |     optrange = pi_arguments['optrange']
 73 |     optmax = optrange[1]
 74 |     
 75 |     alphamat = kwargs.pop('alphamat',pi_arguments['alphamat'])
 76 |     if (alphamat != None):
 77 |         optrange = np.sign(optrange)
 78 |         stop0s = np.arcsin(optrange[0])
 79 |         stop1s = np.arcsin(optrange[1])
 80 |         stop1c = np.arccos(optrange[0])
 81 |         stop0c = np.arccos(optrange[1])
 82 |         anglestart = max(stop0c,stop0s)
 83 |         anglestop = max(stop1c,stop1s)
 84 |         nangle = np.linspace(anglestart,anglestop,pi_arguments['ndir'],endpoint=False)            
 85 |         alphamat = np.array([np.cos(nangle), np.sin(nangle)])
 86 |         if optmax != 1:
 87 |             alphamat *= optmax
 88 |     
 89 |     tj = np.matmul(X,alphamat)
 90 |     if pi_arguments['square_pi']:
 91 |         meas = [most.fit(tj[:,i],**pi_arguments)**2 
 92 |         for i in np.arange(0,pi_arguments['ndir'])]
 93 |     else:
 94 |         meas = [most.fit(tj[:,i],**pi_arguments) 
 95 |         for i in np.arange(0,pi_arguments['ndir'])]
 96 |         
 97 |     maximo = np.max(meas)
 98 |     indmax = np.where(meas == maximo)[0]
 99 |     if len(indmax)>0:
100 |         indmax = indmax[0]
101 |     wi = np.array(alphamat[:,indmax]).reshape((2,1))
102 |     
103 |     return(wi,maximo)
104 |     
105 |     
106 | 
107 | def gridplane_2(X,most,q,div,pi_arguments={},**kwargs):
108 | 
109 |     """
110 |     Function for refining a grid search in a plane in two dimensions
111 |     
112 |     Required: X, np.array(n,2), data
113 |               most, class object, projection index. Designed for 
114 |                   dicomo or capi classes.
115 |               q, np.array(1,1), last obtained suboptimal direction component
116 |               div, float, number of subsegments to divide angle into
117 |     
118 |     Optional: pi_arguments, dict: arguments to pass on to projection index, 
119 |                     plus a few local arguments such as optrange and square_pi
120 |                     (see ppdire for explanation)
121 |         
122 |               y, np.array(n,1), second block of data 
123 |               biascorr, to apply bias correction at normal distribution 
124 |               alphamat, np.array: matrix of alpha angles to be scanned. 
125 |         
126 |     pi_arguments is a dict of arguments passed on to the projection index
127 |         
128 |     Values: 
129 |         wi, np.array(p,1): optimal direction 
130 |         maximo, float: optimal value of projection index
131 |         
132 |     Note: this function is written to be called from within the ppdire class
133 |     
134 |     """
135 |             
136 |     if (('biascorr' not in kwargs) and ('biascorr' not in pi_arguments)):
137 |         biascorr = False
138 |     else:
139 |         biascorr = kwargs.get('biascorr')
140 |         
141 |     if len(pi_arguments) == 0:
142 |         
143 |         pi_arguments = {
144 |                         'alpha': 0,
145 |                         'ndir': 1000,
146 |                         'trimming': 0,
147 |                         'biascorr': biascorr, 
148 |                         'dmetric' : 'euclidean',
149 |                         'alphamat': None,
150 |                         'optrange': (-1,1),
151 |                         'square_pi': False
152 |                         }
153 | 
154 |         
155 |     if 'y' in kwargs:
156 |         y = kwargs.pop('y')
157 |         pi_arguments['y'] = y
158 | 
159 |     optrange = pi_arguments['optrange']
160 |     optmax = optrange[1]
161 |    
162 |     alphamat = kwargs.pop('alphamat',pi_arguments['alphamat'])
163 |     if (alphamat != None).any():
164 |         anglestart = min(pi_arguments['_stop0c'],pi_arguments['_stop0s'])
165 |         anglestop = min(pi_arguments['_stop1c'],pi_arguments['_stop1s'])
166 |         nangle = np.linspace(anglestart,anglestop,pi_arguments['ndir'],endpoint=True)
167 |         alphamat = np.array([np.cos(nangle), np.sin(nangle)])
168 |         if optmax != 1:
169 |             alphamat *= optmax
170 |     alpha1 = alphamat
171 |     divisor = np.sqrt(1 + 2*np.multiply(alphamat[0,:].reshape(1,-1),alphamat[1,:].reshape(1,-1))*q[0])
172 |     alpha1 = np.divide(alphamat,np.repeat(divisor,2,0))
173 |     tj = np.dot(X,alpha1)
174 |     
175 |     if pi_arguments['square_pi']:
176 |         meas = [most.fit(tj[:,i],**pi_arguments)**2 
177 |         for i in np.arange(0,pi_arguments['ndir'])]
178 |     else:
179 |         meas = [most.fit(tj[:,i],**pi_arguments) 
180 |         for i in np.arange(0,pi_arguments['ndir'])]
181 | 
182 |     maximo = np.max(meas)
183 |     indmax = np.where(meas == maximo)[0]
184 |     if len(indmax)>0:
185 |         indmax = indmax[0]
186 |     wi = np.array(alpha1[:,indmax]).reshape((2,1))
187 |     
188 |     return(wi,maximo)
189 | 


--------------------------------------------------------------------------------
/src/direpack/plot/ppdire_plot.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Apr 11 17:25:42 2020
  4 | 
  5 | @author: Emmanuel Jordy Menvouta & Sven Serneels
  6 | """
  7 | 
  8 | from __future__ import absolute_import, division, print_function
  9 | from __future__ import unicode_literals
 10 | 
 11 | 
 12 | 
 13 | from ..ppdire.ppdire import ppdire
 14 | from ..utils.utils import MyException
 15 | import matplotlib.pyplot as pp 
 16 | import numpy as np
 17 | 
 18 | 
 19 | class ppdire_plot(ppdire):
 20 |     
 21 |     def __init__(self,res_ppdire,colors,markers=['o','d','v'],*args):
 22 |         """
 23 |         Initialize with 
 24 |         res_ppdire, a ppdire class object
 25 |         
 26 |         Only mandatory input is colors, a list of colors for 
 27 |             [0] borders of pane 
 28 |             [1] plot background
 29 |             [2] marker fill
 30 |             [3] diagonal line 
 31 |             [4] marker contour, if different from fill
 32 |             [5] marker color for new cases, if applicable
 33 |                
 34 |         """
 35 |         if not(isinstance(res_ppdire,ppdire)):
 36 |             raise(MyException("Object supplied to ppdireplot needs to be a ppdire object"))
 37 |         self.res_ppdire = res_ppdire
 38 |         self.colors = colors
 39 |         self.markers = markers
 40 |         
 41 |     def plot_yyp(self,ytruev=[],Xn=[],label=[],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False):
 42 |         """
 43 |         plot_yyp will plot y vs y predicted for ppdire M opbjects
 44 |         Optional inputs: 
 45 |             ytruev: array (new_cases,) of predictands
 46 |             Xn: array (new_cases,variables) of predictors 
 47 |             If these arguments are supplied, ppdire predictions for ytrue will be 
 48 |                 made from Xn through res_ppdire.predict()
 49 |             label: string: name of variable to be plotted. Will show in legend.
 50 |             names: list or tuple of strings, casenames from training set
 51 |             namesv: list or tuple of strings, casenames from test set
 52 |             title: String containing plot title
 53 |             legend_pos: string containing legend position
 54 |             onlyval: boolean: only plot validation cases
 55 |         """
 56 |         
 57 |         if len(label)==0:
 58 |             label = 'none'
 59 |         fig = pp.figure()
 60 |         fig.set_facecolor(self.colors[0])
 61 |         pp.rcParams['axes.facecolor'] = self.colors[1]
 62 |         ax1 = fig.add_subplot(111)
 63 |         if (not(onlyval)):
 64 |             ytruec = self.res_ppdire.y0
 65 |             if len(ytruec.shape) >1:
 66 |                 ytruec = np.array(ytruec).reshape(-1).astype('float64')
 67 |             ypredc = np.array(self.res_ppdire.fitted_).T.reshape(-1)
 68 |             labelcr = label + ' Training'
 69 |             ax1.scatter(ytruec, ypredc, c=self.colors[2], label=labelcr,
 70 |                     zorder=1,edgecolors=self.colors[4],marker=self.markers[0])
 71 |             pp.xlabel("y-true")
 72 |             pp.ylabel("y-pred")
 73 |             
 74 |         else:
 75 |             if (len(Xn)==0):
 76 |                 ValueError('In onlyval=True mode, new cases Xn need to be provided')
 77 |         if not(len(Xn)==0):
 78 |             if len(ytruev.shape) >1:
 79 |                 ytruev = np.array(ytruev).reshape(-1).astype('float64')
 80 |             ypredv = self.res_ppdire.predict(Xn)
 81 |             ypredv = np.array(ypredv).reshape(-1).astype('float64')
 82 |             labelvr = label + ' Test'   
 83 |             ax1.scatter(ytruev,ypredv,c=self.colors[5],label=labelvr,
 84 |                         zorder=1,edgecolors=self.colors[4],marker=self.markers[0])
 85 |             pp.xlabel("y-true")
 86 |             pp.ylabel("y-pred")
 87 |            
 88 |         x_abline = np.array(ax1.get_xbound())
 89 |         ax1.add_line(pp.Line2D(x_abline,x_abline,color=self.colors[3]))
 90 |         if len(label)==0:
 91 |             ax1.legend_.remove()
 92 |         else:
 93 |             pp.legend(loc=legend_pos)
 94 |         if len(names)>0:
 95 |             if not(onlyval):
 96 |                 for i in range(0,len(names)-1):
 97 |                     ax1.annotate(names[i], (ytruec[i],ypredc[i]))
 98 |         if len(namesv)>0:
 99 |             for i in range(0,len(namesv)-1):
100 |                 ax1.annotate(namesv[i], (ytruev[i],ypredv[i]))
101 |         if len(title)>0:
102 |             pp.title(title)
103 |         pp.show()
104 |         
105 |     def plot_projections(self,Xn=[],label=[],components = [0,1],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False):
106 |         
107 |         """
108 |         plot_projections will plot the score space  
109 |         Optional inputs: 
110 |             Xn: array (new_cases,variables) of predictors 
111 |             If supplied, ppdire projections for new cases will be 
112 |                 made from Xn through res_ppdire.transform()
113 |             label: string: name of variable to be plotted. Will show in legend.
114 |             names: list or tuple of strings, casenames from training set
115 |             namesv: list or tuple of strings, casenames from test set
116 |             title: String containing plot title
117 |             legend_pos: string containing legend position
118 |             onlyval: boolean: only plot validation cases
119 |         """
120 |         
121 |         if len(label)==0:
122 |             label = 'none'
123 |         fig = pp.figure()
124 |         fig.set_facecolor(self.colors[0])
125 |         pp.rcParams['axes.facecolor'] = self.colors[1]
126 |         ax1 = fig.add_subplot(111)
127 |         if (not(onlyval)):
128 |             Tc = np.array(self.res_ppdire.x_scores_)
129 |             labelcr = label + ' Training'
130 |             ax1.scatter(Tc[:,components[0]], Tc[:,components[1]], c=self.colors[2], label=labelcr, 
131 |                     zorder=1,edgecolors=self.colors[4],marker=self.markers[0])
132 |         else:
133 |             if (len(Xn)==0):
134 |                 ValueError('In onlyval=True mode, new cases Xn need to be provided')
135 |         if not(len(Xn)==0):
136 |             Tv = np.array(self.res_ppdire.transform(Xn))
137 |             labelvr = label + ' Test'  
138 |             ax1.scatter(Tv[:,components[0]], Tv[:,components[1]],c=self.colors[5],label=labelvr,
139 |                         zorder=1,edgecolors=self.colors[4],marker=self.markers[0])
140 |         if len(label)==0:
141 |             ax1.legend_.remove()
142 |         else:
143 |             pp.legend(loc=legend_pos)
144 |         if len(names)>0:
145 |             if not(onlyval):
146 |                 for i in range(0,len(names)-1):
147 |                     ax1.annotate(names[i], (Tc[i,components[0]], Tc[i,components[1]]))
148 |         if len(namesv)>0:
149 |             for i in range(0,len(namesv)-1):
150 |                 ax1.annotate(namesv[i], (Tv[i,components[0]], Tv[i,components[1]]))
151 |         if len(title)>0:
152 |             pp.title(title)
153 |         pp.show()
154 |         
155 |         
156 |         
157 | 
158 | 


--------------------------------------------------------------------------------
/docs/ppdire.rst:
--------------------------------------------------------------------------------
  1 | .. _ppdire:
  2 | 
  3 | 
  4 | ################
  5 | ppdire
  6 | ################
  7 | 
  8 | Beyond discussion, the class of dimension reduction with the longest standing history accessible through direpack, is projection pursuit (PP) dimension reduction.
  9 | Let $\mathbf{X}$ be a data matrix that is a sample of $n$ cases of a $p$ variate random variable and $\mathbf{y}$ be a sample of a corresponding depending variable, when applicable. 
 10 | The set of projection pursuit scores $\mathbf{t}_i$ that span the columns of $\mathbf{T}$ are defined as linear combinations of the original variables: $\mathbf{T} = \mathbf{X}\mathbf{W}$, where the $\mathbf{w}_i$ are
 11 | the solution to the optimization problem: 
 12 | 
 13 | .. math::
 14 |    :label: optim_ppdire
 15 |    :nowrap:
 16 |     
 17 |     \begin{equation*}
 18 |     \begin{aligned}
 19 |     & \underset{\mathbf{a}}{\text{maximise}} & & \mathfrak{P}\left(\mathbb{S}\left(\mathbf{a}^T\mathbf{X}\right)\right) \\
 20 |     & \text{subject to} & & \mathbf{w}_i^T\mathbf{X}^T\mathbf{X}\mathbf{w}_j = 0 \mbox{ and } \parallel \mathbf{w}_i\parallel_2 = 1,\\
 21 |     \end{aligned}
 22 |     \end{equation*}
 23 |     
 24 |      
 25 | 
 26 | where $i,j \in [1,\min(n,p)]$, $j > i$  and the set $\mathbb{S} = \{\mathbf{X},\mathbf{y}\}$ if data for a dependent variable $Y$ exist and is a singleton containing $\mathbf{X}$ otherwise.
 27 | Maximization of this criterion is very flexible and the properties of the dimension reduction accomplished according to it can vary widely, mainly dependent on the presence or absence of dependent 
 28 | variable data, as well as on $\mathfrak{P}$, which in the PP literature is referred to as the  projection index. 
 29 | 
 30 | dicomo
 31 | ===========
 32 | 
 33 | The projection index determines which method is being calculated. 
 34 | In direpack, projection pursuit can be called through the ppdire subpackge and class object, which allows the user to pass any function of appropriate dimensionality as a projection index. 
 35 | However, a set of popular projection indices deriving from (co-)moments, are provided as well through the dicomo subpackage. For several of these, plugging them in leads to well-established methods. They comprise: 
 36 | 
 37 |      * Moment statistics: variance (PCA), higher order moments 
 38 |      * Co-moment statistics: covariance (PLS), higher order co-moments 
 39 |      * Standardized moments: skewness (ICA), kurtosis (ICA)
 40 |      * Standardized co-moments: correlation coefficient (CCA), co-skewness, co-kurtosis
 41 |      * Linear combinations of (standardized co-) moments. Here, the capi.py file in the ppdire subpackage delivers to co-moment analysis projection index (Serneels2019). 
 42 |      * Products of (co-)moments. Particularly the continuum association measure has been provided, which is given by $\mathop{\mbox{cont}}(\mathbf{X},\mathbf{y}) = \mathop{\mbox{cov}}(\mathbf{X},\mathbf{y})\mathop{\mbox{var}}(\mathbf{X})^{\alpha-1}$. Using this continuum measure produces continuum regression  (CR, Stone and Brooks (1990)). CR is equivalent to PLS for $\alpha = 1$ and approaches PCA as $\alpha \rightarrow\infty$.   
 43 | 
 44 | 
 45 | 
 46 | pp optimizers
 47 | ==============
 48 | 
 49 | Early ideas behind PP was the ability to scan all directions maximizing the projection index as denoted in  :eq:`optim_ppdire`. This essentially corresponds to a brute force optimization technique, which can be computationally very demanding.
 50 | For instance, both PCA and PLS, can be solved analytically, leading to efficient algorithms that do not directly optimize :eq:`optim_ppdire`. Whenever the projection index plugged in, leads to a convex optimization problem, it is advisable to apply an efficient numerical optimization technique.  For that purpose,ppdire has the option to use scipy.optimize’s sequential least squares quadratic programming optimization (SLSQP). However, for projection indices based on ordering or ranking data, such as medians or trimmed (co-)moments, the problem is no longer convex and cannot be solved through SLSQP. 
 51 | For those purposes, the grid algorithm is included, which was originally developed to compute RCR (Filzmoser, Serneels, Croux, andVan Espen 2006). 
 52 | 
 53 | Regularized regression
 54 | =======================
 55 | 
 56 | While the main focus of direpack is dimension reduction, all dimension reduction techniques offer a bridge to regularized regression. 
 57 | This can be achieved by regressing the dependent variable onto the estimated dimension reduced space. The latter provides regularization of the covariance matrix,
 58 | due to the constraints in :eq:`optim_ppdire`, and allow to perform regression for an undersampled $\mathbf{X}$. The classical estimate is to predict $\mathbf{y}$ through least squares regression: 
 59 | 
 60 | .. math::
 61 |    :nowrap:
 62 | 
 63 |     \begin{equation*}
 64 |     \hat{\mathbf{y}}  = \hat{\mathbf{T}} \hat{\mathbf{T}}^T\mathbf{y}
 65 |     \end{equation*}
 66 | 
 67 | which again leads to well-established methods such as principal component regression (PCR), PLS regression, etc.
 68 | 
 69 | 
 70 | 
 71 | Usage
 72 | ===========
 73 | .. currentmodule:: direpack.ppdire.ppdire
 74 | 
 75 | .. autosummary::
 76 |     :toctree: generated/
 77 |     :caption: PPDIRE
 78 |         
 79 |     ppdire
 80 | 
 81 |        
 82 | .. currentmodule:: direpack.dicomo.dicomo
 83 | 
 84 | .. autosummary::
 85 |     :toctree: generated/
 86 |     :caption: DICOMO
 87 | 
 88 |     dicomo
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | Dependencies
 96 | ================
 97 | 
 98 | - From `sklearn.base`: `BaseEstimator`,`TransformerMixin`,`RegressorMixin`
 99 | - From `sklearn.utils`: `_BaseComposition`
100 | - `copy`
101 | - `scipy.stats`
102 | - From `scipy.linalg`: `pinv2`
103 | - From `scipy.optimize`: `minimize`
104 | - `numpy` 
105 | - From `statsmodels.regression.quantile_regression`: `QuantReg`
106 | - From `sklearn.utils.extmath`: `svd_flip`
107 | 
108 | 
109 | References
110 | ==========
111 | 1. Peter Filzmoser, Sven Serneels, Christophe Croux and Pierre J. Van Espen, Robust Multivariate Methods: The Projection Pursuit Approach,  in: From Data and Information Analysis to Knowledge Engineering,Spiliopoulou, M., Kruse, R., Borgelt, C., Nuernberger, A. and Gaul, W., eds.,  Springer Verlag, Berlin, Germany, 2006, pages 270--277.
112 | 
113 | 2. Sven Serneels, Projection pursuit based generalized betas accounting for higher order co-moment effects in financial market analysis,  in: JSM Proceedings, Business and Economic Statistics Section. Alexandria, VA: American Statistical Association, 2019, 3009-3035.
114 | 
115 | 3. Chen, Z. and Li, G., Robust principal components and dispersion matrices via projection pursuit,  Research Report, Department of Statistics, Harvard University, 1981.
116 | 
117 | 4. Peter Filzmoser, Christophe Croux, Pierre J. Van Espen, Robust Continuum Regression, Sven Serneels,  Chemometrics and Intelligent Laboratory Systems, 76 (2005), 197-204.
118 | 
119 | 5. Stone  M,  Brooks  RJ  (1990).   “Continuum  Regression:   Cross-Validated  Sequentially  Constructed Prediction Embracing Ordinary Least Squares, Partial Least Squares and PrincipalComponents Regression.”Journal of the Royal Statistical Society. Series B (Methodological),52, 237–269.
120 | 


--------------------------------------------------------------------------------
/src/direpack/ppdire/capi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun May 12 10:03:05 2019
  5 | 
  6 | @author: Sven Serneels, Ponalytics.
  7 | """
  8 | 
  9 | from sklearn.base import BaseEstimator, defaultdict
 10 | from sklearn.utils.metaestimators import _BaseComposition
 11 | from collections import defaultdict
 12 | import inspect
 13 | from ..dicomo.dicomo import dicomo
 14 | from ..dicomo._dicomo_utils import *
 15 | 
 16 | 
 17 | class capi(_BaseComposition, BaseEstimator):
 18 | 
 19 |     """
 20 |     CAPI Co-moment analysis projection index
 21 |     
 22 |     The CAPI projection index to estimate generalized betas was first introduced
 23 |     in: 
 24 |     
 25 |     S. Serneels, Projection pursuit based generalized betas accounting for 
 26 |     higher order co-moment effects in financial market analysis,  in: 
 27 |     JSM Proceedings, Business and Economic Statistics Section. 
 28 |     Alexandria, VA: American Statistical Association, 2019, 3009-3035.
 29 |     
 30 |     Class arguments 
 31 |     
 32 |     max_degree, int: maxmimal degree of co-moments to be used. In [2,3,4]. 
 33 |     
 34 |     projection_index, class object: class used to calculate co-moments. 
 35 |         Written to work with dicomo class yet other plugins could be written.
 36 |     
 37 |     pi_arguments, dict: dict of arguments to pass on to projection_index
 38 |     
 39 |     weights, list of float: weights to used in linear combination of co-moments. 
 40 |     
 41 |     centring, bool
 42 |     
 43 |     scaling, bool whether to calculate CAPI based on scaled higher co-moments 
 44 |         (co-skewness, co-kurtosis) or raw higher co-moments
 45 |     
 46 |     options, either a list of co-moment options to be included, or 'all' (e.g.
 47 |     option=i calculates M3,i and M4,i etc.)
 48 |     
 49 |     After intializing the object, call object.fit(x,y,**kwargs) to evaluate. 
 50 |     CAPI takes no direct kwargs, yet passes all kwargs on to the fit method of
 51 |     the projection index. 
 52 |     
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         max_degree=2,
 58 |         projection_index=dicomo,
 59 |         pi_arguments={},
 60 |         weights=[1, 1, 1, -1, -1, -1],
 61 |         centring=False,
 62 |         scaling=True,
 63 |         options="all",
 64 |     ):
 65 |         self.max_degree = max_degree
 66 |         self.projection_index = projection_index
 67 |         self.pi_arguments = pi_arguments
 68 |         self.weights = weights
 69 |         self.most = self.projection_index(**self.pi_arguments)
 70 |         self.scaling = scaling
 71 |         self.options = options
 72 |         self.capi_index_ = None
 73 |         if self.max_degree > 4:
 74 |             raise (ValueError("Maximal degree is 4."))
 75 | 
 76 |     def fit(self, x, y, **kwargs):
 77 | 
 78 |         if self.scaling:
 79 |             order_kwargs = ["cov", "cos", "cok"]
 80 |         else:
 81 |             order_kwargs = ["com", "com", "com"]
 82 | 
 83 |         if self.max_degree < 2:
 84 |             raise (ValueError("capi not meaningful for max_degree < 2"))
 85 |         if self.options == "all":
 86 |             options = np.arange(1, 4)
 87 |         else:
 88 |             options = np.array(self.options, ndmin=1)
 89 |         moments = np.zeros(6)
 90 |         fit_arguments = {"order": 0, "y": y}
 91 |         fit_arguments = {**kwargs, **fit_arguments}
 92 |         init_moment_calc = 2
 93 |         k = 0
 94 |         for i in range(init_moment_calc, self.max_degree + 1):
 95 |             fit_arguments["order"] = i
 96 |             self.most.set_params(mode=order_kwargs[i - 2])
 97 |             l = min(i - 1, len(options))
 98 |             for j in options[np.arange(0, l)]:
 99 |                 fit_arguments["option"] = j
100 |                 moments[i - 3 + j + k] = self.most.fit(x, **fit_arguments)
101 |             if i == 3:
102 |                 k += 1
103 |         capi_index_ = np.dot(self.weights, moments)
104 |         self.capi_index_ = capi_index_
105 |         self.moments_ = moments
106 |         return capi_index_
107 | 
108 |     @classmethod
109 |     def _get_param_names(cls):
110 |         """Get parameter names for the estimator"""
111 |         # fetch the constructor or the original constructor before
112 |         # deprecation wrapping if any
113 |         init = getattr(cls.__init__, "deprecated_original", cls.__init__)
114 |         if init is object.__init__:
115 |             # No explicit constructor to introspect
116 |             return []
117 | 
118 |         # introspect the constructor arguments to find the model parameters
119 |         # to represent
120 |         init_signature = inspect.signature(init)
121 |         # Consider the constructor parameters excluding 'self'
122 |         parameters = [
123 |             p
124 |             for p in init_signature.parameters.values()
125 |             if p.name != "self" and p.kind != p.VAR_KEYWORD
126 |         ]
127 |         for p in parameters:
128 |             if p.kind == p.VAR_POSITIONAL:
129 |                 raise RuntimeError(
130 |                     "scikit-learn estimators should always "
131 |                     "specify their parameters in the signature"
132 |                     " of their __init__ (no varargs)."
133 |                     " %s with constructor %s doesn't "
134 |                     " follow this convention." % (cls, init_signature)
135 |                 )
136 |         # Extract and sort argument names excluding 'self'
137 |         return sorted([p.name for p in parameters])
138 | 
139 |     def get_params(self, deep=False):
140 |         """Get parameters for this estimator.
141 |         Parameters
142 |         ----------
143 |         deep : boolean, optional
144 |             If True, will return the parameters for this estimator and
145 |             contained subobjects that are estimators.
146 |         Returns
147 |         -------
148 |         params : mapping of string to any
149 |             Parameter names mapped to their values.
150 |         ------
151 |         Copied from ScikitLlearn instead of imported to avoid 'deep=True'
152 |         """
153 |         out = dict()
154 |         for key in self._get_param_names():
155 |             value = getattr(self, key, None)
156 |             if deep and hasattr(value, "get_params"):
157 |                 deep_items = value.get_params().items()
158 |                 out.update((key + "__" + k, val) for k, val in deep_items)
159 |             out[key] = value
160 |         return out
161 | 
162 |     def set_params(self, **params):
163 |         """Set the parameters of this estimator.
164 |         Copied from ScikitLearn, adapted to avoid calling 'deep=True'
165 |         Returns
166 |         -------
167 |         self
168 |         ------
169 |         Copied from ScikitLlearn instead of imported to avoid 'deep=True'
170 |         """
171 |         if not params:
172 |             # Simple optimization to gain speed (inspect is slow)
173 |             return self
174 |         valid_params = self.get_params()
175 | 
176 |         nested_params = defaultdict(dict)  # grouped by prefix
177 |         for key, value in params.items():
178 |             key, delim, sub_key = key.partition("__")
179 |             if key not in valid_params:
180 |                 raise ValueError(
181 |                     "Invalid parameter %s for estimator %s. "
182 |                     "Check the list of available parameters "
183 |                     "with `estimator.get_params().keys()`." % (key, self)
184 |                 )
185 | 
186 |             if delim:
187 |                 nested_params[key][sub_key] = value
188 |             else:
189 |                 setattr(self, key, value)
190 |                 valid_params[key] = value
191 | 
192 |         for key, sub_params in nested_params.items():
193 |             valid_params[key].set_params(**sub_params)
194 | 
195 |         return self
196 | 


--------------------------------------------------------------------------------
/src/direpack/ipopt_temp/ipopt_wrapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | cyipopt: Python wrapper for the Ipopt optimization package, written in Cython.
  5 | 
  6 | Copyright (C) 2012-2015 Amit Aides
  7 | Copyright (C) 2015-2018 Matthias Kümmerer
  8 | 
  9 | Author: Matthias Kümmerer <matthias.kuemmerer@bethgelab.org>
 10 | (original Author: Amit Aides <amitibo@tx.technion.ac.il>)
 11 | URL: https://github.com/matthias-k/cyipopt
 12 | License: EPL 1.0
 13 | 
 14 | This section is copied from ipopt until the fix in jacobians.py gets included into 
 15 | that package.
 16 | 
 17 | """
 18 | 
 19 | from __future__ import absolute_import, unicode_literals
 20 | import sys
 21 | 
 22 | from builtins import bytes  # from the future package
 23 | import numpy as np
 24 | try:
 25 |     import scipy
 26 | except ImportError:  # scipy is not installed
 27 |     SCIPY_INSTALLED = False
 28 | else:
 29 |     SCIPY_INSTALLED = True
 30 |     del scipy
 31 |     from scipy.optimize import approx_fprime
 32 |     try:
 33 |         from scipy.optimize import OptimizeResult
 34 |     except ImportError:
 35 |         # in scipy 0.14 Result was renamed to OptimzeResult
 36 |         from scipy.optimize import Result
 37 |         OptimizeResult = Result
 38 | 
 39 | import cyipopt
 40 | from .jacobian import FunctionWithApproxJacobianCentral,FunctionWithApproxJacobian
 41 | 
 42 | 
 43 | class IpoptProblemWrapper(object):
 44 |     def __init__(self, fun, args=(), kwargs=None, jac=None, hess=None, hessp=None,
 45 |                  constraints=(), eps=1e-8):
 46 |         if not SCIPY_INSTALLED:
 47 |             raise ImportError('Install SciPy to use the `IpoptProblemWrapper` class.')
 48 |         self.fun_with_jac = None
 49 |         self.last_x = None
 50 |         if hess is not None or hessp is not None:
 51 |             raise NotImplementedError('Using hessian matrixes is not yet implemented!')
 52 |         if jac is None:
 53 |             #fun = FunctionWithApproxJacobian(fun, epsilon=eps, verbose=False)
 54 |             jac = lambda x0, *args, **kwargs: approx_fprime(x0, fun, eps, *args, **kwargs)
 55 |         elif jac is True:
 56 |             self.fun_with_jac = fun
 57 |         elif not callable(jac):
 58 |             raise NotImplementedError('jac has to be bool or a function')
 59 |         self.fun = fun
 60 |         self.jac = jac
 61 |         self.args = args
 62 |         self.kwargs = kwargs or {}
 63 |         self._constraint_funs = []
 64 |         self._constraint_jacs = []
 65 |         self._constraint_args = []
 66 |         if isinstance(constraints, dict):
 67 |             constraints = (constraints, )
 68 |         for con in constraints:
 69 |             con_fun = con['fun']
 70 |             con_jac = con.get('jac', None)
 71 |             if con_jac is None:
 72 |                 con_fun = FunctionWithApproxJacobian(con_fun, epsilon=eps, verbose=False)
 73 |                 con_jac = con_fun.jac
 74 |             con_args = con.get('args', [])
 75 |             self._constraint_funs.append(con_fun)
 76 |             self._constraint_jacs.append(con_jac)
 77 |             self._constraint_args.append(con_args)
 78 |         # Set up evaluation counts
 79 |         self.nfev = 0
 80 |         self.njev = 0
 81 |         self.nit = 0
 82 | 
 83 |     def evaluate_fun_with_grad(self, x):
 84 |         if self.last_x is None or not np.all(self.last_x == x):
 85 |             self.last_x = x
 86 |             self.nfev += 1
 87 |             self.last_value = self.fun(x, *self.args, **self.kwargs)
 88 |         return self.last_value
 89 | 
 90 |     def objective(self, x):
 91 |         if self.fun_with_jac:
 92 |             return self.evaluate_fun_with_grad(x)[0]
 93 | 
 94 |         self.nfev += 1
 95 |         return self.fun(x, *self.args, **self.kwargs)
 96 | 
 97 |     def gradient(self, x, **kwargs):
 98 |         if self.fun_with_jac:
 99 |             return self.evaluate_fun_with_grad(x)[1]
100 | 
101 |         self.njev += 1
102 |         return self.jac(x, *self.args, **self.kwargs)  # .T
103 | 
104 |     def constraints(self, x):
105 |         con_values = []
106 |         for fun, args in zip(self._constraint_funs, self._constraint_args):
107 |             con_values.append(fun(x, *args))
108 |         return np.hstack(con_values)
109 | 
110 |     def jacobian(self, x):
111 |         con_values = []
112 |         for fun, args in zip(self._constraint_jacs, self._constraint_args):
113 |             con_values.append(fun(x, *args))
114 |         return np.vstack(con_values)
115 | 
116 |     def intermediate(
117 |             self,
118 |             alg_mod,
119 |             iter_count,
120 |             obj_value,
121 |             inf_pr,
122 |             inf_du,
123 |             mu,
124 |             d_norm,
125 |             regularization_size,
126 |             alpha_du,
127 |             alpha_pr,
128 |             ls_trials
129 |             ):
130 | 
131 |         self.nit = iter_count
132 | 
133 | 
134 | def get_bounds(bounds):
135 |     if bounds is None:
136 |         return None, None
137 |     else:
138 |         lb = [b[0] for b in bounds]
139 |         ub = [b[1] for b in bounds]
140 |         return lb, ub
141 | 
142 | 
143 | def get_constraint_bounds(constraints, x0, INF=1e19):
144 |     if isinstance(constraints, dict):
145 |         constraints = (constraints, )
146 |     cl = []
147 |     cu = []
148 |     if isinstance(constraints, dict):
149 |         constraints = (constraints, )
150 |     for con in constraints:
151 |         m = len(np.atleast_1d(con['fun'](x0, *con.get('args', []))))
152 |         cl.extend(np.zeros(m))
153 |         if con['type'] == 'eq':
154 |             cu.extend(np.zeros(m))
155 |         elif con['type'] == 'ineq':
156 |             cu.extend(INF*np.ones(m))
157 |         else:
158 |             raise ValueError(con['type'])
159 |     cl = np.array(cl)
160 |     cu = np.array(cu)
161 | 
162 |     return cl, cu
163 | 
164 | 
165 | def replace_option(options, oldname, newname):
166 |     if oldname in options:
167 |         if newname not in options:
168 |             options[newname] = options.pop(oldname)
169 | 
170 | def convert_to_bytes(options):
171 |     if sys.version_info >= (3, 0):
172 |         for key in list(options.keys()):
173 |             try:
174 |                 if bytes(key, 'utf-8') != key:
175 |                     options[bytes(key, 'utf-8')] = options[key]
176 |                     options.pop(key)
177 |             except TypeError:
178 |                 pass
179 | 
180 | def minimize_ipopt(fun, x0, args=(), kwargs=None, method=None, jac=None, hess=None, hessp=None,
181 |                    bounds=None, constraints=(), tol=None, callback=None, options=None):
182 |     """
183 |     Minimize a function using ipopt. The call signature is exactly like for
184 |     `scipy.optimize.mimize`. In options, all options are directly passed to
185 |     ipopt. Check [http://www.coin-or.org/Ipopt/documentation/node39.html] for
186 |     details.
187 |     The options `disp` and `maxiter` are automatically mapped to their
188 |     ipopt-equivalents `print_level` and `max_iter`.
189 |     """
190 |     if not SCIPY_INSTALLED:
191 |         raise ImportError('Install SciPy to use the `minimize_ipopt` function.')
192 | 
193 |     _x0 = np.atleast_1d(x0)
194 |     problem = IpoptProblemWrapper(fun, args=args, kwargs=kwargs, jac=jac, hess=hess,
195 |                                   hessp=hessp, constraints=constraints)
196 |     lb, ub = get_bounds(bounds)
197 | 
198 |     cl, cu = get_constraint_bounds(constraints, x0)
199 | 
200 |     if options is None:
201 |         options = {}
202 | 
203 |     nlp = cyipopt.problem(n = len(_x0),
204 |                           m = len(cl),
205 |                           problem_obj=problem,
206 |                           lb=lb,
207 |                           ub=ub,
208 |                           cl=cl,
209 |                           cu=cu)
210 | 
211 |     # python3 compatibility
212 |     convert_to_bytes(options)
213 | 
214 |     # Rename some default scipy options
215 |     replace_option(options, b'disp', b'print_level')
216 |     replace_option(options, b'maxiter', b'max_iter')
217 |     if b'print_level' not in options:
218 |         options[b'print_level'] = 0
219 |     if b'tol' not in options:
220 |         options[b'tol'] = tol or 1e-8
221 |     if b'mu_strategy' not in options:
222 |         options[b'mu_strategy'] = b'adaptive'
223 |     if b'hessian_approximation' not in options:
224 |         if hess is None and hessp is None:
225 |             options[b'hessian_approximation'] = b'limited-memory'
226 |     for option, value in options.items():
227 |         try:
228 |             nlp.addOption(option, value)
229 |         except TypeError as e:
230 |             raise TypeError('Invalid option for IPOPT: {0}: {1} (Original message: "{2}")'.format(option, value, e))
231 | 
232 |     x, info = nlp.solve(_x0)
233 | 
234 |     if np.asarray(x0).shape == ():
235 |         x = x[0]
236 | 
237 |     return OptimizeResult(x=x, success=info['status'] == 0, status=info['status'],
238 |                           message=info['status_msg'],
239 |                           fun=info['obj_val'],
240 |                           info=info,
241 |                           nfev=problem.nfev,
242 |                           njev=problem.njev,
243 |                           nit=problem.nit)
244 | 


--------------------------------------------------------------------------------
/docs/ppdire.md:
--------------------------------------------------------------------------------
  1 | Projection Pursuit Dimension Reduction
  2 | ======================================
  3 | 
  4 | A `scikit-learn` compatible Python 3 package for Projection Pursuit Dimension Reduction. 
  5 | This class implements a very general framweork for projection pursuit, giving access to 
  6 | methods ranging from PP-PCA to CAPI generalized betas.  
  7 | 
  8 | Description
  9 | -----------
 10 | 
 11 | Projection pursuit (PP) provides a very general framework for dimension reduction and regression. The
 12 | `ppdire` class provides a framework to calculate PP estimates based on a wide variety of projection 
 13 | indices. 
 14 | 
 15 | While the class will also work with user-defined projection indices, a set of projection indices are 
 16 | included into the `direpack` package as two ancillary classes: 
 17 | - `dicomo` class for (co-)moment statistics (separate folder), cf the [dicomo Documentation file](https://github.com/SvenSerneels/direpack/blob/master/docs/dicomo.md)
 18 | - `capi` specifically for analyzing financial market returns based on a linear combination of co-moments \[2\] 
 19 | 
 20 | When using the `dicomo` class as a plugin, several well-known multivariate dimension reduction techniques 
 21 | are accessible, as well as robust alternatives thereto. For more details, have a look at the [ppdire examples notebook](https://github.com/SvenSerneels/direpack/blob/master/examples/ppdire_example.ipynb). 
 22 | 
 23 | The `ppdire` class allows for calculation of the projection pursuit optimization either 
 24 | through `scipy.optimize` or through the native grid\[1\] algorithm. Optimization through 
 25 | `scipy.optimize` is much more efficient, yet it will only provide correct results 
 26 | for classical projection indices. The native grid algorithm should be used when 
 27 | the projection index involves order statistics of any kind, such as ranks, trimming, 
 28 | winsorizing, or empirical quantiles.
 29 | 
 30 | Remarks: 
 31 | - all the methods contained in this package have been designed for continuous data. They do not work correctly for categorical or textual data.
 32 | - this package focuses on projection pursuit dimension reduction. Regression methods that involve a dimension reduction step can be accessed through it 
 33 |   (e.g. PCR, PLS, RCR, ...), yet the package does not provide an implementation for projection pursuit regression (PPR). To access PPR, we refer to 
 34 |   the `projection-pursuit` package, also distributed through PIP.    
 35 |         
 36 | The code is aligned to `scikit-learn`, such that modules such as `GridSearchCV` can flawlessly be applied to it. 
 37 | 
 38 | The `ppdire` folder contains
 39 | - The estimator (`ppdire.py`) 
 40 | - A class for the co-moment analysis projection index (`capi.py`)
 41 | - Ancillary functions for projection pursuit (`_ppdire_utils.py`)
 42 | 
 43 | The ppdire class
 44 | ================
 45 | 
 46 | Dependencies
 47 | ------------
 48 | - From `sklearn.base`: `BaseEstimator`,`TransformerMixin`,`RegressorMixin`
 49 | - From `sklearn.utils`: `_BaseComposition`
 50 | - `copy`
 51 | - `scipy.stats`
 52 | - From `scipy.linalg`: `pinv2`
 53 | - From `scipy.optimize`: `minimize`
 54 | - `numpy` 
 55 | - From `statsmodels.regression.quantile_regression`: `QuantReg`
 56 | - From `sklearn.utils.extmath`: `svd_flip`
 57 | 
 58 | 
 59 | Parameters
 60 | ----------
 61 | - `projection_index`, function or class. `dicomo` and `capi` supplied in this
 62 |             package can both be used, but user defined projection indices can 
 63 |             be processed 
 64 | - `pi_arguments`, dict. Dict of arguments to be passed on to `projection index` 
 65 | - `n_components`, int. number of components to be estimated 
 66 | - `trimming`, float. trimming percentage for projection index, to be entered as pct/100 
 67 | - `alpha`, float. Continuum coefficient. Only relevant if `ppdire` is used to 
 68 |             estimate (classical or robust) continuum regression. 
 69 | - `optimizer`: str. Presently: either `'grid'` (native optimizer) or 
 70 |             any of the options in `scipy-optimize` (e.g. `'SLSQP'`)
 71 | - `optimizer_options`: dict with options to pass on to the optimizer. 
 72 |             If `optimizer == 'grid'`,
 73 |    * `ndir`: int: Number of directions to calculate per iteration.
 74 |    * `maxiter`: int. Maximal number of iterations. 
 75 | - `optimizer_constraints`: dict or list of dicts, further constraints to be
 76 |             passed on to the optimizer function.
 77 | - `regopt`, str. Regression option for regression step y~T. Can be set
 78 |                 to `'OLS'` (default), `'robust'` (will run `sprm.rm`) or `'quantile'` 
 79 |                 (`statsmodels.regression.quantreg`). 
 80 | - `center`, str. How to center the data. options accepted are options from
 81 |             `direpack`'s `VersatileScaler`. 
 82 | - `center_data`, bool. 
 83 | - `scale_data`, bool. Note: if set to `False`, convergence to correct optimum 
 84 |             is not a given. Will throw a warning. 
 85 | - `whiten_data`, bool. Typically used for ICA (kurtosis as PI)
 86 | - `square_pi`, bool. Whether to square the projection index upon evaluation. 
 87 | - `compression`, bool. If `True`, an internal SVD compression step is used for 
 88 |             flat data tables (p > n). Speds up the calculations. 
 89 | - `copy`, bool. Whether to make a deep copy of the input data or not. 
 90 | - `verbose`, bool. Set to `True` prints the iteration number. 
 91 | - `return_scaling_object`, bool.
 92 | Note: several interesting parameters can also be passed to the `fit` method.   
 93 | 
 94 | Attributes
 95 | ----------
 96 | Attributes always provided 
 97 | -  `x_weights_`: X block PPDIRE weighting vectors (usually denoted W)
 98 | -  `x_loadings_`: X block PPDIRE loading vectors (usually denoted P)
 99 | -  `x_scores_`: X block PPDIRE score vectors (usually denoted T)
100 | -  `x_ev_`: X block explained variance per component
101 | -  `x_Rweights_`: X block SIMPLS style weighting vectors (usually denoted R)
102 | -  `x_loc_`: X block location estimate 
103 | -  `x_sca_`: X block scale estimate
104 | -  `crit_values_`: vector of evaluated values for the optimization objective. 
105 | -  `Maxobjf_`: vector containing the optimized objective per component. 
106 | 
107 | Attributes created when more than one block of data is provided: 
108 | -  `C_`: vector of inner relationship between response and latent variables block
109 | -  `coef_`: vector of regression coefficients, if second data block provided 
110 | -  `intercept_`: intercept
111 | -  `coef_scaled_`: vector of scaled regression coeeficients (when scaling option used)
112 | -  `intercept_scaled_`: scaled intercept
113 | -  `residuals_`: vector of regression residuals
114 | -  `y_ev_`: y block explained variance 
115 | -  `fitted_`: fitted response
116 | -  `y_loc_`: y location estimate
117 | -  `y_sca_`: y scale estimate
118 | 
119 | Attributes created only when corresponding input flags are `True`:
120 | -   `whitening_`: whitened data matrix (usually denoted K)
121 | -   `mixing_`: mixing matrix estimate
122 | -   `scaling_object_`: scaling object from `VersatileScaler`
123 | 
124 | 
125 | Methods
126 | --------
127 | - `fit(X, *args, **kwargs)`: fit model 
128 | - `predict(X)`: make predictions based on fit 
129 | - `transform(X)`: project X onto latent space 
130 | - `getattr()`: get list of attributes
131 | - `setattr(*kwargs)`: set individual attribute of sprm object 
132 | 
133 | The `fit` function takes several optional input arguments. These are flags that 
134 | typically would not need to be cross-validated. They are: 
135 | -   `y`, numpy vector or 1D matrix, either as `arg` directly or as `kwarg`
136 | -   `h`, int. Overrides `n_components` for an individual call to `fit`. Use with caution. 
137 | -   `dmetric`, str. Distance metric used internally. Defaults to `'euclidean'`
138 | -   `mixing`, bool. Return mixing matrix? 
139 | -   Further parameters to the regression methods can be passed on here 
140 |     as additional `kwargs`. 
141 |   
142 | 
143 | Ancillary functions 
144 | -------------------
145 | - `dicomo` (class):  (co-)moments 
146 | - `capi` (class): co-moment analysis projection index 
147 | 
148 |         
149 | References
150 | ----------
151 | 1. [Robust Multivariate Methods: The Projection Pursuit Approach](https://link.springer.com/chapter/10.1007/3-540-31314-1_32), Peter Filzmoser, Sven Serneels, Christophe Croux and Pierre J. Van Espen, in: From Data and Information Analysis to Knowledge Engineering,
152 |         Spiliopoulou, M., Kruse, R., Borgelt, C., Nuernberger, A. and Gaul, W., eds., 
153 |         Springer Verlag, Berlin, Germany,
154 |         2006, pages 270--277.
155 | 2. [Projection pursuit based generalized betas accounting for higher order co-moment effects in financial market analysis](https://arxiv.org/pdf/1908.00141.pdf), Sven Serneels, in: 
156 |         JSM Proceedings, Business and Economic Statistics Section. Alexandria, VA: American Statistical Association, 2019, 3009-3035.
157 | 3. Robust principal components and dispersion matrices via projection pursuit, Chen, Z. and Li, G., Research Report, Department of Statistics, Harvard University, 1981.
158 | 4. [Robust Continuum Regression](https://www.sciencedirect.com/science/article/abs/pii/S0169743904002667), Sven Serneels, Peter Filzmoser, Christophe Croux, Pierre J. Van Espen, Chemometrics and Intelligent Laboratory Systems, 76 (2005), 197-204.
159 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | `direpack`: a Python 3 library for state-of-the-art statistical dimension reduction techniques
 2 | ==============================================================================================
 3 | 
 4 | This package delivers a `scikit-learn` compatible Python 3 package for sundry state-of-the art multivariate statistical methods, with 
 5 | a focus on dimension reduction. 
 6 | 
 7 | The categories of methods delivered in this package, are: 
 8 | - Projection pursuit dimension reduction (`ppdire`) 
 9 | - Sufficient dimension reduction (`sudire`)
10 | - Robust M-estimators for dimension reduction (`sprm`)
11 | each of which are presented as `scikit-learn` compatible objects in the corresponding folders.
12 | 
13 | We hope that this package leads to scientific success. If it does so, we kindly ask to cite the [official `direpack` publication](https://www.sciencedirect.com/science/article/pii/S235271102200200X) \[0\], as well as the original publication of the corresponding method.  
14 | 
15 | The package also contains a set of tools for pre- and postprocessing: 
16 | - The `preprocessing` folder provides classical and robust centring and scaling, as well as spatial sign transforms \[4\] and the robbustness inducing wrapping transformation \[15\].  
17 | - The `dicomo` folder contains a versatile class to access a wide variety of moment and co-moment statistics, and statistics derived from those. Check out the [dicomo Documentation file](https://github.com/SvenSerneels/direpack/blob/master/docs/dicomo.md) and the [dicomo Examples Notebook](https://github.com/SvenSerneels/direpack/blob/master/examples/dicomo_example.ipynb).
18 | - Plotting utilities in the `plot` folder 
19 | - Cross-validation utilities in the `cross-validation` folder  
20 | 
21 |  ![AIG sprm score space](https://github.com/SvenSerneels/direpack/blob/master/img/AIG_T12.png "AIG SPRM score space")
22 | 
23 | 
24 | Methods in the `sprm` folder
25 | ----------------------------
26 | - The estimator (`sprm.py`) \[1\]
27 | - The Sparse NIPALS (SNIPLS) estimator \[3\](`snipls.py`)
28 | - Robust M regression estimator (`rm.py`)
29 | - Ancillary functions for M-estimation (`_m_support_functions.py`)
30 | 
31 | Methods in the `ppdire` folder
32 | ------------------------------
33 | The `ppdire` class will give access to a wide range of projection pursuit dimension reduction techniques.
34 | These include slower approximate estimates for well-established methods such as PCA, PLS and continuum regression. 
35 | However, the class provides unique access to a set of robust options, such as robust continuum regression (RCR) \[5\], through its native `grid` optimization algorithm, first 
36 | published for RCR as well \[6\]. Moreover, `ppdire` is also a great gateway to calculate generalized betas, using the CAPI projection index \[7\]. 
37 | 
38 | The code is orghanized in 
39 | - `ppdire.py` - the main PP dimension reduction class 
40 | - `capi.py` - the co-moment analysis projection index.      
41 | 
42 | Methods in the `sudire` folder
43 | ------------------------------
44 | The `sudire` folder gives access to an extensive set of methods that resort under the umbrella of sufficient dimension reduction. 
45 | These range from meanwhile long-standing, well-accepted approaches, such as sliced inverse regression (SIR) and the closely related SAVE \[8,9\], 
46 | through methods such as directional regression \[10\] and principal Hessian directions \[11\], and more. However, the package also contains some 
47 | of the most recently developed, state-of-the-art sufficient dimension reduction techniques, that require no distributional assumptions. 
48 | The options provided in this category are based on energy statistics (distance covariance \[12\] or martingale difference divergence \[13\]) and 
49 | ball statistics (ball covariance) \[14\]. All of these options can be called by setting the corresponding parameters in the `sudire` class, cf. [the docs](https://github.com/SvenSerneels/direpack/blob/master/docs/sudire.md). 
50 | Note: the ball covariance option will require some lines to be uncommented as indicated. We decided not to make that option generally available, 
51 | since it depends on the `Ball` package that seems to be difficult to install on certain architectures. 
52 | 
53 | How to install
54 | --------------
55 | The package is distributed through PyPI, so install through: 
56 |         
57 |         pip install direpack
58 |         
59 | Note that some of the key methods in the `sudire` subpackage rely on the IPOPT 
60 | optimization package, which according to their recommendation, can best be installed
61 | directly as: 
62 | 
63 |         conda install -c conda-forge cyipopt
64 |         
65 | Documentation
66 | =============
67 | 
68 | - Detailed documentation can be found in the [ReadTheDocs page](https://direpack.readthedocs.io/en/latest/index.html). 
69 | - A more extensive description on the background is presented in the [official `direpack` publication](https://www.sciencedirect.com/science/article/pii/S235271102200200X). 
70 | - Examples on how to use each of the `dicomo`, `ppdire`, `sprm` and `sudire` classes are presented as Jupyter notebooks in the [examples](https://github.com/SvenSerneels/direpack/blob/master/examples) folder
71 | - Furthemore, the [docs](https://github.com/SvenSerneels/direpack/blob/master/docs) folder contains a few markdown files on usage of the classes. 
72 | 
73 |   
74 |         
75 | References
76 | ==========
77 | 0. [`direpack`: A Python 3 package for state-of-the-art statistical dimensionality reduction methods](https://www.sciencedirect.com/science/article/pii/S235271102200200X), Emmanuel Jordy Menvouta, Sven Serneels, Tim Verdonck, SoftwareX, 21 (2023), 101282.
78 | 1. [Sparse partial robust M regression](https://www.sciencedirect.com/science/article/abs/pii/S0169743915002440), Irene Hoffmann, Sven Serneels, Peter Filzmoser, Christophe Croux, Chemometrics and Intelligent Laboratory Systems, 149 (2015), 50-59.
79 | 2. [Partial robust M regression](https://doi.org/10.1016/j.chemolab.2005.04.007), Sven Serneels, Christophe Croux, Peter Filzmoser, Pierre J. Van Espen, Chemometrics and Intelligent Laboratory Systems, 79 (2005), 55-64.
80 | 3. [Sparse and robust PLS for binary classification](https://onlinelibrary.wiley.com/doi/abs/10.1002/cem.2775), I. Hoffmann, P. Filzmoser, S. Serneels, K. Varmuza, Journal of Chemometrics, 30 (2016), 153-162.
81 | 4. [Spatial Sign Preprocessing:  A Simple Way To Impart Moderate Robustness to Multivariate Estimators](https://pubs.acs.org/doi/abs/10.1021/ci050498u), Sven Serneels, Evert De Nolf, Pierre J. Van Espen, Journal of Chemical Information and Modeling, 46 (2006), 1402-1409.
82 | 5. [Robust Continuum Regression](https://www.sciencedirect.com/science/article/abs/pii/S0169743904002667), Sven Serneels, Peter Filzmoser, Christophe Croux, Pierre J. Van Espen, Chemometrics and Intelligent Laboratory Systems, 76 (2005), 197-204.
83 | 6. [Robust Multivariate Methods: The Projection Pursuit Approach](https://link.springer.com/chapter/10.1007/3-540-31314-1_32), Peter Filzmoser, Sven Serneels, Christophe Croux and Pierre J. Van Espen, in: From Data and Information Analysis to Knowledge Engineering, Spiliopoulou, M., Kruse, R., Borgelt, C., Nuernberger, A. and Gaul, W., eds., Springer Verlag, Berlin, Germany, 2006, pages 270--277.
84 | 7. [Projection pursuit based generalized betas accounting for higher order co-moment effects in financial market analysis](https://arxiv.org/pdf/1908.00141.pdf), Sven Serneels, in: JSM Proceedings, Business and Economic Statistics Section. Alexandria, VA: American Statistical Association, 2019, 3009-3035.
85 | 8. [Sliced Inverse Regression for Dimension Reduction](https://www.tandfonline.com/doi/abs/10.1080/01621459.1991.10475035) Li K-C,  Journal of the American Statistical Association (1991), 86, 316-327.
86 | 9. [Sliced Inverse Regression for Dimension Reduction: Comment](https://www.jstor.org/stable/2290564?seq=1#metadata_info_tab_contents),  R.D. Cook, and Sanford Weisberg, Journal of the American Statistical Association (1991), 86, 328-332.
87 | 10. [On directional regression for dimension reduction](https://doi.org/10.1198/016214507000000536) ,  B. Li and S.Wang, Journal of the American Statistical Association (2007), 102:997–1008.
88 | 11. [On principal hessian directions for data visualization and dimension reduction:Another application of stein’s lemma](https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10476258), K.-C. Li. , Journal of the American Statistical Association(1992)., 87,1025–1039.
89 | 12. [Sufficient Dimension Reduction via Distance Covariance](https://doi.org/10.1080/10618600.2015.1026601), Wenhui Sheng and Xiangrong Yin in: Journal of Computational and Graphical Statistics (2016),  25, issue 1, pages 91-104.
90 | 13. [A martingale-difference-divergence-based estimation of central mean subspace](https://dx.doi.org/10.4310/19-SII562), Yu Zhang, Jicai Liu, Yuesong Wu and Xiangzhong Fang, in: Statistics and Its Interface (2019),  12, number 3, pages 489-501.
91 | 14. [Robust Sufficient Dimension Reduction Via Ball Covariance](https://www.sciencedirect.com/science/article/pii/S0167947319301380) Jia Zhang and Xin Chen, Computational Statistics and Data Analysis 140 (2019) 144–154 
92 | 15. [Fast Robust Correlation for High-Dimensional Data](https://www.tandfonline.com/doi/full/10.1080/00401706.2019.1677270) Jakob Raymaekers and Peter J. Rousseeuw, Technometrics, 63 (2021), 184-198. 
93 |  
94 |         
95 | [Release Notes](https://github.com/SvenSerneels/direpack/blob/master/direpack_Release_Notes.md) can be checked out in the repository.  
96 | 
97 | [A list of possible topics for further development](https://github.com/SvenSerneels/direpack/blob/master/direpack_Future_Dev.md) is provided as well. Additions and comments are welcome!


--------------------------------------------------------------------------------
/src/direpack/preprocessing/_preproc_utilities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sat Dec 21 10:55:24 2019
  5 | 
  6 | Set of help functions for robust centring and scaling 
  7 | 
  8 | @author: Sven Serneels, Ponalytics
  9 | """
 10 | 
 11 | import numpy as np
 12 | import pandas as ps
 13 | import scipy.stats as sps
 14 | import scipy.optimize as spo
 15 | from statsmodels import robust as srs
 16 | import copy
 17 | 
 18 | 
 19 | def _handle_zeros_in_scale(scale, copy=True):
 20 |     """
 21 |     Makes sure that whenever scale is zero, we handle it correctly.
 22 |     This happens in most scalers when we have constant features.
 23 |     Taken from ScikitLearn.preprocesssing"""
 24 | 
 25 |     # if we are fitting on 1D arrays, scale might be a scalar
 26 |     if np.isscalar(scale):
 27 |         if scale == 0.0:
 28 |             scale = 1.0
 29 |         return scale
 30 |     elif isinstance(scale, np.ndarray):
 31 |         if copy:
 32 |             # New array to avoid side-effects
 33 |             scale = scale.copy()
 34 |         scale[scale == 0.0] = 1.0
 35 |         return scale
 36 | 
 37 | 
 38 | def _check_trimming(t):
 39 | 
 40 |     if (t > 0.99) or (t < 0):
 41 |         raise (ValueError("Trimming fraction must be in [0,1)"))
 42 | 
 43 | 
 44 | def mad(X, c=0.6744897501960817, **kwargs):
 45 |     """
 46 |     Column-wise median absolute deviation. **kwargs included to allow
 47 |     general function call in scale_data.
 48 |     """
 49 | 
 50 |     s = median(np.abs(X - median(X, axis=0)), axis=0) / c
 51 |     s = np.array(s).reshape(-1)
 52 |     # statsmodels.robust.mad is not as flexible toward matrix input,
 53 |     # sometimes throws a value error in ufunc
 54 |     return s
 55 | 
 56 | 
 57 | def median(X, **kwargs):
 58 |     """
 59 |     Column-wise median. **kwargs included to allow
 60 |     general function call in scale_data.
 61 |     """
 62 | 
 63 |     if np.isnan(X).any():
 64 |         m = np.nanmedian(X, axis=0)
 65 |     else:
 66 |         m = np.median(X, axis=0)
 67 |     m = np.array(m).reshape(-1)
 68 | 
 69 |     return m
 70 | 
 71 | 
 72 | def mean(X, trimming=0):
 73 |     """
 74 |     Column-wise mean or trimmed mean. Trimming to be entered as fraction.
 75 |     """
 76 | 
 77 |     if trimming == 0:
 78 |         if np.isnan(X).any():
 79 |             m = np.nanmean(X, axis=0)
 80 |         else:
 81 |             m = np.mean(X, axis=0)
 82 |     else:
 83 |         # Returns all NaN if missings in X
 84 |         m = sps.trim_mean(X, trimming, 0)
 85 | 
 86 |     return m
 87 | 
 88 | 
 89 | def std(X, trimming=0):
 90 |     """
 91 |     Column-wise standard devaition or trimmed std.
 92 |     Trimming to be entered as fraction.
 93 |     """
 94 | 
 95 |     if trimming == 0:
 96 |         if np.isnan(X).any():
 97 |             s = np.power(np.nanvar(X, axis=0), 0.5)
 98 |         else:
 99 |             s = np.power(np.var(X, axis=0), 0.5)
100 |         s = np.array(s).reshape(-1)
101 |     else:
102 |         var = sps.trim_mean(
103 |             np.square(X - sps.trim_mean(X, trimming, 0)), trimming, 0
104 |         )
105 |         s = np.sqrt(var)
106 |     return s
107 | 
108 | 
109 | def _euclidnorm(x):
110 |     """
111 |     Euclidean norm of a vector
112 |     """
113 | 
114 |     if np.isnan(x).any():
115 |         return np.sqrt(np.nansum(np.square(x)))
116 |     else:
117 |         return np.sqrt(np.sum(np.square(x)))
118 | 
119 | 
120 | def _diffmat_objective(a, X):
121 |     """
122 |     Utility to l1median, matrix of differences
123 |     """
124 | 
125 |     (n, p) = X.shape
126 |     return X - np.tile(a, (n, 1))
127 | 
128 | 
129 | def _l1m_objective(a, X, *args):
130 |     """
131 |     Optimization objective for l1median
132 |     """
133 | 
134 |     if np.isnan(X).any():
135 |         return np.nansum(
136 |             np.apply_along_axis(_euclidnorm, 1, _diffmat_objective(a, X))
137 |         )
138 |     else:
139 |         return np.sum(
140 |             np.apply_along_axis(_euclidnorm, 1, _diffmat_objective(a, X))
141 |         )
142 | 
143 | 
144 | def _l1m_jacobian(a, X):
145 |     """
146 |     Jacobian for l1median
147 |     """
148 | 
149 |     (n, p) = X.shape
150 |     dX = _diffmat_objective(a, X)
151 |     dists = np.apply_along_axis(_euclidnorm, 1, dX)
152 |     dists = _handle_zeros_in_scale(dists)
153 |     dX /= np.tile(np.array(dists).reshape(n, 1), (1, p))
154 |     if np.isnan(X).any():
155 |         return -np.nansum(dX, axis=0)
156 |     else:
157 |         return -np.sum(dX, axis=0)
158 | 
159 | 
160 | def _l1median(
161 |     X, x0, method="SLSQP", tol=1e-8, options={"maxiter": 2000}, **kwargs
162 | ):
163 |     """
164 |     Optimization for l1median
165 |     """
166 | 
167 |     mu = spo.minimize(
168 |         _l1m_objective,
169 |         x0,
170 |         args=(X),
171 |         jac=_l1m_jacobian,
172 |         tol=tol,
173 |         options=options,
174 |         method=method,
175 |     )
176 |     return mu
177 | 
178 | 
179 | def l1median(X, **kwargs):
180 |     """
181 |     l1median wrapper to generically convert matrices as some of the scipy
182 |     optimization options will crash when provided matrix input.
183 |     """
184 | 
185 |     if "x0" not in kwargs:
186 |         x0 = median(X)
187 | 
188 |     if type(X) == np.matrix:
189 |         X = np.array(X)
190 | 
191 |     if len(X.shape) == 2:
192 |         (n, p) = X.shape
193 |     else:
194 |         p = 1
195 | 
196 |     if p < 2:
197 |         return median(X)
198 |     else:
199 |         return _l1median(X, x0, **kwargs).x
200 | 
201 | 
202 | def kstepLTS(X, maxit=5, tol=1e-10, **kwargs):
203 |     """
204 |     Computes the K-step LTS estimator of location
205 |     It uses the spatial median as a starting value, and yields an
206 |     estimator with improved statistical efficiency, but at a higher
207 |     computational cost.
208 |     Inputs:
209 |         X: data matrix
210 |         maxit: maximum number of iterations
211 |         tol: convergence tolerance
212 |     Outputs:
213 |         m2: location estimate
214 |     """
215 |     n, p = X.shape
216 |     m1 = l1median(X)  # initial estimate
217 |     m2 = copy.deepcopy(m1)
218 |     iteration = 0
219 |     unconverged = True
220 |     while unconverged and (iteration < maxit):
221 |         if np.isnan(X).any():
222 |             dists = np.nansum(np.square(X - m1), axis=1)
223 |         else:
224 |             dists = np.sum(np.square(X - m1), axis=1)
225 |         cutdist = np.sort(dists, axis=0)[int(np.floor((n + 1) / 2)) - 1]
226 |         hsubset = np.where(dists <= cutdist)[0]
227 |         m2 = np.array(mean(X[hsubset, :])).reshape((p,))
228 |         unconverged = max(abs(m1 - m2)) > tol
229 |         iteration += 1
230 |         m1 = copy.deepcopy(m2)
231 | 
232 |     return m2
233 | 
234 | 
235 | def scaleTau2(x0, c1=4.5, c2=3, consistency=True, **kwargs):
236 |     """
237 |     Tau estimator of scale
238 |     Inputs:
239 |         x0: array or matrix, data
240 |         c1: consistency factor for initial estimate
241 |         c2: consistency factor for final estimate
242 |         consistency: str or bool,
243 |             False, True, or "finiteSample"
244 |     Output:
245 |         the scale estimate
246 |     """
247 | 
248 |     x = copy.deepcopy(x0)
249 |     n, p = x.shape
250 |     if np.isnan(x).any():
251 |         summ = np.nansum
252 |     else:
253 |         summ = np.sum
254 |     medx = median(x)
255 |     xc = abs(x - medx)
256 |     sigma0 = median(xc)
257 |     if c1 > 0:
258 |         xc /= sigma0 * c1
259 |         w = 1 - np.square(xc)
260 |         w = np.square((abs(w) + w) / 2)
261 |         mu = summ(np.multiply(x, w)) / summ(w)
262 |     else:
263 |         mu = medx
264 |     x -= mu
265 |     x /= sigma0
266 |     rho = np.square(x)
267 |     rho[np.where(rho > c2**2)[0]] = c2**2
268 |     if consistency:
269 | 
270 |         def Erho(b):
271 |             return (
272 |                 2
273 |                 * (
274 |                     (1 - b**2) * sps.norm.cdf(b)
275 |                     - b * sps.norm.pdf(b)
276 |                     + b**2
277 |                 )
278 |                 - 1
279 |             )
280 | 
281 |         def Es2(c2):
282 |             return Erho(c2 * sps.norm.ppf(3 / 4))
283 | 
284 |         if consistency == "finiteSample":
285 |             nEs2 = (n - 2) * Es2(c2)
286 |         else:
287 |             nEs2 = n * Es2(c2)
288 |     else:
289 |         nEs2 = n
290 |     return np.array(sigma0 * np.sqrt(summ(rho) / nEs2)).reshape((p,))
291 | 
292 | 
293 | def scale_data(X, m, s):
294 |     """
295 |     Column-wise data scaling on location and scale estimates.
296 | 
297 |     """
298 | 
299 |     n = X.shape
300 |     if len(n) > 1:
301 |         p = n[1]
302 |     else:
303 |         p = 1
304 |     n = n[0]
305 | 
306 |     s = _handle_zeros_in_scale(s)
307 | 
308 |     if p == 1:
309 |         Xm = X - float(m)
310 |         Xs = Xm / s
311 |     else:
312 |         Xm = X - np.array([m for i in range(1, n + 1)])
313 |         Xs = Xm / np.array([s for i in range(1, n + 1)])
314 |     return Xs
315 | 
316 | 
317 | def wrap_univ(dd, scale=False, locX=None, scaleX=None):
318 |     """
319 |     # Computes the univariate wrapping transformation
320 |     # Reference: Jakob Raymaekers & Peter J. Rousseeuw (2021)
321 |     # Fast Robust Correlation for High-Dimensional Data,
322 |     # Technometrics, 63:2, 184-198.
323 |     args:
324 |         dd, np.array: vector of distances
325 |         scale, bool: if True, will scale data about med/mad
326 |     returns:
327 |         xi, np.array: wrapped vector
328 |     """
329 |     b = 1.5
330 |     c = 4
331 |     q1 = 1.540793
332 |     q2 = 0.8622731
333 |     if dd.dtype == "O":
334 |         dd = dd.astype("float")
335 |     if scale:
336 |         locX = median(dd)
337 |         scaleX = mad(dd)
338 |         xi = (dd - locX) / scaleX
339 |     else:
340 |         xi = np.array(dd)
341 |     indMid = np.where((np.abs(xi) < c) & (np.abs(xi) >= b))[0]
342 |     indHigh = np.where(np.abs(xi) >= c)[0]
343 |     xi[indMid] = (
344 |         q1
345 |         * np.tanh(q2 * (c - np.abs(xi[indMid])))
346 |         * np.abs(xi[indMid])
347 |         / xi[indMid]
348 |     )
349 |     xi[indHigh] = 0
350 |     xi = xi * scaleX + locX
351 | 
352 |     return xi
353 | 
354 | 
355 | def wrap(X, locX, scaleX):
356 |     """
357 |     wrap - wrap matrix column wise
358 |     """
359 | 
360 |     if len(X.shape) == 1:
361 |         X = X.reshape(-1, 1)
362 | 
363 |     return np.array(
364 |         [
365 |             wrap_univ(X[:, i], locX=locX[i], scale=True, scaleX=scaleX[i])
366 |             for i in range(X.shape[1])
367 |         ]
368 |     ).transpose()
369 | 


--------------------------------------------------------------------------------
/src/direpack/preprocessing/robcent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # @author: Sven Serneels, Ponalytics
  4 | # Created on Sun Feb 4 2018
  5 | # Updated on Sun Dec 16 2018
  6 | # Refactored on Sat Dec 21 2019
  7 | # Refactored on Sat Mar 28 2020
  8 | 
  9 | 
 10 | # Class for classical and robust centering and scaling of input data for
 11 | # regression and machine learning
 12 | 
 13 | # Version 2.0: Code entirely restructured compared to version 1.0.
 14 | # Code made consistent with sklearn logic: fit(data,params) yields results.
 15 | # Code makes more effciient use of numpy builtin estimators.
 16 | # Version 3.0:
 17 | # Code now takes strings or functions as input to centring and scaling.
 18 | # Utility functions have been moved to _preproc_utilities.py
 19 | # Code now supplied for l1median cetring, with options to use different
 20 | # scipy.optimize optimization algorithms
 21 | # Version 4.0:
 22 | # Made the API compatible for ScikitLearn pipelines. However, some nonstandard
 23 | # functions and output remain for backwards compatibility. Functionality for
 24 | # sparse matrices still has to be implemented.
 25 | 
 26 | 
 27 | # Ancillary functions in _preproc_utilities.py:
 28 | 
 29 | #         -   `scale_data(X,m,s)`: centers and scales X on center m (as vector) and scale s (as vector).
 30 | #         -   `mean(X,trimming)`: Column-wise mean.
 31 | #         -   `median(X)`: Column-wise median.
 32 | #         -   `l1median(X)`: L1 or spatial median. Optional arguments:
 33 | #         -   `x0`: starting point for optimization, defaults to column wise median
 34 | #         -   `method`: optimization algorithm, defaults to 'SLSQP'
 35 | #         -   `tol`: tolerance, defaults to 1e-8
 36 | #         -   `options`: list of options for `scipy.optimize.minimize`
 37 | #         -   `kstepLTS(X): k-step LTS estimator of location.
 38 | #         -   `maxit`: int, number of iterations to compute maximally
 39 | #         -   `tol`: float, tolerance for convergence
 40 | #         -   `std(X,trimming)`: Column-wise std.
 41 | #         -   `mad(X,c)`: Column-wise median absolute deviation, with consistency factor c.
 42 | #         -   `scaleTau2(x0, c1 = 4.5, c2 = 3, consistency = True)`: Tau estimator of scale
 43 | #             with consistency parameters c1 and c2 and option for consistency correction
 44 | #             (True, False or 'finiteSample')
 45 | 
 46 | 
 47 | from __future__ import absolute_import, division, print_function
 48 | from __future__ import unicode_literals
 49 | 
 50 | from sklearn.base import BaseEstimator, TransformerMixin
 51 | from sklearn.utils.metaestimators import _BaseComposition
 52 | from sklearn.utils.validation import check_is_fitted
 53 | import numpy as np
 54 | from ..utils.utils import (
 55 |     MyException,
 56 |     convert_X_input,
 57 |     convert_y_input,
 58 |     _check_input,
 59 | )
 60 | from ._preproc_utilities import *
 61 | from ._preproc_utilities import _check_trimming, wrap_univ
 62 | 
 63 | __all__ = ["VersatileScaler", "robcent", "versatile_scale", "Wrapper", "wrap"]
 64 | 
 65 | 
 66 | class VersatileScaler(_BaseComposition, TransformerMixin, BaseEstimator):
 67 | 
 68 |     """
 69 |     VersatileScaler Center and Scale data about classical or robust location and scale estimates
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |         `center`: str or callable, location estimator. String has to be name of the
 74 |                 function to be used, or 'None'.
 75 |         `scale`: str or callable, scale estimator
 76 |         `trimming`: trimming percentage to be used in location and scale estimation.
 77 | 
 78 | 
 79 |     Attributes
 80 |     ----------
 81 |     Arguments for methods:
 82 |         -   `X`: array-like, n x p, the data.
 83 |         -   `trimming`: float, fraction to be trimmed (must be in (0,1)).
 84 | 
 85 | 
 86 | 
 87 | 
 88 |     Remarks
 89 |     -------
 90 |     Options for classical estimators 'mean' and 'std' also give access to robust
 91 |     trimmed versions.
 92 | 
 93 |     """
 94 | 
 95 |     def __init__(self, center="mean", scale="std", trimming=0):
 96 |         """
 97 |         Initialize values. Check if correct options provided.
 98 |         """
 99 | 
100 |         self.center = center
101 |         self.scale = scale
102 |         self.trimming = trimming
103 | 
104 |     def fit(self, X):
105 |         """
106 |         Estimate location and scale, store these in the class object.
107 |         Trimming fraction can be provided as keyword argument.
108 |         """
109 | 
110 |         X = _check_input(X)
111 | 
112 |         _check_trimming(self.trimming)
113 | 
114 |         if type(self.center) is str:
115 |             center = eval(self.center)
116 |         else:
117 |             center = self.center
118 | 
119 |         if type(self.scale) is str:
120 |             scale = eval(self.scale)
121 |         else:
122 |             scale = self.scale
123 | 
124 |         n = X.shape
125 |         if len(n) > 1:
126 |             p = n[1]
127 |         else:
128 |             p = 1
129 |         n = n[0]
130 | 
131 |         if self.center == "None":
132 |             m = np.repeat(0, p)
133 |         else:
134 |             m = center(X, trimming=self.trimming)
135 | 
136 |         # Keeping col_loc_ for older version compatibility
137 |         setattr(self, "col_loc_", m)
138 |         # sklearn standard
139 |         setattr(self, "center_", m)
140 | 
141 |         if self.scale == "None":
142 |             s = np.repeat(1, p)
143 |         else:
144 |             s = scale(X, trimming=self.trimming)
145 | 
146 |         # Keeping col_sca_ for older version compatibility
147 |         setattr(self, "col_sca_", s)
148 |         # sklearn standard
149 |         setattr(self, "scale_", s)
150 | 
151 |     def transform(self, X):
152 |         """
153 |         Center and/or scale training data to pre-estimated location and scale
154 |         """
155 | 
156 |         X = _check_input(X)
157 |         check_is_fitted(self, ["center_", "scale_"])
158 | 
159 |         Xs = scale_data(X, self.center_, self.scale_)
160 |         setattr(self, "datas_", Xs)
161 | 
162 |         return Xs
163 | 
164 |     def predict(self, Xn):
165 |         """
166 |         Standardize new data on previously estimated location and scale.
167 |         Number of columns needs to match.
168 |         """
169 | 
170 |         Xn = _check_input(Xn)
171 |         Xns = scale_data(Xn, self.col_loc_, self.col_sca_)
172 |         setattr(self, "datans_", Xns)
173 | 
174 |         return Xns
175 | 
176 |     def fit_transform(self, X):
177 |         """
178 |         Estimate center and scale for training data and scale these data
179 |         """
180 | 
181 |         self.fit(X)
182 |         self.transform(X)
183 | 
184 |         return self.datas_
185 | 
186 |     def inverse_transform(self, Xs=None):
187 |         """
188 |         Transform scaled data back to their original scale
189 |         """
190 | 
191 |         check_is_fitted(self, ["center_", "scale_"])
192 |         if Xs is not None:
193 |             Xs = _check_input(Xs)
194 |         else:
195 |             Xs = self.datas_
196 | 
197 |         return np.multiply(Xs, self.scale_) + self.center_
198 | 
199 | 
200 | # For backwards compatibility
201 | robcent = VersatileScaler
202 | 
203 | 
204 | def versatile_scale(X, center="l1median", scale="mad", trimming=0):
205 |     """
206 |     Wrapper to scale based on present robcent implementation that uses
207 |     `fit` instead of `transform`
208 |     """
209 | 
210 |     rc = VersatileScaler(center=center, scale=scale, trimming=trimming)
211 |     return rc.fit_transform(X)
212 | 
213 | 
214 | class Wrapper(_BaseComposition, TransformerMixin, BaseEstimator):
215 | 
216 |     """
217 |     Wrapper Perform robustness inducing 'wrapping' transformation using
218 |     optimal plugins and parameters from the literature
219 | 
220 |     Parameters
221 |     ----------
222 | 
223 | 
224 |     Attributes
225 |     ----------
226 |     Arguments for methods:
227 |         -   `X`: array-like, n x p, the data.
228 | 
229 |     Reference
230 |     ---------
231 |     Jakob Raymaekers & Peter J. Rousseeuw (2021), Fast Robust Correlation for
232 |     High-Dimensional Data, Technometrics, 63:2, 184-198.
233 | 
234 |     """
235 | 
236 |     def __init__(self):
237 |         """
238 |         Initialize values. Check if correct options provided.
239 |         """
240 | 
241 |         self.center = "median"
242 |         self.scale = "mad"
243 |         self.trimming = 0
244 | 
245 |     def fit(self, X):
246 |         """
247 |         Estimate location and scale, store these in the class object.
248 |         Trimming fraction can be provided as keyword argument.
249 |         """
250 | 
251 |         X = _check_input(X)
252 | 
253 |         _check_trimming(self.trimming)
254 | 
255 |         if type(self.center) is str:
256 |             center = eval(self.center)
257 |         else:
258 |             center = self.center
259 | 
260 |         if type(self.scale) is str:
261 |             scale = eval(self.scale)
262 |         else:
263 |             scale = self.scale
264 | 
265 |         n = X.shape
266 |         if len(n) > 1:
267 |             p = n[1]
268 |         else:
269 |             p = 1
270 |         n = n[0]
271 | 
272 |         if self.center == "None":
273 |             m = np.repeat(0, p)
274 |         else:
275 |             m = center(X, trimming=self.trimming)
276 | 
277 |         # Keeping col_loc_ for older version compatibility
278 |         setattr(self, "col_loc_", m)
279 |         # sklearn standard
280 |         setattr(self, "center_", m)
281 | 
282 |         if self.scale == "None":
283 |             s = np.repeat(1, p)
284 |         else:
285 |             s = scale(X, trimming=self.trimming)
286 | 
287 |         # Keeping col_sca_ for older version compatibility
288 |         setattr(self, "col_sca_", s)
289 |         # sklearn standard
290 |         setattr(self, "scale_", s)
291 | 
292 |     def transform(self, X):
293 |         """
294 |         Project data points to their wrapped counterparts
295 |         """
296 | 
297 |         X = _check_input(X)
298 |         check_is_fitted(self, ["center_", "scale_"])
299 | 
300 |         Xw = wrap(X, self.center_, self.scale_)
301 |         setattr(self, "dataw_", Xw)
302 | 
303 |         return Xw
304 | 
305 |     def predict(self, Xn):
306 |         """
307 |         Wrap new data using previously estimated location and scale.
308 |         Number of columns needs to match.
309 |         """
310 | 
311 |         Xn = _check_input(Xn)
312 |         Xnw = wrap(Xn, self.col_loc_, self.col_sca_)
313 |         setattr(self, "datanw_", Xnw)
314 | 
315 |         return Xnw
316 | 
317 |     def fit_transform(self, X):
318 |         """
319 |         Estimate center and scale for training data wrap these data
320 |         """
321 | 
322 |         self.fit(X)
323 |         self.transform(X)
324 | 
325 |         return self.dataw_
326 | 


--------------------------------------------------------------------------------
/docs/sprm.md:
--------------------------------------------------------------------------------
  1 | Sparse partial robust M regression
  2 | ==================================
  3 | 
  4 | Description
  5 | -----------
  6 | 
  7 | The `sprm` module in `direpack` comprises code for Sparse Partial Robust M-regeression, as 
  8 | well as a few closely related estimators: the Sparse NIPALS estimator (a non-robust option
  9 | for sparse PLS) and the Robust M-regression estoimator (multiple regression based on the same
 10 | re-weighting priciple as SPRM, yet without dimension reduction). 
 11 | 
 12 | The SPRM method performs four tasks at the same time in a single, consistent estimate: 
 13 | - *regression*: yields regression coefficients and predicts responses
 14 | - *dimension reduction*: calculates interpretable PLS-like components maximizing covariance to the predictand in a robust way 
 15 | - *variable selection*: depending on the paramter settings, can yield highly sparse regression coefficients that contain exact zero elements 
 16 | - *outlier detection and compensation*: yields a set of case weights in \[0,1\]. The lower the weight, the more outlying a case is. The estimate itself is outlier robust. 
 17 | 
 18 | Note: all the methods contained in this package have been designed for continuous data. They do not work correctly for caetgorical or textual data. 
 19 |         
 20 | The code is aligned to ScikitLearn, such that modules such as `GridSearchCV` can flawlessly be applied to it. 
 21 | 
 22 | The repository contains
 23 | - The estimator (`sprm.py`) 
 24 | 
 25 | - Options for data pre-processing (`robcent.py`)
 26 | - The Sparse NIPALS (SNIPLS) estimator \[3\](`snipls.py`)
 27 | - Robust M regression estimator (`rm.py`)
 28 | - Ancillary functions for M-estimation (`_m_support_functions.py`)
 29 | 
 30 | Note that the `plot` folder contains some plotting functionality specific to SPRM (`sprm_plot.py`). 
 31 | 
 32 | 
 33 | 
 34 | 1\. The SPRM estimator
 35 | ======================
 36 | 
 37 | The main SPRM implementation yields a class with the following structure:
 38 | 
 39 | 1\.1\. Dependencies
 40 | -----------------
 41 | - From `<sklearn.base>`: `BaseEstimator, TransformerMixin, RegressorMixin`
 42 | - From `<sklearn.utils>`: `_BaseComposition`
 43 | - `copy`
 44 | - From `<scipy.stats>`: `norm, chi2`
 45 | - `numpy` 
 46 | - From `<matplotlib>`: `pyplot`. 
 47 | - From `<statsmodels>`: `robust`. 
 48 | 
 49 | 1\.2\. Parameters
 50 | ---------------
 51 | - `eta`: float. Sparsity parameter in \[0,1). Note that `eta=0` returns the non-sparse, yet robust, partial robust M-regression (PRM) \[2\]. 
 52 | - `n_components`: int > 1. Note that if applied on data, `n_components` shall take a value <= min(x_data.shape)
 53 | - `fun`: str, downweighting function. `'Hampel'` (recommended), `'Fair'` or `'Huber'`
 54 | - `probp1`: float, probability cutoff for start of downweighting (e.g. 0.95)
 55 | - `probp2`: float, probability cutoff for start of steep downweighting (e.g. 0.975, only relevant if `fun='Hampel'`)
 56 | - `probp3`: float, probability cutoff for start of outlier omission (e.g. 0.999, only relevant if `fun='Hampel'`)
 57 | - `centring`: str, type of centring (`'mean'`, `'median'`,`'l1median'` or `'kstepLTS'`)
 58 | - `scaling`: str, type of scaling (`'std'`,`'mad'`, `'scaleTau2'`, the latter recommended, or `'None'`)
 59 | - `verbose`: boolean, specifying verbose mode
 60 | - `maxit`: int, maximal number of iterations in M algorithm
 61 | - `tol`: float, tolerance for convergence in M algorithm 
 62 | - `start_cutoff_mode`: str, value `'specific'` will set starting value cutoffs specific to X and y (preferred); any other value will set X and y stating cutoffs identically. The non-specific setting yields identical results to the SPRM R implementation available from [CRAN](https://cran.r-project.org/web/packages/sprm/index.html).
 63 | - `start_X_init`: str, values `'pcapp'` will include a PCA/broken stick projection to calculate the initial predictor block caseweights; any other value will just calculate initial predictor block case weights based on Euclidian distances within that block. The is less stable for very flat data (p >> n). 
 64 | - `colums` (def `False`): Either bool, list, numpy array or pandas Index
 65 |         if `False`, no column names supplied
 66 |         if `True`, 
 67 |             if X data are supplied as a pandas DataFrame, will extract column 
 68 |                 names from the frame
 69 |             else throws an error
 70 |         if a list, array or Index (will only take length x_data.shape[1]), 
 71 |             the column names of the x_data supplied in this list, 
 72 |             will be printed in verbose mode
 73 | - `copy` (def `True`): boolean, whether to create deep copy of the data in the calculation process 
 74 | 
 75 | 1\.3\. Attributes
 76 | ---------------
 77 | -  `x_weights_`: X block PLS weighting vectors (usually denoted W)
 78 | -  `x_loadings_`: X block PLS loading vectors (usually denoted P)
 79 | -  `C_`: vector of inner relationship between response and latent variablesblock re
 80 | -  `x_scores_`: X block PLS score vectors (usually denoted T)
 81 | -  `coef_`: vector of regression coefficients 
 82 | -  `intercept_`: intercept
 83 | -  `coef_scaled_`: vector of scaled regression coeeficients (when scaling option used)
 84 | -  `intercept_scaled_`: scaled intercept
 85 | -  `residuals_`: vector of regression residuals
 86 | -  `x_ev_`: X block explained variance per component
 87 | -  `y_ev_`: y block explained variance 
 88 | -  `fitted_`: fitted response
 89 | -  `x_Rweights_`: X block SIMPLS style weighting vectors (usually denoted R)
 90 | -  `x_caseweights_`: X block case weights
 91 | -  `y_caseweights_`: y block case weights
 92 | -  `caseweights_`: combined case weights
 93 | -  `colret_`: names of variables retained in the sparse model
 94 | -  `x_loc_`: X block location estimate 
 95 | -  `y_loc_`: y location estimate
 96 | -  `x_sca_`: X block scale estimate
 97 | -  `y_sca_`: y scale estimate
 98 | -  `non_zero_scale_vars_`: indicator vector of variables in X with nonzero scale
 99 | 
100 | 1\.4\. Methods
101 | ------------
102 | - `fit(X,y)`: fit model 
103 | - `predict(X)`: make predictions based on fit 
104 | - `transform(X)`: project X onto latent space 
105 | - `weightnewx(X)`: calculate X case weights
106 | - `getattr()`: get list of attributes
107 | - `setattr(**kwargs)`: set individual attribute of sprm object 
108 | - `valscore(X,y,scoring)`: option to use weighted scoring function in cross-validation if scoring=weighted 
109 | 
110 | 1\.5\. Ancillary functions 
111 | ------------------------
112 | - `snipls` (class): sparse NIPALS regression (first described in: \[3\]) 
113 | - `Hampel`: Hampel weight function 
114 | - `Huber`: Huber weight function 
115 | - `Fair`: Fair weight function 
116 | - `brokenstick`: broken stick rule to estimate number of relevant principal components  
117 | - `VersatileScaler` (class): centring and scaling data, with several robust options beyond `sklearn`'s `RobustScaler` 
118 | - `sprm_plot` (class): plotting SPRM results 
119 | - `sprm_plot_cv` (class): plotting SPRM cross-validation results
120 |         
121 |         
122 | 2\. The Robust M (RM) estimator
123 | ==============================
124 | 
125 | RM has been implemented to be consistent with SPRM. It takes the same arguments, except for `eta`, `n_components` and `columns`, 
126 | because it does not perform dimension reduction nor variable selection. For the same reasons, the outputs are limited to regression
127 | outputs. Therefore, dimension reduction outputs like `x_scores_`, `x_loadings_`, etc. are not provided. For R adepts, note that a
128 | [cellwise robust](https://github.com/SebastiaanHoppner/CRM) version of RM has recently been introduced. 
129 |         
130 |         
131 | 3\. The Sparse NIPALS (SNIPLS) estimator
132 | =======================================
133 | 
134 | SNIPLS is the non-robust sparse univariate PLS algorithm \[3\]. SNIPLS has been implemented to be consistent with SPRM. It takes the same arguments, except for `'fun'` and `'probp1'` through `'probp3'`, since these are robustness parameters. For the same reasons, the outputs are limited to sparse dimension reduction and regression outputs. Robustness related outputs like `x_caseweights_` cannot be provided.
135 |         
136 |  
137 | 4\. Plotting functionality
138 | =========================
139 | 
140 | The file `sprm_plot.py` contains a set of plot functions based on Matplotlib. The class sprm_plot contains plots for sprm objects, wheras the class sprm_plot_cv contains a plot for cross-validation. 
141 | 
142 | 4\.1\. Dependencies
143 | -----------------
144 | - `pandas`
145 | - `numpy`
146 | - `matplotlib.pyplot`
147 | - for plotting cross-validation results: `sklearn.model_selection.GridSearchCV`
148 | 
149 | 4\.2\. Paramaters
150 | ---------------
151 | - `res_sprm`, sprm. An sprm class object that has been fit.  
152 | - `colors`, list of str entries. Only mandatory input. Elements determine colors as: 
153 |     - \[0\]: borders of pane 
154 |     - \[1\]: plot background
155 |     - \[2\]: marker fill
156 |     - \[3\]: diagonal line 
157 |     - \[4\]: marker contour, if different from fill
158 |     - \[5\]: marker color for new cases, if applicable
159 |     - \[6\]: marker color for harsh calibration outliers
160 |     - \[7\]: marker color for harsh prediction outliers
161 | - `markers`, a list of str entries. Elements determkine markers for: 
162 |     - \[0\]: regular cases 
163 |     - \[1\]: moderate outliers 
164 |     - \[2\]: harsh outliers 
165 |     
166 | 4\.3\. Methods
167 | ------------
168 | - `plot_coeffs(entity="coef_",truncation=0,columns=[],title=[])`: Plot regression coefficients, loadings, etc. with the option only to plot the x% smallest and largets coefficients (truncation) 
169 | - `plot_yyp(ytruev=[],Xn=[],label=[],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False)`: Plot y vs y predicted. 
170 | - `plot_projections(Xn=[],label=[],components = [0,1],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False)`: Plot score space. 
171 | - `plot_caseweights(Xn=[],label=[],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False,mode='overall')`: Plot caseweights, with the option to plot `'x'`, `'y'` or `'overall'` case weights for cases used to train the model. For new cases, only `'x'` weights can be plotted. 
172 | 
173 | 4\.4\. Remark
174 | -----------
175 | The latter 3 methods will work both for cases that the models has been trained with (no additional input) or new cases (requires Xn and in case of plot_ypp, ytruev), with the option to plot only the latter (option onlyval = True). All three functions have the option to plot case names if supplied as list.       
176 | 
177 | 4\.5\. Ancillary classes
178 | ---------------------- 
179 | - `sprm_plot_cv` has method `eta_ncomp_contour(title)` to plot sklearn GridSearchCV results 
180 | 
181 | 
182 | References
183 | ==========
184 | 1. [Sparse partial robust M regression](https://www.sciencedirect.com/science/article/abs/pii/S0169743915002440), Irene Hoffmann, Sven Serneels, Peter Filzmoser, Christophe Croux, Chemometrics and Intelligent Laboratory Systems, 149 (2015), 50-59.
185 | 2. [Partial robust M regression](https://doi.org/10.1016/j.chemolab.2005.04.007), Sven Serneels, Christophe Croux, Peter Filzmoser, Pierre J. Van Espen, Chemometrics and Intelligent Laboratory Systems, 79 (2005), 55-64.
186 | 3. [Sparse and robust PLS for binary classification](https://onlinelibrary.wiley.com/doi/abs/10.1002/cem.2775), I. Hoffmann, P. Filzmoser, S. Serneels, K. Varmuza, Journal of Chemometrics, 30 (2016), 153-162.
187 |         


--------------------------------------------------------------------------------
/src/direpack/sprm/rm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Thu Jan 24 2019
  3 | 
  4 | Module containing:
  5 |     
  6 |     Estimators
  7 |     ----------
  8 |     Robust M Regression (RM)
  9 | 
 10 | Depends on robcent class for robustly centering and scaling data, as well as on
 11 | the functions in _m_support_functions. 
 12 | 
 13 | @author: Sven Serneels, Ponalytics
 14 | """
 15 | from __future__ import absolute_import, division, print_function
 16 | from __future__ import unicode_literals
 17 | from sklearn.base import RegressorMixin, BaseEstimator
 18 | from sklearn.utils.metaestimators import _BaseComposition
 19 | import copy
 20 | import numpy as np
 21 | import pandas as ps
 22 | from scipy.stats import norm, chi2
 23 | from ..preprocessing.robcent import VersatileScaler
 24 | from ..utils.utils import MyException, _predict_check_input, _check_input
 25 | from ._m_support_functions import *
 26 | 
 27 | 
 28 | class rm(_BaseComposition, BaseEstimator, RegressorMixin):
 29 | 
 30 |     """
 31 |     Robust M Regression 
 32 |     
 33 |     Parameters:
 34 |     -----------
 35 |     fun: str, downweighting function. 'Hampel' (recommended), 'Fair' or 
 36 |                 'Huber'
 37 |     probp1: float, probability cutoff for start of downweighting 
 38 |                  (e.g. 0.95)
 39 |     probp2: float, probability cutoff for start of steep downweighting 
 40 |                  (e.g. 0.975, only relevant if fun='Hampel')
 41 |     probp3: float, probability cutoff for start of outlier omission 
 42 |                  (e.g. 0.999, only relevant if fun='Hampel')
 43 |     centre: str, type of centring (`'mean'`, `'median'` or `'l1median'`, 
 44 |             the latter recommended statistically, if too slow, switch to `'median'`)
 45 |     scale: str, type of scaling ('std','mad' [recommended] or 'None')
 46 |     verbose: boolean, specifying verbose mode
 47 |     maxit: int, maximal number of iterations in M algorithm
 48 |     tol: float, tolerance for convergence in M algorithm 
 49 |     start_cutoff_mode: str, values:
 50 |         'specific' will set starting value cutoffs specific to X and y (preferred); 
 51 |         any other value will set X and y stating cutoffs identically. 
 52 |         The latter yields identical results to the SPRM R implementation available from
 53 |         CRAN. 
 54 |     copy (def True): boolean, whether to copy data
 55 |         Note: copy not yet aligned with sklearn def  
 56 |     
 57 |     """
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         fun="Hampel",
 62 |         probp1=0.95,
 63 |         probp2=0.975,
 64 |         probp3=0.999,
 65 |         centre="median",
 66 |         scale="mad",
 67 |         start_cutoff_mode="specific",
 68 |         verbose=True,
 69 |         maxit=100,
 70 |         tol=0.01,
 71 |         copy=True,
 72 |     ):
 73 |         self.fun = fun
 74 |         self.probp1 = probp1
 75 |         self.probp2 = probp2
 76 |         self.probp3 = probp3
 77 |         self.centre = centre
 78 |         self.scale = scale
 79 |         self.start_cutoff_mode = start_cutoff_mode
 80 |         self.verbose = verbose
 81 |         self.maxit = maxit
 82 |         self.tol = tol
 83 |         self.copy = copy
 84 |         self.probctx_ = "irrelevant"
 85 |         self.probcty_ = "irrelevant"
 86 |         self.hampelbx_ = "irrelevant"
 87 |         self.hampelby__ = "irrelevant"
 88 |         self.hampelrx_ = "irrelevant"
 89 |         self.hampelry_ = "irrelevant"
 90 | 
 91 |     def fit(self, X, y):
 92 |         if self.copy:
 93 |             self.X = copy.deepcopy(X)
 94 |             self.y = copy.deepcopy(y)
 95 |         (n, p) = X.shape
 96 |         if not (self.fun in ("Hampel", "Huber", "Fair")):
 97 |             raise MyException(
 98 |                 "Invalid weighting function. Choose Hampel, Huber or Fair for parameter fun."
 99 |             )
100 |         if (self.probp1 > 1) | (self.probp1 <= 0):
101 |             raise MyException("probp1 is a probability. Choose a value between 0 and 1")
102 |         if self.fun == "Hampel":
103 |             if not (
104 |                 (self.probp1 < self.probp2)
105 |                 & (self.probp2 < self.probp3)
106 |                 & (self.probp3 <= 1)
107 |             ):
108 |                 raise MyException(
109 |                     "Wrong choise of parameters for Hampel function. Use 0<probp1<hampelp2<hampelp3<=1"
110 |                 )
111 | 
112 |         if type(X) == ps.core.frame.DataFrame:
113 |             X = X.to_numpy()
114 |         if type(y) in [ps.core.frame.DataFrame, ps.core.series.Series]:
115 |             y = y.to_numpy().astype("float64")
116 |         X = _check_input(X)
117 |         y = _check_input(y)
118 |         if len(y.shape) > 1:
119 |             y = np.array(y).reshape(-1).astype("float64")
120 |         ny = y.shape[0]
121 |         if ny != n:
122 |             raise MyException("Number of cases in y and X must be identical.")
123 | 
124 |         scaling = VersatileScaler(center=self.centre, scale=self.scale)
125 |         Xs = scaling.fit_transform(X).astype("float64")
126 |         mX = scaling.col_loc_
127 |         sX = scaling.col_sca_
128 |         ys = scaling.fit_transform(y).astype("float64")
129 |         my = scaling.col_loc_
130 |         sy = scaling.col_sca_
131 |         ys = np.array(ys).reshape(-1)
132 | 
133 |         wx = np.sqrt(np.array(np.sum(np.square(Xs), 1), dtype=np.float64))
134 |         wx = wx / np.median(wx)
135 |         if [self.centre, self.scale] == ["median", "mad"]:
136 |             wy = np.array(abs(ys), dtype=np.float64)
137 |         else:
138 |             wy = (y - np.median(y)) / (1.4826 * np.median(abs(y - np.median(y))))
139 |         self.probcty_ = norm.ppf(self.probp1)
140 |         if self.start_cutoff_mode == "specific":
141 |             self.probctx_ = chi2.ppf(self.probp1, p)
142 |         else:
143 |             self.probctx_ = self.probcty_
144 |         if self.fun == "Fair":
145 |             wx = Fair(wx, self.probctx_)
146 |             wy = Fair(wy, self.probcty_)
147 |         if self.fun == "Huber":
148 |             wx = Huber(wx, self.probctx_)
149 |             wy = Huber(wy, self.probcty_)
150 |         if self.fun == "Hampel":
151 |             self.hampelby_ = norm.ppf(self.probp2)
152 |             self.hampelry_ = norm.ppf(self.probp3)
153 |             if self.start_cutoff_mode == "specific":
154 |                 self.hampelbx_ = chi2.ppf(self.probp2, p)
155 |                 self.hampelrx_ = chi2.ppf(self.probp3, p)
156 |             else:
157 |                 self.hampelbx_ = self.hampelby_
158 |                 self.hampelrx_ = self.hampelry_
159 |             wx = Hampel(wx, self.probctx_, self.hampelbx_, self.hampelrx_)
160 |             wy = Hampel(wy, self.probcty_, self.hampelby_, self.hampelry_)
161 |         wx = np.array(wx).reshape(-1)
162 |         w = (wx * wy).astype("float64")
163 |         if (w < 1e-06).any():
164 |             w0 = np.where(w < 1e-06)[0]
165 |             w[w0] = 1e-06
166 |             we = np.array(w, dtype=np.float64)
167 |         else:
168 |             we = np.array(w, dtype=np.float64)
169 |         wye = wy
170 |         WEmat = np.array([np.sqrt(we) for i in range(1, p + 1)], ndmin=1).T
171 |         Xw = np.multiply(Xs, WEmat).astype("float64")
172 |         yw = ys * np.sqrt(we)
173 |         loops = 1
174 |         rold = 1e-5
175 |         difference = 1
176 | 
177 |         while (difference > self.tol) & (loops < self.maxit):
178 |             b = np.linalg.lstsq(Xw, yw, rcond=None)
179 |             b = np.array(b[0]).reshape(-1, 1)
180 |             yp = np.dot(Xs, b).reshape(-1)
181 |             r = ys - yp
182 |             if len(r) / 2 > np.sum(r == 0):
183 |                 r = abs(r) / (1.4826 * np.median(abs(r)))
184 |             else:
185 |                 r = abs(r) / (1.4826 * np.median(abs(r[r != 0])))
186 |             wye = r
187 |             if self.fun == "Fair":
188 |                 wye = Fair(wye, self.probcty_)
189 |             if self.fun == "Huber":
190 |                 wye = Huber(wye, self.probcty_)
191 |             if self.fun == "Hampel":
192 |                 wye = Hampel(wye, self.probcty_, self.hampelby_, self.hampelry_)
193 |             b2sum = np.sum(np.square(b))
194 |             difference = abs(b2sum - rold) / rold
195 |             rold = b2sum
196 |             we = (wye * wx).astype("float64")
197 |             w0 = []
198 |             if any(we < 1e-06):
199 |                 w0 = np.where(we < 1e-06)[0]
200 |                 we[w0] = 1e-06
201 |                 we = np.array(we, dtype=np.float64)
202 |             if len(w0) >= (n / 2):
203 |                 break
204 |             WEmat = np.array([np.sqrt(we) for i in range(1, p + 1)], ndmin=1).T
205 |             Xw = np.multiply(Xs, WEmat).astype("float64")
206 |             yw = ys * np.sqrt(we)
207 |             loops += 1
208 |         if difference > self.maxit:
209 |             print(
210 |                 "Warning: Method did not converge. The scaled difference between norms of the coefficient vectors is "
211 |                 + str(round(difference, 4))
212 |             )
213 |         plotprec = False
214 |         if plotprec:
215 |             print(str(loops - 1))
216 |         w = we
217 |         w[w0] = 0
218 |         wx[w0] = 0
219 |         wy = wye
220 |         wy[w0] = 0
221 |         Xrw = np.array(np.multiply(Xs, np.sqrt(WEmat)).astype("float64"))
222 |         scaling.set_params(scale="None")
223 |         Xrw = scaling.fit_transform(Xrw)
224 |         b_rescaled = np.multiply(np.reshape(sy / sX, (p, 1)), b)
225 |         yp_rescaled = np.matmul(X, b_rescaled).reshape(-1)
226 |         if self.centre == "mean":
227 |             intercept = np.mean(y - yp_rescaled)
228 |         else:
229 |             intercept = np.median(y - yp_rescaled)
230 |         yfit = yp_rescaled + intercept
231 |         if self.scale != "None":
232 |             if self.centre == "mean":
233 |                 b0 = np.mean(ys.astype("float64") - np.matmul(Xs.astype("float64"), b))
234 |             else:
235 |                 b0 = np.median(
236 |                     np.array(ys.astype("float64") - np.matmul(Xs.astype("float64"), b))
237 |                 )
238 |         else:
239 |             if self.centre == "mean":
240 |                 ytil = np.array(np.matmul(X, b)).reshape(-1)
241 |                 intercept = np.mean(y - ytil)
242 |             else:
243 |                 intercept = np.median(y - ytil)
244 |             b0 = intercept
245 |         r = y - yfit
246 |         setattr(self, "coef_", b_rescaled)
247 |         setattr(self, "intercept_", intercept)
248 |         setattr(self, "coef_scaled_", b)
249 |         setattr(self, "intercept_scaled_", b0)
250 |         setattr(self, "residuals_", r)
251 |         setattr(self, "fitted_", yfit)
252 |         setattr(self, "x_caseweights_", wx)
253 |         setattr(self, "y_caseweights_", wy)
254 |         setattr(self, "caseweights_", w)
255 |         setattr(self, "x_loc_", mX)
256 |         setattr(self, "y_loc_", my)
257 |         setattr(self, "x_sca_", sX)
258 |         setattr(self, "y_sca_", sy)
259 |         setattr(self, "scaling_", scaling)
260 |         return self
261 |         pass
262 | 
263 |     def predict(self, Xn):
264 |         n, p, Xn = _predict_check_input(Xn)
265 |         if p != self.X.shape[1]:
266 |             raise (
267 |                 ValueError(
268 |                     "New data must have seame number of columns as the ones the model has been trained with"
269 |                 )
270 |             )
271 |         return np.matmul(Xn, self.coef_) + self.intercept_
272 | 


--------------------------------------------------------------------------------
/src/direpack/sprm/snipls.py:
--------------------------------------------------------------------------------
  1 | # Created on Fri Apr 26 19:27:52 2019
  2 | 
  3 | # @author: sven
  4 | 
  5 | 
  6 | from __future__ import absolute_import, division, print_function
  7 | from __future__ import unicode_literals
  8 | from sklearn.base import RegressorMixin, BaseEstimator, TransformerMixin
  9 | from sklearn.utils.metaestimators import _BaseComposition
 10 | import copy
 11 | import numpy as np
 12 | import pandas as ps
 13 | from ..preprocessing.robcent import VersatileScaler
 14 | from ..utils.utils import MyException, _predict_check_input, _check_input, nandot, nanmatdot
 15 | from ..preprocessing._preproc_utilities import scale_data
 16 | 
 17 | 
 18 | class snipls(_BaseComposition, BaseEstimator, TransformerMixin, RegressorMixin):
 19 |     """
 20 |     SNIPLS Sparse Nipals Algorithm 
 21 | 
 22 |     Algorithm first outlined in: 
 23 |         Sparse and robust PLS for binary classification, 
 24 |         I. Hoffmann, P. Filzmoser, S. Serneels, K. Varmuza, 
 25 |         Journal of Chemometrics, 30 (2016), 153-162.
 26 | 
 27 |     As of driepack-1.1.2, snipls works when there are missing data in the inputs
 28 | 
 29 |     Parameters
 30 |     -----------
 31 | 
 32 |     eta : float.
 33 |          Sparsity parameter in [0,1)
 34 | 
 35 |     n_components : int,
 36 |                      min 1. Note that if applied on data, n_components shall take a value <= min(x_data.shape)
 37 | 
 38 |     verbose: Boolean (def true)
 39 |                 to print intermediate set of columns retained
 40 | 
 41 |     columns : Either boolean, list, numpy array or pandas Index (def false)
 42 |                 if False, no column names supplied; if True, if X data are supplied as a pandas data frame, will extract column names from the frame throws an error for other data input types if a list, array or Index (will only take length x_data.shape[1]), the column names of the x_data supplied in this list, will be printed in verbose mode.
 43 | 
 44 |     centre : str, 
 45 |                 type of centring (`'mean'` [recommended], `'median'` or `'l1median'`), 
 46 | 
 47 |     scale : str,
 48 |              type of scaling ('std','mad' or 'None')
 49 | 
 50 |     copy : (def True): boolean,
 51 |              whether to copy data.  Note : copy not yet aligned with sklearn def  - we always copy  
 52 | 
 53 | 
 54 |     Attributes
 55 |     ------------
 56 |     Attributes always provided:
 57 | 
 58 |         -  `x_weights_`: X block PLS weighting vectors (usually denoted W)
 59 |         -  `x_loadings_`: X block PLS loading vectors (usually denoted P)
 60 |         -  `C_`: vector of inner relationship between response and latent variablesblock re
 61 |         -  `x_scores_`: X block PLS score vectors (usually denoted T)
 62 |         -  `coef_`: vector of regression coefficients 
 63 |         -  `intercept_`: intercept
 64 |         -  `coef_scaled_`: vector of scaled regression coeeficients (when scaling option used)
 65 |         -  `intercept_scaled_`: scaled intercept
 66 |         -  `residuals_`: vector of regression residuals
 67 |         -  `x_ev_`: X block explained variance per component
 68 |         -  `y_ev_`: y block explained variance 
 69 |         -  `fitted_`: fitted response
 70 |         -  `x_Rweights_`: X block SIMPLS style weighting vectors (usually denoted R)
 71 |         -  `colret_`: names of variables retained in the sparse model
 72 |         -  `x_loc_`: X block location estimate 
 73 |         -  `y_loc_`: y location estimate
 74 |         -  `x_sca_`: X block scale estimate
 75 |         -  `y_sca_`: y scale estimate
 76 |         -  `centring_`: scaling object used internally (from `VersatileScaler`)
 77 | 
 78 |     """
 79 | 
 80 |     def __init__(
 81 |         self,
 82 |         eta=0.5,
 83 |         n_components=1,
 84 |         verbose=True,
 85 |         columns=False,
 86 |         centre="mean",
 87 |         scale="None",
 88 |         copy=True,
 89 |     ):
 90 |         assert eta >= 0 and eta < 1,  "eta needs to be in [0,1)"
 91 |         assert isinstance(
 92 |             n_components, int) and n_components > 0, "number of components needs to be positive integer"
 93 |         self.eta = eta
 94 |         self.n_components = n_components
 95 |         self.verbose = verbose
 96 |         self.columns = columns
 97 |         self.centre = centre
 98 |         self.scale = scale
 99 |         self.copy = copy
100 | 
101 |     def fit(self, X, y):
102 |         """
103 |             Fit a SNIPLS model. 
104 | 
105 |             Parameters
106 |             ------------ 
107 | 
108 |                 X : numpy array 
109 |                     Input data.
110 | 
111 |                 y :   vector or 1D matrix
112 |                     Response data
113 | 
114 |         """
115 |         if type(self.columns) is list:
116 |             self.columns = np.array(self.columns)
117 |         elif type(self.columns) is bool:
118 |             if type(X) != ps.core.frame.DataFrame and self.columns:
119 |                 raise (
120 |                     MyException(
121 |                         "Columns set to true can only extract column names for data frame input"
122 |                     )
123 |                 )
124 |         if type(X) == ps.core.frame.DataFrame:
125 |             if type(self.columns) is bool and self.columns:
126 |                 self.columns = X.columns
127 |             X = X.to_numpy()
128 |         (n, p) = X.shape
129 |         if type(y) in [ps.core.frame.DataFrame, ps.core.series.Series]:
130 |             y = y.to_numpy()
131 |         X = _check_input(X)
132 |         y = _check_input(y)
133 |         ny = y.shape[0]
134 |         if ny != n:
135 |             if y.ndim == 2:
136 |                 y = y.T
137 |             else:
138 |                 raise (MyException("Number of cases in X and y needs to agree"))
139 |         y = y.astype("float64")
140 |         if self.copy:
141 |             X0 = copy.deepcopy(X)
142 |             y0 = copy.deepcopy(y)
143 |         else:
144 |             X0 = X
145 |             y0 = y
146 |         self.X = X0
147 |         self.y = y0
148 |         X0 = X0.astype("float64")
149 |         centring = VersatileScaler(center=self.centre, scale=self.scale)
150 |         X0 = centring.fit_transform(X0).astype("float64")
151 |         mX = centring.col_loc_
152 |         sX = centring.col_sca_
153 |         y0 = centring.fit_transform(y0).astype("float64")
154 |         my = centring.col_loc_
155 |         sy = centring.col_sca_
156 |         if np.isnan(X0).any() or np.isnan(y0).any():
157 |             S = nanmatdot(X0.T, X0)
158 |             dot = nandot
159 |         else:
160 |             S = np.matmul(X0.T, X0)
161 |             dot = np.dot
162 |         s0 = dot(X0.T, y0)
163 |         T = np.empty((n, self.n_components), float)
164 |         W = np.empty((p, self.n_components), float)
165 |         P = np.empty((p, self.n_components), float)
166 |         C = np.empty((self.n_components, 1), float)
167 |         Xev = np.empty((self.n_components, 1), float)
168 |         yev = np.empty((self.n_components, 1), float)
169 |         B = np.empty((p, 1), float)
170 |         oldgoodies = np.array([])
171 |         Xi = X0
172 |         yi = y0
173 |         for i in range(1, self.n_components + 1):
174 |             wh = dot(Xi.T, yi)
175 |             wh = wh / np.linalg.norm(wh, "fro")
176 |             # goodies = abs(wh)-llambda/2 lambda definition
177 |             goodies = abs(wh) - self.eta * max(abs(wh))
178 |             wh = np.multiply(goodies, np.sign(wh))
179 |             goodies = np.where((goodies > 0))[0]
180 |             goodies = np.union1d(oldgoodies, goodies)
181 |             oldgoodies = goodies
182 |             if len(goodies) == 0:
183 |                 colret = None
184 |                 print(
185 |                     "No variables retained at"
186 |                     + str(i)
187 |                     + "latent variables"
188 |                     + "and lambda = "
189 |                     + str(self.eta)
190 |                     + ", try lower lambda"
191 |                 )
192 |                 break
193 |             elimvars = np.setdiff1d(range(0, p), goodies)
194 |             wh[elimvars] = 0
195 |             th = dot(Xi, wh)
196 |             nth = np.linalg.norm(th, "fro")
197 |             ch = dot(yi.T, th) / (nth ** 2)
198 |             ph = dot(Xi.T, dot(Xi, wh)) / (nth ** 2)
199 |             Xi = Xi - np.dot(th, ph.T)
200 |             yi = yi - np.dot(th, ch)
201 |             ph[elimvars] = 0
202 |             W[:, i - 1] = np.reshape(wh, p)
203 |             P[:, i - 1] = np.reshape(ph, p)
204 |             C[i - 1] = ch
205 |             T[:, i - 1] = np.reshape(th, n)
206 |             Xev[i - 1] = (
207 |                 (nth ** 2 * np.linalg.norm(ph, "fro") ** 2)
208 |                 / np.nansum(np.square(X0))
209 |                 * 100
210 |             )
211 |             yev[i - 1] = np.nansum(nth ** 2 * (ch ** 2)) / \
212 |                 np.nansum(np.power(y0, 2)) * 100
213 |             if type(self.columns) == bool:
214 |                 colret = goodies
215 |             else:
216 |                 colret = self.columns[np.setdiff1d(range(0, p), elimvars)]
217 |             if self.verbose:
218 |                 print(
219 |                     "Variables retained for "
220 |                     + str(i)
221 |                     + " latent variable(s):"
222 |                     + "\n"
223 |                     + str(colret)
224 |                     + ".\n"
225 |                 )
226 |         if len(goodies) > 0:
227 |             R = np.matmul(
228 |                 W[:, range(0, i)],
229 |                 np.linalg.inv(
230 |                     np.matmul(P[:, range(0, i)].T, W[:, range(0, i)])),
231 |             )
232 |             B = np.matmul(
233 |                 W[:, range(0, i)],
234 |                 np.matmul(
235 |                     np.linalg.inv(
236 |                         np.matmul(
237 |                             np.matmul(W[:, range(0, i)].T, S),
238 |                             W[:, range(0, i)],
239 |                         )
240 |                     ),
241 |                     np.matmul(W[:, range(0, i)].T, s0),
242 |                 ),
243 |             )
244 |         else:
245 |             B = np.empty((p, 1))
246 |             B.fill(0)
247 |             R = B
248 |             T = np.empty((n, self.n_components))
249 |             T.fill(0)
250 |         B_rescaled = np.multiply(np.array(sy / sX).reshape((p, 1)), B)
251 |         yp_rescaled = dot(X, B_rescaled)
252 |         if self.centre == "mean":
253 |             intercept = np.nanmean(y - yp_rescaled)
254 |         elif self.centre == "None":
255 |             intercept = 0
256 |         else:
257 |             intercept = np.nanmedian(y - yp_rescaled)
258 |         yfit = yp_rescaled + intercept
259 |         yfit = yfit.reshape(-1)
260 |         r = y.ravel() - yfit
261 |         setattr(self, "x_weights_", W)
262 |         setattr(self, "x_loadings_", P)
263 |         setattr(self, "C_", C)
264 |         setattr(self, "x_scores_", T)
265 |         setattr(self, "coef_", B_rescaled)
266 |         setattr(self, "coef_scaled_", B)
267 |         setattr(self, "intercept_", intercept)
268 |         setattr(self, "x_ev_", Xev)
269 |         setattr(self, "y_ev_", yev)
270 |         setattr(self, "fitted_", yfit)
271 |         setattr(self, "residuals_", r)
272 |         setattr(self, "x_Rweights_", R)
273 |         setattr(self, "colret_", colret)
274 |         setattr(self, "x_loc_", mX)
275 |         setattr(self, "y_loc_", my)
276 |         setattr(self, "x_sca_", sX)
277 |         setattr(self, "y_sca_", sy)
278 |         setattr(self, "centring_", centring)
279 |         return self
280 | 
281 |     def predict(self, Xn):
282 |         """
283 |         Predict using a  SNIPLS model. 
284 | 
285 |         Parameters
286 |         ------------ 
287 | 
288 |             Xn : numpy array or data frame 
289 |                 Input data.
290 | 
291 |         """
292 |         n, p, Xn = _predict_check_input(Xn)
293 |         if p != self.X.shape[1]:
294 |             raise (
295 |                 ValueError(
296 |                     "New data must have same number of columns as the ones the model has been trained with"
297 |                 )
298 |             )
299 |         return np.matmul(Xn, self.coef_) + self.intercept_
300 | 
301 |     def transform(self, Xn):
302 |         """
303 |         Transform input data. 
304 | 
305 | 
306 |         Parameters
307 |         ------------ 
308 | 
309 |             Xn : numpy array or data frame 
310 |                 Input data.
311 | 
312 |         """
313 |         n, p, Xn = _predict_check_input(Xn)
314 |         if p != self.X.shape[1]:
315 |             raise (
316 |                 ValueError(
317 |                     "New data must have seame number of columns as the ones the model has been trained with"
318 |                 )
319 |             )
320 |         Xnc = scale_data(Xn, self.x_loc_, self.x_sca_)
321 |         return np.dot(Xnc, self.x_Rweights_)
322 | 


--------------------------------------------------------------------------------