├── MANIFEST.in ├── setup.cfg ├── img ├── AIG_b.png ├── AIG_CV.png ├── AIG_T12.png ├── AIG_caseweights.png ├── AIG_yyp_train.png └── AIG_yyp_train_test.png ├── docs ├── modules.rst ├── .vscode │ └── settings.json ├── setup.rst ├── sphinx_requirements.txt ├── rtd-environment.yaml ├── generated │ ├── direpack.dicomo.dicomo.dicomo.rst │ ├── direpack.sprm.snipls.snipls.rst │ ├── direpack.ppdire.ppdire.ppdire.rst │ ├── direpack.sudire.sudire.sudire.rst │ ├── direpack.sprm.sprm.sprm.rst │ ├── direpack.preprocessing.robcent.VersatileScaler.rst │ └── direpack.preprocessing.gsspp.GenSpatialSignPrePprocessor.rst ├── Makefile ├── make.bat ├── index.rst ├── Contributing.rst ├── dicomo.md ├── Cross-validation and plotting.rst ├── conf.py ├── Pre-processing.rst ├── sudire.rst ├── sprm.rst ├── sudire.md ├── ppdire.rst ├── ppdire.md └── sprm.md ├── dev-requirements.txt ├── src └── direpack │ ├── test │ ├── __init__.py │ ├── test_ppdire.py │ ├── test_sprm.py │ ├── test_dicomo.py │ └── test_sudire.py │ ├── sprm │ ├── __init__.py │ ├── _m_support_functions.py │ ├── rm.py │ └── snipls.py │ ├── ppdire │ ├── __init__.py │ ├── _ppdire_utils.py │ └── capi.py │ ├── preprocessing │ ├── __init__.py │ ├── _gsspp_utils.py │ ├── gsspp.py │ ├── _preproc_utilities.py │ └── robcent.py │ ├── utils │ ├── __init__.py │ └── utils.py │ ├── plot │ ├── __init__.py │ ├── sudire_plot.py │ └── ppdire_plot.py │ ├── cross_validation │ ├── __init__.py │ └── _cv_support_functions.py │ ├── sudire │ └── __init__.py │ ├── dicomo │ └── __init__.py │ ├── ipopt_temp │ ├── __init__.py │ ├── jacobian.py │ └── ipopt_wrapper.py │ └── __init__.py ├── direpack_Future_Dev.md ├── requirements.txt ├── .readthedocs.yaml ├── LICENSE ├── setup.py ├── .github └── workflows │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── direpack_Release_Notes.md └── README.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /img/AIG_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_b.png -------------------------------------------------------------------------------- /img/AIG_CV.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_CV.png -------------------------------------------------------------------------------- /img/AIG_T12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_T12.png -------------------------------------------------------------------------------- /img/AIG_caseweights.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_caseweights.png -------------------------------------------------------------------------------- /img/AIG_yyp_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_yyp_train.png -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | direpack 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | setup 8 | -------------------------------------------------------------------------------- /img/AIG_yyp_train_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SvenSerneels/direpack/HEAD/img/AIG_yyp_train_test.png -------------------------------------------------------------------------------- /docs/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "C:\\Workdir\\Programs\\envs\\mddsdr2\\python.exe" 3 | } -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | pytest>=7.1.3 3 | prospector>=1.7.7 4 | bandit 5 | vulture 6 | coverage>=6.4.4 7 | -------------------------------------------------------------------------------- /docs/setup.rst: -------------------------------------------------------------------------------- 1 | setup module 2 | ============ 3 | 4 | .. automodule:: setup 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /src/direpack/test/__init__.py: -------------------------------------------------------------------------------- 1 | __name__ = "test" 2 | __author__ = "Emmanuel Jordy and Sven" 3 | __license__ = "MIT" 4 | __version__ = "0.0.4" 5 | __date__ = "2024-05-23" 6 | -------------------------------------------------------------------------------- /docs/sphinx_requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | ####### requirements for sphinx####### 3 | ###### Requirements without Version Specifiers ###### 4 | 5 | sphinx-math-dollar 6 | sklearn 7 | direpack 8 | Ball 9 | sympy 10 | -------------------------------------------------------------------------------- /direpack_Future_Dev.md: -------------------------------------------------------------------------------- 1 | Work to do 2 | ---------- 3 | - optimize alignment to `sklearn` 4 | - optimize for speed 5 | - extend to multivariate responses (open research topic for several of the options!) 6 | - extend backend to GPU compatibility 7 | - suggestions and contributions always welcome! -------------------------------------------------------------------------------- /src/direpack/sprm/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 22 12:17:17 2018 5 | 6 | @author: Sven Serneels, Ponalytics 7 | """ 8 | 9 | __name__ = "sprm" 10 | __author__ = "Sven Serneels" 11 | __license__ = "MIT" 12 | __version__ = "0.8.1" 13 | __date__ = "2024-05-23" 14 | -------------------------------------------------------------------------------- /src/direpack/ppdire/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 9 14:20:17 2019 5 | 6 | @author: Sven Serneels, Ponalytics 7 | """ 8 | 9 | __name__ = "ppdire" 10 | __author__ = "Sven Serneels" 11 | __license__ = "MIT" 12 | __version__ = "0.2.12" 13 | __date__ = "2022-10-22" 14 | -------------------------------------------------------------------------------- /docs/rtd-environment.yaml: -------------------------------------------------------------------------------- 1 | name: RobDimRed-rtd 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python =3.7 7 | - pip 8 | - cyipopt 9 | - sphinx-math-dollar 10 | - scikit-learn 11 | - numpy 12 | - matplotlib 13 | - pandas 14 | - statsmodels 15 | - dcor 16 | - sympy 17 | 18 | -------------------------------------------------------------------------------- /src/direpack/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 22 12:17:17 2018 5 | 6 | @author: Sven Serneels, Ponalytics 7 | """ 8 | 9 | __name__ = "preprocessing" 10 | __author__ = "Sven Serneels" 11 | __license__ = "MIT" 12 | __version__ = "0.8.0" 13 | __date__ = "2024-02-23" 14 | -------------------------------------------------------------------------------- /src/direpack/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 11 17:22:09 2020 4 | 5 | @author: Emmanuel Jordy Menvouta 6 | """ 7 | 8 | 9 | __name__ = "utils" 10 | __author__ = "Emmanuel Jordy Menvouta and Sven Serneels" 11 | __license__ = "MIT" 12 | __version__ = "0.1.0" 13 | __date__ = "2024-02-23" 14 | -------------------------------------------------------------------------------- /src/direpack/plot/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 22 12:17:17 2018 5 | 6 | @author: Sven Serneels, Ponalytics 7 | """ 8 | 9 | __name__ = "plot" 10 | __author__ = "Sven Serneels" 11 | __license__ = "MIT" 12 | __version__ = "0.9.0" 13 | __date__ = "2020-04-18" 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/direpack/cross_validation/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 22 12:17:17 2018 5 | 6 | @author: Sven Serneels, Ponalytics 7 | """ 8 | 9 | __name__ = "cross_validation" 10 | __author__ = "Sven Serneels" 11 | __license__ = "MIT" 12 | __version__ = "0.7.0" 13 | __date__ = "2020-04-03" 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/direpack/sudire/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 11 17:22:09 2020 4 | 5 | @author: Emmanuel Jordy Menvouta 6 | Edits by Sven Serneels. 7 | """ 8 | 9 | 10 | __name__ = "sudire" 11 | __author__ = "Emmanuel Jordy Menvouta" 12 | __license__ = "MIT" 13 | __version__ = "0.1.6" 14 | __date__ = "2022-10-09" 15 | -------------------------------------------------------------------------------- /src/direpack/dicomo/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 9 14:20:17 2019 5 | 6 | @author: Sven Serneels, Ponalytics 7 | """ 8 | 9 | __name__ = "dicomo" 10 | __author__ = "Sven Serneels" 11 | __license__ = "MIT" 12 | __version__ = "1.0.4" 13 | __date__ = "2022-10-08" 14 | 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /docs/generated/direpack.dicomo.dicomo.dicomo.rst: -------------------------------------------------------------------------------- 1 | direpack.dicomo.dicomo.dicomo 2 | ============================= 3 | 4 | .. currentmodule:: direpack.dicomo.dicomo 5 | 6 | .. autoclass:: dicomo 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~dicomo.__init__ 17 | ~dicomo.fit 18 | ~dicomo.get_params 19 | ~dicomo.set_params 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/direpack/ipopt_temp/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Apr 12 2020 5 | 6 | This folder is temporary. It copies a fix to ipopt: 7 | https://github.com/matthias-k/optpy/blob/master/optpy/jacobian.py 8 | 9 | Folder will stay in direpack until the latter has been released. 10 | 11 | """ 12 | 13 | __name__ = "opt_temp" 14 | __author__ = "Sven Serneels" 15 | __license__ = "MIT" 16 | __version__ = "0.0.2" 17 | __date__ = "2021-04-15" 18 | 19 | 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | ####### sprm-requirements.txt ####### 3 | # 4 | ###### Requirements without Version Specifiers ###### 5 | numpy 6 | matplotlib 7 | scipy >= 1.9.0 8 | sklearn 9 | pandas 10 | statsmodels 11 | # uncheck these for DCOV-SDR and MDD-SDR options in sudire 12 | # Cython 13 | # ipopt 14 | dcor 15 | sympy 16 | scikit-learn 17 | pandas 18 | 19 | 20 | # 21 | ###### Requirements with Version Specifiers ###### 22 | # See https://www.python.org/dev/peps/pep-0440/#version-specifiers 23 | # python > 3.5 24 | 25 | -------------------------------------------------------------------------------- /docs/generated/direpack.sprm.snipls.snipls.rst: -------------------------------------------------------------------------------- 1 | direpack.sprm.snipls.snipls 2 | =========================== 3 | 4 | .. currentmodule:: direpack.sprm.snipls 5 | 6 | .. autoclass:: snipls 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~snipls.__init__ 17 | ~snipls.fit 18 | ~snipls.fit_transform 19 | ~snipls.get_params 20 | ~snipls.predict 21 | ~snipls.score 22 | ~snipls.set_params 23 | ~snipls.transform 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/generated/direpack.ppdire.ppdire.ppdire.rst: -------------------------------------------------------------------------------- 1 | direpack.ppdire.ppdire.ppdire 2 | ============================= 3 | 4 | .. currentmodule:: direpack.ppdire.ppdire 5 | 6 | .. autoclass:: ppdire 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~ppdire.__init__ 17 | ~ppdire.fit 18 | ~ppdire.fit_transform 19 | ~ppdire.get_params 20 | ~ppdire.predict 21 | ~ppdire.score 22 | ~ppdire.set_params 23 | ~ppdire.transform 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/generated/direpack.sudire.sudire.sudire.rst: -------------------------------------------------------------------------------- 1 | direpack.sudire.sudire.sudire 2 | ============================= 3 | 4 | .. currentmodule:: direpack.sudire.sudire 5 | 6 | .. autoclass:: sudire 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~sudire.__init__ 17 | ~sudire.fit 18 | ~sudire.fit_transform 19 | ~sudire.get_params 20 | ~sudire.predict 21 | ~sudire.score 22 | ~sudire.set_params 23 | ~sudire.transform 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/generated/direpack.sprm.sprm.sprm.rst: -------------------------------------------------------------------------------- 1 | direpack.sprm.sprm.sprm 2 | ======================= 3 | 4 | .. currentmodule:: direpack.sprm.sprm 5 | 6 | .. autoclass:: sprm 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~sprm.__init__ 17 | ~sprm.fit 18 | ~sprm.fit_transform 19 | ~sprm.get_params 20 | ~sprm.predict 21 | ~sprm.score 22 | ~sprm.set_params 23 | ~sprm.transform 24 | ~sprm.valscore 25 | ~sprm.weightnewx 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Optionally build your docs in additional formats such as PDF 13 | formats: 14 | - pdf 15 | 16 | 17 | # Optionally set the version of Python and requirements required to build your docs 18 | python: 19 | version: 3.7 20 | install: 21 | - requirements: docs/sphinx_requirements.txt 22 | 23 | conda: 24 | environment: docs/rtd-environment.yaml 25 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/generated/direpack.preprocessing.robcent.VersatileScaler.rst: -------------------------------------------------------------------------------- 1 | direpack.preprocessing.robcent.VersatileScaler 2 | ============================================== 3 | 4 | .. currentmodule:: direpack.preprocessing.robcent 5 | 6 | .. autoclass:: VersatileScaler 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~VersatileScaler.__init__ 17 | ~VersatileScaler.fit 18 | ~VersatileScaler.fit_transform 19 | ~VersatileScaler.get_params 20 | ~VersatileScaler.inverse_transform 21 | ~VersatileScaler.predict 22 | ~VersatileScaler.set_params 23 | ~VersatileScaler.transform 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /docs/generated/direpack.preprocessing.gsspp.GenSpatialSignPrePprocessor.rst: -------------------------------------------------------------------------------- 1 | direpack.preprocessing.gsspp.GenSpatialSignPrePprocessor 2 | ======================================================== 3 | 4 | .. currentmodule:: direpack.preprocessing.gsspp 5 | 6 | .. autoclass:: GenSpatialSignPrePprocessor 7 | 8 | 9 | .. automethod:: __init__ 10 | 11 | 12 | .. rubric:: Methods 13 | 14 | .. autosummary:: 15 | 16 | ~GenSpatialSignPrePprocessor.__init__ 17 | ~GenSpatialSignPrePprocessor.fit 18 | ~GenSpatialSignPrePprocessor.fit_transform 19 | ~GenSpatialSignPrePprocessor.get_params 20 | ~GenSpatialSignPrePprocessor.set_params 21 | ~GenSpatialSignPrePprocessor.transform 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Sven Serneels 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/direpack/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 22 12:17:17 2018 5 | 6 | @author: Sven Serneels, Ponalytics 7 | """ 8 | 9 | __name__ = "direpack" 10 | __author__ = "Emmanuel Jordy Menvouta, Sven Serneels, Tim Verdonck" 11 | __license__ = "MIT" 12 | __version__ = "1.1.3" 13 | __date__ = "2024-05-23" 14 | 15 | # The commented lines can be uncommented if IPOPT has been installed independently. 16 | 17 | from .preprocessing.robcent import ( 18 | VersatileScaler, 19 | versatile_scale, 20 | Wrapper, 21 | wrap, 22 | ) 23 | from .preprocessing.gsspp import ( 24 | GenSpatialSignPreProcessor, 25 | gen_ss_pp, 26 | gen_ss_covmat, 27 | ) 28 | from .sprm.sprm import sprm 29 | from .sprm.snipls import snipls 30 | from .sprm.rm import rm 31 | from .cross_validation._cv_support_functions import robust_loss 32 | from .ppdire.ppdire import ppdire 33 | from .ppdire.capi import capi 34 | from .dicomo.dicomo import dicomo 35 | from .sudire.sudire import sudire, estimate_structural_dim 36 | from .plot.sudire_plot import sudire_plot 37 | from .plot.ppdire_plot import ppdire_plot 38 | from .plot.sprm_plot import sprm_plot, sprm_plot_cv 39 | from .ipopt_temp.ipopt_wrapper import minimize_ipopt 40 | from .ipopt_temp.jacobian import * 41 | -------------------------------------------------------------------------------- /src/direpack/sprm/_m_support_functions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Fri Jan 25 18:22:27 2019 3 | 4 | Functions called internally in M-estimation 5 | 6 | @author: Sven Serneels, Ponalytics 7 | """ 8 | 9 | import numpy as np 10 | import pandas as ps 11 | 12 | 13 | def Fair(x, probct, *args): 14 | return 1 / (1 + abs(x / (probct * 2))) ** 2 15 | 16 | 17 | def Huber(x, probct, *args): 18 | x[np.where(x <= probct)[0]] = 1 19 | x[np.where(x > probct)] = probct / abs(x[np.where(x > probct)]) 20 | return x 21 | 22 | 23 | def Hampel(x, probct, hampelb, hampelr): 24 | wx = x 25 | wx[np.where(x <= probct)[0]] = 1 26 | wx[np.where((x > probct) & (x <= hampelb))[0]] = probct / abs( 27 | x[np.where((x > probct) & (x <= hampelb))[0]] 28 | ) 29 | wx[np.where((x > hampelb) & (x <= hampelr))[0]] = np.divide( 30 | probct * (hampelr - (x[np.where((x > hampelb) & (x <= hampelr))[0]])), 31 | (hampelr - hampelb) * abs(x[np.where((x > hampelb) & (x <= hampelr))[0]]), 32 | ) 33 | wx[np.where(x > hampelr)[0]] = 0 34 | return wx 35 | 36 | 37 | def brokenstick(n_components): 38 | q = np.triu(np.ones((n_components, n_components))) 39 | r = np.empty((n_components, 1), float) 40 | r[0:n_components, 0] = range(1, n_components + 1) 41 | q = np.matmul(q, 1 / r) 42 | q /= n_components 43 | return q 44 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 22 12:18:53 2018 5 | 6 | @author: Sven serneels, Ponalytics 7 | """ 8 | 9 | from setuptools import setup, find_packages 10 | import re 11 | import sys 12 | import os 13 | 14 | SRC_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),"./src") 15 | if SRC_DIR not in sys.path: 16 | sys.path.insert(0,SRC_DIR) 17 | from direpack import __version__, __author__, __license__ 18 | 19 | readme_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'README.md') 20 | try: 21 | from m2r import parse_from_file 22 | readme = parse_from_file(readme_file) 23 | except ImportError: 24 | # m2r may not be installed in user environment 25 | with open(readme_file) as f: 26 | readme = f.read() 27 | 28 | setup( 29 | name="direpack", 30 | version=__version__, 31 | author=__author__, 32 | author_email="svenserneels@gmail.com", 33 | description="A Python 3 Library for State-of-the-Art Statistical Dimension Reduction Methods", 34 | long_description=readme, 35 | long_description_content_type='text/markdown', 36 | url="https://github.com/SvenSerneels/direpack", 37 | classifiers=[ 38 | "Programming Language :: Python :: 3", 39 | "License :: OSI Approved :: MIT License", 40 | "Operating System :: OS Independent", 41 | ], 42 | packages=find_packages('src'), # include all packages under src 43 | package_dir={'':'src'}, # tell distutils packages are under src 44 | include_package_data = True, 45 | install_requires=[ 46 | 'numpy>=1.5.0', 47 | 'scipy>=0.8.0', 48 | 'matplotlib>=2.2.0', 49 | 'scikit-learn>=0.18.0', 50 | 'pandas>=0.19.0', 51 | 'statsmodels>=0.8.0', 52 | # 'ipopt>=0.1.5', 53 | 'dcor>=0.3' 54 | ] 55 | ) 56 | 57 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | Welcome to direpack's documentation! 4 | ==================================== 5 | The direpack package aims to establish a set of modern statistical dimension reduction techniques into the Python universe as a single, consistent package. 6 | The dimension reduction methods included resort into three categories: projection pursuit based dimension reduction, sufficient dimension reduction, and robust M estimators for dimension reduction. 7 | As a corollary, regularized regression estimators based on these reduced dimension spaces are provided as well, ranging from classical principal component regression up to sparse partial robust M regression. 8 | The package also contains a set of classical and robust pre-processing utilities, including generalized spatial signs, as well as dedicated plotting functionality and cross-validation utilities. 9 | Finally, direpack has been written consistent with the scikit-learn API, such that the estimators can flawlessly be included into (statistical and/or machine) learning pipelines in that framework. 10 | 11 | 12 | 13 | Installation 14 | ============ 15 | The package is distributed through PyPI, so use:: 16 | 17 | pip install direpack 18 | 19 | Examples 20 | =============== 21 | Example notebooks have been produced to showcase the use of direpack for statistical dimension reduction. These notebooks contain a `ppdire example `_ , `sprm example `_ and a `sudire example `_ . 22 | 23 | 24 | 25 | 26 | Contents 27 | ======== 28 | 29 | .. toctree:: 30 | :maxdepth: 2 31 | 32 | ppdire 33 | sudire 34 | sprm 35 | Pre-processing 36 | Cross-validation and plotting 37 | 38 | 39 | .. toctree:: 40 | :maxdepth: 1 41 | :caption: Other information 42 | 43 | Contributing 44 | 45 | 46 | 47 | 48 | Indices and tables 49 | ================== 50 | 51 | * :ref:`genindex` 52 | * :ref:`search` 53 | -------------------------------------------------------------------------------- /docs/Contributing.rst: -------------------------------------------------------------------------------- 1 | .. _Contributing: 2 | 3 | ################ 4 | Contributing 5 | ################ 6 | 7 | No package is complete and the authors would like to see direpack extend its functionality in the future. Some possible additions could be : 8 | 9 | - Cellwise robust dimension reduction methods : For instance, a cellwise robust version of the robust M regression method, included in sprm, has recently been published (Filzmoseret al.2020), and could be included in direpack. 10 | - Uncertainty quantification : The methods provided through direpack provide point estimates. In the future, the package could, e.g. be augmented with appropriate bootstrapping techniques, as was done for a related dimension reduction context 11 | - GPU flexibility : There are many matrix manipulations in direpack, which can possiblybe sped up by allowing a GPU compatibility, which could be achieved by providing a TensorFlowor PyTorch back-end. However, this would be a major effort, since thepresent back-end integrally builds upon numpy. 12 | - More (and better) unit tests. 13 | 14 | Guidelines 15 | ============ 16 | 17 | Testing 18 | ------- 19 | Contributions should be accompanied by unit tests similar to those already available. Contrbutors can use the datasets presented in the example notebooks. 20 | 21 | Documentation 22 | ------------- 23 | We have followed `PEP8 `_ style when building this project and ask that contributors do so, 24 | for ease of maintainability. 25 | 26 | Article 27 | ================ 28 | An article with further information on the package is available. Menvouta, E.J., Serneels, S., Verdonck, T., 2023. direpack: A python 3 package for state-of-the-art statistical dimensionality reduction methods. SoftwareX 21, 101282. 29 | 30 | Contacts 31 | ================ 32 | 33 | * Dr Sven Serneels is co-founder at Gallop Data, Inc. and can be contacted at svenserneel (at) gmail.com. 34 | 35 | * Emmanuel Jordy Menvouta is a PhD researcher in Statistics and Data Science at KU Leuven and can be contacted at emmanueljordy.menvoutankpwele (at) kuleuven.be. 36 | 37 | * Prof Tim Verdonck is Professor of Statistics and Data Science at University of Antwerp and KU Leuven. He can be reached at tim.verdonck (at) uantwerp.be. -------------------------------------------------------------------------------- /src/direpack/cross_validation/_cv_support_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jan 27 11:42:23 2019 5 | 6 | Ancillary tools for plotting sprm results. 7 | 8 | # Deleted: ABLine2D class, was broken in Py 3.7 9 | cv_score_table (function): tranform sklearn GridSearchCV 10 | results into Data Frame 11 | 12 | @author: Sven Serneels 13 | """ 14 | import numpy as np 15 | import pandas as ps 16 | from sklearn.metrics import mean_squared_error 17 | from scipy.stats import norm 18 | from ..sprm._m_support_functions import Fair, Huber, Hampel 19 | 20 | def cv_score_table(res_sprm_cv): 21 | 22 | """ 23 | Internal function reorganizing sklearn GridSearchCV results to pandas table. 24 | The function adds the cv score table to the object as cv_score_table_ 25 | """ 26 | 27 | n_settings = len(res_sprm_cv.cv_results_['params']) 28 | etas = [res_sprm_cv.cv_results_['params'][i]['eta'] for i in range(0,n_settings)] 29 | components = [res_sprm_cv.cv_results_['params'][i]['n_components'] for i in range(0,n_settings)] 30 | cv_score_table_ = ps.DataFrame({'etas':etas, 'n_components':components, 'score':res_sprm_cv.cv_results_['mean_test_score']}) 31 | return(cv_score_table_) 32 | 33 | def robust_loss(y,ypred,lfun=mean_squared_error,fun=Hampel,probct=norm.ppf(0.975),hampelb=norm.ppf(.99),hampelr=norm.ppf(.999)): 34 | 35 | """ 36 | Weighted loss function to be used in sklearn cross-validation 37 | Inputs: 38 | y: array or matrix, original predictand 39 | ypred, array or matrix, predicted values 40 | lfun, function: an sklearn loss metric that accepts caseweights, 41 | e.g. sklearn.metrics.mean_squared_error 42 | fun: function, weight function, 43 | e.g. Fair, Huber or Hampel from sprm.sprm._m_support_functions 44 | probct, hampelb, hampelr: float, cutoffs for weight functions 45 | Output: 46 | loss, float 47 | """ 48 | 49 | if len(ypred.shape) > 1: 50 | ypred = np.array(ypred).reshape(-1) 51 | ypred = ypred.astype('float64') 52 | if len(y.shape) > 1: 53 | y = np.array(y).reshape(-1) 54 | y = y.astype('float64') 55 | r = y - ypred 56 | w = fun(r,probct,hampelb,hampelr) 57 | return(lfun(y,ypred,sample_weight=w)) 58 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [3.8, 3.9, "3.10"] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | # uses: actions/setup-python@v2 24 | uses: conda-incubator/setup-miniconda@v2 25 | with: 26 | auto-update-conda: true 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | $CONDA/bin/conda install -c conda-forge libstdcxx-ng 33 | $CONDA/bin/conda install -c conda-forge libgcc=5.2.0 34 | $CONDA/bin/conda install -c conda-forge scikit-learn 35 | $CONDA/bin/conda install -c conda-forge pandas 36 | $CONDA/bin/conda install -c conda-forge numpy 37 | $CONDA/bin/conda install -c conda-forge statsmodels 38 | $CONDA/bin/conda install -c conda-forge dcor 39 | $CONDA/bin/conda install -c conda-forge sympy 40 | $CONDA/bin/conda install -c conda-forge matplotlib 41 | sudo apt install gcc 42 | - name: Conda info 43 | shell: bash -l {0} 44 | run: conda info 45 | - name: Conda list 46 | shell: pwsh 47 | run: conda list 48 | - name: install ipopt 49 | run: | 50 | $CONDA/bin/conda install -c conda-forge cyipopt 51 | - name: Lint with flake8 52 | run: | 53 | $CONDA/bin/conda install flake8 54 | # stop the build if there are Python syntax errors or undefined names 55 | $CONDA/bin/flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 56 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 57 | $CONDA/bin/flake8 . --count --exit-zero --max-complexity=15 --max-line-length=127 --statistics 58 | - name: Test with pytest 59 | run: | 60 | conda install pytest 61 | $CONDA/bin/pytest 62 | -------------------------------------------------------------------------------- /src/direpack/ipopt_temp/jacobian.py: -------------------------------------------------------------------------------- 1 | """ 2 | Author: Matthias Kuemmerer, 2014 3 | """ 4 | from __future__ import print_function, division, unicode_literals, absolute_import 5 | 6 | import sys 7 | import numpy as np 8 | 9 | 10 | class FunctionWithApproxJacobian(object): 11 | def __init__(self, func, epsilon, verbose=True): 12 | self._func = func 13 | self.epsilon = epsilon 14 | self.value_cache = {} 15 | self.verbose = verbose 16 | 17 | def __call__(self, x, *args, **kwargs): 18 | key = tuple(x) 19 | if not key in self.value_cache: 20 | self.log('.') 21 | value = self._func(x, *args, **kwargs) 22 | if np.any(np.isnan(value)): 23 | print("Warning! nan function value encountered at {0}".format(x)) 24 | self.value_cache[key] = value 25 | return self.value_cache[key] 26 | 27 | def func(self, x, *args, **kwargs): 28 | if self.verbose: 29 | print(x) 30 | return self(x, *args, **kwargs) 31 | 32 | def log(self, msg): 33 | if self.verbose: 34 | sys.stdout.write(msg) 35 | sys.stdout.flush() 36 | 37 | def jac(self, x, *args, **kwargs): 38 | self.log('G[') 39 | x0 = np.asfarray(x) 40 | #print x0 41 | dxs = np.zeros((len(x0), len(x0) + 1)) 42 | for i in range(len(x0)): 43 | dxs[i, i + 1] = self.epsilon 44 | results = [self(*(x0 + dxs[:, i], ) + args, **kwargs) for i in range(len(x0) + 1)] 45 | jac = np.zeros([len(x0), len(np.atleast_1d(results[0]))]) 46 | for i in range(len(x0)): 47 | jac[i] = (results[i + 1] - results[0]) / self.epsilon 48 | self.log(']') 49 | return jac.transpose() 50 | 51 | 52 | class FunctionWithApproxJacobianCentral(FunctionWithApproxJacobian): 53 | def jac(self, x, *args, **kwargs): 54 | self.log('G[') 55 | x0 = np.asfarray(x) 56 | #print x0 57 | dxs = np.zeros((len(x0), 2*len(x0))) 58 | for i in range(len(x0)): 59 | dxs[i, i] = -self.epsilon 60 | dxs[i, len(x0)+i] = self.epsilon 61 | results = [self(*(x0 + dxs[:, i], ) + args, **kwargs) for i in range(2*len(x0))] 62 | jac = np.zeros([len(x0), len(np.atleast_1d(results[0]))]) 63 | for i in range(len(x0)): 64 | jac[i] = (results[len(x0)+i] - results[i]) / (2*self.epsilon) 65 | self.log(']') 66 | return jac.transpose() -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | workflow_dispatch: 11 | inputs: 12 | version_bump: 13 | description: 'The verions portion to increment' 14 | required: true 15 | default: 'patch' 16 | type: choice 17 | options: 18 | - patch 19 | - minor 20 | - major 21 | workflow_call: 22 | inputs: 23 | version_bump: 24 | description: "The version portion to increment" 25 | required: true 26 | type: string 27 | push: 28 | branches: 29 | - master 30 | paths-ignore: 31 | - '.github/**' 32 | - 'README.md' 33 | - '.gitignore' 34 | - 'CHANGELOG.md' 35 | pull_request: 36 | branches: 37 | - master 38 | 39 | jobs: 40 | deploy: 41 | 42 | runs-on: ubuntu-latest 43 | 44 | steps: 45 | - uses: actions/checkout@v3 46 | - name: Set up Python 47 | uses: conda-incubator/setup-miniconda@v2 48 | with: 49 | auto-update-conda: true 50 | python-version: 3.9 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 55 | $CONDA/bin/conda install -c conda-forge libstdcxx-ng 56 | $CONDA/bin/conda install -c conda-forge libgcc=5.2.0 57 | $CONDA/bin/conda install -c conda-forge scikit-learn 58 | $CONDA/bin/conda install -c conda-forge pandas 59 | $CONDA/bin/conda install -c conda-forge numpy 60 | $CONDA/bin/conda install -c conda-forge statsmodels 61 | $CONDA/bin/conda install -c conda-forge dcor 62 | $CONDA/bin/conda install -c conda-forge sympy 63 | $CONDA/bin/conda install -c conda-forge matplotlib 64 | sudo apt install gcc 65 | - name: Conda info 66 | shell: bash -l {0} 67 | run: conda info 68 | - name: Conda list 69 | shell: pwsh 70 | run: conda list 71 | - name: install ipopt 72 | run: | 73 | $CONDA/bin/conda install -c conda-forge cyipopt 74 | - name: Build and publish 75 | env: 76 | TWINE_USERNAME: ${{ secrets.PYPIUID }} 77 | TWINE_PASSWORD: ${{ secrets.PYPIPWD }} 78 | run: | 79 | $CONDA/bin/python -m pip install setuptools wheel twine 80 | $CONDA/bin/python setup.py sdist bdist_wheel 81 | $CONDA/bin/twine upload dist/* 82 | -------------------------------------------------------------------------------- /src/direpack/utils/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Apr 13 16:08:22 2020 5 | 6 | @author: sven 7 | """ 8 | 9 | import pandas as ps 10 | import numpy as np 11 | 12 | 13 | class MyException(Exception): 14 | pass 15 | 16 | 17 | def convert_X_input(X): 18 | 19 | if type(X) == ps.core.frame.DataFrame: 20 | X = X.to_numpy().astype('float64') 21 | return(X) 22 | 23 | 24 | def convert_y_input(y): 25 | 26 | if type(y) in [ps.core.frame.DataFrame, ps.core.series.Series]: 27 | y = y.to_numpy().T.astype('float64') 28 | return(y) 29 | 30 | 31 | def const_xscale(beta, *args): 32 | X = args[0] 33 | h = args[1] 34 | i = args[2] 35 | j = args[3] 36 | beta = np.reshape(beta, (-1, h), order='F') 37 | covx = np.cov(X, rowvar=False) 38 | ans = np.matmul(np.matmul(beta.T, covx), beta) - np.identity(h) 39 | return(ans[i, j]) 40 | 41 | 42 | def const_zscale(beta, *args): 43 | X = args[0] 44 | h = args[1] 45 | i = args[2] 46 | j = args[3] 47 | beta = np.reshape(beta, (-1, h), order='F') 48 | covx = np.identity(X.shape[1]) 49 | ans = np.matmul(np.matmul(beta.T, covx), beta) - np.identity(h) 50 | return(ans[i, j]) 51 | 52 | 53 | def _predict_check_input(Xn): 54 | if type(Xn) == ps.core.series.Series: 55 | Xn = Xn.to_numpy() 56 | if Xn.ndim == 1: 57 | Xn = Xn.reshape((1, -1)) 58 | if type(Xn) == ps.core.frame.DataFrame: 59 | Xn = Xn.to_numpy() 60 | n, p = Xn.shape 61 | return (n, p, Xn) 62 | 63 | 64 | def _check_input(X): 65 | 66 | if(type(X) in (np.matrix, ps.core.frame.DataFrame, ps.core.series.Series)): 67 | X = np.array(X) 68 | 69 | if (X.dtype == np.dtype('O')): 70 | X = X.astype('float64') 71 | 72 | if X.ndim == 1: 73 | X = X.reshape((1, -1)) 74 | 75 | n, p = X.shape 76 | 77 | if n == 1: 78 | if p >= 2: 79 | X = X.reshape((-1, 1)) 80 | return(X) 81 | 82 | 83 | def nandot(X, y): 84 | 85 | p, n = X.shape 86 | assert n == len(y), "Number of rows in X and y needs to agree" 87 | if len(y.shape) > 1: 88 | y = y.reshape(-1) 89 | product = [np.nansum(np.multiply(X[i, :], y)) for i in range(p)] 90 | 91 | return np.array(product).reshape((-1, 1)) 92 | 93 | 94 | def nanmatdot(X, Y): 95 | 96 | p, n = X.shape 97 | if len(Y.shape) == 1: 98 | return nandot(X, Y) 99 | else: 100 | m, q = Y.shape 101 | assert n == m, "Matrix diomensions need to agree" 102 | if q == 1: 103 | return nandot(X, Y) 104 | else: 105 | product = [[np.nansum(np.multiply(X[i, :], Y[:, j])) 106 | for i in range(p)] for j in range(q)] 107 | 108 | return np.array(product) 109 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ -------------------------------------------------------------------------------- /docs/dicomo.md: -------------------------------------------------------------------------------- 1 | Diverse (co-)moment statistics 2 | ============================== 3 | 4 | This class implements (co)-moment statistics, covering both clasical product-moment 5 | statistics, as well as more recently developed energy statistics. 6 | The `dicomo` class also serves as a plug-in into `capi` and `ppdire`. It has been written consistently with `ppdire` such that it provides a wide range of 7 | projection indices based on (co-)moments. 8 | 9 | Description 10 | ----------- 11 | 12 | The `dicomo` folder contains 13 | - The class object (`dicomo.py`) 14 | - Ancillary functions for (co-)moment estimation (`_dicomo_utils.py`) 15 | 16 | The `dicomo` class 17 | ================== 18 | 19 | Parameters 20 | ---------- 21 | - `est`, str: mode of estimation. The set of options are `'arithmetic'` (product-moment) or `'distance'` (energy statistics) 22 | - `mode`, str: type of moment. Options are: 23 | * `'mom'`: moment 24 | * `'var'`: variance 25 | * `'std'`: standard deviation 26 | * `'skew'`: skewness 27 | * `'kurt'`: kurtosis 28 | * `'com'`: co-moment 29 | * `'M3'`: shortcut for third order co-moment 30 | * `'cov'`: covariance 31 | * `'cos'`: co-skewness 32 | * `'cok'`: co-kurtosis 33 | * `'corr'`: correlation, 34 | * `'continuum'`: continuum association 35 | * `'mdd'`: martingale difference divergence (requires `est = 'distance'`) 36 | * `'mdc'`: martingale difference correlation (requires `est = 'distance'`) 37 | * `'ballcov'`: ball covariance (requires installing `Ball` and uncommenting the `import` statement) 38 | - `center`: internal centring used in calculation. Options are `mean` or `median`. 39 | 40 | Attributes 41 | ---------- 42 | Attributes always provided 43 | - `moment_`: The resulting (co-)moment 44 | 45 | Depending on the options picked, intermediate results are stored as well, as `x_moment_`, `y_moment_` or `co_moment_` 46 | 47 | 48 | Methods 49 | -------- 50 | - `fit(X, *args, **kwargs)`: fit model 51 | 52 | The `fit` function takes several optional input arguments. These are options that 53 | apply to individual settings: 54 | - `biascorr`, Bool, when `True`, correct for bias. For classical product-moment statistics, this 55 | is the small sample correction. For energy statistics, this leads to the estimates that are unbiased in high dimension 56 | (but not preferred in low dimension). 57 | - `alpha`, float, parameter for continuum association. Has no effect for other options. 58 | - `option`, int, determines which higher order co-moment to calculate, e.g. for co-skewness, `option=1` calciulates CoS(x,x,y) 59 | - `order`, int, which order (co-)moment to calculate. Can be overruled by `mode`, e.g. if `mode='var'`, `order` is set to 2. 60 | - `calcmode`, str, to use the efficient or naive algorithm to calculate distance statistics. Defaults to `fast` when available. 61 | 62 | Examples 63 | -------- 64 | Check out the [dicomo examples notebook](https://github.com/SvenSerneels/direpack/blob/master/examples/dicomo_example.ipynb) -------------------------------------------------------------------------------- /docs/Cross-validation and plotting.rst: -------------------------------------------------------------------------------- 1 | .. _Cross-validation and plotting: 2 | 3 | ############################# 4 | Cross-validation and plotting 5 | ############################# 6 | 7 | Each of the sudire, ppdire and sprm subpackages in direpackare wrappers around a broad class of dimension reduction methods. 8 | Each of these methods will have at least one tune-able hyperparameter; some have many more. The user will want to be able to find the optimal hyperparameters for the data at hand, which can be done through cross-validation or bayesian optimization. 9 | It is not the aim of direpack to provide its own hyperparameter tuning algorithms, as ample cross-validation utilities are available in scikit-learn’s model selection subpackage and the direpack estimations have been written consistently with the scikit-learn API, 10 | such that these model selection tools from scikit-learn can directly be applied to them. However, some caution should be taken when training the robust methods. While all classical (non-robust) methods could just use scikit-learn’s default settings, when tuning a robust model, 11 | outliers are expected to be in the data, such that it becomes preferable to apply a robust cross-validation metric as well. Thereunto, it is possible to use scikit-learn’s median_absolute_error, which is an MAE (L1) scorer that is less affected by extreme values than the default mean_squared_error. 12 | However, particularly in the case of robust M estimators, a more model consistent approach can be pursued. The robust M estimators provide a set of case weights, and these can be used to construct a weighted evaluation metric for cross-validation. Exactly this is provided in the robust_loss function that is a part of the direpack cross-validation utilities. 13 | 14 | Similar to hyperparameter tuning, direpack's mission is not to deliver a broad set of plotting utilities, but rather focus on the dimension reduction statistics. However, some plots many users would like to have in this context, are provided for each of the methods. These are : 15 | 16 | * Projection plots. These plots visualize the scores $\mathbf{t}_i$ and a distinction can be made in the plots between cases that the model had been trained with, and test set cases. 17 | * Parity plots. For the regularized regressions based on the estimated scores, these visualize the predicted versus actual responses, with the same distinction as for the scores. 18 | 19 | For the special case of SPRM, the plots have enhanced functionality. Since SPRM provides case weights, which can also be calculated for new cases, the SPRM plots can flag outliers. In the sprm_plot function, this is set up with two cut-offs, based on the caseweight values,and visualized asregular cases,moderate outliersorharsh outliers. 20 | For SPRM, there is anoption as well to visualize the case weights themselves. 21 | 22 | 23 | Examples of direpack's plotting functionalities are available in the example notebooks of `ppdire `_, `sprm `_ and `sudire `_ . 24 | 25 | -------------------------------------------------------------------------------- /direpack_Release_Notes.md: -------------------------------------------------------------------------------- 1 | `sprm` Release notes (versions 0.0 through 0.7) 2 | ==================== 3 | 4 | Version 0.2.1 5 | ------------- 6 | - sprm now takes both numeric (n,1) np matrices and (n,) np.arrays as input 7 | 8 | 9 | Version 0.2.0 10 | ------------- 11 | Changes compared to version 0.1: 12 | - All functionalities can now be loaded in modular way, e.g. to use plotting functions, now source the plot function separately: 13 | 14 | from sprm import sprm_plot 15 | 16 | - The package now includes a robust M regression estimator (rm.py), which is a multiple regression only variant of sprm. 17 | It is based on the same iterative re-weighting scheme, buit does not perform dimension reduction, nor variable selection. 18 | - The robust preprocessing routine (robcent.py) has been re-written so as to be more consistent with sklearn. 19 | 20 | Version 0.3 21 | ----------- 22 | All three estimators provided as separate classes in module: 23 | 24 | from sprm import sprm 25 | from sprm import snipls 26 | from sprm import rm 27 | 28 | Also, sprm now includes a check for zero scales. It will remove zero scale variables from the input data, and only use 29 | columns corresponding to nonzero predictor scales in new data. This check has not yet been built in for snipls or rm 30 | separately. 31 | 32 | Plus some minor changes to make it consistent with the latest numpy and matplotlib versions. 33 | 34 | Version 0.4 35 | ----------- 36 | The preprocesing routine `robcent` has been refactored. Functionality has been 37 | added to centre the data nonparametrically by the L1 median. The ancillary functions 38 | for `robcent` have been moved into `_preproc_utilities.py`. 39 | 40 | Furthermore, `sprm`, `snipls` and `rm` have all three been modified such that 41 | they accept matrix, array or data frame input for both X and y. Also, the option 42 | to provide column names has been extended to automatic extraction from data frame 43 | input, or direct input as list, array or pandas Index. 44 | 45 | The license has been changed from GPL3 to MIT. 46 | 47 | 0.4.2. `'kstepLTS'` location estimator included. 48 | 49 | 50 | Version 0.5 51 | ----------- 52 | Pre-processing functions further refactored so as to be compatible with `sklearn` pipelines. 53 | Class now named `VersatileScaler`; the old `robcent` name still works, but will be sunset. 54 | 55 | Version 0.6 56 | ----------- 57 | Preprocessing files moved into separate folder. More preprocessing options. 58 | Examples moved into Jupyter notebook in separate examples section. 59 | 60 | `direpack` release notes (since version 0.8) 61 | ======================== 62 | 63 | Version 0.8 64 | ----------- 65 | `ppdire` merges in 66 | 67 | Version 0.9 68 | ----------- 69 | - `preprocessing` widely extended 70 | - `plot` functions adapted 71 | - documentation improved 72 | 73 | Version 1.0 74 | ----------- 75 | - `sudire` joins in 76 | - `plot` functions adapted 77 | - documentation provided for `dicomo` 78 | - 1.0.2: link to `direpack` publication added 79 | - 1.0.3: fixed rare division by zero in `l1median` 80 | - 1.0.4: unit tests included 81 | - 1.0.5: `sudire` notebook adapted 82 | - 1.0.9: function to calculate the martingale difference divergence matrix (MDDM) added in `_dicomo_utils.py` 83 | - 1.0.11: documentation updated to accommodate for go-live of readthedocs page 84 | - 1.0.13: fixed bug in option to use `Ball` in `sudire`. Adjusted readthedocs. 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | #import unittest.mock as mock 16 | sys.path.insert(0, os.path.abspath('..')) 17 | #sys.path.insert(0, os.path.abspath('../src/direpack/')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'direpack' 23 | copyright = '2021, Sven Serneels and Emmanuel Jordy Menvouta' 24 | author = 'Sven Serneels and Emmanuel Jordy Menvouta' 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = '1.0.10' 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage','sphinx.ext.autosummary', 'sphinx.ext.napoleon','sphinx.ext.imgmath', "sphinx.ext.viewcode", 'sphinx_math_dollar'] 36 | 37 | # Add autosummary 38 | autosummary_generate = True 39 | add_module_names = False 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ['_templates'] 43 | 44 | # List of patterns, relative to source directory, that match files and 45 | # directories to ignore when looking for source files. 46 | # This pattern also affects html_static_path and html_extra_path. 47 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 48 | 49 | 50 | 51 | # MOCK_MODULES = ['numpy', 'pandas', 'matplotlib','scikit-learn'] 52 | # for mod_name in MOCK_MODULES: 53 | # sys.modules[mod_name] = mock.Mock() 54 | 55 | 56 | # -- Options for HTML output ------------------------------------------------- 57 | 58 | # The theme to use for HTML and HTML Help pages. See the documentation for 59 | # a list of builtin themes. 60 | # 61 | html_theme = 'sphinx_rtd_theme' 62 | 63 | # Add any paths that contain custom static files (such as style sheets) here, 64 | # relative to this directory. They are copied after the builtin static files, 65 | # so a file named "default.css" will overwrite the builtin "default.css". 66 | html_static_path = ['_static'] 67 | 68 | 69 | imgmath_latex_preamble = r''' 70 | \usepackage{lineno} 71 | \usepackage{amsmath} 72 | \usepackage{graphicx,psfrag,epsf} 73 | \usepackage{enumerate} 74 | \usepackage{amsmath,amsfonts,amssymb,graphicx,multirow} 75 | \usepackage{mdsymbol} 76 | \usepackage{booktabs} 77 | \usepackage{amsthm} 78 | \usepackage{bbm} 79 | \usepackage{algorithm} 80 | \newcommand{\argmax}{\mathop{\mbox{argmax}}} 81 | \usepackage[noend]{algpseudocode} 82 | \usepackage{rotating} 83 | \modulolinenumbers[5] 84 | \def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}} 85 | \def\spacingset#1{\renewcommand{\baselinestretch}% 86 | {#1}\small\normalsize} \spacingset{1} 87 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 88 | ''' 89 | 90 | #imgmath_image_format = 'svg' 91 | -------------------------------------------------------------------------------- /src/direpack/test/test_ppdire.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jun 30 13:17:46 2020 4 | 5 | @author: Emmanuel Jordy Menvouta 6 | """ 7 | 8 | import unittest 9 | import pandas as ps 10 | import numpy as np 11 | from ..preprocessing.robcent import VersatileScaler 12 | import sklearn.decomposition as skd 13 | from ..dicomo.dicomo import dicomo 14 | from ..ppdire.ppdire import ppdire 15 | import sklearn.cross_decomposition as skc 16 | 17 | 18 | class Testppdire(unittest.TestCase): 19 | """Test some methods in the ppdire class""" 20 | 21 | @classmethod 22 | def setUpClass(self): 23 | print("setupClass") 24 | 25 | @classmethod 26 | def tearDownClass(self): 27 | print("teardownClass") 28 | 29 | def setUp(self): 30 | self.data = ps.read_csv("./data/Returns_shares.csv") 31 | self.datav = self.data.values[:, 2:8].astype("float64") 32 | self.x = self.datav[:, 1:5] 33 | self.y = self.datav[:, 0] 34 | self.n = self.data.shape[0] 35 | self.p = self.data.shape[1] 36 | self.centring = VersatileScaler() 37 | self.Xs = self.centring.fit_transform(self.x) 38 | 39 | def tearDown(self): 40 | del self.x 41 | del self.y 42 | del self.n 43 | del self.p 44 | del self.Xs 45 | del self.centring 46 | 47 | def test_pca(self): 48 | """tests the exactness of ppdire's pca""" 49 | 50 | pppca = ppdire( 51 | projection_index=dicomo, 52 | pi_arguments={"mode": "var"}, 53 | n_components=4, 54 | optimizer="SLSQP", 55 | ) 56 | pppca.fit(self.x) 57 | skpca = skd.PCA(n_components=4) 58 | skpca.fit(self.Xs) 59 | np.testing.assert_almost_equal( 60 | np.abs(pppca.x_loadings_), np.abs(skpca.components_.T), decimal=3 61 | ) 62 | 63 | def test_pls(self): 64 | """tests the exactness of ppdire's pls""" 65 | 66 | skpls = skc.PLSRegression(n_components=4) 67 | skpls.fit(self.Xs, (self.y - np.mean(self.y)) / np.std(self.y)) 68 | pppls = ppdire( 69 | projection_index=dicomo, 70 | pi_arguments={"mode": "cov"}, 71 | n_components=4, 72 | square_pi=True, 73 | optimizer="SLSQP", 74 | optimizer_options={"maxiter": 500}, 75 | ) 76 | pppls.fit(self.x, self.y) 77 | np.testing.assert_almost_equal( 78 | np.abs( 79 | np.matmul(self.Xs, skpls.coef_.reshape(-1)) * np.std(self.y) 80 | + np.mean(self.y) 81 | ), 82 | np.abs(pppls.fitted_.ravel()), 83 | decimal=3, 84 | ) 85 | 86 | 87 | # def test_robust(self): 88 | # lcpca = ppdire(projection_index = dicomo, pi_arguments = {'mode' : 'var', 'center': 'median'}, n_components=4, optimizer='grid',optimizer_options={'ndir':1000,'maxiter':10}) 89 | # lcpca.fit(self.x) 90 | # test_ans=np.array([[ 0.6324543 , -0.00651997, -0.35820225, 0.6438448 ], 91 | # [ 0.44750274, -0.67228343, 0.4950862 , -0.21806968], 92 | # [ 0.53378114, 0.28794634, -0.46650197, -0.72699245], 93 | # [ 0.35432068, 0.68524337, 0.64350842, 0.09692107]]) 94 | # np.testing.assert_almost_equal(np.abs(test_ans),np.abs(lcpca.x_loadings_),decimal=3) 95 | 96 | 97 | if __name__ == "__main__": 98 | unittest.main() 99 | -------------------------------------------------------------------------------- /src/direpack/test/test_sprm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jun 30 13:17:46 2020 4 | 5 | @author: Emmanuel Jordy Menvouta 6 | """ 7 | 8 | import unittest 9 | import pandas as ps 10 | import numpy as np 11 | from ..sprm.sprm import sprm 12 | from ..sprm.snipls import snipls 13 | from ..sprm.rm import rm 14 | 15 | 16 | class Testsprm(unittest.TestCase): 17 | """Test some methods in the sprm class""" 18 | 19 | @classmethod 20 | def setUpClass(cls): 21 | print("setupClass") 22 | 23 | @classmethod 24 | def tearDownClass(cls): 25 | print("teardownClass") 26 | 27 | def setUp(self): 28 | self.data = ps.read_csv("./data/Returns_shares.csv") 29 | self.datav = np.matrix(self.data.values[:, 2:8].astype("float64")) 30 | self.x = self.datav[:, 0:5] 31 | self.y = self.datav[:, 5] 32 | self.n = self.data.shape[0] 33 | self.p = self.data.shape[1] 34 | self.x0 = self.x.astype("float") 35 | self.y0 = self.y.astype("float") 36 | self.columns = self.data.columns[2:8] 37 | 38 | def tearDown(self): 39 | del self.x 40 | del self.y 41 | del self.n 42 | del self.p 43 | del self.x0 44 | del self.y0 45 | del self.data 46 | del self.datav 47 | 48 | def test_sprm(self): 49 | """Test the functioning of the sprm object""" 50 | 51 | res_sprm = sprm( 52 | 2, 53 | 0.8, 54 | "Hampel", 55 | 0.95, 56 | 0.975, 57 | 0.999, 58 | "kstepLTS", 59 | "scaleTau2", 60 | True, 61 | 100, 62 | 0.01, 63 | "ally", 64 | "xonly", 65 | self.columns, 66 | True, 67 | ) 68 | res_sprm.fit(self.x0[:2666], self.y0[:2666]) 69 | test_ans = 28.40453479240838 70 | np.testing.assert_almost_equal( 71 | np.linalg.norm(res_sprm.weightnewx(self.x0[2666:])), 72 | test_ans, 73 | decimal=4, 74 | ) 75 | 76 | def test_rm(self): 77 | """Test the functioning of the rm object""" 78 | 79 | res_rm = rm( 80 | "Hampel", 81 | 0.95, 82 | 0.975, 83 | 0.999, 84 | "median", 85 | "mad", 86 | "specific", 87 | True, 88 | 100, 89 | 0.01, 90 | True, 91 | ) 92 | res_rm.fit(self.x0[:2666], self.y0[:2666]) 93 | test_ans = 28.62510008113666 94 | np.testing.assert_almost_equal( 95 | np.linalg.norm(res_rm.predict(self.x0[2666:])), test_ans, decimal=4 96 | ) 97 | 98 | def test_snipls(self): 99 | """Test the functioning of the snipls object""" 100 | res_snipls = snipls(n_components=4, eta=0.5) 101 | res_snipls.fit(self.x0[:2666], self.y0[:2666]) 102 | test_ans = 38.6183244001568 103 | np.testing.assert_almost_equal( 104 | np.linalg.norm(res_snipls.predict(self.x0[2666:])), 105 | test_ans, 106 | decimal=4, 107 | ) 108 | self.x[0, 0] = np.nan 109 | res_snipls.fit(self.x0[:2666], self.y0[:2666]) 110 | np.testing.assert_almost_equal( 111 | np.linalg.norm(res_snipls.predict(self.x0[2666:])), 112 | test_ans, 113 | decimal=4, 114 | ) 115 | 116 | 117 | if __name__ == "__main__": 118 | unittest.main() 119 | -------------------------------------------------------------------------------- /src/direpack/test/test_dicomo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jun 30 13:17:46 2020 4 | 5 | @author: Emmanuel Jordy Menvouta 6 | """ 7 | 8 | import unittest 9 | from ..dicomo.dicomo import dicomo 10 | import pandas as ps 11 | import numpy as np 12 | import statsmodels.robust as srs 13 | import scipy.stats as sps 14 | import dcor as dc 15 | 16 | class Testdicomo(unittest.TestCase): 17 | """ Test methods in the dicomo class""" 18 | 19 | @classmethod 20 | def setUpClass(cls): 21 | print('...setupClass') 22 | 23 | 24 | 25 | @classmethod 26 | def tearDownClass(cls): 27 | print('...teardownClass') 28 | 29 | 30 | @classmethod 31 | def setUp(self): 32 | self.data=ps.read_csv("./data/Returns_shares.csv") 33 | self.datav = np.array(self.data.values[:,2:8].astype('float64')) 34 | self.est = dicomo() 35 | self.x = self.datav[:,1] 36 | self.y = self.datav[:,0] 37 | self.n=self.data.shape[0] 38 | self.p = self.data.shape[1] 39 | 40 | 41 | 42 | @classmethod 43 | def tearDown(self): 44 | del self.est 45 | del self.x 46 | del self.y 47 | del self.n 48 | del self.p 49 | 50 | 51 | 52 | def test_mom(self): 53 | """ Tests functions to compute moments""" 54 | 55 | self.assertAlmostEquals(self.est.fit(self.x,biascorr=False),np.var(self.x))# biased var 56 | self.assertAlmostEquals(self.est.fit(self.x,biascorr=True),np.var(self.x)*self.n/(self.n-1))#unbiased var 57 | self.est.set_params(center='median') 58 | self.assertAlmostEquals(self.est.fit(self.x),srs.mad(self.x),places=4) 59 | self.est.set_params(center='mean') 60 | self.assertAlmostEquals(self.est.fit(self.x,biascorr=False,order=3),sps.moment(self.x,3))#third moment 61 | self.est.set_params(mode='skew') 62 | self.assertAlmostEquals(self.est.fit(self.x,biascorr=False),sps.skew(self.x))# skew without small sample corr 63 | self.assertAlmostEquals(self.est.fit(self.x,biascorr=True),sps.skew(self.x,bias=False)) 64 | 65 | 66 | 67 | 68 | 69 | 70 | def test_como(self): 71 | """ Tests function to compute comomennts""" 72 | 73 | self.est.set_params(mode='com') 74 | self.assertAlmostEquals(self.est.fit(self.x,y=self.y,biascorr=True),self.data.iloc[:,2:4].cov().values[0,1])#covariance 75 | self.assertAlmostEquals(self.est.fit(self.x,y=self.y,biascorr=True,option=1,order=3),0.39009,places=4)#third order comoment 76 | self.est.set_params(mode='corr') 77 | self.assertAlmostEquals(self.est.fit(self.x,y=self.y),self.data.iloc[:,2:4].corr().values[0,1])#correlation 78 | self.est.set_params(mode='continuum') 79 | self.assertAlmostEquals(np.sqrt(self.est.fit(self.x,y=self.y,alpha=1,biascorr=True)),self.data.iloc[:,2:4].cov().values[0,1])#continuum 80 | 81 | 82 | def test_energy(self): 83 | """ Tests function to compute energy statistics""" 84 | 85 | self.est.set_params(est='distance',mode='var') 86 | self.assertAlmostEquals(self.est.fit(self.x,biascorr=False),dc.distance_stats(self.x,self.x).covariance_xy) 87 | self.assertAlmostEquals(self.est.fit(self.x,biascorr=True),np.sqrt(dc.u_distance_stats_sqr(self.x,self.x).covariance_xy)) 88 | self.est.set_params(mode='com') 89 | self.assertAlmostEquals(self.est.fit(self.x,y=self.y,biascorr=False),dc.distance_covariance(self.x,self.y)) 90 | self.est.set_params(mode='mdd') 91 | self.assertAlmostEquals(self.est.fit(self.x,y=self.y,biascorr=False),0.352427150086) 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | if __name__ =='__main__': 113 | unittest.main() 114 | 115 | 116 | -------------------------------------------------------------------------------- /src/direpack/preprocessing/_gsspp_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Mar 25 09:02:05 2020 5 | 6 | @author: Sven Serneels, Ponalytics. 7 | 8 | Code for radial transform functions largely adapted from 9 | R code by Jakob Raymaekers 10 | 11 | """ 12 | 13 | import numpy as np 14 | 15 | def quad(dd, p, n): 16 | """ 17 | Computes the quadratic radial function 18 | args: 19 | dd: vector of distances 20 | p: number of variables in original data 21 | n: number of rows in original data 22 | returns: 23 | xi: radial function 24 | """ 25 | d_hmed = np.sort(dd,axis=0)[int(np.floor((n + p + 1) / 2))-1][0] 26 | idx = np.where(dd > d_hmed)[0] 27 | xi = np.ones((n,1)) 28 | xi[idx] = (1 / np.square(dd[idx])) * (d_hmed**2) 29 | return(xi) 30 | 31 | def ss(dd, p,*args,prec=1e-10): 32 | 33 | """ 34 | Computes the spatial sign radial function 35 | args: 36 | dd: vector of distances 37 | p: dimension of original data 38 | *args flag to be able to pass on n - has no effect 39 | returns: 40 | xi: radial function 41 | """ 42 | dd = np.maximum(dd,prec) 43 | xi = 1 / dd 44 | return(xi) 45 | 46 | def winsor(dd, p, n) : 47 | """ 48 | Computes the Winsor radial function 49 | args: 50 | dd: vector of distances 51 | p: number of variables in original data 52 | n: number of rows in original data 53 | returns: 54 | xi: radial function 55 | """ 56 | d_hmed = np.sort(dd,axis=0)[int(np.floor((n + p + 1) / 2))-1][0] 57 | idx = np.where(dd > d_hmed)[0] 58 | xi = np.ones((n,1)) 59 | xi[idx] = (1 / dd[idx]) * d_hmed 60 | return(xi) 61 | 62 | def ball(dd, p, n): 63 | 64 | """ 65 | Computes the Ball radial function 66 | args: 67 | dd: vector of distances 68 | p: number of variables in original data 69 | n: number of rows in original data 70 | returns: 71 | xi: radial function 72 | """ 73 | 74 | dWH = np.power(dd,2/3) 75 | dWH_hmed = np.sort(dWH,axis=0)[int(np.floor((n + p + 1) / 2))-1][0] 76 | d_hmed = np.power(dWH_hmed,3/2) 77 | idx = np.where(dd > d_hmed)[0] 78 | xi = np.ones((n,1)) 79 | xi[idx] = 0 80 | return(xi) 81 | 82 | 83 | def shell(dd, p, n) : 84 | """ 85 | Computes the Shell radial function 86 | args: 87 | dd: vector of distances 88 | p: number of variables in original data 89 | n: number of rows in original data 90 | returns: 91 | xi: radial function 92 | """ 93 | 94 | dWH = np.power(dd,2/3) 95 | dWH_hmed = np.sort(dWH,axis=0)[int(np.floor((n + p + 1) / 2))-1][0] 96 | dWH_hmad = np.sort(np.abs(dWH - dWH_hmed),axis=0)[int(np.floor((n + p + 1) / 2))-1][0] 97 | cutoff1 = np.power(np.maximum(0, dWH_hmed - dWH_hmad),3/2) 98 | cutoff2 = np.power(dWH_hmed + dWH_hmad,3/2) 99 | idxlow = np.where(dd < cutoff1)[0] 100 | idxhigh = np.where(dd > cutoff2)[0] 101 | xi = np.ones((n,1)) 102 | xi[idxlow] = 0 103 | xi[idxhigh] = 0 104 | return(xi) 105 | 106 | 107 | def linear_redescending(dd, p,n): 108 | """ 109 | # Computes the Linear redescending radial function 110 | args: 111 | dd: vector of distances 112 | p: number of variables in original data 113 | n: number of rows in original data 114 | returns: 115 | xi: radial function 116 | """ 117 | 118 | dWH = np.power(dd,2/3) 119 | dWH_hmed = np.sort(dWH,axis=0)[int(np.floor((n + p + 1) / 2))-1][0] 120 | dWH_hmad = np.sort(np.abs(dWH - dWH_hmed),axis=0)[int(np.floor((n + p + 1) / 2))-1][0] 121 | d_hmed = dWH_hmed**(3/2) 122 | cutoff = (dWH_hmed + 1.4826 * dWH_hmad)**(3/2) 123 | idxmid = np.where(np.logical_and(dd > d_hmed,dd <= cutoff))[0] 124 | idxhigh = np.where(dd > cutoff)[0] 125 | xi = np.ones((n,1)) 126 | xi[idxmid] = 1 - (dd[idxmid,:] - d_hmed) / (cutoff - d_hmed) 127 | xi[idxhigh] = 0 128 | return(xi) 129 | 130 | 131 | def _norms(X,**kwargs): 132 | """ 133 | Casewise norms of a matrix 134 | """ 135 | return(np.linalg.norm(X,axis=1,keepdims=True,**kwargs)) 136 | 137 | 138 | def _gsspp(X,p,n,fun=ss): 139 | """ 140 | Generalized Spatial Sign Pre-Processing for Centred Data 141 | """ 142 | return(np.multiply(X,fun(_norms(X),p,n))) 143 | 144 | def _spatial_sign(X,**kwargs): 145 | """ 146 | Spatial Sign Pre-Processing for Centred Data 147 | """ 148 | return(X/_norms(X)) 149 | 150 | 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /docs/Pre-processing.rst: -------------------------------------------------------------------------------- 1 | .. _Pre-processing: 2 | 3 | ################ 4 | Pre-processing 5 | ################ 6 | 7 | The first step in most meaningful data analytics projects will be to pre-process the data, hence direpack proposes a set of tools for data pre-processing. 8 | 9 | Data standardization 10 | ===================== 11 | 12 | A first, well accepted way to pre-process data is to center them and scale them to unit variance on a column wise basis. This corresponds to transforming a $\mathbf{x}$ variable into z-scores: 13 | 14 | .. math:: 15 | :nowrap: 16 | 17 | \begin{equation*} 18 | \mathbf{z} = \frac{\mathbf{x} - \hat{\boldsymbol{\mu}}}{\hat{\boldsymbol{\sigma}}} 19 | \end{equation*} 20 | 21 | where $\hat{\boldsymbol{\mu}}$ and $\hat{\boldsymbol{\sigma}}$ are estimates of location and scale, respectively. 22 | For normally distributed data, the appropriate way to accomplish this is by centering about the mean and dividing by the column wise standard deviation. 23 | However, when the marginal distributions in the data significantly deviate from the normal, outliers could throw the result of that data standardization off, and robust or nonparametric alternatives become a more reliable choice. 24 | Essentially, all robust statistics are subject to a trade-off between efficiency and robustness, which means that the variance of the estimates will increase as the estimator can resist a higher fraction of outliers. 25 | While scikit-learn provides highly robust nonparametric standardization in its RobustScaler, the estimators included therein are known to have a low statistical efficiency (these are the median for location and the interquartile range for scale). 26 | Since autoscaling the data is often an essential step, a few location and scale estimators have been implemented. For location, with increasing performance in terms of the robustness---efficiency trade-off, these are: the column wise median, the spatial median (also called $L_1$-median, although it minimizes an $L_2$ norm) and the $k$ step least trimmed squares (LTS, Rousseeuw and Leroy (1987)) estimator. 27 | For scale, the consistency corrected median absolute deviation (MAD) and the $\tau$ estimator of scale (Maronna and Zamar 2002) have been included. Generally, it holds true that the more statistically efficient the estimator in these lists is, the higher its computational cost. In preprocessing, these estimators can be accessed through its VersatileScaler class, which takes the names of these estimators as strings, but it will also accept functions of location and scale estimators, should the user prefer to apply other ones. 28 | 29 | Spatial sign pre-processing 30 | ============================ 31 | Besides standardizing data, it can be beneficial to transform data to some sort of signs. The generalized spatial sign transformation consists of transforming a variable $\mathbf{x}$ into 32 | 33 | .. math:: 34 | :nowrap: 35 | 36 | \begin{equation*} 37 | \mathbf{u} = \left(\mathbf{x} - \hat{\boldsymbol{\mu}}\right) \times f\left(\mathbf{x} - \hat{\boldsymbol{\mu}}\right) 38 | \end{equation*} 39 | 40 | where the spatial sign is obtained by setting $f(x) = {\parallel x \parallel}^{-1}$ and $\parallel \cdot \parallel$ denotes the norm (in all published literature in this context, the $L_2$ norm). 41 | Since spatial sign pre-processing (SS-PP) consists of dividing the data by their Euclidean norm, it is also known as normalizing and as such, is available in scikit-learn's Normalizer. 42 | Spatial sign pre-processing has been shown to convey moderate robustness to multivariate estimators that are entirely based on covariance estimates, such as PCA or PLS (Serneels, De Nolf, and Van Espen 2006). 43 | Moderate robustness means in this case that the resulting estimator can resist up to 50% of outliers, but will have a sizeable bias even for small fractions of contamination. The reason why this happens 44 | is that the spatial sign transform projects all cases onto the unit sphere indiscriminately, which can drastically change data topology, and thereby introduce bias. Recently, the generalized spatial sign transform has been proposed (Raymaekers and Rousseeuw 2019). 45 | These authors examine a set of different functions that can be plugged into the expression for $\mathbf{u}$, some of which will only transform those cases in the data that exceed a certain eccentricity threshold. These functions are the quadratic radial, ball, shell, Winsor and linear redescending (LR) functions, all of which can be accessed through direpack’s GenSpatialSignPreprocessor. 46 | 47 | 48 | Usage 49 | ========= 50 | 51 | 52 | 53 | .. currentmodule:: direpack.preprocessing.robcent 54 | 55 | .. autosummary:: 56 | :toctree: generated/ 57 | 58 | VersatileScaler 59 | 60 | .. currentmodule:: direpack.preprocessing.gsspp 61 | 62 | .. autosummary:: 63 | :toctree: generated/ 64 | 65 | GenSpatialSignPreProcessor 66 | 67 | 68 | 69 | 70 | 71 | 72 | References 73 | ============== 74 | 75 | 1. Maronna RA, Zamar RH (2002). “Robust estimates of location and dispersion for high-dimensional datasets.” Technometrics, 44(4), 307–317. 76 | 2. Rousseeuw PJ, Leroy AM (1987). Robust Regression and Outlier Detection. Wiley and Sons, New York 77 | 3. Raymaekers J, Rousseeuw PJ (2019). “A generalized spatial sign covariance matrix.” Journal of Multivariate Analysis, 171, 94–111. 78 | 4. Serneels S, De Nolf E, Van Espen PJ (2006). “Spatial Sign Preprocessing: A Simple Way ToImpart Moderate Robustness to Multivariate Estimators.” Journal of Chemical Information and Modeling, 46, 1402–1409. -------------------------------------------------------------------------------- /docs/sudire.rst: -------------------------------------------------------------------------------- 1 | .. _sudire: 2 | 3 | 4 | 5 | ################ 6 | sudire 7 | ################ 8 | 9 | Sufficient dimension reduction (SDR) is a recent take on dimension reduction, where one aims to estimate a set of latent variables 10 | that are linear combinations of the original variables :math:`\mathbf{T} = \mathbf{X}\mathbf{W}` in such a way that the subspace spanned by them contains all information 11 | relevant to the dependent variable in such a way that the subspace spanned by them contains all information relevant to the dependent variable: 12 | :math:`\mathbf{Y} \upvDash \mathbf{X}\ | \ \mathbf{T}.` Here, $\mathbf{X}$ is a sample of $n$ cases of a $p$ variate random variable and $\mathbf{Y}$ 13 | is a sample of the dependent variable, $\mathbf{W}$ is a $p \times q$ matrix with $q \leq p$, and $\upvDash$ denotes statistical independence. 14 | A lot of research has been done over the last thirty years investigating different approaches in 15 | terms of asymptotics and assumptions made in each of the approaches. A good textbook 16 | providing an overview of approaches to SDR is Li (2018). The subpackage sudire contains 17 | implementations of a broad set of these approaches. 18 | 19 | Generally speaking, SDR techniques roughly resort in three categories. At first, there is a 20 | successful set of approaches to SDR based on slicing the original space. Examples of these are 21 | sliced inverse regression (SIR, Li (1991)) and sliced-average variance estimation (SAVE, Cook 22 | (2000)). A second group of developments has involved selective focus on certain directions, 23 | which has resulted in, among others, directional regression (DR, Li (2007)), principal Hessian 24 | directions (PHD, Li (1992)) and the iterative Hessian transformations (IHT, Cook and Li 25 | (2002)). 26 | 27 | While all of the aforementioned methods are included in sudire and would merit a broader 28 | discussion, at this point we would like to highlight that sudire contains implementations of a 29 | more recent approach as well. The latter has, so far, resulted in three methods, all three of 30 | which share the following advantages: they do not require conditions of linearity or constant 31 | covariance, nor do they need distributional assumptions, yet they may be computationally 32 | more demanding. This third group of SDR algorithms estimates a basis of the central subspace as: 33 | 34 | .. math:: 35 | :nowrap: 36 | 37 | \begin{equation*} 38 | \begin{aligned} 39 | & \mathbf{W}_h = \argmax_{\mathbf{B}} & & \mathfrak{P}^2\left(\mathbf{X}\mathbf{B},\mathbf{Y}\right) \\ 40 | & \text{subject to} & & \mathbf{B}^T\mathbf{X}^T\mathbf{X}\mathbf{B} = \mathbf{I}_h,\\ 41 | \end{aligned} 42 | \end{equation*} 43 | 44 | 45 | 46 | where $\mathbf{B}$ is an arbitrary $p \times h$ matrix, $h \in [1,\min(n,p)]$. Here, $\mathfrak{P}$ can be any statistic, that estimate a subspace whose complement 47 | is independent of $\mathbf{Y}$. Currently implemented $\mathfrak{P}$ statistics are : 48 | 49 | * distance covariance (Székely, Rizzo, and Bakirov 2007), leading to option dcov-sdr (Sheng and Yin 2016); 50 | * martingale difference divergence (Shao and Zhang 2014), leading to option mdd-sdr (Zhang, Liu, Wu, and Fang 2019); 51 | * ball covariance (Pan, Wang, Xiao, and Zhu 2019), leading to option bcov-sdr (Zhang and Chen 2019) 52 | 53 | 54 | 55 | 56 | 57 | 58 | Usage 59 | =========== 60 | 61 | .. currentmodule:: direpack.sudire.sudire 62 | 63 | .. autosummary:: 64 | :toctree: generated/ 65 | :caption: Sudire 66 | 67 | sudire 68 | 69 | 70 | 71 | 72 | 73 | 74 | Dependencies 75 | ================ 76 | 77 | 78 | - From `sklearn.base`: `BaseEstimator`,`TransformerMixin`,`RegressorMixin` 79 | - From `sklearn.utils`: `_BaseComposition` 80 | - `copy` 81 | - From `scipy.stats` : `trim_mean` 82 | - From `scipy.linalg`: `inv`, `sqrtm` 83 | - `cython` 84 | - From `ipopt` : `minimize_ipopt` 85 | - `numpy` 86 | - From `statsmodels.regression.linear_model`: `OLS` 87 | - `statsmodels.robust` 88 | 89 | References 90 | ========== 91 | 1. Wenhui Sheng and Xiangrong Yin Sufficient Dimension Reduction via Distance Covariance, in: Journal of Computational and Graphical Statistics (2016), 25, issue 1, pages 91-104. 92 | 2. Yu Zhang, Jicai Liu, Yuesong Wu and Xiangzhong Fang, A martingale-difference-divergence-based estimation of central mean subspace, in: Statistics and Its Interface (2019), 12, number 3, pages 489-501. 93 | 3. Li K-C, Sliced Inverse Regression for Dimension Reduction, Journal of the American Statistical Association (1991), 86, 316-327. 94 | 4. R.D. Cook, and Sanford Weisberg, Sliced Inverse Regression for Dimension Reduction: Comment, Journal of the American Statistical Association (1991), 86, 328-332. 95 | 5. B. Li and S.Wang, On directional regression for dimension reduction, Journal of the American Statistical Association (2007), 102:997–1008. 96 | 6. K.-C. Li., On principal hessian directions for data visualization and dimension reduction:Another application of stein’s lemma, Journal of the American Statistical Association(1992)., 87,1025–1039. 97 | 7. R. D. Cook and B. Li., Dimension Reduction for Conditional Mean in Regression, The Annals of Statistics(2002)30(2):455–474. 98 | 8. Jia Zhang and Xin Chen, Robust Sufficient Dimension Reduction Via Ball Covariance Computational Statistics and Data Analysis 140 (2019) 144–154 99 | 9. Li B, Sufficient Dimension Reduction: Methods and Applications with R. (2018) Chapman& Hall /CRC, Monographs on Statistics and Applied Probability, New York 100 | -------------------------------------------------------------------------------- /src/direpack/preprocessing/gsspp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # Created on Wed Mar 25 09:01:53 2020 5 | 6 | # @author: Sven Serneels, Ponalytics, Mar 2020. 7 | 8 | 9 | __all__ = ['GenSpatialSignPrePprocessor','gen_ss_pp','gen_ss_covmat'] 10 | 11 | from sklearn.base import BaseEstimator, TransformerMixin 12 | from sklearn.utils.validation import check_is_fitted 13 | from .robcent import VersatileScaler, versatile_scale 14 | from ._preproc_utilities import * 15 | from ..utils.utils import _check_input 16 | from ._gsspp_utils import * 17 | from ._gsspp_utils import _norms, _gsspp 18 | 19 | __all__ = ['GenSpatialSignPreProcessor', 'gen_ss_covmat', 'gen_ss_pp'] 20 | 21 | class GenSpatialSignPreProcessor(TransformerMixin,BaseEstimator): 22 | 23 | """ 24 | GenSpatialSignPreProcessor Generalized Spatial Sign Pre-Processing as a scikit-learn compatible object 25 | that can be used in ML pipelines. 26 | 27 | Parameters 28 | ---------- 29 | center: str or function, 30 | location estimator for centring.str options: 'mean', 'median', 'l1median', 'kstepLTS', 'None' 31 | 32 | fun: str or function, 33 | radial transformation function, str options: 'ss' (the non-generalized spatial sign, equivalent to sklearn's Normalizer), 'ball', 'shell', 'quad' (quadratic), 'winsor', or 'linear_redescending' 34 | Methods: sklearn API: `fit(X)`, `transform(X)` and `fit_transform(X)` with 35 | 36 | 37 | Attributes 38 | ---------- 39 | Attributes always provided : 40 | 41 | - `gss_` : the generalized spatial signs 42 | - `Xm_` : the centred data 43 | - `centring_` : VersatileScaler centring object 44 | - `X_gss_pp_` : Data preprocessed by Generalized Spatial Sign 45 | """ 46 | 47 | def __init__(self,center='l1median',fun='linear_redescending'): 48 | 49 | self.center = center 50 | self.fun = fun 51 | 52 | def fit(self,X): 53 | 54 | """ 55 | Calculate and store generalized spatial signs 56 | """ 57 | 58 | X = _check_input(X) 59 | n,p = X.shape 60 | if type(self.fun) is str: 61 | fun = eval(self.fun) 62 | else: 63 | fun = self.fun 64 | vs = VersatileScaler(center=self.center,scale='None') 65 | Xm = vs.fit_transform(X) 66 | gss_ = fun(_norms(Xm),p,n) 67 | setattr(self,'gss_',gss_) 68 | setattr(self,'Xm_',Xm) 69 | setattr(self,'centring_',vs) 70 | 71 | def transform(self,X): 72 | 73 | """ 74 | Calculate Generalized Spatial Sign Pre-Pprocessed Data 75 | """ 76 | 77 | check_is_fitted(self,('gss_','Xm_')) 78 | Xgss = np.multiply(self.Xm_,self.gss_) 79 | setattr(self,'X_gsspp_',Xgss) 80 | return(Xgss) 81 | 82 | def fit_transform(self,X): 83 | 84 | self.fit(X) 85 | self.transform(X) 86 | return(self.X_gsspp_) 87 | 88 | 89 | 90 | 91 | def gen_ss_pp(X,center='l1median',fun='linear_redescending'): 92 | 93 | """ 94 | Generalized Spatial Sign Pre-Processing as a one pass function 95 | Inputs: 96 | X: Data matrix 97 | center: str or function, location estimator for centring. 98 | str options: 'mean', 'median', 'l1median', 'kstepLTS', 'None' 99 | fun: str or function, radial transformation function, 100 | str options: 'ss' (the non-generalized spatial sign, equivalent 101 | to sklearn's Normalizer), 'ball', 'shell', 'quad' (quadratic), 102 | 'winsor', or 'linear_redescending' 103 | Outputs: the pre-processed data 104 | """ 105 | 106 | if type(center) is str: 107 | center = eval(center) 108 | 109 | if type(fun) is str: 110 | fun = eval(fun) 111 | 112 | X = _check_input(X) 113 | n = X.shape 114 | if len(n) > 1: 115 | p = n[1] 116 | else: 117 | p = 1 118 | n = n[0] 119 | 120 | if center != 'None': 121 | X = versatile_scale(X,center=center,scale='None') 122 | 123 | return(_gsspp(X,p,n,fun=fun)) 124 | 125 | 126 | def gen_ss_covmat(X,center='kstepLTS',fun=linear_redescending): 127 | 128 | """ 129 | Generalized Spatial Sign Covariance Matrix 130 | Is equivalent to the covariance matrix of generalized spatial sign 131 | pre-processed data. 132 | 133 | First published in: 134 | A generalized spatial sign covariance matrix, 135 | Jakob Raymaekers, Peter Rousseeuw, 136 | Journal of Multivariate Analysis, 171 (2019), 94–111. 137 | 138 | Inputs: 139 | X: Data matrix 140 | center: str or function, location estimator for centring. 141 | str options: 'mean', 'median', 'l1median', 'kstepLTS', 'None' 142 | fun: str or function, radial transformation function, 143 | str options: 'ss' (the non-generalized spatial sign, equivalent 144 | to sklearn's Normalizer), 'ball', 'shell', 'quad' (quadratic), 145 | 'winsor', or 'linear_redescending' 146 | 147 | Outputs: the generalized spatial sign covariance matrix 148 | """ 149 | 150 | X = _check_input(X) 151 | rc = VersatileScaler(center=center, scale='None') 152 | n,p = X.shape 153 | Xm = rc.fit_transform(X) 154 | Xgss = _gsspp(Xm,p,n,fun=fun) 155 | return(Xgss.T*Xgss/n) 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /docs/sprm.rst: -------------------------------------------------------------------------------- 1 | .. _sprm: 2 | 3 | ################ 4 | sprm 5 | ################ 6 | 7 | Sparse partial robust M regression (SPRM) is a sparse and robust alternative to PLS that can be calculated efficiently (Hoffmann, Serneels, Filzmoser,and Croux 2015). 8 | The subpackage is organized slightly differently from the other two mainsubpackages. Because SPRM combines the virtues of robust regression with sparse dimension reduction, 9 | besides the SPRM estimators itself, each of these building blocks are provided themselves as class objects that can be deployed in sklearn pipelines. 10 | The class objects rm, snipls and sprm are sourced by default when importing direpack. 11 | 12 | Robust M regression 13 | ===================== 14 | 15 | M regression is a generalization of least squares regression in the sense that it minimizes a more general objective that allows to tune the estimator's robustness. 16 | In M regression, the vector of regression coefficients is defined as: 17 | 18 | .. math:: 19 | :label: optim_rm 20 | :nowrap: 21 | 22 | \begin{equation*} 23 | \hat{\boldsymbol{\beta}} = \mathop{\mbox{argmin}}_{\boldsymbol{\beta}}\sum_i \rho\left(\frac{r_i(\boldsymbol{\beta})}{\hat{\sigma}}\right) 24 | \end{equation*} 25 | 26 | where $r_i$ are the casewise regression residuals and $\hat{\sigma}$ is a robust scale estimator thereof. The $\rho$ function defines the properties of the estimator. 27 | Identity to the least squares estimator is obtained if $\rho(r) = r^2$, but robustness can be introduced by taking a different function, 28 | for instance a function that is approximately quadratic for small (absolute) $r$, but increases more slowly than $r^2$ for larger values of $r$. 29 | Objective :eq:`optim_rm` can be solved numerically, but it is well known that its solution can equivalently be obtained through an iteratively reweighting least squares (IRLS), 30 | which is how it is implemented in sprm. In the package, the Fair, Huber or Hampel reweighting functions can be picked, which will lead to different robustness properties. 31 | 32 | 33 | 34 | Sparse NIPALS 35 | ===================== 36 | 37 | A second building block in the package is the SNIPLS algorithm. It is a sparse version of the NIPALS algorithm for PLS and as such, essentially a computationally efficient implementation of univariate sparse PLS. 38 | Again, the SNIPLS components are linear combinations of the original variables through a set of weighting vectors $\mathbf{w}_i$ that maximize: 39 | 40 | .. math:: 41 | :label: optim_snipls 42 | :nowrap: 43 | 44 | \begin{equation*} 45 | \begin{aligned} 46 | & \mathbf{w}_i &= \argmax_{\mathbf{a}} \mathop{\mbox{cov}^2}\left(\mathbf{a}^T\mathbf{X},\mathbf{y}\right) + \lambda \parallel\mathbf{a}\parallel_1 \\ 47 | & \text{subject to} & \mathbf{w}_i^T\mathbf{X}^T\mathbf{X}\mathbf{w}_j = 0 \mbox{ and } \parallel \mathbf{w}_i\parallel_2 = 1\\ 48 | \end{aligned} 49 | \end{equation*} 50 | 51 | 52 | which in sparse PLS is typically maximized through a surrogate formulation. However, in this case, the exact solution to Criterion :eq:`optim_snipls` can be obtained, 53 | which is what the SNIPLS algorithm builds upon. For details on the algorithm, the reader is referred to Hoffmann, Filzmoser, Serneels, and Varmuza (2016). 54 | At this point, remark that the SNIPLS algorithm has also become a key building block to analyze outlyingness (Debruyne, Höppner, Serneels,and Verdonck 2019). 55 | 56 | 57 | 58 | 59 | 60 | Sparse partial robust M 61 | ========================= 62 | 63 | Sparse partial robust M dimension reduction unites the benefits of SNIPLS and robust M estimation: it yields an efficient sparse PLS dimension reduction, while at the same time, 64 | it is robust against both leverage points and virtual outliers through robust M estimation. It is defined similarly as in :eq:`optim_snipls` but instead maximizing a weighted covariance, with case weights that depend on the data. 65 | Consistent with robust M estimation, it can be calculated through iteratively reweighting SNIPLS. SPRM improves upon the original reweighted PLS proposal by (i) yielding a sparse estimate, (ii) having a reweighting scheme as well as starting values that weight both in the score and residual spaces and (iii) by allowing different weight functions, the most tuneable one being the Hampel function. 66 | 67 | Usage 68 | =========== 69 | 70 | .. currentmodule:: direpack.sprm.sprm 71 | 72 | .. autosummary:: 73 | :toctree: generated/ 74 | :caption: SPRM 75 | 76 | sprm 77 | 78 | 79 | .. currentmodule:: direpack.sprm.snipls 80 | 81 | .. autosummary:: 82 | :toctree: generated/ 83 | :caption: SNIPLS 84 | 85 | snipls 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | Dependencies 96 | ================ 97 | 98 | - `pandas` 99 | - `numpy` 100 | 101 | 102 | References 103 | ================ 104 | 105 | 1. Irene Hoffmann, Sven Serneels, Peter Filzmoser, Christophe Croux, Sparse partial robust M regression, Chemometrics and Intelligent Laboratory Systems, 149 (2015), 50-59. 106 | 2. Sven Serneels, Christophe Croux, Peter Filzmoser, Pierre J. Van Espen, Partial robust M regression, Chemometrics and Intelligent Laboratory Systems, 79 (2005), 55-64. 107 | 3. Hoffmann I., P. Filzmoser, S. Serneels, K. Varmuza, Sparse and robust PLS for binary classification, Journal of Chemometrics, 30 (2016), 153-162. 108 | 4. Filzmoser P, Höppner S, Ortner I, Serneels S, Verdonck T. Cellwise robust M regression. Computational Statistics and Data Analysis,147 (2020). 109 | -------------------------------------------------------------------------------- /src/direpack/test/test_sudire.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jun 30 13:17:46 2020 4 | 5 | @author: Emmanuel Jordy Menvouta 6 | """ 7 | 8 | import unittest 9 | import numpy as np 10 | import pandas as pd 11 | from direpack.sudire._sudire_utils import * 12 | from direpack import sudire 13 | from sklearn.model_selection import train_test_split 14 | 15 | 16 | class Testsudire(unittest.TestCase): 17 | """Test some methods in the sudire class""" 18 | 19 | @classmethod 20 | def setUpClass(cls): 21 | print("setupClass") 22 | 23 | @classmethod 24 | def tearDownClass(cls): 25 | print("teardownClass") 26 | 27 | def setUp(self): 28 | self.data = pd.read_csv("./data/boston_housing.csv") 29 | self.x = self.data 30 | self.y = self.x["MEDV"] 31 | self.x.drop("MEDV", axis=1, inplace=True) 32 | self.n = self.x.shape[0] 33 | self.p = self.x.shape[1] 34 | self.struct_dim = 2 35 | 36 | def tearDown(self): 37 | del self.x 38 | del self.y 39 | del self.n 40 | del self.p 41 | del self.data 42 | del self.struct_dim 43 | 44 | # def test_estimdim(self): 45 | # """ Tests the estimation of the central subspace via Bootstrap """ 46 | # 47 | # central_dim, diff_vec = estimate_structural_dim('dr',self.x_train.values,self.y_train.values , B=100, n_slices=4) 48 | # np.testing.assert_equal(central_dim,6) 49 | 50 | def test_sir(self): 51 | """Tests Sliced Inverse Regression""" 52 | 53 | # mod_auto = sudire('sir', center_data= True, scale_data=True,n_components=self.struct_dim) 54 | # mod_auto.fit(self.x_train.values, self.y_train.values) 55 | res_sir = SIR( 56 | self.x.values, 57 | self.y.values, 58 | 6, 59 | self.struct_dim, 60 | "continuous", 61 | True, 62 | True, 63 | ) 64 | test_ans = 3.9759408796493894 65 | np.testing.assert_almost_equal( 66 | np.linalg.norm(res_sir), test_ans, decimal=8 67 | ) 68 | 69 | def test_save(self): 70 | """Tests Sliced Average Variance Estimation""" 71 | 72 | # mod_auto = sudire('save', center_data= True, scale_data=True,n_components=self.struct_dim) 73 | # mod_auto.fit(self.x_train.values, self.y_train.values) 74 | res_save = SAVE( 75 | self.x.values, 76 | self.y.values, 77 | 6, 78 | self.struct_dim, 79 | "continuous", 80 | True, 81 | True, 82 | ) 83 | test_ans = 6.347895320407837 84 | np.testing.assert_almost_equal( 85 | np.linalg.norm(res_save), test_ans, decimal=8 86 | ) 87 | 88 | def test_dr(self): 89 | """Tests Directional Regression""" 90 | 91 | # mod_auto = sudire('dr', center_data= True, scale_data=True,n_components=self.struct_dim) 92 | # mod_auto.fit(self.x_train.values, self.y_train.values) 93 | res_dr = DR( 94 | self.x.values, 95 | self.y.values, 96 | 6, 97 | self.struct_dim, 98 | "continuous", 99 | True, 100 | True, 101 | ) 102 | test_ans = 4.013789664544885 103 | np.testing.assert_almost_equal( 104 | np.linalg.norm(res_dr), test_ans, decimal=8 105 | ) 106 | 107 | def test_iht(self): 108 | """Tests Iterative Hessian Transformations""" 109 | 110 | # mod_auto = sudire('iht', center_data= True, scale_data=True,n_components=self.struct_dim) 111 | # mod_auto.fit(self.x_train.values, self.y_train.values) 112 | res_iht = IHT( 113 | self.x.values, self.y.values, self.struct_dim, True, True 114 | ) 115 | # local linux -- resolve platform sensitivity!! 116 | # test_ans = 0.22443355 117 | test_ans = 1.68656340 118 | np.testing.assert_almost_equal( 119 | np.linalg.norm(res_iht), test_ans, decimal=8 120 | ) 121 | 122 | def test_phd(self): 123 | """Tests Principal Hessian Directions""" 124 | 125 | # mod_auto = sudire('phd', center_data= True, scale_data=True,n_components=self.struct_dim) 126 | # mod_auto.fit(self.x_train.values, self.y_train.values) 127 | res_phd = PHD( 128 | self.x.values, self.y.values, self.struct_dim, True, True 129 | ) 130 | test_ans = 3.2904239864118763 131 | np.testing.assert_almost_equal( 132 | np.linalg.norm(res_phd), test_ans, decimal=8 133 | ) 134 | 135 | def test_dcov(self): 136 | """Test DCOV based SDR""" 137 | 138 | mod_auto = sudire( 139 | "dcov-sdr", 140 | center_data=True, 141 | scale_data=True, 142 | n_components=self.struct_dim, 143 | ) 144 | mod_auto.fit(self.x.values, self.y.values) 145 | test_ans = 1.4628980331787338 146 | np.testing.assert_almost_equal( 147 | np.linalg.norm(mod_auto.x_loadings_), test_ans, decimal=5 148 | ) 149 | 150 | def test_mdd(self): 151 | 152 | """Test MDD based SDR""" 153 | mod_auto = sudire( 154 | "mdd-sdr", 155 | center_data=True, 156 | scale_data=True, 157 | n_components=self.struct_dim, 158 | ) 159 | mod_auto.fit(self.x.values, self.y.values) 160 | test_ans = 3.5752717342726803 161 | np.testing.assert_almost_equal( 162 | np.linalg.norm(mod_auto.x_loadings_), test_ans, decimal=5 163 | ) 164 | 165 | 166 | if __name__ == "__main__": 167 | unittest.main() 168 | -------------------------------------------------------------------------------- /docs/sudire.md: -------------------------------------------------------------------------------- 1 | Sufficient Dimension Reduction 2 | ====================================== 3 | 4 | A `scikit-learn` compatible Python 3 package for Sufficient Dimension Reduction. 5 | This class implements a set of methods to perform Sufficient Dimension Reduction . 6 | 7 | Description 8 | ----------- 9 | 10 | Sufficient Dimension Reduction(SDR) is a general framework which aims to capture all the relevant information in high dimensional data. This capture of information is based on the notion that a combination of the predictors provides all the relevant information on the response, so that the rest of the predictors can be ignored. 11 | 12 | The different SDR methods implemented in this class are : 13 | - `dcov-sdr` : Sufficient Dimension Reduction via Distance Covariance 14 | - `mdd-sdr ` : Sufficient Dimension Reduction via Martingale Difference Divergence 15 | - `sir` : Sliced Inverse Regression 16 | - `save`: Sliced Average Variance Estimation 17 | - ` dr` : Directional Regression 18 | - ` phd ` : Principal Hessian Directions 19 | - `iht` : Iterative Hessian Transformation 20 | 21 | User defined functions can also be maximised by the method explained in \[1\]. For more details on how to use the implemented SDR methods and how to use user defined functions, have a look at the [sudire example notebook]() 22 | 23 | The `sudire` class also allows for estimation of the central subspace by optimizing an objective function . The optimization is performed using the Interior Point Optimizer (IPOPT) which is part of the [COIN-OR project](https://coin-or.github.io/Ipopt/) 24 | 25 | Remarks: 26 | - all the methods contained in this package have been designed for continuous data. Categorical or textual data first needs to be one hot encoded or embedded. 27 | 28 | The code is aligned to `scikit-learn`, such that modules such as `GridSearchCV` can flawlessly be applied to it. 29 | 30 | The `sudire` folder contains 31 | - The estimator (`sudire.py`) 32 | - Plotting functions for fitted sudire objects (`sudire_plot.py`) 33 | - Ancillary functions for sufficient dimension reduction (`_sudire_utils.py`) 34 | 35 | The sudire class 36 | ================ 37 | 38 | Dependencies 39 | ------------ 40 | - From `sklearn.base`: `BaseEstimator`,`TransformerMixin`,`RegressorMixin` 41 | - From `sklearn.utils`: `_BaseComposition` 42 | - `copy` 43 | - From `scipy.stats` : `trim_mean` 44 | - From `scipy.linalg`: `inv`, `sqrtm` 45 | - `cython` 46 | - From `ipopt` : `minimize_ipopt` 47 | - `numpy` 48 | - From `statsmodels.regression.linear_model`: `OLS` 49 | - `statsmodels.robust` 50 | 51 | 52 | 53 | Parameters 54 | ---------- 55 | - `sufdirmeth`, function or string. one of the elements in the list of implemented SDR methods 56 | or user defined function. 57 | - `n_components`, int. dimension of the central subspace. 58 | - `trimming`, float. trimming percentage for projection index, to be entered as pct/100 59 | - `optimizer_options`: dict with options to pass on to the ipopt optimizer. 60 | * `tol`: int: relative convergence tolerance. 61 | * `max_iter`: int. Maximal number of iterations. 62 | * `constr_viol_tol` : Desired threshold for the constraint violation. 63 | - `optimizer_constraints`: dict or list of dicts, further constraints to be 64 | passed on to the optimizer function. 65 | - `center`, str. How to center the data. options accepted are options from 66 | `direpack`'s `VersatileScaler`. 67 | - `center_data`, bool. 68 | - `scale_data`, bool. Note: if set to `False`, convergence to correct optimum 69 | is not a given. Will throw a warning. 70 | - `whiten_data`, bool. 71 | - `compression`, bool. If `True`, an internal SVD compression step is used for 72 | flat data tables (p > n). Speds up the calculations. 73 | - `copy`, bool. Whether to make a deep copy of the input data or not. 74 | - `verbose`, bool. Set to `True` prints the iteration number. 75 | - `return_scaling_object`, bool. 76 | Note: parameters concerning the data can also be passed to the `fit` method. 77 | 78 | Attributes 79 | ---------- 80 | Attributes always provided 81 | - `x_loadings_`: Estimated basis of the central subsapce 82 | - `x_scores_`: The projected X data. 83 | - `x_loc_`: location estimate for X 84 | - `x_sca_`: scale estimate for X 85 | - ` ols_obj` : fitted OLS objected 86 | - `y_loc_`: y location estimate 87 | - `y_sca_`: y scale estimate 88 | 89 | Attributes created only when corresponding input flags are `True`: 90 | - `whitening_`: whitened data matrix (usually denoted K) 91 | - `scaling_object_`: scaling object from `VersatileScaler` 92 | 93 | 94 | Methods 95 | -------- 96 | - `fit(X, *args, **kwargs)`: fit model 97 | - `predict(X)`: make predictions based on fit 98 | - `transform(X)`: project X onto latent space 99 | - `getattr()`: get list of attributes 100 | - `setattr(*kwargs)`: set individual attribute of sprm object 101 | 102 | The `fit` function takes several optional input arguments for user defined objective functions. 103 | 104 | 105 | 106 | 107 | 108 | References 109 | ---------- 110 | 1. [Sufficient Dimension Reduction via Distance Covariance](https://doi.org/10.1080/10618600.2015.1026601), Wenhui Sheng and Xiangrong Yin in: Journal of Computational and Graphical Statistics (2016), 25, issue 1, pages 91-104. 111 | 2. [A martingale-difference-divergence-based estimation of central mean subspace](https://dx.doi.org/10.4310/19-SII562), Yu Zhang, Jicai Liu, Yuesong Wu and Xiangzhong Fang, in: Statistics and Its Interface (2019), 12, number 3, pages 489-501. 112 | 3. [Sliced Inverse Regression for Dimension Reduction](https://www.tandfonline.com/doi/abs/10.1080/01621459.1991.10475035) Li K-C, Journal of the American Statistical Association (1991), 86, 316-327. 113 | 4. [Sliced Inverse Regression for Dimension Reduction: Comment](https://www.jstor.org/stable/2290564?seq=1#metadata_info_tab_contents), R.D. Cook, and Sanford Weisberg, Journal of the American Statistical Association (1991), 86, 328-332. 114 | 5. [On directional regression for dimension reduction](https://doi.org/10.1198/016214507000000536) , B. Li and S.Wang, Journal of the American Statistical Association (2007), 102:997–1008. 115 | 6. [On principal hessian directions for data visualization and dimension reduction:Another application of stein’s lemma](https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10476258), K.-C. Li. , Journal of the American Statistical Association(1992)., 87,1025–1039. 116 | 7. [Dimension Reduction for Conditional Mean in Regression](https://pdfs.semanticscholar.org/fd99/4f0cd554790eb8e0449440a59dcd47cf3396.pdf), R. D. Cook and B. Li., The Annals of Statistics(2002)30(2):455–474. 117 | 8. [Robust Sufficient Dimension Reduction Via Ball Covariance](https://www.sciencedirect.com/science/article/pii/S0167947319301380) Jia Zhang and Xin Chen, Computational Statistics and Data Analysis 140 (2019) 144–154 118 | -------------------------------------------------------------------------------- /src/direpack/plot/sudire_plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 11 17:25:42 2020 4 | 5 | @author: Emmanuel Jordy Menvouta 6 | """ 7 | 8 | from __future__ import absolute_import, division, print_function 9 | from __future__ import unicode_literals 10 | 11 | 12 | 13 | from ..sudire.sudire import sudire 14 | from ..utils.utils import MyException 15 | import matplotlib.pyplot as pp 16 | import numpy as np 17 | 18 | 19 | class sudire_plot(sudire): 20 | 21 | def __init__(self,res_sudire,colors,markers=['o','d','v'],*args): 22 | """ 23 | Initialize with 24 | res_sudire, a sudire class object 25 | 26 | Only mandatory input is colors, a list of colors for 27 | [0] borders of pane 28 | [1] plot background 29 | [2] marker fill 30 | [3] diagonal line 31 | [4] marker contour, if different from fill 32 | [5] marker color for new cases, if applicable 33 | 34 | """ 35 | if not(isinstance(res_sudire,sudire)): 36 | raise(MyException("Object supplied to sudireplot needs to be a sudire object")) 37 | self.res_sudire = res_sudire 38 | self.colors = colors 39 | self.markers = markers 40 | 41 | def plot_yyp(self,ytruev=[],Xn=[],label=[],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False): 42 | """ 43 | plot_yyp will plot y vs y predicted for sudire M opbjects 44 | Optional inputs: 45 | ytruev: array (new_cases,) of predictands 46 | Xn: array (new_cases,variables) of predictors 47 | If these arguments are supplied, sudire predictions for ytrue will be 48 | made from Xn through res_sudire.predict() 49 | label: string: name of variable to be plotted. Will show in legend. 50 | names: list or tuple of strings, casenames from training set 51 | namesv: list or tuple of strings, casenames from test set 52 | title: String containing plot title 53 | legend_pos: string containing legend position 54 | onlyval: boolean: only plot validation cases 55 | """ 56 | 57 | if len(label)==0: 58 | label = 'none' 59 | fig = pp.figure() 60 | fig.set_facecolor(self.colors[0]) 61 | pp.rcParams['axes.facecolor'] = self.colors[1] 62 | ax1 = fig.add_subplot(111) 63 | if (not(onlyval)): 64 | ytruec = self.res_sudire.y0 65 | if len(ytruec.shape) >1: 66 | ytruec = np.array(ytruec).reshape(-1).astype('float64') 67 | ypredc = np.array(self.res_sudire.ols_obj_.fittedvalues).T.reshape(-1) 68 | 69 | ax1.scatter(ytruec, ypredc, c=self.colors[2], label=label, 70 | zorder=1,edgecolors=self.colors[4],marker=self.markers[0]) 71 | pp.xlabel("y-true") 72 | pp.ylabel("y-pred") 73 | 74 | else: 75 | if (len(Xn)==0): 76 | ValueError('In onlyval=True mode, new cases Xn need to be provided') 77 | if not(len(Xn)==0): 78 | if len(ytruev.shape) >1: 79 | ytruev = np.array(ytruev).reshape(-1).astype('float64') 80 | ypredv = self.res_sudire.predict(Xn) 81 | ypredv = np.array(ypredv).reshape(-1).astype('float64') 82 | 83 | ax1.scatter(ytruev,ypredv,c=self.colors[5],label=label, 84 | zorder=1,edgecolors=self.colors[4],marker=self.markers[0]) 85 | pp.xlabel("y-true") 86 | pp.ylabel("y-pred") 87 | 88 | x_abline = np.array(ax1.get_xbound()) 89 | ax1.add_line(pp.Line2D(x_abline,x_abline,color=self.colors[3])) 90 | if len(label)==0: 91 | ax1.legend_.remove() 92 | else: 93 | pp.legend(loc=legend_pos) 94 | if len(names)>0: 95 | if not(onlyval): 96 | for i in range(0,len(names)-1): 97 | ax1.annotate(names[i], (ytruec[i],ypredc[i])) 98 | if len(namesv)>0: 99 | for i in range(0,len(namesv)-1): 100 | ax1.annotate(namesv[i], (ytruev[i],ypredv[i])) 101 | if len(title)>0: 102 | pp.title(title) 103 | pp.show() 104 | 105 | def plot_projections(self,Xn=[],label=[],components = [0,1],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False): 106 | 107 | """ 108 | plot_projections will plot the score space 109 | Optional inputs: 110 | Xn: array (new_cases,variables) of predictors 111 | If supplied, sudire projections for new cases will be 112 | made from Xn through res_sudire.transform() 113 | label: string: name of variable to be plotted. Will show in legend. 114 | names: list or tuple of strings, casenames from training set 115 | namesv: list or tuple of strings, casenames from test set 116 | title: String containing plot title 117 | legend_pos: string containing legend position 118 | onlyval: boolean: only plot validation cases 119 | """ 120 | 121 | if len(label)==0: 122 | label = 'none' 123 | fig = pp.figure() 124 | fig.set_facecolor(self.colors[0]) 125 | pp.rcParams['axes.facecolor'] = self.colors[1] 126 | ax1 = fig.add_subplot(111) 127 | if (not(onlyval)): 128 | Tc = np.array(self.res_sudire.x_scores_) 129 | ax1.scatter(Tc[:,components[0]], Tc[:,components[1]], c=self.colors[2], label=label, 130 | zorder=1,edgecolors=self.colors[4],marker=self.markers[0]) 131 | else: 132 | if (len(Xn)==0): 133 | ValueError('In onlyval=True mode, new cases Xn need to be provided') 134 | if not(len(Xn)==0): 135 | Tv = np.array(self.res_sudire.transform(Xn)) 136 | ax1.scatter(Tv[:,components[0]], Tv[:,components[1]],c=self.colors[5],label=label, 137 | zorder=1,edgecolors=self.colors[4],marker=self.markers[0]) 138 | if len(label)==0: 139 | ax1.legend_.remove() 140 | else: 141 | pp.legend(loc=legend_pos) 142 | if len(names)>0: 143 | if not(onlyval): 144 | for i in range(0,len(names)-1): 145 | ax1.annotate(names[i], (Tc[i,components[0]], Tc[i,components[1]])) 146 | if len(namesv)>0: 147 | for i in range(0,len(namesv)-1): 148 | ax1.annotate(namesv[i], (Tv[i,components[0]], Tv[i,components[1]])) 149 | if len(title)>0: 150 | pp.title(title) 151 | pp.show() 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /src/direpack/ppdire/_ppdire_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Jan 2 2020 5 | 6 | @author: Sven Serneels, Ponalytics. 7 | """ 8 | 9 | import numpy as np 10 | import pandas as ps 11 | 12 | 13 | def pp_objective(x,est,X,opt_args): 14 | 15 | """ 16 | Optimization objective for ppdire 17 | 18 | """ 19 | 20 | n = len(x) 21 | x = np.array(x).reshape((n,1)) 22 | return(-est.fit(np.matmul(X,x),**opt_args)) 23 | 24 | def gridplane(X,most,pi_arguments={},**kwargs): 25 | 26 | """ 27 | Function for grid search in a plane in two dimensions 28 | 29 | Required: X, np.array(n,2), data, 30 | most, class object, projection index. Designed for 31 | dicomo or capi classes. 32 | Optional: pi_arguments, dict: arguments to pass on to projection index, 33 | plus a few local arguments such as optrange and square_pi 34 | (see ppdire for explanation) 35 | 36 | y, np.array(n,1), second block of data 37 | biascorr, to apply bias correction at normal distribution 38 | alphamat, np.array: matrix of alpha angles to be scanned. 39 | 40 | 41 | Values: 42 | wi, np.array(p,1): optimal direction 43 | maximo, float: optimal value of projection index 44 | 45 | Note: this function is written exclusively to be called from within the ppdire class 46 | 47 | """ 48 | 49 | 50 | if (('biascorr' not in kwargs) and ('biascorr' not in pi_arguments)): 51 | biascorr = False 52 | else: 53 | biascorr = kwargs.get('biascorr') 54 | 55 | if len(pi_arguments) == 0: 56 | 57 | pi_arguments = { 58 | 'alpha': 0, 59 | 'ndir': 1000, 60 | 'trimming': 0, 61 | 'biascorr': biascorr, 62 | 'dmetric' : 'euclidean', 63 | 'alphamat': None, 64 | 'optrange': (-1,1), 65 | 'square_pi': False 66 | } 67 | 68 | if ('y' in kwargs): 69 | y = kwargs.pop('y') 70 | pi_arguments['y'] = y 71 | 72 | optrange = pi_arguments['optrange'] 73 | optmax = optrange[1] 74 | 75 | alphamat = kwargs.pop('alphamat',pi_arguments['alphamat']) 76 | if (alphamat != None): 77 | optrange = np.sign(optrange) 78 | stop0s = np.arcsin(optrange[0]) 79 | stop1s = np.arcsin(optrange[1]) 80 | stop1c = np.arccos(optrange[0]) 81 | stop0c = np.arccos(optrange[1]) 82 | anglestart = max(stop0c,stop0s) 83 | anglestop = max(stop1c,stop1s) 84 | nangle = np.linspace(anglestart,anglestop,pi_arguments['ndir'],endpoint=False) 85 | alphamat = np.array([np.cos(nangle), np.sin(nangle)]) 86 | if optmax != 1: 87 | alphamat *= optmax 88 | 89 | tj = np.matmul(X,alphamat) 90 | if pi_arguments['square_pi']: 91 | meas = [most.fit(tj[:,i],**pi_arguments)**2 92 | for i in np.arange(0,pi_arguments['ndir'])] 93 | else: 94 | meas = [most.fit(tj[:,i],**pi_arguments) 95 | for i in np.arange(0,pi_arguments['ndir'])] 96 | 97 | maximo = np.max(meas) 98 | indmax = np.where(meas == maximo)[0] 99 | if len(indmax)>0: 100 | indmax = indmax[0] 101 | wi = np.array(alphamat[:,indmax]).reshape((2,1)) 102 | 103 | return(wi,maximo) 104 | 105 | 106 | 107 | def gridplane_2(X,most,q,div,pi_arguments={},**kwargs): 108 | 109 | """ 110 | Function for refining a grid search in a plane in two dimensions 111 | 112 | Required: X, np.array(n,2), data 113 | most, class object, projection index. Designed for 114 | dicomo or capi classes. 115 | q, np.array(1,1), last obtained suboptimal direction component 116 | div, float, number of subsegments to divide angle into 117 | 118 | Optional: pi_arguments, dict: arguments to pass on to projection index, 119 | plus a few local arguments such as optrange and square_pi 120 | (see ppdire for explanation) 121 | 122 | y, np.array(n,1), second block of data 123 | biascorr, to apply bias correction at normal distribution 124 | alphamat, np.array: matrix of alpha angles to be scanned. 125 | 126 | pi_arguments is a dict of arguments passed on to the projection index 127 | 128 | Values: 129 | wi, np.array(p,1): optimal direction 130 | maximo, float: optimal value of projection index 131 | 132 | Note: this function is written to be called from within the ppdire class 133 | 134 | """ 135 | 136 | if (('biascorr' not in kwargs) and ('biascorr' not in pi_arguments)): 137 | biascorr = False 138 | else: 139 | biascorr = kwargs.get('biascorr') 140 | 141 | if len(pi_arguments) == 0: 142 | 143 | pi_arguments = { 144 | 'alpha': 0, 145 | 'ndir': 1000, 146 | 'trimming': 0, 147 | 'biascorr': biascorr, 148 | 'dmetric' : 'euclidean', 149 | 'alphamat': None, 150 | 'optrange': (-1,1), 151 | 'square_pi': False 152 | } 153 | 154 | 155 | if 'y' in kwargs: 156 | y = kwargs.pop('y') 157 | pi_arguments['y'] = y 158 | 159 | optrange = pi_arguments['optrange'] 160 | optmax = optrange[1] 161 | 162 | alphamat = kwargs.pop('alphamat',pi_arguments['alphamat']) 163 | if (alphamat != None).any(): 164 | anglestart = min(pi_arguments['_stop0c'],pi_arguments['_stop0s']) 165 | anglestop = min(pi_arguments['_stop1c'],pi_arguments['_stop1s']) 166 | nangle = np.linspace(anglestart,anglestop,pi_arguments['ndir'],endpoint=True) 167 | alphamat = np.array([np.cos(nangle), np.sin(nangle)]) 168 | if optmax != 1: 169 | alphamat *= optmax 170 | alpha1 = alphamat 171 | divisor = np.sqrt(1 + 2*np.multiply(alphamat[0,:].reshape(1,-1),alphamat[1,:].reshape(1,-1))*q[0]) 172 | alpha1 = np.divide(alphamat,np.repeat(divisor,2,0)) 173 | tj = np.dot(X,alpha1) 174 | 175 | if pi_arguments['square_pi']: 176 | meas = [most.fit(tj[:,i],**pi_arguments)**2 177 | for i in np.arange(0,pi_arguments['ndir'])] 178 | else: 179 | meas = [most.fit(tj[:,i],**pi_arguments) 180 | for i in np.arange(0,pi_arguments['ndir'])] 181 | 182 | maximo = np.max(meas) 183 | indmax = np.where(meas == maximo)[0] 184 | if len(indmax)>0: 185 | indmax = indmax[0] 186 | wi = np.array(alpha1[:,indmax]).reshape((2,1)) 187 | 188 | return(wi,maximo) 189 | -------------------------------------------------------------------------------- /src/direpack/plot/ppdire_plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 11 17:25:42 2020 4 | 5 | @author: Emmanuel Jordy Menvouta & Sven Serneels 6 | """ 7 | 8 | from __future__ import absolute_import, division, print_function 9 | from __future__ import unicode_literals 10 | 11 | 12 | 13 | from ..ppdire.ppdire import ppdire 14 | from ..utils.utils import MyException 15 | import matplotlib.pyplot as pp 16 | import numpy as np 17 | 18 | 19 | class ppdire_plot(ppdire): 20 | 21 | def __init__(self,res_ppdire,colors,markers=['o','d','v'],*args): 22 | """ 23 | Initialize with 24 | res_ppdire, a ppdire class object 25 | 26 | Only mandatory input is colors, a list of colors for 27 | [0] borders of pane 28 | [1] plot background 29 | [2] marker fill 30 | [3] diagonal line 31 | [4] marker contour, if different from fill 32 | [5] marker color for new cases, if applicable 33 | 34 | """ 35 | if not(isinstance(res_ppdire,ppdire)): 36 | raise(MyException("Object supplied to ppdireplot needs to be a ppdire object")) 37 | self.res_ppdire = res_ppdire 38 | self.colors = colors 39 | self.markers = markers 40 | 41 | def plot_yyp(self,ytruev=[],Xn=[],label=[],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False): 42 | """ 43 | plot_yyp will plot y vs y predicted for ppdire M opbjects 44 | Optional inputs: 45 | ytruev: array (new_cases,) of predictands 46 | Xn: array (new_cases,variables) of predictors 47 | If these arguments are supplied, ppdire predictions for ytrue will be 48 | made from Xn through res_ppdire.predict() 49 | label: string: name of variable to be plotted. Will show in legend. 50 | names: list or tuple of strings, casenames from training set 51 | namesv: list or tuple of strings, casenames from test set 52 | title: String containing plot title 53 | legend_pos: string containing legend position 54 | onlyval: boolean: only plot validation cases 55 | """ 56 | 57 | if len(label)==0: 58 | label = 'none' 59 | fig = pp.figure() 60 | fig.set_facecolor(self.colors[0]) 61 | pp.rcParams['axes.facecolor'] = self.colors[1] 62 | ax1 = fig.add_subplot(111) 63 | if (not(onlyval)): 64 | ytruec = self.res_ppdire.y0 65 | if len(ytruec.shape) >1: 66 | ytruec = np.array(ytruec).reshape(-1).astype('float64') 67 | ypredc = np.array(self.res_ppdire.fitted_).T.reshape(-1) 68 | labelcr = label + ' Training' 69 | ax1.scatter(ytruec, ypredc, c=self.colors[2], label=labelcr, 70 | zorder=1,edgecolors=self.colors[4],marker=self.markers[0]) 71 | pp.xlabel("y-true") 72 | pp.ylabel("y-pred") 73 | 74 | else: 75 | if (len(Xn)==0): 76 | ValueError('In onlyval=True mode, new cases Xn need to be provided') 77 | if not(len(Xn)==0): 78 | if len(ytruev.shape) >1: 79 | ytruev = np.array(ytruev).reshape(-1).astype('float64') 80 | ypredv = self.res_ppdire.predict(Xn) 81 | ypredv = np.array(ypredv).reshape(-1).astype('float64') 82 | labelvr = label + ' Test' 83 | ax1.scatter(ytruev,ypredv,c=self.colors[5],label=labelvr, 84 | zorder=1,edgecolors=self.colors[4],marker=self.markers[0]) 85 | pp.xlabel("y-true") 86 | pp.ylabel("y-pred") 87 | 88 | x_abline = np.array(ax1.get_xbound()) 89 | ax1.add_line(pp.Line2D(x_abline,x_abline,color=self.colors[3])) 90 | if len(label)==0: 91 | ax1.legend_.remove() 92 | else: 93 | pp.legend(loc=legend_pos) 94 | if len(names)>0: 95 | if not(onlyval): 96 | for i in range(0,len(names)-1): 97 | ax1.annotate(names[i], (ytruec[i],ypredc[i])) 98 | if len(namesv)>0: 99 | for i in range(0,len(namesv)-1): 100 | ax1.annotate(namesv[i], (ytruev[i],ypredv[i])) 101 | if len(title)>0: 102 | pp.title(title) 103 | pp.show() 104 | 105 | def plot_projections(self,Xn=[],label=[],components = [0,1],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False): 106 | 107 | """ 108 | plot_projections will plot the score space 109 | Optional inputs: 110 | Xn: array (new_cases,variables) of predictors 111 | If supplied, ppdire projections for new cases will be 112 | made from Xn through res_ppdire.transform() 113 | label: string: name of variable to be plotted. Will show in legend. 114 | names: list or tuple of strings, casenames from training set 115 | namesv: list or tuple of strings, casenames from test set 116 | title: String containing plot title 117 | legend_pos: string containing legend position 118 | onlyval: boolean: only plot validation cases 119 | """ 120 | 121 | if len(label)==0: 122 | label = 'none' 123 | fig = pp.figure() 124 | fig.set_facecolor(self.colors[0]) 125 | pp.rcParams['axes.facecolor'] = self.colors[1] 126 | ax1 = fig.add_subplot(111) 127 | if (not(onlyval)): 128 | Tc = np.array(self.res_ppdire.x_scores_) 129 | labelcr = label + ' Training' 130 | ax1.scatter(Tc[:,components[0]], Tc[:,components[1]], c=self.colors[2], label=labelcr, 131 | zorder=1,edgecolors=self.colors[4],marker=self.markers[0]) 132 | else: 133 | if (len(Xn)==0): 134 | ValueError('In onlyval=True mode, new cases Xn need to be provided') 135 | if not(len(Xn)==0): 136 | Tv = np.array(self.res_ppdire.transform(Xn)) 137 | labelvr = label + ' Test' 138 | ax1.scatter(Tv[:,components[0]], Tv[:,components[1]],c=self.colors[5],label=labelvr, 139 | zorder=1,edgecolors=self.colors[4],marker=self.markers[0]) 140 | if len(label)==0: 141 | ax1.legend_.remove() 142 | else: 143 | pp.legend(loc=legend_pos) 144 | if len(names)>0: 145 | if not(onlyval): 146 | for i in range(0,len(names)-1): 147 | ax1.annotate(names[i], (Tc[i,components[0]], Tc[i,components[1]])) 148 | if len(namesv)>0: 149 | for i in range(0,len(namesv)-1): 150 | ax1.annotate(namesv[i], (Tv[i,components[0]], Tv[i,components[1]])) 151 | if len(title)>0: 152 | pp.title(title) 153 | pp.show() 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /docs/ppdire.rst: -------------------------------------------------------------------------------- 1 | .. _ppdire: 2 | 3 | 4 | ################ 5 | ppdire 6 | ################ 7 | 8 | Beyond discussion, the class of dimension reduction with the longest standing history accessible through direpack, is projection pursuit (PP) dimension reduction. 9 | Let $\mathbf{X}$ be a data matrix that is a sample of $n$ cases of a $p$ variate random variable and $\mathbf{y}$ be a sample of a corresponding depending variable, when applicable. 10 | The set of projection pursuit scores $\mathbf{t}_i$ that span the columns of $\mathbf{T}$ are defined as linear combinations of the original variables: $\mathbf{T} = \mathbf{X}\mathbf{W}$, where the $\mathbf{w}_i$ are 11 | the solution to the optimization problem: 12 | 13 | .. math:: 14 | :label: optim_ppdire 15 | :nowrap: 16 | 17 | \begin{equation*} 18 | \begin{aligned} 19 | & \underset{\mathbf{a}}{\text{maximise}} & & \mathfrak{P}\left(\mathbb{S}\left(\mathbf{a}^T\mathbf{X}\right)\right) \\ 20 | & \text{subject to} & & \mathbf{w}_i^T\mathbf{X}^T\mathbf{X}\mathbf{w}_j = 0 \mbox{ and } \parallel \mathbf{w}_i\parallel_2 = 1,\\ 21 | \end{aligned} 22 | \end{equation*} 23 | 24 | 25 | 26 | where $i,j \in [1,\min(n,p)]$, $j > i$ and the set $\mathbb{S} = \{\mathbf{X},\mathbf{y}\}$ if data for a dependent variable $Y$ exist and is a singleton containing $\mathbf{X}$ otherwise. 27 | Maximization of this criterion is very flexible and the properties of the dimension reduction accomplished according to it can vary widely, mainly dependent on the presence or absence of dependent 28 | variable data, as well as on $\mathfrak{P}$, which in the PP literature is referred to as the projection index. 29 | 30 | dicomo 31 | =========== 32 | 33 | The projection index determines which method is being calculated. 34 | In direpack, projection pursuit can be called through the ppdire subpackge and class object, which allows the user to pass any function of appropriate dimensionality as a projection index. 35 | However, a set of popular projection indices deriving from (co-)moments, are provided as well through the dicomo subpackage. For several of these, plugging them in leads to well-established methods. They comprise: 36 | 37 | * Moment statistics: variance (PCA), higher order moments 38 | * Co-moment statistics: covariance (PLS), higher order co-moments 39 | * Standardized moments: skewness (ICA), kurtosis (ICA) 40 | * Standardized co-moments: correlation coefficient (CCA), co-skewness, co-kurtosis 41 | * Linear combinations of (standardized co-) moments. Here, the capi.py file in the ppdire subpackage delivers to co-moment analysis projection index (Serneels2019). 42 | * Products of (co-)moments. Particularly the continuum association measure has been provided, which is given by $\mathop{\mbox{cont}}(\mathbf{X},\mathbf{y}) = \mathop{\mbox{cov}}(\mathbf{X},\mathbf{y})\mathop{\mbox{var}}(\mathbf{X})^{\alpha-1}$. Using this continuum measure produces continuum regression (CR, Stone and Brooks (1990)). CR is equivalent to PLS for $\alpha = 1$ and approaches PCA as $\alpha \rightarrow\infty$. 43 | 44 | 45 | 46 | pp optimizers 47 | ============== 48 | 49 | Early ideas behind PP was the ability to scan all directions maximizing the projection index as denoted in :eq:`optim_ppdire`. This essentially corresponds to a brute force optimization technique, which can be computationally very demanding. 50 | For instance, both PCA and PLS, can be solved analytically, leading to efficient algorithms that do not directly optimize :eq:`optim_ppdire`. Whenever the projection index plugged in, leads to a convex optimization problem, it is advisable to apply an efficient numerical optimization technique. For that purpose,ppdire has the option to use scipy.optimize’s sequential least squares quadratic programming optimization (SLSQP). However, for projection indices based on ordering or ranking data, such as medians or trimmed (co-)moments, the problem is no longer convex and cannot be solved through SLSQP. 51 | For those purposes, the grid algorithm is included, which was originally developed to compute RCR (Filzmoser, Serneels, Croux, andVan Espen 2006). 52 | 53 | Regularized regression 54 | ======================= 55 | 56 | While the main focus of direpack is dimension reduction, all dimension reduction techniques offer a bridge to regularized regression. 57 | This can be achieved by regressing the dependent variable onto the estimated dimension reduced space. The latter provides regularization of the covariance matrix, 58 | due to the constraints in :eq:`optim_ppdire`, and allow to perform regression for an undersampled $\mathbf{X}$. The classical estimate is to predict $\mathbf{y}$ through least squares regression: 59 | 60 | .. math:: 61 | :nowrap: 62 | 63 | \begin{equation*} 64 | \hat{\mathbf{y}} = \hat{\mathbf{T}} \hat{\mathbf{T}}^T\mathbf{y} 65 | \end{equation*} 66 | 67 | which again leads to well-established methods such as principal component regression (PCR), PLS regression, etc. 68 | 69 | 70 | 71 | Usage 72 | =========== 73 | .. currentmodule:: direpack.ppdire.ppdire 74 | 75 | .. autosummary:: 76 | :toctree: generated/ 77 | :caption: PPDIRE 78 | 79 | ppdire 80 | 81 | 82 | .. currentmodule:: direpack.dicomo.dicomo 83 | 84 | .. autosummary:: 85 | :toctree: generated/ 86 | :caption: DICOMO 87 | 88 | dicomo 89 | 90 | 91 | 92 | 93 | 94 | 95 | Dependencies 96 | ================ 97 | 98 | - From `sklearn.base`: `BaseEstimator`,`TransformerMixin`,`RegressorMixin` 99 | - From `sklearn.utils`: `_BaseComposition` 100 | - `copy` 101 | - `scipy.stats` 102 | - From `scipy.linalg`: `pinv2` 103 | - From `scipy.optimize`: `minimize` 104 | - `numpy` 105 | - From `statsmodels.regression.quantile_regression`: `QuantReg` 106 | - From `sklearn.utils.extmath`: `svd_flip` 107 | 108 | 109 | References 110 | ========== 111 | 1. Peter Filzmoser, Sven Serneels, Christophe Croux and Pierre J. Van Espen, Robust Multivariate Methods: The Projection Pursuit Approach, in: From Data and Information Analysis to Knowledge Engineering,Spiliopoulou, M., Kruse, R., Borgelt, C., Nuernberger, A. and Gaul, W., eds., Springer Verlag, Berlin, Germany, 2006, pages 270--277. 112 | 113 | 2. Sven Serneels, Projection pursuit based generalized betas accounting for higher order co-moment effects in financial market analysis, in: JSM Proceedings, Business and Economic Statistics Section. Alexandria, VA: American Statistical Association, 2019, 3009-3035. 114 | 115 | 3. Chen, Z. and Li, G., Robust principal components and dispersion matrices via projection pursuit, Research Report, Department of Statistics, Harvard University, 1981. 116 | 117 | 4. Peter Filzmoser, Christophe Croux, Pierre J. Van Espen, Robust Continuum Regression, Sven Serneels, Chemometrics and Intelligent Laboratory Systems, 76 (2005), 197-204. 118 | 119 | 5. Stone M, Brooks RJ (1990). “Continuum Regression: Cross-Validated Sequentially Constructed Prediction Embracing Ordinary Least Squares, Partial Least Squares and PrincipalComponents Regression.”Journal of the Royal Statistical Society. Series B (Methodological),52, 237–269. 120 | -------------------------------------------------------------------------------- /src/direpack/ppdire/capi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun May 12 10:03:05 2019 5 | 6 | @author: Sven Serneels, Ponalytics. 7 | """ 8 | 9 | from sklearn.base import BaseEstimator, defaultdict 10 | from sklearn.utils.metaestimators import _BaseComposition 11 | from collections import defaultdict 12 | import inspect 13 | from ..dicomo.dicomo import dicomo 14 | from ..dicomo._dicomo_utils import * 15 | 16 | 17 | class capi(_BaseComposition, BaseEstimator): 18 | 19 | """ 20 | CAPI Co-moment analysis projection index 21 | 22 | The CAPI projection index to estimate generalized betas was first introduced 23 | in: 24 | 25 | S. Serneels, Projection pursuit based generalized betas accounting for 26 | higher order co-moment effects in financial market analysis, in: 27 | JSM Proceedings, Business and Economic Statistics Section. 28 | Alexandria, VA: American Statistical Association, 2019, 3009-3035. 29 | 30 | Class arguments 31 | 32 | max_degree, int: maxmimal degree of co-moments to be used. In [2,3,4]. 33 | 34 | projection_index, class object: class used to calculate co-moments. 35 | Written to work with dicomo class yet other plugins could be written. 36 | 37 | pi_arguments, dict: dict of arguments to pass on to projection_index 38 | 39 | weights, list of float: weights to used in linear combination of co-moments. 40 | 41 | centring, bool 42 | 43 | scaling, bool whether to calculate CAPI based on scaled higher co-moments 44 | (co-skewness, co-kurtosis) or raw higher co-moments 45 | 46 | options, either a list of co-moment options to be included, or 'all' (e.g. 47 | option=i calculates M3,i and M4,i etc.) 48 | 49 | After intializing the object, call object.fit(x,y,**kwargs) to evaluate. 50 | CAPI takes no direct kwargs, yet passes all kwargs on to the fit method of 51 | the projection index. 52 | 53 | """ 54 | 55 | def __init__( 56 | self, 57 | max_degree=2, 58 | projection_index=dicomo, 59 | pi_arguments={}, 60 | weights=[1, 1, 1, -1, -1, -1], 61 | centring=False, 62 | scaling=True, 63 | options="all", 64 | ): 65 | self.max_degree = max_degree 66 | self.projection_index = projection_index 67 | self.pi_arguments = pi_arguments 68 | self.weights = weights 69 | self.most = self.projection_index(**self.pi_arguments) 70 | self.scaling = scaling 71 | self.options = options 72 | self.capi_index_ = None 73 | if self.max_degree > 4: 74 | raise (ValueError("Maximal degree is 4.")) 75 | 76 | def fit(self, x, y, **kwargs): 77 | 78 | if self.scaling: 79 | order_kwargs = ["cov", "cos", "cok"] 80 | else: 81 | order_kwargs = ["com", "com", "com"] 82 | 83 | if self.max_degree < 2: 84 | raise (ValueError("capi not meaningful for max_degree < 2")) 85 | if self.options == "all": 86 | options = np.arange(1, 4) 87 | else: 88 | options = np.array(self.options, ndmin=1) 89 | moments = np.zeros(6) 90 | fit_arguments = {"order": 0, "y": y} 91 | fit_arguments = {**kwargs, **fit_arguments} 92 | init_moment_calc = 2 93 | k = 0 94 | for i in range(init_moment_calc, self.max_degree + 1): 95 | fit_arguments["order"] = i 96 | self.most.set_params(mode=order_kwargs[i - 2]) 97 | l = min(i - 1, len(options)) 98 | for j in options[np.arange(0, l)]: 99 | fit_arguments["option"] = j 100 | moments[i - 3 + j + k] = self.most.fit(x, **fit_arguments) 101 | if i == 3: 102 | k += 1 103 | capi_index_ = np.dot(self.weights, moments) 104 | self.capi_index_ = capi_index_ 105 | self.moments_ = moments 106 | return capi_index_ 107 | 108 | @classmethod 109 | def _get_param_names(cls): 110 | """Get parameter names for the estimator""" 111 | # fetch the constructor or the original constructor before 112 | # deprecation wrapping if any 113 | init = getattr(cls.__init__, "deprecated_original", cls.__init__) 114 | if init is object.__init__: 115 | # No explicit constructor to introspect 116 | return [] 117 | 118 | # introspect the constructor arguments to find the model parameters 119 | # to represent 120 | init_signature = inspect.signature(init) 121 | # Consider the constructor parameters excluding 'self' 122 | parameters = [ 123 | p 124 | for p in init_signature.parameters.values() 125 | if p.name != "self" and p.kind != p.VAR_KEYWORD 126 | ] 127 | for p in parameters: 128 | if p.kind == p.VAR_POSITIONAL: 129 | raise RuntimeError( 130 | "scikit-learn estimators should always " 131 | "specify their parameters in the signature" 132 | " of their __init__ (no varargs)." 133 | " %s with constructor %s doesn't " 134 | " follow this convention." % (cls, init_signature) 135 | ) 136 | # Extract and sort argument names excluding 'self' 137 | return sorted([p.name for p in parameters]) 138 | 139 | def get_params(self, deep=False): 140 | """Get parameters for this estimator. 141 | Parameters 142 | ---------- 143 | deep : boolean, optional 144 | If True, will return the parameters for this estimator and 145 | contained subobjects that are estimators. 146 | Returns 147 | ------- 148 | params : mapping of string to any 149 | Parameter names mapped to their values. 150 | ------ 151 | Copied from ScikitLlearn instead of imported to avoid 'deep=True' 152 | """ 153 | out = dict() 154 | for key in self._get_param_names(): 155 | value = getattr(self, key, None) 156 | if deep and hasattr(value, "get_params"): 157 | deep_items = value.get_params().items() 158 | out.update((key + "__" + k, val) for k, val in deep_items) 159 | out[key] = value 160 | return out 161 | 162 | def set_params(self, **params): 163 | """Set the parameters of this estimator. 164 | Copied from ScikitLearn, adapted to avoid calling 'deep=True' 165 | Returns 166 | ------- 167 | self 168 | ------ 169 | Copied from ScikitLlearn instead of imported to avoid 'deep=True' 170 | """ 171 | if not params: 172 | # Simple optimization to gain speed (inspect is slow) 173 | return self 174 | valid_params = self.get_params() 175 | 176 | nested_params = defaultdict(dict) # grouped by prefix 177 | for key, value in params.items(): 178 | key, delim, sub_key = key.partition("__") 179 | if key not in valid_params: 180 | raise ValueError( 181 | "Invalid parameter %s for estimator %s. " 182 | "Check the list of available parameters " 183 | "with `estimator.get_params().keys()`." % (key, self) 184 | ) 185 | 186 | if delim: 187 | nested_params[key][sub_key] = value 188 | else: 189 | setattr(self, key, value) 190 | valid_params[key] = value 191 | 192 | for key, sub_params in nested_params.items(): 193 | valid_params[key].set_params(**sub_params) 194 | 195 | return self 196 | -------------------------------------------------------------------------------- /src/direpack/ipopt_temp/ipopt_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | cyipopt: Python wrapper for the Ipopt optimization package, written in Cython. 5 | 6 | Copyright (C) 2012-2015 Amit Aides 7 | Copyright (C) 2015-2018 Matthias Kümmerer 8 | 9 | Author: Matthias Kümmerer 10 | (original Author: Amit Aides ) 11 | URL: https://github.com/matthias-k/cyipopt 12 | License: EPL 1.0 13 | 14 | This section is copied from ipopt until the fix in jacobians.py gets included into 15 | that package. 16 | 17 | """ 18 | 19 | from __future__ import absolute_import, unicode_literals 20 | import sys 21 | 22 | from builtins import bytes # from the future package 23 | import numpy as np 24 | try: 25 | import scipy 26 | except ImportError: # scipy is not installed 27 | SCIPY_INSTALLED = False 28 | else: 29 | SCIPY_INSTALLED = True 30 | del scipy 31 | from scipy.optimize import approx_fprime 32 | try: 33 | from scipy.optimize import OptimizeResult 34 | except ImportError: 35 | # in scipy 0.14 Result was renamed to OptimzeResult 36 | from scipy.optimize import Result 37 | OptimizeResult = Result 38 | 39 | import cyipopt 40 | from .jacobian import FunctionWithApproxJacobianCentral,FunctionWithApproxJacobian 41 | 42 | 43 | class IpoptProblemWrapper(object): 44 | def __init__(self, fun, args=(), kwargs=None, jac=None, hess=None, hessp=None, 45 | constraints=(), eps=1e-8): 46 | if not SCIPY_INSTALLED: 47 | raise ImportError('Install SciPy to use the `IpoptProblemWrapper` class.') 48 | self.fun_with_jac = None 49 | self.last_x = None 50 | if hess is not None or hessp is not None: 51 | raise NotImplementedError('Using hessian matrixes is not yet implemented!') 52 | if jac is None: 53 | #fun = FunctionWithApproxJacobian(fun, epsilon=eps, verbose=False) 54 | jac = lambda x0, *args, **kwargs: approx_fprime(x0, fun, eps, *args, **kwargs) 55 | elif jac is True: 56 | self.fun_with_jac = fun 57 | elif not callable(jac): 58 | raise NotImplementedError('jac has to be bool or a function') 59 | self.fun = fun 60 | self.jac = jac 61 | self.args = args 62 | self.kwargs = kwargs or {} 63 | self._constraint_funs = [] 64 | self._constraint_jacs = [] 65 | self._constraint_args = [] 66 | if isinstance(constraints, dict): 67 | constraints = (constraints, ) 68 | for con in constraints: 69 | con_fun = con['fun'] 70 | con_jac = con.get('jac', None) 71 | if con_jac is None: 72 | con_fun = FunctionWithApproxJacobian(con_fun, epsilon=eps, verbose=False) 73 | con_jac = con_fun.jac 74 | con_args = con.get('args', []) 75 | self._constraint_funs.append(con_fun) 76 | self._constraint_jacs.append(con_jac) 77 | self._constraint_args.append(con_args) 78 | # Set up evaluation counts 79 | self.nfev = 0 80 | self.njev = 0 81 | self.nit = 0 82 | 83 | def evaluate_fun_with_grad(self, x): 84 | if self.last_x is None or not np.all(self.last_x == x): 85 | self.last_x = x 86 | self.nfev += 1 87 | self.last_value = self.fun(x, *self.args, **self.kwargs) 88 | return self.last_value 89 | 90 | def objective(self, x): 91 | if self.fun_with_jac: 92 | return self.evaluate_fun_with_grad(x)[0] 93 | 94 | self.nfev += 1 95 | return self.fun(x, *self.args, **self.kwargs) 96 | 97 | def gradient(self, x, **kwargs): 98 | if self.fun_with_jac: 99 | return self.evaluate_fun_with_grad(x)[1] 100 | 101 | self.njev += 1 102 | return self.jac(x, *self.args, **self.kwargs) # .T 103 | 104 | def constraints(self, x): 105 | con_values = [] 106 | for fun, args in zip(self._constraint_funs, self._constraint_args): 107 | con_values.append(fun(x, *args)) 108 | return np.hstack(con_values) 109 | 110 | def jacobian(self, x): 111 | con_values = [] 112 | for fun, args in zip(self._constraint_jacs, self._constraint_args): 113 | con_values.append(fun(x, *args)) 114 | return np.vstack(con_values) 115 | 116 | def intermediate( 117 | self, 118 | alg_mod, 119 | iter_count, 120 | obj_value, 121 | inf_pr, 122 | inf_du, 123 | mu, 124 | d_norm, 125 | regularization_size, 126 | alpha_du, 127 | alpha_pr, 128 | ls_trials 129 | ): 130 | 131 | self.nit = iter_count 132 | 133 | 134 | def get_bounds(bounds): 135 | if bounds is None: 136 | return None, None 137 | else: 138 | lb = [b[0] for b in bounds] 139 | ub = [b[1] for b in bounds] 140 | return lb, ub 141 | 142 | 143 | def get_constraint_bounds(constraints, x0, INF=1e19): 144 | if isinstance(constraints, dict): 145 | constraints = (constraints, ) 146 | cl = [] 147 | cu = [] 148 | if isinstance(constraints, dict): 149 | constraints = (constraints, ) 150 | for con in constraints: 151 | m = len(np.atleast_1d(con['fun'](x0, *con.get('args', [])))) 152 | cl.extend(np.zeros(m)) 153 | if con['type'] == 'eq': 154 | cu.extend(np.zeros(m)) 155 | elif con['type'] == 'ineq': 156 | cu.extend(INF*np.ones(m)) 157 | else: 158 | raise ValueError(con['type']) 159 | cl = np.array(cl) 160 | cu = np.array(cu) 161 | 162 | return cl, cu 163 | 164 | 165 | def replace_option(options, oldname, newname): 166 | if oldname in options: 167 | if newname not in options: 168 | options[newname] = options.pop(oldname) 169 | 170 | def convert_to_bytes(options): 171 | if sys.version_info >= (3, 0): 172 | for key in list(options.keys()): 173 | try: 174 | if bytes(key, 'utf-8') != key: 175 | options[bytes(key, 'utf-8')] = options[key] 176 | options.pop(key) 177 | except TypeError: 178 | pass 179 | 180 | def minimize_ipopt(fun, x0, args=(), kwargs=None, method=None, jac=None, hess=None, hessp=None, 181 | bounds=None, constraints=(), tol=None, callback=None, options=None): 182 | """ 183 | Minimize a function using ipopt. The call signature is exactly like for 184 | `scipy.optimize.mimize`. In options, all options are directly passed to 185 | ipopt. Check [http://www.coin-or.org/Ipopt/documentation/node39.html] for 186 | details. 187 | The options `disp` and `maxiter` are automatically mapped to their 188 | ipopt-equivalents `print_level` and `max_iter`. 189 | """ 190 | if not SCIPY_INSTALLED: 191 | raise ImportError('Install SciPy to use the `minimize_ipopt` function.') 192 | 193 | _x0 = np.atleast_1d(x0) 194 | problem = IpoptProblemWrapper(fun, args=args, kwargs=kwargs, jac=jac, hess=hess, 195 | hessp=hessp, constraints=constraints) 196 | lb, ub = get_bounds(bounds) 197 | 198 | cl, cu = get_constraint_bounds(constraints, x0) 199 | 200 | if options is None: 201 | options = {} 202 | 203 | nlp = cyipopt.problem(n = len(_x0), 204 | m = len(cl), 205 | problem_obj=problem, 206 | lb=lb, 207 | ub=ub, 208 | cl=cl, 209 | cu=cu) 210 | 211 | # python3 compatibility 212 | convert_to_bytes(options) 213 | 214 | # Rename some default scipy options 215 | replace_option(options, b'disp', b'print_level') 216 | replace_option(options, b'maxiter', b'max_iter') 217 | if b'print_level' not in options: 218 | options[b'print_level'] = 0 219 | if b'tol' not in options: 220 | options[b'tol'] = tol or 1e-8 221 | if b'mu_strategy' not in options: 222 | options[b'mu_strategy'] = b'adaptive' 223 | if b'hessian_approximation' not in options: 224 | if hess is None and hessp is None: 225 | options[b'hessian_approximation'] = b'limited-memory' 226 | for option, value in options.items(): 227 | try: 228 | nlp.addOption(option, value) 229 | except TypeError as e: 230 | raise TypeError('Invalid option for IPOPT: {0}: {1} (Original message: "{2}")'.format(option, value, e)) 231 | 232 | x, info = nlp.solve(_x0) 233 | 234 | if np.asarray(x0).shape == (): 235 | x = x[0] 236 | 237 | return OptimizeResult(x=x, success=info['status'] == 0, status=info['status'], 238 | message=info['status_msg'], 239 | fun=info['obj_val'], 240 | info=info, 241 | nfev=problem.nfev, 242 | njev=problem.njev, 243 | nit=problem.nit) 244 | -------------------------------------------------------------------------------- /docs/ppdire.md: -------------------------------------------------------------------------------- 1 | Projection Pursuit Dimension Reduction 2 | ====================================== 3 | 4 | A `scikit-learn` compatible Python 3 package for Projection Pursuit Dimension Reduction. 5 | This class implements a very general framweork for projection pursuit, giving access to 6 | methods ranging from PP-PCA to CAPI generalized betas. 7 | 8 | Description 9 | ----------- 10 | 11 | Projection pursuit (PP) provides a very general framework for dimension reduction and regression. The 12 | `ppdire` class provides a framework to calculate PP estimates based on a wide variety of projection 13 | indices. 14 | 15 | While the class will also work with user-defined projection indices, a set of projection indices are 16 | included into the `direpack` package as two ancillary classes: 17 | - `dicomo` class for (co-)moment statistics (separate folder), cf the [dicomo Documentation file](https://github.com/SvenSerneels/direpack/blob/master/docs/dicomo.md) 18 | - `capi` specifically for analyzing financial market returns based on a linear combination of co-moments \[2\] 19 | 20 | When using the `dicomo` class as a plugin, several well-known multivariate dimension reduction techniques 21 | are accessible, as well as robust alternatives thereto. For more details, have a look at the [ppdire examples notebook](https://github.com/SvenSerneels/direpack/blob/master/examples/ppdire_example.ipynb). 22 | 23 | The `ppdire` class allows for calculation of the projection pursuit optimization either 24 | through `scipy.optimize` or through the native grid\[1\] algorithm. Optimization through 25 | `scipy.optimize` is much more efficient, yet it will only provide correct results 26 | for classical projection indices. The native grid algorithm should be used when 27 | the projection index involves order statistics of any kind, such as ranks, trimming, 28 | winsorizing, or empirical quantiles. 29 | 30 | Remarks: 31 | - all the methods contained in this package have been designed for continuous data. They do not work correctly for categorical or textual data. 32 | - this package focuses on projection pursuit dimension reduction. Regression methods that involve a dimension reduction step can be accessed through it 33 | (e.g. PCR, PLS, RCR, ...), yet the package does not provide an implementation for projection pursuit regression (PPR). To access PPR, we refer to 34 | the `projection-pursuit` package, also distributed through PIP. 35 | 36 | The code is aligned to `scikit-learn`, such that modules such as `GridSearchCV` can flawlessly be applied to it. 37 | 38 | The `ppdire` folder contains 39 | - The estimator (`ppdire.py`) 40 | - A class for the co-moment analysis projection index (`capi.py`) 41 | - Ancillary functions for projection pursuit (`_ppdire_utils.py`) 42 | 43 | The ppdire class 44 | ================ 45 | 46 | Dependencies 47 | ------------ 48 | - From `sklearn.base`: `BaseEstimator`,`TransformerMixin`,`RegressorMixin` 49 | - From `sklearn.utils`: `_BaseComposition` 50 | - `copy` 51 | - `scipy.stats` 52 | - From `scipy.linalg`: `pinv2` 53 | - From `scipy.optimize`: `minimize` 54 | - `numpy` 55 | - From `statsmodels.regression.quantile_regression`: `QuantReg` 56 | - From `sklearn.utils.extmath`: `svd_flip` 57 | 58 | 59 | Parameters 60 | ---------- 61 | - `projection_index`, function or class. `dicomo` and `capi` supplied in this 62 | package can both be used, but user defined projection indices can 63 | be processed 64 | - `pi_arguments`, dict. Dict of arguments to be passed on to `projection index` 65 | - `n_components`, int. number of components to be estimated 66 | - `trimming`, float. trimming percentage for projection index, to be entered as pct/100 67 | - `alpha`, float. Continuum coefficient. Only relevant if `ppdire` is used to 68 | estimate (classical or robust) continuum regression. 69 | - `optimizer`: str. Presently: either `'grid'` (native optimizer) or 70 | any of the options in `scipy-optimize` (e.g. `'SLSQP'`) 71 | - `optimizer_options`: dict with options to pass on to the optimizer. 72 | If `optimizer == 'grid'`, 73 | * `ndir`: int: Number of directions to calculate per iteration. 74 | * `maxiter`: int. Maximal number of iterations. 75 | - `optimizer_constraints`: dict or list of dicts, further constraints to be 76 | passed on to the optimizer function. 77 | - `regopt`, str. Regression option for regression step y~T. Can be set 78 | to `'OLS'` (default), `'robust'` (will run `sprm.rm`) or `'quantile'` 79 | (`statsmodels.regression.quantreg`). 80 | - `center`, str. How to center the data. options accepted are options from 81 | `direpack`'s `VersatileScaler`. 82 | - `center_data`, bool. 83 | - `scale_data`, bool. Note: if set to `False`, convergence to correct optimum 84 | is not a given. Will throw a warning. 85 | - `whiten_data`, bool. Typically used for ICA (kurtosis as PI) 86 | - `square_pi`, bool. Whether to square the projection index upon evaluation. 87 | - `compression`, bool. If `True`, an internal SVD compression step is used for 88 | flat data tables (p > n). Speds up the calculations. 89 | - `copy`, bool. Whether to make a deep copy of the input data or not. 90 | - `verbose`, bool. Set to `True` prints the iteration number. 91 | - `return_scaling_object`, bool. 92 | Note: several interesting parameters can also be passed to the `fit` method. 93 | 94 | Attributes 95 | ---------- 96 | Attributes always provided 97 | - `x_weights_`: X block PPDIRE weighting vectors (usually denoted W) 98 | - `x_loadings_`: X block PPDIRE loading vectors (usually denoted P) 99 | - `x_scores_`: X block PPDIRE score vectors (usually denoted T) 100 | - `x_ev_`: X block explained variance per component 101 | - `x_Rweights_`: X block SIMPLS style weighting vectors (usually denoted R) 102 | - `x_loc_`: X block location estimate 103 | - `x_sca_`: X block scale estimate 104 | - `crit_values_`: vector of evaluated values for the optimization objective. 105 | - `Maxobjf_`: vector containing the optimized objective per component. 106 | 107 | Attributes created when more than one block of data is provided: 108 | - `C_`: vector of inner relationship between response and latent variables block 109 | - `coef_`: vector of regression coefficients, if second data block provided 110 | - `intercept_`: intercept 111 | - `coef_scaled_`: vector of scaled regression coeeficients (when scaling option used) 112 | - `intercept_scaled_`: scaled intercept 113 | - `residuals_`: vector of regression residuals 114 | - `y_ev_`: y block explained variance 115 | - `fitted_`: fitted response 116 | - `y_loc_`: y location estimate 117 | - `y_sca_`: y scale estimate 118 | 119 | Attributes created only when corresponding input flags are `True`: 120 | - `whitening_`: whitened data matrix (usually denoted K) 121 | - `mixing_`: mixing matrix estimate 122 | - `scaling_object_`: scaling object from `VersatileScaler` 123 | 124 | 125 | Methods 126 | -------- 127 | - `fit(X, *args, **kwargs)`: fit model 128 | - `predict(X)`: make predictions based on fit 129 | - `transform(X)`: project X onto latent space 130 | - `getattr()`: get list of attributes 131 | - `setattr(*kwargs)`: set individual attribute of sprm object 132 | 133 | The `fit` function takes several optional input arguments. These are flags that 134 | typically would not need to be cross-validated. They are: 135 | - `y`, numpy vector or 1D matrix, either as `arg` directly or as `kwarg` 136 | - `h`, int. Overrides `n_components` for an individual call to `fit`. Use with caution. 137 | - `dmetric`, str. Distance metric used internally. Defaults to `'euclidean'` 138 | - `mixing`, bool. Return mixing matrix? 139 | - Further parameters to the regression methods can be passed on here 140 | as additional `kwargs`. 141 | 142 | 143 | Ancillary functions 144 | ------------------- 145 | - `dicomo` (class): (co-)moments 146 | - `capi` (class): co-moment analysis projection index 147 | 148 | 149 | References 150 | ---------- 151 | 1. [Robust Multivariate Methods: The Projection Pursuit Approach](https://link.springer.com/chapter/10.1007/3-540-31314-1_32), Peter Filzmoser, Sven Serneels, Christophe Croux and Pierre J. Van Espen, in: From Data and Information Analysis to Knowledge Engineering, 152 | Spiliopoulou, M., Kruse, R., Borgelt, C., Nuernberger, A. and Gaul, W., eds., 153 | Springer Verlag, Berlin, Germany, 154 | 2006, pages 270--277. 155 | 2. [Projection pursuit based generalized betas accounting for higher order co-moment effects in financial market analysis](https://arxiv.org/pdf/1908.00141.pdf), Sven Serneels, in: 156 | JSM Proceedings, Business and Economic Statistics Section. Alexandria, VA: American Statistical Association, 2019, 3009-3035. 157 | 3. Robust principal components and dispersion matrices via projection pursuit, Chen, Z. and Li, G., Research Report, Department of Statistics, Harvard University, 1981. 158 | 4. [Robust Continuum Regression](https://www.sciencedirect.com/science/article/abs/pii/S0169743904002667), Sven Serneels, Peter Filzmoser, Christophe Croux, Pierre J. Van Espen, Chemometrics and Intelligent Laboratory Systems, 76 (2005), 197-204. 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | `direpack`: a Python 3 library for state-of-the-art statistical dimension reduction techniques 2 | ============================================================================================== 3 | 4 | This package delivers a `scikit-learn` compatible Python 3 package for sundry state-of-the art multivariate statistical methods, with 5 | a focus on dimension reduction. 6 | 7 | The categories of methods delivered in this package, are: 8 | - Projection pursuit dimension reduction (`ppdire`) 9 | - Sufficient dimension reduction (`sudire`) 10 | - Robust M-estimators for dimension reduction (`sprm`) 11 | each of which are presented as `scikit-learn` compatible objects in the corresponding folders. 12 | 13 | We hope that this package leads to scientific success. If it does so, we kindly ask to cite the [official `direpack` publication](https://www.sciencedirect.com/science/article/pii/S235271102200200X) \[0\], as well as the original publication of the corresponding method. 14 | 15 | The package also contains a set of tools for pre- and postprocessing: 16 | - The `preprocessing` folder provides classical and robust centring and scaling, as well as spatial sign transforms \[4\] and the robbustness inducing wrapping transformation \[15\]. 17 | - The `dicomo` folder contains a versatile class to access a wide variety of moment and co-moment statistics, and statistics derived from those. Check out the [dicomo Documentation file](https://github.com/SvenSerneels/direpack/blob/master/docs/dicomo.md) and the [dicomo Examples Notebook](https://github.com/SvenSerneels/direpack/blob/master/examples/dicomo_example.ipynb). 18 | - Plotting utilities in the `plot` folder 19 | - Cross-validation utilities in the `cross-validation` folder 20 | 21 | ![AIG sprm score space](https://github.com/SvenSerneels/direpack/blob/master/img/AIG_T12.png "AIG SPRM score space") 22 | 23 | 24 | Methods in the `sprm` folder 25 | ---------------------------- 26 | - The estimator (`sprm.py`) \[1\] 27 | - The Sparse NIPALS (SNIPLS) estimator \[3\](`snipls.py`) 28 | - Robust M regression estimator (`rm.py`) 29 | - Ancillary functions for M-estimation (`_m_support_functions.py`) 30 | 31 | Methods in the `ppdire` folder 32 | ------------------------------ 33 | The `ppdire` class will give access to a wide range of projection pursuit dimension reduction techniques. 34 | These include slower approximate estimates for well-established methods such as PCA, PLS and continuum regression. 35 | However, the class provides unique access to a set of robust options, such as robust continuum regression (RCR) \[5\], through its native `grid` optimization algorithm, first 36 | published for RCR as well \[6\]. Moreover, `ppdire` is also a great gateway to calculate generalized betas, using the CAPI projection index \[7\]. 37 | 38 | The code is orghanized in 39 | - `ppdire.py` - the main PP dimension reduction class 40 | - `capi.py` - the co-moment analysis projection index. 41 | 42 | Methods in the `sudire` folder 43 | ------------------------------ 44 | The `sudire` folder gives access to an extensive set of methods that resort under the umbrella of sufficient dimension reduction. 45 | These range from meanwhile long-standing, well-accepted approaches, such as sliced inverse regression (SIR) and the closely related SAVE \[8,9\], 46 | through methods such as directional regression \[10\] and principal Hessian directions \[11\], and more. However, the package also contains some 47 | of the most recently developed, state-of-the-art sufficient dimension reduction techniques, that require no distributional assumptions. 48 | The options provided in this category are based on energy statistics (distance covariance \[12\] or martingale difference divergence \[13\]) and 49 | ball statistics (ball covariance) \[14\]. All of these options can be called by setting the corresponding parameters in the `sudire` class, cf. [the docs](https://github.com/SvenSerneels/direpack/blob/master/docs/sudire.md). 50 | Note: the ball covariance option will require some lines to be uncommented as indicated. We decided not to make that option generally available, 51 | since it depends on the `Ball` package that seems to be difficult to install on certain architectures. 52 | 53 | How to install 54 | -------------- 55 | The package is distributed through PyPI, so install through: 56 | 57 | pip install direpack 58 | 59 | Note that some of the key methods in the `sudire` subpackage rely on the IPOPT 60 | optimization package, which according to their recommendation, can best be installed 61 | directly as: 62 | 63 | conda install -c conda-forge cyipopt 64 | 65 | Documentation 66 | ============= 67 | 68 | - Detailed documentation can be found in the [ReadTheDocs page](https://direpack.readthedocs.io/en/latest/index.html). 69 | - A more extensive description on the background is presented in the [official `direpack` publication](https://www.sciencedirect.com/science/article/pii/S235271102200200X). 70 | - Examples on how to use each of the `dicomo`, `ppdire`, `sprm` and `sudire` classes are presented as Jupyter notebooks in the [examples](https://github.com/SvenSerneels/direpack/blob/master/examples) folder 71 | - Furthemore, the [docs](https://github.com/SvenSerneels/direpack/blob/master/docs) folder contains a few markdown files on usage of the classes. 72 | 73 | 74 | 75 | References 76 | ========== 77 | 0. [`direpack`: A Python 3 package for state-of-the-art statistical dimensionality reduction methods](https://www.sciencedirect.com/science/article/pii/S235271102200200X), Emmanuel Jordy Menvouta, Sven Serneels, Tim Verdonck, SoftwareX, 21 (2023), 101282. 78 | 1. [Sparse partial robust M regression](https://www.sciencedirect.com/science/article/abs/pii/S0169743915002440), Irene Hoffmann, Sven Serneels, Peter Filzmoser, Christophe Croux, Chemometrics and Intelligent Laboratory Systems, 149 (2015), 50-59. 79 | 2. [Partial robust M regression](https://doi.org/10.1016/j.chemolab.2005.04.007), Sven Serneels, Christophe Croux, Peter Filzmoser, Pierre J. Van Espen, Chemometrics and Intelligent Laboratory Systems, 79 (2005), 55-64. 80 | 3. [Sparse and robust PLS for binary classification](https://onlinelibrary.wiley.com/doi/abs/10.1002/cem.2775), I. Hoffmann, P. Filzmoser, S. Serneels, K. Varmuza, Journal of Chemometrics, 30 (2016), 153-162. 81 | 4. [Spatial Sign Preprocessing:  A Simple Way To Impart Moderate Robustness to Multivariate Estimators](https://pubs.acs.org/doi/abs/10.1021/ci050498u), Sven Serneels, Evert De Nolf, Pierre J. Van Espen, Journal of Chemical Information and Modeling, 46 (2006), 1402-1409. 82 | 5. [Robust Continuum Regression](https://www.sciencedirect.com/science/article/abs/pii/S0169743904002667), Sven Serneels, Peter Filzmoser, Christophe Croux, Pierre J. Van Espen, Chemometrics and Intelligent Laboratory Systems, 76 (2005), 197-204. 83 | 6. [Robust Multivariate Methods: The Projection Pursuit Approach](https://link.springer.com/chapter/10.1007/3-540-31314-1_32), Peter Filzmoser, Sven Serneels, Christophe Croux and Pierre J. Van Espen, in: From Data and Information Analysis to Knowledge Engineering, Spiliopoulou, M., Kruse, R., Borgelt, C., Nuernberger, A. and Gaul, W., eds., Springer Verlag, Berlin, Germany, 2006, pages 270--277. 84 | 7. [Projection pursuit based generalized betas accounting for higher order co-moment effects in financial market analysis](https://arxiv.org/pdf/1908.00141.pdf), Sven Serneels, in: JSM Proceedings, Business and Economic Statistics Section. Alexandria, VA: American Statistical Association, 2019, 3009-3035. 85 | 8. [Sliced Inverse Regression for Dimension Reduction](https://www.tandfonline.com/doi/abs/10.1080/01621459.1991.10475035) Li K-C, Journal of the American Statistical Association (1991), 86, 316-327. 86 | 9. [Sliced Inverse Regression for Dimension Reduction: Comment](https://www.jstor.org/stable/2290564?seq=1#metadata_info_tab_contents), R.D. Cook, and Sanford Weisberg, Journal of the American Statistical Association (1991), 86, 328-332. 87 | 10. [On directional regression for dimension reduction](https://doi.org/10.1198/016214507000000536) , B. Li and S.Wang, Journal of the American Statistical Association (2007), 102:997–1008. 88 | 11. [On principal hessian directions for data visualization and dimension reduction:Another application of stein’s lemma](https://www.tandfonline.com/doi/abs/10.1080/01621459.1992.10476258), K.-C. Li. , Journal of the American Statistical Association(1992)., 87,1025–1039. 89 | 12. [Sufficient Dimension Reduction via Distance Covariance](https://doi.org/10.1080/10618600.2015.1026601), Wenhui Sheng and Xiangrong Yin in: Journal of Computational and Graphical Statistics (2016), 25, issue 1, pages 91-104. 90 | 13. [A martingale-difference-divergence-based estimation of central mean subspace](https://dx.doi.org/10.4310/19-SII562), Yu Zhang, Jicai Liu, Yuesong Wu and Xiangzhong Fang, in: Statistics and Its Interface (2019), 12, number 3, pages 489-501. 91 | 14. [Robust Sufficient Dimension Reduction Via Ball Covariance](https://www.sciencedirect.com/science/article/pii/S0167947319301380) Jia Zhang and Xin Chen, Computational Statistics and Data Analysis 140 (2019) 144–154 92 | 15. [Fast Robust Correlation for High-Dimensional Data](https://www.tandfonline.com/doi/full/10.1080/00401706.2019.1677270) Jakob Raymaekers and Peter J. Rousseeuw, Technometrics, 63 (2021), 184-198. 93 | 94 | 95 | [Release Notes](https://github.com/SvenSerneels/direpack/blob/master/direpack_Release_Notes.md) can be checked out in the repository. 96 | 97 | [A list of possible topics for further development](https://github.com/SvenSerneels/direpack/blob/master/direpack_Future_Dev.md) is provided as well. Additions and comments are welcome! -------------------------------------------------------------------------------- /src/direpack/preprocessing/_preproc_utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Dec 21 10:55:24 2019 5 | 6 | Set of help functions for robust centring and scaling 7 | 8 | @author: Sven Serneels, Ponalytics 9 | """ 10 | 11 | import numpy as np 12 | import pandas as ps 13 | import scipy.stats as sps 14 | import scipy.optimize as spo 15 | from statsmodels import robust as srs 16 | import copy 17 | 18 | 19 | def _handle_zeros_in_scale(scale, copy=True): 20 | """ 21 | Makes sure that whenever scale is zero, we handle it correctly. 22 | This happens in most scalers when we have constant features. 23 | Taken from ScikitLearn.preprocesssing""" 24 | 25 | # if we are fitting on 1D arrays, scale might be a scalar 26 | if np.isscalar(scale): 27 | if scale == 0.0: 28 | scale = 1.0 29 | return scale 30 | elif isinstance(scale, np.ndarray): 31 | if copy: 32 | # New array to avoid side-effects 33 | scale = scale.copy() 34 | scale[scale == 0.0] = 1.0 35 | return scale 36 | 37 | 38 | def _check_trimming(t): 39 | 40 | if (t > 0.99) or (t < 0): 41 | raise (ValueError("Trimming fraction must be in [0,1)")) 42 | 43 | 44 | def mad(X, c=0.6744897501960817, **kwargs): 45 | """ 46 | Column-wise median absolute deviation. **kwargs included to allow 47 | general function call in scale_data. 48 | """ 49 | 50 | s = median(np.abs(X - median(X, axis=0)), axis=0) / c 51 | s = np.array(s).reshape(-1) 52 | # statsmodels.robust.mad is not as flexible toward matrix input, 53 | # sometimes throws a value error in ufunc 54 | return s 55 | 56 | 57 | def median(X, **kwargs): 58 | """ 59 | Column-wise median. **kwargs included to allow 60 | general function call in scale_data. 61 | """ 62 | 63 | if np.isnan(X).any(): 64 | m = np.nanmedian(X, axis=0) 65 | else: 66 | m = np.median(X, axis=0) 67 | m = np.array(m).reshape(-1) 68 | 69 | return m 70 | 71 | 72 | def mean(X, trimming=0): 73 | """ 74 | Column-wise mean or trimmed mean. Trimming to be entered as fraction. 75 | """ 76 | 77 | if trimming == 0: 78 | if np.isnan(X).any(): 79 | m = np.nanmean(X, axis=0) 80 | else: 81 | m = np.mean(X, axis=0) 82 | else: 83 | # Returns all NaN if missings in X 84 | m = sps.trim_mean(X, trimming, 0) 85 | 86 | return m 87 | 88 | 89 | def std(X, trimming=0): 90 | """ 91 | Column-wise standard devaition or trimmed std. 92 | Trimming to be entered as fraction. 93 | """ 94 | 95 | if trimming == 0: 96 | if np.isnan(X).any(): 97 | s = np.power(np.nanvar(X, axis=0), 0.5) 98 | else: 99 | s = np.power(np.var(X, axis=0), 0.5) 100 | s = np.array(s).reshape(-1) 101 | else: 102 | var = sps.trim_mean( 103 | np.square(X - sps.trim_mean(X, trimming, 0)), trimming, 0 104 | ) 105 | s = np.sqrt(var) 106 | return s 107 | 108 | 109 | def _euclidnorm(x): 110 | """ 111 | Euclidean norm of a vector 112 | """ 113 | 114 | if np.isnan(x).any(): 115 | return np.sqrt(np.nansum(np.square(x))) 116 | else: 117 | return np.sqrt(np.sum(np.square(x))) 118 | 119 | 120 | def _diffmat_objective(a, X): 121 | """ 122 | Utility to l1median, matrix of differences 123 | """ 124 | 125 | (n, p) = X.shape 126 | return X - np.tile(a, (n, 1)) 127 | 128 | 129 | def _l1m_objective(a, X, *args): 130 | """ 131 | Optimization objective for l1median 132 | """ 133 | 134 | if np.isnan(X).any(): 135 | return np.nansum( 136 | np.apply_along_axis(_euclidnorm, 1, _diffmat_objective(a, X)) 137 | ) 138 | else: 139 | return np.sum( 140 | np.apply_along_axis(_euclidnorm, 1, _diffmat_objective(a, X)) 141 | ) 142 | 143 | 144 | def _l1m_jacobian(a, X): 145 | """ 146 | Jacobian for l1median 147 | """ 148 | 149 | (n, p) = X.shape 150 | dX = _diffmat_objective(a, X) 151 | dists = np.apply_along_axis(_euclidnorm, 1, dX) 152 | dists = _handle_zeros_in_scale(dists) 153 | dX /= np.tile(np.array(dists).reshape(n, 1), (1, p)) 154 | if np.isnan(X).any(): 155 | return -np.nansum(dX, axis=0) 156 | else: 157 | return -np.sum(dX, axis=0) 158 | 159 | 160 | def _l1median( 161 | X, x0, method="SLSQP", tol=1e-8, options={"maxiter": 2000}, **kwargs 162 | ): 163 | """ 164 | Optimization for l1median 165 | """ 166 | 167 | mu = spo.minimize( 168 | _l1m_objective, 169 | x0, 170 | args=(X), 171 | jac=_l1m_jacobian, 172 | tol=tol, 173 | options=options, 174 | method=method, 175 | ) 176 | return mu 177 | 178 | 179 | def l1median(X, **kwargs): 180 | """ 181 | l1median wrapper to generically convert matrices as some of the scipy 182 | optimization options will crash when provided matrix input. 183 | """ 184 | 185 | if "x0" not in kwargs: 186 | x0 = median(X) 187 | 188 | if type(X) == np.matrix: 189 | X = np.array(X) 190 | 191 | if len(X.shape) == 2: 192 | (n, p) = X.shape 193 | else: 194 | p = 1 195 | 196 | if p < 2: 197 | return median(X) 198 | else: 199 | return _l1median(X, x0, **kwargs).x 200 | 201 | 202 | def kstepLTS(X, maxit=5, tol=1e-10, **kwargs): 203 | """ 204 | Computes the K-step LTS estimator of location 205 | It uses the spatial median as a starting value, and yields an 206 | estimator with improved statistical efficiency, but at a higher 207 | computational cost. 208 | Inputs: 209 | X: data matrix 210 | maxit: maximum number of iterations 211 | tol: convergence tolerance 212 | Outputs: 213 | m2: location estimate 214 | """ 215 | n, p = X.shape 216 | m1 = l1median(X) # initial estimate 217 | m2 = copy.deepcopy(m1) 218 | iteration = 0 219 | unconverged = True 220 | while unconverged and (iteration < maxit): 221 | if np.isnan(X).any(): 222 | dists = np.nansum(np.square(X - m1), axis=1) 223 | else: 224 | dists = np.sum(np.square(X - m1), axis=1) 225 | cutdist = np.sort(dists, axis=0)[int(np.floor((n + 1) / 2)) - 1] 226 | hsubset = np.where(dists <= cutdist)[0] 227 | m2 = np.array(mean(X[hsubset, :])).reshape((p,)) 228 | unconverged = max(abs(m1 - m2)) > tol 229 | iteration += 1 230 | m1 = copy.deepcopy(m2) 231 | 232 | return m2 233 | 234 | 235 | def scaleTau2(x0, c1=4.5, c2=3, consistency=True, **kwargs): 236 | """ 237 | Tau estimator of scale 238 | Inputs: 239 | x0: array or matrix, data 240 | c1: consistency factor for initial estimate 241 | c2: consistency factor for final estimate 242 | consistency: str or bool, 243 | False, True, or "finiteSample" 244 | Output: 245 | the scale estimate 246 | """ 247 | 248 | x = copy.deepcopy(x0) 249 | n, p = x.shape 250 | if np.isnan(x).any(): 251 | summ = np.nansum 252 | else: 253 | summ = np.sum 254 | medx = median(x) 255 | xc = abs(x - medx) 256 | sigma0 = median(xc) 257 | if c1 > 0: 258 | xc /= sigma0 * c1 259 | w = 1 - np.square(xc) 260 | w = np.square((abs(w) + w) / 2) 261 | mu = summ(np.multiply(x, w)) / summ(w) 262 | else: 263 | mu = medx 264 | x -= mu 265 | x /= sigma0 266 | rho = np.square(x) 267 | rho[np.where(rho > c2**2)[0]] = c2**2 268 | if consistency: 269 | 270 | def Erho(b): 271 | return ( 272 | 2 273 | * ( 274 | (1 - b**2) * sps.norm.cdf(b) 275 | - b * sps.norm.pdf(b) 276 | + b**2 277 | ) 278 | - 1 279 | ) 280 | 281 | def Es2(c2): 282 | return Erho(c2 * sps.norm.ppf(3 / 4)) 283 | 284 | if consistency == "finiteSample": 285 | nEs2 = (n - 2) * Es2(c2) 286 | else: 287 | nEs2 = n * Es2(c2) 288 | else: 289 | nEs2 = n 290 | return np.array(sigma0 * np.sqrt(summ(rho) / nEs2)).reshape((p,)) 291 | 292 | 293 | def scale_data(X, m, s): 294 | """ 295 | Column-wise data scaling on location and scale estimates. 296 | 297 | """ 298 | 299 | n = X.shape 300 | if len(n) > 1: 301 | p = n[1] 302 | else: 303 | p = 1 304 | n = n[0] 305 | 306 | s = _handle_zeros_in_scale(s) 307 | 308 | if p == 1: 309 | Xm = X - float(m) 310 | Xs = Xm / s 311 | else: 312 | Xm = X - np.array([m for i in range(1, n + 1)]) 313 | Xs = Xm / np.array([s for i in range(1, n + 1)]) 314 | return Xs 315 | 316 | 317 | def wrap_univ(dd, scale=False, locX=None, scaleX=None): 318 | """ 319 | # Computes the univariate wrapping transformation 320 | # Reference: Jakob Raymaekers & Peter J. Rousseeuw (2021) 321 | # Fast Robust Correlation for High-Dimensional Data, 322 | # Technometrics, 63:2, 184-198. 323 | args: 324 | dd, np.array: vector of distances 325 | scale, bool: if True, will scale data about med/mad 326 | returns: 327 | xi, np.array: wrapped vector 328 | """ 329 | b = 1.5 330 | c = 4 331 | q1 = 1.540793 332 | q2 = 0.8622731 333 | if dd.dtype == "O": 334 | dd = dd.astype("float") 335 | if scale: 336 | locX = median(dd) 337 | scaleX = mad(dd) 338 | xi = (dd - locX) / scaleX 339 | else: 340 | xi = np.array(dd) 341 | indMid = np.where((np.abs(xi) < c) & (np.abs(xi) >= b))[0] 342 | indHigh = np.where(np.abs(xi) >= c)[0] 343 | xi[indMid] = ( 344 | q1 345 | * np.tanh(q2 * (c - np.abs(xi[indMid]))) 346 | * np.abs(xi[indMid]) 347 | / xi[indMid] 348 | ) 349 | xi[indHigh] = 0 350 | xi = xi * scaleX + locX 351 | 352 | return xi 353 | 354 | 355 | def wrap(X, locX, scaleX): 356 | """ 357 | wrap - wrap matrix column wise 358 | """ 359 | 360 | if len(X.shape) == 1: 361 | X = X.reshape(-1, 1) 362 | 363 | return np.array( 364 | [ 365 | wrap_univ(X[:, i], locX=locX[i], scale=True, scaleX=scaleX[i]) 366 | for i in range(X.shape[1]) 367 | ] 368 | ).transpose() 369 | -------------------------------------------------------------------------------- /src/direpack/preprocessing/robcent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # @author: Sven Serneels, Ponalytics 4 | # Created on Sun Feb 4 2018 5 | # Updated on Sun Dec 16 2018 6 | # Refactored on Sat Dec 21 2019 7 | # Refactored on Sat Mar 28 2020 8 | 9 | 10 | # Class for classical and robust centering and scaling of input data for 11 | # regression and machine learning 12 | 13 | # Version 2.0: Code entirely restructured compared to version 1.0. 14 | # Code made consistent with sklearn logic: fit(data,params) yields results. 15 | # Code makes more effciient use of numpy builtin estimators. 16 | # Version 3.0: 17 | # Code now takes strings or functions as input to centring and scaling. 18 | # Utility functions have been moved to _preproc_utilities.py 19 | # Code now supplied for l1median cetring, with options to use different 20 | # scipy.optimize optimization algorithms 21 | # Version 4.0: 22 | # Made the API compatible for ScikitLearn pipelines. However, some nonstandard 23 | # functions and output remain for backwards compatibility. Functionality for 24 | # sparse matrices still has to be implemented. 25 | 26 | 27 | # Ancillary functions in _preproc_utilities.py: 28 | 29 | # - `scale_data(X,m,s)`: centers and scales X on center m (as vector) and scale s (as vector). 30 | # - `mean(X,trimming)`: Column-wise mean. 31 | # - `median(X)`: Column-wise median. 32 | # - `l1median(X)`: L1 or spatial median. Optional arguments: 33 | # - `x0`: starting point for optimization, defaults to column wise median 34 | # - `method`: optimization algorithm, defaults to 'SLSQP' 35 | # - `tol`: tolerance, defaults to 1e-8 36 | # - `options`: list of options for `scipy.optimize.minimize` 37 | # - `kstepLTS(X): k-step LTS estimator of location. 38 | # - `maxit`: int, number of iterations to compute maximally 39 | # - `tol`: float, tolerance for convergence 40 | # - `std(X,trimming)`: Column-wise std. 41 | # - `mad(X,c)`: Column-wise median absolute deviation, with consistency factor c. 42 | # - `scaleTau2(x0, c1 = 4.5, c2 = 3, consistency = True)`: Tau estimator of scale 43 | # with consistency parameters c1 and c2 and option for consistency correction 44 | # (True, False or 'finiteSample') 45 | 46 | 47 | from __future__ import absolute_import, division, print_function 48 | from __future__ import unicode_literals 49 | 50 | from sklearn.base import BaseEstimator, TransformerMixin 51 | from sklearn.utils.metaestimators import _BaseComposition 52 | from sklearn.utils.validation import check_is_fitted 53 | import numpy as np 54 | from ..utils.utils import ( 55 | MyException, 56 | convert_X_input, 57 | convert_y_input, 58 | _check_input, 59 | ) 60 | from ._preproc_utilities import * 61 | from ._preproc_utilities import _check_trimming, wrap_univ 62 | 63 | __all__ = ["VersatileScaler", "robcent", "versatile_scale", "Wrapper", "wrap"] 64 | 65 | 66 | class VersatileScaler(_BaseComposition, TransformerMixin, BaseEstimator): 67 | 68 | """ 69 | VersatileScaler Center and Scale data about classical or robust location and scale estimates 70 | 71 | Parameters 72 | ---------- 73 | `center`: str or callable, location estimator. String has to be name of the 74 | function to be used, or 'None'. 75 | `scale`: str or callable, scale estimator 76 | `trimming`: trimming percentage to be used in location and scale estimation. 77 | 78 | 79 | Attributes 80 | ---------- 81 | Arguments for methods: 82 | - `X`: array-like, n x p, the data. 83 | - `trimming`: float, fraction to be trimmed (must be in (0,1)). 84 | 85 | 86 | 87 | 88 | Remarks 89 | ------- 90 | Options for classical estimators 'mean' and 'std' also give access to robust 91 | trimmed versions. 92 | 93 | """ 94 | 95 | def __init__(self, center="mean", scale="std", trimming=0): 96 | """ 97 | Initialize values. Check if correct options provided. 98 | """ 99 | 100 | self.center = center 101 | self.scale = scale 102 | self.trimming = trimming 103 | 104 | def fit(self, X): 105 | """ 106 | Estimate location and scale, store these in the class object. 107 | Trimming fraction can be provided as keyword argument. 108 | """ 109 | 110 | X = _check_input(X) 111 | 112 | _check_trimming(self.trimming) 113 | 114 | if type(self.center) is str: 115 | center = eval(self.center) 116 | else: 117 | center = self.center 118 | 119 | if type(self.scale) is str: 120 | scale = eval(self.scale) 121 | else: 122 | scale = self.scale 123 | 124 | n = X.shape 125 | if len(n) > 1: 126 | p = n[1] 127 | else: 128 | p = 1 129 | n = n[0] 130 | 131 | if self.center == "None": 132 | m = np.repeat(0, p) 133 | else: 134 | m = center(X, trimming=self.trimming) 135 | 136 | # Keeping col_loc_ for older version compatibility 137 | setattr(self, "col_loc_", m) 138 | # sklearn standard 139 | setattr(self, "center_", m) 140 | 141 | if self.scale == "None": 142 | s = np.repeat(1, p) 143 | else: 144 | s = scale(X, trimming=self.trimming) 145 | 146 | # Keeping col_sca_ for older version compatibility 147 | setattr(self, "col_sca_", s) 148 | # sklearn standard 149 | setattr(self, "scale_", s) 150 | 151 | def transform(self, X): 152 | """ 153 | Center and/or scale training data to pre-estimated location and scale 154 | """ 155 | 156 | X = _check_input(X) 157 | check_is_fitted(self, ["center_", "scale_"]) 158 | 159 | Xs = scale_data(X, self.center_, self.scale_) 160 | setattr(self, "datas_", Xs) 161 | 162 | return Xs 163 | 164 | def predict(self, Xn): 165 | """ 166 | Standardize new data on previously estimated location and scale. 167 | Number of columns needs to match. 168 | """ 169 | 170 | Xn = _check_input(Xn) 171 | Xns = scale_data(Xn, self.col_loc_, self.col_sca_) 172 | setattr(self, "datans_", Xns) 173 | 174 | return Xns 175 | 176 | def fit_transform(self, X): 177 | """ 178 | Estimate center and scale for training data and scale these data 179 | """ 180 | 181 | self.fit(X) 182 | self.transform(X) 183 | 184 | return self.datas_ 185 | 186 | def inverse_transform(self, Xs=None): 187 | """ 188 | Transform scaled data back to their original scale 189 | """ 190 | 191 | check_is_fitted(self, ["center_", "scale_"]) 192 | if Xs is not None: 193 | Xs = _check_input(Xs) 194 | else: 195 | Xs = self.datas_ 196 | 197 | return np.multiply(Xs, self.scale_) + self.center_ 198 | 199 | 200 | # For backwards compatibility 201 | robcent = VersatileScaler 202 | 203 | 204 | def versatile_scale(X, center="l1median", scale="mad", trimming=0): 205 | """ 206 | Wrapper to scale based on present robcent implementation that uses 207 | `fit` instead of `transform` 208 | """ 209 | 210 | rc = VersatileScaler(center=center, scale=scale, trimming=trimming) 211 | return rc.fit_transform(X) 212 | 213 | 214 | class Wrapper(_BaseComposition, TransformerMixin, BaseEstimator): 215 | 216 | """ 217 | Wrapper Perform robustness inducing 'wrapping' transformation using 218 | optimal plugins and parameters from the literature 219 | 220 | Parameters 221 | ---------- 222 | 223 | 224 | Attributes 225 | ---------- 226 | Arguments for methods: 227 | - `X`: array-like, n x p, the data. 228 | 229 | Reference 230 | --------- 231 | Jakob Raymaekers & Peter J. Rousseeuw (2021), Fast Robust Correlation for 232 | High-Dimensional Data, Technometrics, 63:2, 184-198. 233 | 234 | """ 235 | 236 | def __init__(self): 237 | """ 238 | Initialize values. Check if correct options provided. 239 | """ 240 | 241 | self.center = "median" 242 | self.scale = "mad" 243 | self.trimming = 0 244 | 245 | def fit(self, X): 246 | """ 247 | Estimate location and scale, store these in the class object. 248 | Trimming fraction can be provided as keyword argument. 249 | """ 250 | 251 | X = _check_input(X) 252 | 253 | _check_trimming(self.trimming) 254 | 255 | if type(self.center) is str: 256 | center = eval(self.center) 257 | else: 258 | center = self.center 259 | 260 | if type(self.scale) is str: 261 | scale = eval(self.scale) 262 | else: 263 | scale = self.scale 264 | 265 | n = X.shape 266 | if len(n) > 1: 267 | p = n[1] 268 | else: 269 | p = 1 270 | n = n[0] 271 | 272 | if self.center == "None": 273 | m = np.repeat(0, p) 274 | else: 275 | m = center(X, trimming=self.trimming) 276 | 277 | # Keeping col_loc_ for older version compatibility 278 | setattr(self, "col_loc_", m) 279 | # sklearn standard 280 | setattr(self, "center_", m) 281 | 282 | if self.scale == "None": 283 | s = np.repeat(1, p) 284 | else: 285 | s = scale(X, trimming=self.trimming) 286 | 287 | # Keeping col_sca_ for older version compatibility 288 | setattr(self, "col_sca_", s) 289 | # sklearn standard 290 | setattr(self, "scale_", s) 291 | 292 | def transform(self, X): 293 | """ 294 | Project data points to their wrapped counterparts 295 | """ 296 | 297 | X = _check_input(X) 298 | check_is_fitted(self, ["center_", "scale_"]) 299 | 300 | Xw = wrap(X, self.center_, self.scale_) 301 | setattr(self, "dataw_", Xw) 302 | 303 | return Xw 304 | 305 | def predict(self, Xn): 306 | """ 307 | Wrap new data using previously estimated location and scale. 308 | Number of columns needs to match. 309 | """ 310 | 311 | Xn = _check_input(Xn) 312 | Xnw = wrap(Xn, self.col_loc_, self.col_sca_) 313 | setattr(self, "datanw_", Xnw) 314 | 315 | return Xnw 316 | 317 | def fit_transform(self, X): 318 | """ 319 | Estimate center and scale for training data wrap these data 320 | """ 321 | 322 | self.fit(X) 323 | self.transform(X) 324 | 325 | return self.dataw_ 326 | -------------------------------------------------------------------------------- /docs/sprm.md: -------------------------------------------------------------------------------- 1 | Sparse partial robust M regression 2 | ================================== 3 | 4 | Description 5 | ----------- 6 | 7 | The `sprm` module in `direpack` comprises code for Sparse Partial Robust M-regeression, as 8 | well as a few closely related estimators: the Sparse NIPALS estimator (a non-robust option 9 | for sparse PLS) and the Robust M-regression estoimator (multiple regression based on the same 10 | re-weighting priciple as SPRM, yet without dimension reduction). 11 | 12 | The SPRM method performs four tasks at the same time in a single, consistent estimate: 13 | - *regression*: yields regression coefficients and predicts responses 14 | - *dimension reduction*: calculates interpretable PLS-like components maximizing covariance to the predictand in a robust way 15 | - *variable selection*: depending on the paramter settings, can yield highly sparse regression coefficients that contain exact zero elements 16 | - *outlier detection and compensation*: yields a set of case weights in \[0,1\]. The lower the weight, the more outlying a case is. The estimate itself is outlier robust. 17 | 18 | Note: all the methods contained in this package have been designed for continuous data. They do not work correctly for caetgorical or textual data. 19 | 20 | The code is aligned to ScikitLearn, such that modules such as `GridSearchCV` can flawlessly be applied to it. 21 | 22 | The repository contains 23 | - The estimator (`sprm.py`) 24 | 25 | - Options for data pre-processing (`robcent.py`) 26 | - The Sparse NIPALS (SNIPLS) estimator \[3\](`snipls.py`) 27 | - Robust M regression estimator (`rm.py`) 28 | - Ancillary functions for M-estimation (`_m_support_functions.py`) 29 | 30 | Note that the `plot` folder contains some plotting functionality specific to SPRM (`sprm_plot.py`). 31 | 32 | 33 | 34 | 1\. The SPRM estimator 35 | ====================== 36 | 37 | The main SPRM implementation yields a class with the following structure: 38 | 39 | 1\.1\. Dependencies 40 | ----------------- 41 | - From ``: `BaseEstimator, TransformerMixin, RegressorMixin` 42 | - From ``: `_BaseComposition` 43 | - `copy` 44 | - From ``: `norm, chi2` 45 | - `numpy` 46 | - From ``: `pyplot`. 47 | - From ``: `robust`. 48 | 49 | 1\.2\. Parameters 50 | --------------- 51 | - `eta`: float. Sparsity parameter in \[0,1). Note that `eta=0` returns the non-sparse, yet robust, partial robust M-regression (PRM) \[2\]. 52 | - `n_components`: int > 1. Note that if applied on data, `n_components` shall take a value <= min(x_data.shape) 53 | - `fun`: str, downweighting function. `'Hampel'` (recommended), `'Fair'` or `'Huber'` 54 | - `probp1`: float, probability cutoff for start of downweighting (e.g. 0.95) 55 | - `probp2`: float, probability cutoff for start of steep downweighting (e.g. 0.975, only relevant if `fun='Hampel'`) 56 | - `probp3`: float, probability cutoff for start of outlier omission (e.g. 0.999, only relevant if `fun='Hampel'`) 57 | - `centring`: str, type of centring (`'mean'`, `'median'`,`'l1median'` or `'kstepLTS'`) 58 | - `scaling`: str, type of scaling (`'std'`,`'mad'`, `'scaleTau2'`, the latter recommended, or `'None'`) 59 | - `verbose`: boolean, specifying verbose mode 60 | - `maxit`: int, maximal number of iterations in M algorithm 61 | - `tol`: float, tolerance for convergence in M algorithm 62 | - `start_cutoff_mode`: str, value `'specific'` will set starting value cutoffs specific to X and y (preferred); any other value will set X and y stating cutoffs identically. The non-specific setting yields identical results to the SPRM R implementation available from [CRAN](https://cran.r-project.org/web/packages/sprm/index.html). 63 | - `start_X_init`: str, values `'pcapp'` will include a PCA/broken stick projection to calculate the initial predictor block caseweights; any other value will just calculate initial predictor block case weights based on Euclidian distances within that block. The is less stable for very flat data (p >> n). 64 | - `colums` (def `False`): Either bool, list, numpy array or pandas Index 65 | if `False`, no column names supplied 66 | if `True`, 67 | if X data are supplied as a pandas DataFrame, will extract column 68 | names from the frame 69 | else throws an error 70 | if a list, array or Index (will only take length x_data.shape[1]), 71 | the column names of the x_data supplied in this list, 72 | will be printed in verbose mode 73 | - `copy` (def `True`): boolean, whether to create deep copy of the data in the calculation process 74 | 75 | 1\.3\. Attributes 76 | --------------- 77 | - `x_weights_`: X block PLS weighting vectors (usually denoted W) 78 | - `x_loadings_`: X block PLS loading vectors (usually denoted P) 79 | - `C_`: vector of inner relationship between response and latent variablesblock re 80 | - `x_scores_`: X block PLS score vectors (usually denoted T) 81 | - `coef_`: vector of regression coefficients 82 | - `intercept_`: intercept 83 | - `coef_scaled_`: vector of scaled regression coeeficients (when scaling option used) 84 | - `intercept_scaled_`: scaled intercept 85 | - `residuals_`: vector of regression residuals 86 | - `x_ev_`: X block explained variance per component 87 | - `y_ev_`: y block explained variance 88 | - `fitted_`: fitted response 89 | - `x_Rweights_`: X block SIMPLS style weighting vectors (usually denoted R) 90 | - `x_caseweights_`: X block case weights 91 | - `y_caseweights_`: y block case weights 92 | - `caseweights_`: combined case weights 93 | - `colret_`: names of variables retained in the sparse model 94 | - `x_loc_`: X block location estimate 95 | - `y_loc_`: y location estimate 96 | - `x_sca_`: X block scale estimate 97 | - `y_sca_`: y scale estimate 98 | - `non_zero_scale_vars_`: indicator vector of variables in X with nonzero scale 99 | 100 | 1\.4\. Methods 101 | ------------ 102 | - `fit(X,y)`: fit model 103 | - `predict(X)`: make predictions based on fit 104 | - `transform(X)`: project X onto latent space 105 | - `weightnewx(X)`: calculate X case weights 106 | - `getattr()`: get list of attributes 107 | - `setattr(**kwargs)`: set individual attribute of sprm object 108 | - `valscore(X,y,scoring)`: option to use weighted scoring function in cross-validation if scoring=weighted 109 | 110 | 1\.5\. Ancillary functions 111 | ------------------------ 112 | - `snipls` (class): sparse NIPALS regression (first described in: \[3\]) 113 | - `Hampel`: Hampel weight function 114 | - `Huber`: Huber weight function 115 | - `Fair`: Fair weight function 116 | - `brokenstick`: broken stick rule to estimate number of relevant principal components 117 | - `VersatileScaler` (class): centring and scaling data, with several robust options beyond `sklearn`'s `RobustScaler` 118 | - `sprm_plot` (class): plotting SPRM results 119 | - `sprm_plot_cv` (class): plotting SPRM cross-validation results 120 | 121 | 122 | 2\. The Robust M (RM) estimator 123 | ============================== 124 | 125 | RM has been implemented to be consistent with SPRM. It takes the same arguments, except for `eta`, `n_components` and `columns`, 126 | because it does not perform dimension reduction nor variable selection. For the same reasons, the outputs are limited to regression 127 | outputs. Therefore, dimension reduction outputs like `x_scores_`, `x_loadings_`, etc. are not provided. For R adepts, note that a 128 | [cellwise robust](https://github.com/SebastiaanHoppner/CRM) version of RM has recently been introduced. 129 | 130 | 131 | 3\. The Sparse NIPALS (SNIPLS) estimator 132 | ======================================= 133 | 134 | SNIPLS is the non-robust sparse univariate PLS algorithm \[3\]. SNIPLS has been implemented to be consistent with SPRM. It takes the same arguments, except for `'fun'` and `'probp1'` through `'probp3'`, since these are robustness parameters. For the same reasons, the outputs are limited to sparse dimension reduction and regression outputs. Robustness related outputs like `x_caseweights_` cannot be provided. 135 | 136 | 137 | 4\. Plotting functionality 138 | ========================= 139 | 140 | The file `sprm_plot.py` contains a set of plot functions based on Matplotlib. The class sprm_plot contains plots for sprm objects, wheras the class sprm_plot_cv contains a plot for cross-validation. 141 | 142 | 4\.1\. Dependencies 143 | ----------------- 144 | - `pandas` 145 | - `numpy` 146 | - `matplotlib.pyplot` 147 | - for plotting cross-validation results: `sklearn.model_selection.GridSearchCV` 148 | 149 | 4\.2\. Paramaters 150 | --------------- 151 | - `res_sprm`, sprm. An sprm class object that has been fit. 152 | - `colors`, list of str entries. Only mandatory input. Elements determine colors as: 153 | - \[0\]: borders of pane 154 | - \[1\]: plot background 155 | - \[2\]: marker fill 156 | - \[3\]: diagonal line 157 | - \[4\]: marker contour, if different from fill 158 | - \[5\]: marker color for new cases, if applicable 159 | - \[6\]: marker color for harsh calibration outliers 160 | - \[7\]: marker color for harsh prediction outliers 161 | - `markers`, a list of str entries. Elements determkine markers for: 162 | - \[0\]: regular cases 163 | - \[1\]: moderate outliers 164 | - \[2\]: harsh outliers 165 | 166 | 4\.3\. Methods 167 | ------------ 168 | - `plot_coeffs(entity="coef_",truncation=0,columns=[],title=[])`: Plot regression coefficients, loadings, etc. with the option only to plot the x% smallest and largets coefficients (truncation) 169 | - `plot_yyp(ytruev=[],Xn=[],label=[],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False)`: Plot y vs y predicted. 170 | - `plot_projections(Xn=[],label=[],components = [0,1],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False)`: Plot score space. 171 | - `plot_caseweights(Xn=[],label=[],names=[],namesv=[],title=[],legend_pos='lower right',onlyval=False,mode='overall')`: Plot caseweights, with the option to plot `'x'`, `'y'` or `'overall'` case weights for cases used to train the model. For new cases, only `'x'` weights can be plotted. 172 | 173 | 4\.4\. Remark 174 | ----------- 175 | The latter 3 methods will work both for cases that the models has been trained with (no additional input) or new cases (requires Xn and in case of plot_ypp, ytruev), with the option to plot only the latter (option onlyval = True). All three functions have the option to plot case names if supplied as list. 176 | 177 | 4\.5\. Ancillary classes 178 | ---------------------- 179 | - `sprm_plot_cv` has method `eta_ncomp_contour(title)` to plot sklearn GridSearchCV results 180 | 181 | 182 | References 183 | ========== 184 | 1. [Sparse partial robust M regression](https://www.sciencedirect.com/science/article/abs/pii/S0169743915002440), Irene Hoffmann, Sven Serneels, Peter Filzmoser, Christophe Croux, Chemometrics and Intelligent Laboratory Systems, 149 (2015), 50-59. 185 | 2. [Partial robust M regression](https://doi.org/10.1016/j.chemolab.2005.04.007), Sven Serneels, Christophe Croux, Peter Filzmoser, Pierre J. Van Espen, Chemometrics and Intelligent Laboratory Systems, 79 (2005), 55-64. 186 | 3. [Sparse and robust PLS for binary classification](https://onlinelibrary.wiley.com/doi/abs/10.1002/cem.2775), I. Hoffmann, P. Filzmoser, S. Serneels, K. Varmuza, Journal of Chemometrics, 30 (2016), 153-162. 187 | -------------------------------------------------------------------------------- /src/direpack/sprm/rm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Thu Jan 24 2019 3 | 4 | Module containing: 5 | 6 | Estimators 7 | ---------- 8 | Robust M Regression (RM) 9 | 10 | Depends on robcent class for robustly centering and scaling data, as well as on 11 | the functions in _m_support_functions. 12 | 13 | @author: Sven Serneels, Ponalytics 14 | """ 15 | from __future__ import absolute_import, division, print_function 16 | from __future__ import unicode_literals 17 | from sklearn.base import RegressorMixin, BaseEstimator 18 | from sklearn.utils.metaestimators import _BaseComposition 19 | import copy 20 | import numpy as np 21 | import pandas as ps 22 | from scipy.stats import norm, chi2 23 | from ..preprocessing.robcent import VersatileScaler 24 | from ..utils.utils import MyException, _predict_check_input, _check_input 25 | from ._m_support_functions import * 26 | 27 | 28 | class rm(_BaseComposition, BaseEstimator, RegressorMixin): 29 | 30 | """ 31 | Robust M Regression 32 | 33 | Parameters: 34 | ----------- 35 | fun: str, downweighting function. 'Hampel' (recommended), 'Fair' or 36 | 'Huber' 37 | probp1: float, probability cutoff for start of downweighting 38 | (e.g. 0.95) 39 | probp2: float, probability cutoff for start of steep downweighting 40 | (e.g. 0.975, only relevant if fun='Hampel') 41 | probp3: float, probability cutoff for start of outlier omission 42 | (e.g. 0.999, only relevant if fun='Hampel') 43 | centre: str, type of centring (`'mean'`, `'median'` or `'l1median'`, 44 | the latter recommended statistically, if too slow, switch to `'median'`) 45 | scale: str, type of scaling ('std','mad' [recommended] or 'None') 46 | verbose: boolean, specifying verbose mode 47 | maxit: int, maximal number of iterations in M algorithm 48 | tol: float, tolerance for convergence in M algorithm 49 | start_cutoff_mode: str, values: 50 | 'specific' will set starting value cutoffs specific to X and y (preferred); 51 | any other value will set X and y stating cutoffs identically. 52 | The latter yields identical results to the SPRM R implementation available from 53 | CRAN. 54 | copy (def True): boolean, whether to copy data 55 | Note: copy not yet aligned with sklearn def 56 | 57 | """ 58 | 59 | def __init__( 60 | self, 61 | fun="Hampel", 62 | probp1=0.95, 63 | probp2=0.975, 64 | probp3=0.999, 65 | centre="median", 66 | scale="mad", 67 | start_cutoff_mode="specific", 68 | verbose=True, 69 | maxit=100, 70 | tol=0.01, 71 | copy=True, 72 | ): 73 | self.fun = fun 74 | self.probp1 = probp1 75 | self.probp2 = probp2 76 | self.probp3 = probp3 77 | self.centre = centre 78 | self.scale = scale 79 | self.start_cutoff_mode = start_cutoff_mode 80 | self.verbose = verbose 81 | self.maxit = maxit 82 | self.tol = tol 83 | self.copy = copy 84 | self.probctx_ = "irrelevant" 85 | self.probcty_ = "irrelevant" 86 | self.hampelbx_ = "irrelevant" 87 | self.hampelby__ = "irrelevant" 88 | self.hampelrx_ = "irrelevant" 89 | self.hampelry_ = "irrelevant" 90 | 91 | def fit(self, X, y): 92 | if self.copy: 93 | self.X = copy.deepcopy(X) 94 | self.y = copy.deepcopy(y) 95 | (n, p) = X.shape 96 | if not (self.fun in ("Hampel", "Huber", "Fair")): 97 | raise MyException( 98 | "Invalid weighting function. Choose Hampel, Huber or Fair for parameter fun." 99 | ) 100 | if (self.probp1 > 1) | (self.probp1 <= 0): 101 | raise MyException("probp1 is a probability. Choose a value between 0 and 1") 102 | if self.fun == "Hampel": 103 | if not ( 104 | (self.probp1 < self.probp2) 105 | & (self.probp2 < self.probp3) 106 | & (self.probp3 <= 1) 107 | ): 108 | raise MyException( 109 | "Wrong choise of parameters for Hampel function. Use 0 1: 119 | y = np.array(y).reshape(-1).astype("float64") 120 | ny = y.shape[0] 121 | if ny != n: 122 | raise MyException("Number of cases in y and X must be identical.") 123 | 124 | scaling = VersatileScaler(center=self.centre, scale=self.scale) 125 | Xs = scaling.fit_transform(X).astype("float64") 126 | mX = scaling.col_loc_ 127 | sX = scaling.col_sca_ 128 | ys = scaling.fit_transform(y).astype("float64") 129 | my = scaling.col_loc_ 130 | sy = scaling.col_sca_ 131 | ys = np.array(ys).reshape(-1) 132 | 133 | wx = np.sqrt(np.array(np.sum(np.square(Xs), 1), dtype=np.float64)) 134 | wx = wx / np.median(wx) 135 | if [self.centre, self.scale] == ["median", "mad"]: 136 | wy = np.array(abs(ys), dtype=np.float64) 137 | else: 138 | wy = (y - np.median(y)) / (1.4826 * np.median(abs(y - np.median(y)))) 139 | self.probcty_ = norm.ppf(self.probp1) 140 | if self.start_cutoff_mode == "specific": 141 | self.probctx_ = chi2.ppf(self.probp1, p) 142 | else: 143 | self.probctx_ = self.probcty_ 144 | if self.fun == "Fair": 145 | wx = Fair(wx, self.probctx_) 146 | wy = Fair(wy, self.probcty_) 147 | if self.fun == "Huber": 148 | wx = Huber(wx, self.probctx_) 149 | wy = Huber(wy, self.probcty_) 150 | if self.fun == "Hampel": 151 | self.hampelby_ = norm.ppf(self.probp2) 152 | self.hampelry_ = norm.ppf(self.probp3) 153 | if self.start_cutoff_mode == "specific": 154 | self.hampelbx_ = chi2.ppf(self.probp2, p) 155 | self.hampelrx_ = chi2.ppf(self.probp3, p) 156 | else: 157 | self.hampelbx_ = self.hampelby_ 158 | self.hampelrx_ = self.hampelry_ 159 | wx = Hampel(wx, self.probctx_, self.hampelbx_, self.hampelrx_) 160 | wy = Hampel(wy, self.probcty_, self.hampelby_, self.hampelry_) 161 | wx = np.array(wx).reshape(-1) 162 | w = (wx * wy).astype("float64") 163 | if (w < 1e-06).any(): 164 | w0 = np.where(w < 1e-06)[0] 165 | w[w0] = 1e-06 166 | we = np.array(w, dtype=np.float64) 167 | else: 168 | we = np.array(w, dtype=np.float64) 169 | wye = wy 170 | WEmat = np.array([np.sqrt(we) for i in range(1, p + 1)], ndmin=1).T 171 | Xw = np.multiply(Xs, WEmat).astype("float64") 172 | yw = ys * np.sqrt(we) 173 | loops = 1 174 | rold = 1e-5 175 | difference = 1 176 | 177 | while (difference > self.tol) & (loops < self.maxit): 178 | b = np.linalg.lstsq(Xw, yw, rcond=None) 179 | b = np.array(b[0]).reshape(-1, 1) 180 | yp = np.dot(Xs, b).reshape(-1) 181 | r = ys - yp 182 | if len(r) / 2 > np.sum(r == 0): 183 | r = abs(r) / (1.4826 * np.median(abs(r))) 184 | else: 185 | r = abs(r) / (1.4826 * np.median(abs(r[r != 0]))) 186 | wye = r 187 | if self.fun == "Fair": 188 | wye = Fair(wye, self.probcty_) 189 | if self.fun == "Huber": 190 | wye = Huber(wye, self.probcty_) 191 | if self.fun == "Hampel": 192 | wye = Hampel(wye, self.probcty_, self.hampelby_, self.hampelry_) 193 | b2sum = np.sum(np.square(b)) 194 | difference = abs(b2sum - rold) / rold 195 | rold = b2sum 196 | we = (wye * wx).astype("float64") 197 | w0 = [] 198 | if any(we < 1e-06): 199 | w0 = np.where(we < 1e-06)[0] 200 | we[w0] = 1e-06 201 | we = np.array(we, dtype=np.float64) 202 | if len(w0) >= (n / 2): 203 | break 204 | WEmat = np.array([np.sqrt(we) for i in range(1, p + 1)], ndmin=1).T 205 | Xw = np.multiply(Xs, WEmat).astype("float64") 206 | yw = ys * np.sqrt(we) 207 | loops += 1 208 | if difference > self.maxit: 209 | print( 210 | "Warning: Method did not converge. The scaled difference between norms of the coefficient vectors is " 211 | + str(round(difference, 4)) 212 | ) 213 | plotprec = False 214 | if plotprec: 215 | print(str(loops - 1)) 216 | w = we 217 | w[w0] = 0 218 | wx[w0] = 0 219 | wy = wye 220 | wy[w0] = 0 221 | Xrw = np.array(np.multiply(Xs, np.sqrt(WEmat)).astype("float64")) 222 | scaling.set_params(scale="None") 223 | Xrw = scaling.fit_transform(Xrw) 224 | b_rescaled = np.multiply(np.reshape(sy / sX, (p, 1)), b) 225 | yp_rescaled = np.matmul(X, b_rescaled).reshape(-1) 226 | if self.centre == "mean": 227 | intercept = np.mean(y - yp_rescaled) 228 | else: 229 | intercept = np.median(y - yp_rescaled) 230 | yfit = yp_rescaled + intercept 231 | if self.scale != "None": 232 | if self.centre == "mean": 233 | b0 = np.mean(ys.astype("float64") - np.matmul(Xs.astype("float64"), b)) 234 | else: 235 | b0 = np.median( 236 | np.array(ys.astype("float64") - np.matmul(Xs.astype("float64"), b)) 237 | ) 238 | else: 239 | if self.centre == "mean": 240 | ytil = np.array(np.matmul(X, b)).reshape(-1) 241 | intercept = np.mean(y - ytil) 242 | else: 243 | intercept = np.median(y - ytil) 244 | b0 = intercept 245 | r = y - yfit 246 | setattr(self, "coef_", b_rescaled) 247 | setattr(self, "intercept_", intercept) 248 | setattr(self, "coef_scaled_", b) 249 | setattr(self, "intercept_scaled_", b0) 250 | setattr(self, "residuals_", r) 251 | setattr(self, "fitted_", yfit) 252 | setattr(self, "x_caseweights_", wx) 253 | setattr(self, "y_caseweights_", wy) 254 | setattr(self, "caseweights_", w) 255 | setattr(self, "x_loc_", mX) 256 | setattr(self, "y_loc_", my) 257 | setattr(self, "x_sca_", sX) 258 | setattr(self, "y_sca_", sy) 259 | setattr(self, "scaling_", scaling) 260 | return self 261 | pass 262 | 263 | def predict(self, Xn): 264 | n, p, Xn = _predict_check_input(Xn) 265 | if p != self.X.shape[1]: 266 | raise ( 267 | ValueError( 268 | "New data must have seame number of columns as the ones the model has been trained with" 269 | ) 270 | ) 271 | return np.matmul(Xn, self.coef_) + self.intercept_ 272 | -------------------------------------------------------------------------------- /src/direpack/sprm/snipls.py: -------------------------------------------------------------------------------- 1 | # Created on Fri Apr 26 19:27:52 2019 2 | 3 | # @author: sven 4 | 5 | 6 | from __future__ import absolute_import, division, print_function 7 | from __future__ import unicode_literals 8 | from sklearn.base import RegressorMixin, BaseEstimator, TransformerMixin 9 | from sklearn.utils.metaestimators import _BaseComposition 10 | import copy 11 | import numpy as np 12 | import pandas as ps 13 | from ..preprocessing.robcent import VersatileScaler 14 | from ..utils.utils import MyException, _predict_check_input, _check_input, nandot, nanmatdot 15 | from ..preprocessing._preproc_utilities import scale_data 16 | 17 | 18 | class snipls(_BaseComposition, BaseEstimator, TransformerMixin, RegressorMixin): 19 | """ 20 | SNIPLS Sparse Nipals Algorithm 21 | 22 | Algorithm first outlined in: 23 | Sparse and robust PLS for binary classification, 24 | I. Hoffmann, P. Filzmoser, S. Serneels, K. Varmuza, 25 | Journal of Chemometrics, 30 (2016), 153-162. 26 | 27 | As of driepack-1.1.2, snipls works when there are missing data in the inputs 28 | 29 | Parameters 30 | ----------- 31 | 32 | eta : float. 33 | Sparsity parameter in [0,1) 34 | 35 | n_components : int, 36 | min 1. Note that if applied on data, n_components shall take a value <= min(x_data.shape) 37 | 38 | verbose: Boolean (def true) 39 | to print intermediate set of columns retained 40 | 41 | columns : Either boolean, list, numpy array or pandas Index (def false) 42 | if False, no column names supplied; if True, if X data are supplied as a pandas data frame, will extract column names from the frame throws an error for other data input types if a list, array or Index (will only take length x_data.shape[1]), the column names of the x_data supplied in this list, will be printed in verbose mode. 43 | 44 | centre : str, 45 | type of centring (`'mean'` [recommended], `'median'` or `'l1median'`), 46 | 47 | scale : str, 48 | type of scaling ('std','mad' or 'None') 49 | 50 | copy : (def True): boolean, 51 | whether to copy data. Note : copy not yet aligned with sklearn def - we always copy 52 | 53 | 54 | Attributes 55 | ------------ 56 | Attributes always provided: 57 | 58 | - `x_weights_`: X block PLS weighting vectors (usually denoted W) 59 | - `x_loadings_`: X block PLS loading vectors (usually denoted P) 60 | - `C_`: vector of inner relationship between response and latent variablesblock re 61 | - `x_scores_`: X block PLS score vectors (usually denoted T) 62 | - `coef_`: vector of regression coefficients 63 | - `intercept_`: intercept 64 | - `coef_scaled_`: vector of scaled regression coeeficients (when scaling option used) 65 | - `intercept_scaled_`: scaled intercept 66 | - `residuals_`: vector of regression residuals 67 | - `x_ev_`: X block explained variance per component 68 | - `y_ev_`: y block explained variance 69 | - `fitted_`: fitted response 70 | - `x_Rweights_`: X block SIMPLS style weighting vectors (usually denoted R) 71 | - `colret_`: names of variables retained in the sparse model 72 | - `x_loc_`: X block location estimate 73 | - `y_loc_`: y location estimate 74 | - `x_sca_`: X block scale estimate 75 | - `y_sca_`: y scale estimate 76 | - `centring_`: scaling object used internally (from `VersatileScaler`) 77 | 78 | """ 79 | 80 | def __init__( 81 | self, 82 | eta=0.5, 83 | n_components=1, 84 | verbose=True, 85 | columns=False, 86 | centre="mean", 87 | scale="None", 88 | copy=True, 89 | ): 90 | assert eta >= 0 and eta < 1, "eta needs to be in [0,1)" 91 | assert isinstance( 92 | n_components, int) and n_components > 0, "number of components needs to be positive integer" 93 | self.eta = eta 94 | self.n_components = n_components 95 | self.verbose = verbose 96 | self.columns = columns 97 | self.centre = centre 98 | self.scale = scale 99 | self.copy = copy 100 | 101 | def fit(self, X, y): 102 | """ 103 | Fit a SNIPLS model. 104 | 105 | Parameters 106 | ------------ 107 | 108 | X : numpy array 109 | Input data. 110 | 111 | y : vector or 1D matrix 112 | Response data 113 | 114 | """ 115 | if type(self.columns) is list: 116 | self.columns = np.array(self.columns) 117 | elif type(self.columns) is bool: 118 | if type(X) != ps.core.frame.DataFrame and self.columns: 119 | raise ( 120 | MyException( 121 | "Columns set to true can only extract column names for data frame input" 122 | ) 123 | ) 124 | if type(X) == ps.core.frame.DataFrame: 125 | if type(self.columns) is bool and self.columns: 126 | self.columns = X.columns 127 | X = X.to_numpy() 128 | (n, p) = X.shape 129 | if type(y) in [ps.core.frame.DataFrame, ps.core.series.Series]: 130 | y = y.to_numpy() 131 | X = _check_input(X) 132 | y = _check_input(y) 133 | ny = y.shape[0] 134 | if ny != n: 135 | if y.ndim == 2: 136 | y = y.T 137 | else: 138 | raise (MyException("Number of cases in X and y needs to agree")) 139 | y = y.astype("float64") 140 | if self.copy: 141 | X0 = copy.deepcopy(X) 142 | y0 = copy.deepcopy(y) 143 | else: 144 | X0 = X 145 | y0 = y 146 | self.X = X0 147 | self.y = y0 148 | X0 = X0.astype("float64") 149 | centring = VersatileScaler(center=self.centre, scale=self.scale) 150 | X0 = centring.fit_transform(X0).astype("float64") 151 | mX = centring.col_loc_ 152 | sX = centring.col_sca_ 153 | y0 = centring.fit_transform(y0).astype("float64") 154 | my = centring.col_loc_ 155 | sy = centring.col_sca_ 156 | if np.isnan(X0).any() or np.isnan(y0).any(): 157 | S = nanmatdot(X0.T, X0) 158 | dot = nandot 159 | else: 160 | S = np.matmul(X0.T, X0) 161 | dot = np.dot 162 | s0 = dot(X0.T, y0) 163 | T = np.empty((n, self.n_components), float) 164 | W = np.empty((p, self.n_components), float) 165 | P = np.empty((p, self.n_components), float) 166 | C = np.empty((self.n_components, 1), float) 167 | Xev = np.empty((self.n_components, 1), float) 168 | yev = np.empty((self.n_components, 1), float) 169 | B = np.empty((p, 1), float) 170 | oldgoodies = np.array([]) 171 | Xi = X0 172 | yi = y0 173 | for i in range(1, self.n_components + 1): 174 | wh = dot(Xi.T, yi) 175 | wh = wh / np.linalg.norm(wh, "fro") 176 | # goodies = abs(wh)-llambda/2 lambda definition 177 | goodies = abs(wh) - self.eta * max(abs(wh)) 178 | wh = np.multiply(goodies, np.sign(wh)) 179 | goodies = np.where((goodies > 0))[0] 180 | goodies = np.union1d(oldgoodies, goodies) 181 | oldgoodies = goodies 182 | if len(goodies) == 0: 183 | colret = None 184 | print( 185 | "No variables retained at" 186 | + str(i) 187 | + "latent variables" 188 | + "and lambda = " 189 | + str(self.eta) 190 | + ", try lower lambda" 191 | ) 192 | break 193 | elimvars = np.setdiff1d(range(0, p), goodies) 194 | wh[elimvars] = 0 195 | th = dot(Xi, wh) 196 | nth = np.linalg.norm(th, "fro") 197 | ch = dot(yi.T, th) / (nth ** 2) 198 | ph = dot(Xi.T, dot(Xi, wh)) / (nth ** 2) 199 | Xi = Xi - np.dot(th, ph.T) 200 | yi = yi - np.dot(th, ch) 201 | ph[elimvars] = 0 202 | W[:, i - 1] = np.reshape(wh, p) 203 | P[:, i - 1] = np.reshape(ph, p) 204 | C[i - 1] = ch 205 | T[:, i - 1] = np.reshape(th, n) 206 | Xev[i - 1] = ( 207 | (nth ** 2 * np.linalg.norm(ph, "fro") ** 2) 208 | / np.nansum(np.square(X0)) 209 | * 100 210 | ) 211 | yev[i - 1] = np.nansum(nth ** 2 * (ch ** 2)) / \ 212 | np.nansum(np.power(y0, 2)) * 100 213 | if type(self.columns) == bool: 214 | colret = goodies 215 | else: 216 | colret = self.columns[np.setdiff1d(range(0, p), elimvars)] 217 | if self.verbose: 218 | print( 219 | "Variables retained for " 220 | + str(i) 221 | + " latent variable(s):" 222 | + "\n" 223 | + str(colret) 224 | + ".\n" 225 | ) 226 | if len(goodies) > 0: 227 | R = np.matmul( 228 | W[:, range(0, i)], 229 | np.linalg.inv( 230 | np.matmul(P[:, range(0, i)].T, W[:, range(0, i)])), 231 | ) 232 | B = np.matmul( 233 | W[:, range(0, i)], 234 | np.matmul( 235 | np.linalg.inv( 236 | np.matmul( 237 | np.matmul(W[:, range(0, i)].T, S), 238 | W[:, range(0, i)], 239 | ) 240 | ), 241 | np.matmul(W[:, range(0, i)].T, s0), 242 | ), 243 | ) 244 | else: 245 | B = np.empty((p, 1)) 246 | B.fill(0) 247 | R = B 248 | T = np.empty((n, self.n_components)) 249 | T.fill(0) 250 | B_rescaled = np.multiply(np.array(sy / sX).reshape((p, 1)), B) 251 | yp_rescaled = dot(X, B_rescaled) 252 | if self.centre == "mean": 253 | intercept = np.nanmean(y - yp_rescaled) 254 | elif self.centre == "None": 255 | intercept = 0 256 | else: 257 | intercept = np.nanmedian(y - yp_rescaled) 258 | yfit = yp_rescaled + intercept 259 | yfit = yfit.reshape(-1) 260 | r = y.ravel() - yfit 261 | setattr(self, "x_weights_", W) 262 | setattr(self, "x_loadings_", P) 263 | setattr(self, "C_", C) 264 | setattr(self, "x_scores_", T) 265 | setattr(self, "coef_", B_rescaled) 266 | setattr(self, "coef_scaled_", B) 267 | setattr(self, "intercept_", intercept) 268 | setattr(self, "x_ev_", Xev) 269 | setattr(self, "y_ev_", yev) 270 | setattr(self, "fitted_", yfit) 271 | setattr(self, "residuals_", r) 272 | setattr(self, "x_Rweights_", R) 273 | setattr(self, "colret_", colret) 274 | setattr(self, "x_loc_", mX) 275 | setattr(self, "y_loc_", my) 276 | setattr(self, "x_sca_", sX) 277 | setattr(self, "y_sca_", sy) 278 | setattr(self, "centring_", centring) 279 | return self 280 | 281 | def predict(self, Xn): 282 | """ 283 | Predict using a SNIPLS model. 284 | 285 | Parameters 286 | ------------ 287 | 288 | Xn : numpy array or data frame 289 | Input data. 290 | 291 | """ 292 | n, p, Xn = _predict_check_input(Xn) 293 | if p != self.X.shape[1]: 294 | raise ( 295 | ValueError( 296 | "New data must have same number of columns as the ones the model has been trained with" 297 | ) 298 | ) 299 | return np.matmul(Xn, self.coef_) + self.intercept_ 300 | 301 | def transform(self, Xn): 302 | """ 303 | Transform input data. 304 | 305 | 306 | Parameters 307 | ------------ 308 | 309 | Xn : numpy array or data frame 310 | Input data. 311 | 312 | """ 313 | n, p, Xn = _predict_check_input(Xn) 314 | if p != self.X.shape[1]: 315 | raise ( 316 | ValueError( 317 | "New data must have seame number of columns as the ones the model has been trained with" 318 | ) 319 | ) 320 | Xnc = scale_data(Xn, self.x_loc_, self.x_sca_) 321 | return np.dot(Xnc, self.x_Rweights_) 322 | --------------------------------------------------------------------------------