├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── colorectal_cancer_nmr.csv
├── pyopls
    ├── .gitignore
    ├── __init__.py
    ├── kernel_density.py
    ├── opls.py
    ├── permutation_test.py
    ├── tests
    │   ├── features.npy
    │   ├── target.npy
    │   └── test_opls.py
    └── validation.py
├── requirements.txt
├── roc_curve.png
├── scores.png
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .idea/*
107 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.6" 
 4 |   - "3.7"
 5 |   - "3.8"
 6 |     #- "nightly"  # nightly build turned off for now because numpy cannot be installed using pip yet
 7 | install:
 8 |   - pip install -r requirements.txt
 9 | script:
10 |   - python -m pytest pyopls/tests/
11 | deploy:
12 |   provider: pypi
13 |   user: __token__
14 |   password:
15 |     secure: HJJK7VFqtvpfrGKm2Tvikmur5xVbeLIKT1d6sz+NR9KII9OYReTGi6gGjFSxmPOLTLKuMHpJ8GLfUipK0RiwQYZMe6ve9kuAX4w3Gcn3juR2CLRjc78zwg3u7vQzunuUmXO1vjDhIOawgnbwcaHKZ0bnnHCs39uwUfdxq673SrmlSLCNrRLLsfUOMycCX1HVtAlSDGvdPdsl6DMx+YqJp05uc6oVk50RgojQxjetJtonUcEeEIWXm6V/twQc4wa8mlt/ZnCnkF3kccQUp+a4/P7uoHzF6GLOjJ15rvPz6f/J8wZr1n9Jz5Ng0U4aJFX3clVMGJLcOaRf3owcydIQvfyKzwKuqPN0nnvPv4FMKTNPEMpLVwIc4PUJFrZ1g4013tZZT6zuEqKYrOPpz/nifobWhLUc2ktHt3t0B6VR2VOcYJRWlMpuI9gwAYq0dMtlnIWReYeerLWMLnpzFwAHfogOjKoCAIEPM/PmtjNh2hrYIZZhiH+wCuqcYG4SrC9J0azx8vXWzbxYPRNfvZpxmNjZ2kAeEYsQ0BEL1UI9839rmkOGyVqbEZ9VP1GHg5KsifmeD+VzZEaJOS+PYu8YPuHrfah7MYswaQC4uyU/vzEJlc3oZ/15rqzsgj9GKqWmMS25OvXxOilt9W5CpIKrV7nNzfn22JOKHRMwjPS28f8=
16 |   skip_existing: true
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Wright State University
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pyopls - Orthogonal Projection to Latent Structures in Python. 
  2 | [![Build Status](https://travis-ci.org/BiRG/pyopls.svg?branch=master)](https://travis-ci.org/BiRG/pyopls)
  3 | 
  4 | This package provides a scikit-learn-style transformer to perform OPLS.
  5 | OPLS is a pre-processing method to remove variation from the descriptor 
  6 | variables that are orthogonal to the target variable (1).
  7 | 
  8 | This package also provides a class to validate OPLS models using a 
  9 | 1-component PLS regression with cross-validation and permutation tests (2)
 10 | for both regression and classification metrics (from permutations of the
 11 | target) and feature PLS loadings (from permutations of the features).
 12 | 
 13 | ## Table of Contents
 14 | 1. [Installation](#installation)
 15 | 2. [Notes](#notes)
 16 | 3. [Examples](#examples) 
 17 |    1. [OPLS and PLS-DA](#opls-and-pls-da)
 18 |    2. [Validation](#validation)
 19 | 4. [References](#references)
 20 | 5. [Data Acknowledgment](#data-acknowledgment)
 21 | 
 22 | 
 23 | ## Installation
 24 | pyopls is available via [pypi](https://pypi.org/project/pyopls/):
 25 | ```shell
 26 | pip install pyopls
 27 | ```
 28 | You may also install directly from this repository for the current
 29 | master: 
 30 | ```shell
 31 | pip install git+git://github.com/BiRG/pyopls.git
 32 | ```
 33 | New versions are uploaded to pypi whenever the version number is
 34 | incremented in `setup.py` on the master branch.
 35 | 
 36 | 
 37 | ## Notes
 38 | * The implementation provided here is equivalent to that of the 
 39 |   [libPLS MATLAB library](http://libpls.net/), which is a faithful
 40 |   recreation of Trygg and Wold's algorithm.
 41 |   *   This package uses a different definition for R<sup>2</sup>X, however (see
 42 |       below)
 43 | * `OPLS` inherits `sklearn.base.TransformerMixin` (like
 44 |   `sklearn.decomposition.PCA`) but does not inherit 
 45 |   `sklearn.base.RegressorMixin` because it is not a regressor like
 46 |   `sklearn.cross_decomposition.PLSRegression`. You can use the output of
 47 |   `OPLS.transform()` as an input to another regressor or classifier.
 48 | * Like `sklearn.cross_decomposition.PLSRegression`, `OPLS` will center
 49 |   both X and Y before performing the algorithm. This makes centering by
 50 |   class in PLS-DA models unnecessary.
 51 | * The `score()` function of `OPLS` performs the R<sup>2</sup>X score, the
 52 |   ratio of the variance in the transformed X to the variance in the
 53 |   original X. A lower score indicates more orthogonal variance removed.
 54 | * `OPLS` only supports 1-column targets.
 55 | 
 56 | ## Examples
 57 | ### OPLS and PLS-DA
 58 | A CSV file containing 1H-NMR spectra for 118 serum samples of patients
 59 | with colon cancer diagnoses and healthy controls is located in
 60 | `colorectal_cancer_nmr.csv` in the root of this repository (see
 61 | acknowledgment below).
 62 |  
 63 | OPLS-processed data require only 1 PLS component. Performing a
 64 | 39-component OPLS improves cross-validated accuracy from 70% to 100%,
 65 | AUC from .578 to 1 and DQ<sup>2</sup> (3) from 0.04 to 0.99.
 66 | 
 67 | ```python
 68 | import pandas as pd
 69 | import numpy as np
 70 | import matplotlib.pyplot as plt
 71 | from sklearn.metrics import roc_curve, roc_auc_score
 72 | from pyopls import OPLS
 73 | from sklearn.cross_decomposition import PLSRegression
 74 | from sklearn.model_selection import cross_val_predict, LeaveOneOut
 75 | from sklearn.metrics import r2_score, accuracy_score
 76 | 
 77 | 
 78 | spectra = pd.read_csv('colorectal_cancer_nmr.csv', index_col=0)
 79 | spectra = spectra[spectra.classification.isin(['Colorectal Cancer', 'Healthy Control'])]
 80 | target = spectra.classification.apply(lambda x: 1 if x == 'Colorectal Cancer' else -1)
 81 | spectra = spectra.drop('classification', axis=1)
 82 | 
 83 | opls = OPLS(39)
 84 | Z = opls.fit_transform(spectra, target)
 85 | 
 86 | pls = PLSRegression(1)
 87 | y_pred = cross_val_predict(pls, spectra, target, cv=LeaveOneOut())
 88 | q_squared = r2_score(target, y_pred)  # -0.107
 89 | dq_squared = r2_score(target, np.clip(y_pred, -1, 1))  # -0.106
 90 | accuracy = accuracy_score(target, np.sign(y_pred))  # 0.705
 91 | 
 92 | processed_y_pred = cross_val_predict(pls, Z, target, cv=LeaveOneOut())
 93 | processed_q_squared = r2_score(target, processed_y_pred)  # 0.981
 94 | processed_dq_squared = r2_score(target, np.clip(processed_y_pred, -1, 1))  # 0.984
 95 | processed_accuracy = accuracy_score(target, np.sign(processed_y_pred))  # 1.0
 96 | 
 97 | r2_X = opls.score(spectra)  # 7.8e-12 (most variance is removed)
 98 | 
 99 | fpr, tpr, thresholds = roc_curve(target, y_pred)
100 | roc_auc = roc_auc_score(target, y_pred)
101 | proc_fpr, proc_tpr, proc_thresholds = roc_curve(target, processed_y_pred)
102 | proc_roc_auc = roc_auc_score(target, processed_y_pred)
103 | 
104 | plt.figure(0)
105 | plt.plot(fpr, tpr, lw=2, color='blue', label=f'Unprocessed (AUC={roc_auc:.4f})')
106 | plt.plot(proc_fpr, proc_tpr, lw=2, color='red',
107 |          label=f'39-component OPLS (AUC={proc_roc_auc:.4f})')
108 | plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
109 | plt.xlabel('False Positive Rate')
110 | plt.ylabel('True Positive Rate')
111 | plt.title('ROC Curve')
112 | plt.legend(loc='lower right')
113 | plt.show()
114 | 
115 | plt.figure(1)
116 | pls.fit(Z, target)
117 | df = pd.DataFrame(np.column_stack([pls.x_scores_, opls.T_ortho_[:, 0]]),
118 |                   index=spectra.index, columns=['t', 't_ortho'])                           
119 | pos_df = df[target==1]
120 | neg_df = df[target==-1]
121 | plt.scatter(neg_df['t'], neg_df['t_ortho'], c='blue', label='Healthy Control')
122 | plt.scatter(pos_df['t'], pos_df['t_ortho'], c='red', label='Colorectal Cancer')
123 | plt.title('PLS Scores')
124 | plt.xlabel('t_ortho')
125 | plt.ylabel('t')
126 | plt.legend(loc='upper right')
127 | plt.show()
128 | ```
129 | #### ROC Curve
130 | ![roc curve](roc_curve.png) 
131 | #### Scores Plot
132 | ![scores plot](scores.png)
133 | ### Validation
134 | The `fit()` method of `OPLSValidator` will find the optimum number of
135 | components to remove, then evaluate the results on a 1-component
136 | `sklearn.cross_decomposition.PLSRegression` model. A permutation test is
137 | performed for each metric by permuting the target and for the PLS
138 | loadings by permuting the features.
139 |  
140 | This snippet will determine the best number of components to remove,
141 | perform permutation tests for regression metrics and perform two-tailed
142 | permutation tests for each feature (bin) relative to it's loading. The
143 | feature permutation tests for the colorectal cancer dataset would take
144 | quite some time, as they require that the model be fit as many as 874k
145 | times. So instead, we look at the
146 | [UCI ML Wine Dataset](https://archive.ics.uci.edu/ml/datasets/Wine)
147 | provided by
148 | [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html)
149 | The feature permutation tests reveal that hue and malic acid do not
150 | differentate class 1 from class 0.
151 | 
152 | ```python
153 | import pandas as pd
154 | from pyopls import OPLSValidator
155 | from sklearn.datasets import load_wine
156 | 
157 | wine_data = load_wine()
158 | df = pd.DataFrame(wine_data['data'], columns=wine_data['feature_names'])
159 | df['classification'] = wine_data['target']
160 | df = df[df.classification.isin((0, 1))]
161 | target = df.classification.apply(lambda x: 1 if x else -1)  # discriminant for class 1 vs class 0
162 | X = df[[c for c in df.columns if c!='classification']]
163 | 
164 | validator = OPLSValidator(k=-1).fit(X, target)
165 | 
166 | Z = validator.opls_.transform(X)
167 | 
168 | feature_df = pd.DataFrame()
169 | feature_df['feature_name'] = wine_data['feature_names']
170 | feature_df['feature_p_value'] = validator.feature_p_values_
171 | feature_df['feature_loading'] = validator.pls_.x_loadings_
172 | print(feature_df.loc[feature_df.feature_loading.abs().sort_values(ascending=False).index].to_markdown())  # Pandas 1.0+ required for to_markdown
173 | ```
174 | #### Feature importances
175 | |    | feature\_name                | feature\_p\_value | feature\_loading |
176 | |---:|:-----------------------------|------------------:|-----------------:|
177 | | 12 | proline                      |      0.00990099   |        0.385955  |
178 | |  9 | color_intensity              |      0.00990099   |        0.381981  |
179 | |  0 | alcohol                      |      0.00990099   |        0.379567  |
180 | |  6 | flavanoids                   |      0.00990099   |        0.359975  |
181 | |  5 | total_phenols                |      0.00990099   |        0.336182  |
182 | | 11 | od280/od315_of_diluted_wines |      0.00990099   |        0.299045  |
183 | |  3 | alcalinity_of_ash            |      0.00990099   |       -0.239887  |
184 | |  2 | ash                          |      0.00990099   |        0.22916   |
185 | |  7 | nonflavanoid_phenols         |      0.00990099   |       -0.224338  |
186 | |  4 | magnesium                    |      0.00990099   |        0.18662   |
187 | |  8 | proanthocyanins              |      0.00990099   |        0.181767  |
188 | |  1 | malic_acid                   |      0.564356     |        0.0293328 |
189 | | 10 | hue                          |      0.623762     |        0.0210777 |
190 | 
191 | ## References
192 | 1. Johan Trygg and Svante Wold. Orthogonal projections to latent structures (O-PLS).
193 |    *J. Chemometrics* 2002; 16: 119-128. DOI: [10.1002/cem.695](https://dx.doi.org/10.1002/cem.695)
194 | 2. Eugene Edington and Patrick Onghena. "Calculating P-Values" in *Randomization tests*, 4th edition.
195 |    New York: Chapman & Hall/CRC, 2007, pp. 33-53. DOI: [10.1201/9781420011814](https://doi.org/10.1201/9781420011814).
196 | 3. Johan A. Westerhuis, Ewoud J. J. van Velzen, Huub C. J. Hoefsloot, Age K. Smilde. Discriminant Q-squared for 
197 |    improved discrimination in PLSDA models. *Metabolomics* 2008; 4: 293-296. 
198 |    DOI: [10.1007/s11306-008-0126-2](https://doi.org/10.1007/s11306-008-0126-2)
199 | 
200 | ## Data Acknowledgment
201 | The test dataset provided at `pyopls/tests/colorectal_cancer_nmr.csv` is
202 | available at the NIH Common Fund's National Metabolomics Data Repository
203 | (NMDR) website, the Metabolomics Workbench,
204 | [https://metabolomicsworkbench.org] where it has been assigned Project
205 | ID PR000227. The data can be accessed directly via it's Project DOI
206 | [10.21228/M89P43](https://dx.doi.org/10.21228/M89P43). This work is
207 | supported by NIH grant, U2C-DK119886. 
208 | 
209 | *Note*: The test dataset consists only of those spectra belonging to
210 | samples labeled "Colorectal Cancer" or "Healthy Control". The "target"
211 | variable has the value -1 for samples labeled "Healthy Control" and
212 | value +1 for samples labeled "Colorectal Cancer".
213 | 


--------------------------------------------------------------------------------
/pyopls/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/pyopls/__init__.py:
--------------------------------------------------------------------------------
1 | from pyopls.opls import OPLS
2 | from pyopls.validation import OPLSValidator, OPLSDAValidator
3 | 


--------------------------------------------------------------------------------
/pyopls/kernel_density.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from joblib import Parallel, delayed
 3 | from sklearn.model_selection import GridSearchCV, KFold, LeaveOneOut
 4 | from sklearn.neighbors import KernelDensity
 5 | 
 6 | 
 7 | class OPLSKernelDensity:
 8 |     @staticmethod
 9 |     def _estimate_bandwidth(vals, grid_search_num, cv, n_jobs, verbose, pre_dispatch):
10 |         grid = GridSearchCV(KernelDensity(kernel='gaussian'),
11 |                             {'bandwidth': 10 ** np.linspace(-1, 1, grid_search_num)},
12 |                             cv=cv, n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch, iid=False)
13 |         grid.fit(vals.reshape(-1, 1))
14 |         return grid.best_params_['bandwidth']
15 | 
16 |     @staticmethod
17 |     def _kde(x, vals, bw):
18 |         kd = KernelDensity(kernel='gaussian', bandwidth=bw).fit(vals.reshape(-1, 1))
19 |         return kd.score_samples(x.reshape(-1, 1))
20 | 
21 |     @staticmethod
22 |     def _estimate_kde_abscissa(vals, num):
23 |         return np.linspace(vals.min(), vals.max(), num)
24 | 
25 |     def get_kdes(self, opls_cv, num=None, bandwidth=None, k=5, grid_search_num=100,
26 |                  n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'):
27 |         # Get a kernel-density estimate for permutation test results
28 |         num = num or 2 * opls_cv.n_permutations
29 | 
30 |         def _named_kde(key, x, vals, bw):
31 |             return key, self._kde(x, vals, bw)
32 | 
33 |         def _named_abscissa(key, vals, n):
34 |             return key, self._estimate_kde_abscissa(vals, n)
35 | 
36 |         if k == -1:
37 |             cv = LeaveOneOut()
38 |         else:
39 |             cv = KFold(k)
40 |         loading_bandwidths = [
41 |             self._estimate_bandwidth(vals, grid_search_num, cv, n_jobs, verbose, pre_dispatch)
42 |             for vals in np.hsplit(opls_cv.permutation_loadings_, opls_cv.permutation_loadings_.shape[1])
43 |         ] if bandwidth is None else [bandwidth for _ in range(opls_cv.permutation_loadings_.shape[1])]
44 | 
45 |         loading_abscissae = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
46 |             delayed(self._estimate_kde_abscissa)(vals, num)
47 |             for vals in np.hsplit(opls_cv.permutation_loadings_, opls_cv.permutation_loadings_.shape[1])
48 |         )
49 |         loading_kdes = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
50 |             delayed(self._kde)(x, vals, bw)
51 |             for x, vals, bw in zip(loading_abscissae,
52 |                                    np.hsplit(opls_cv.permutation_loadings_, opls_cv.permutation_loadings_.shape[1]),
53 |                                    loading_bandwidths)
54 |         )
55 |         results = {
56 |             'loadings': {
57 |                 'x': np.column_stack(loading_abscissae),
58 |                 'kde': np.column_stack(loading_kdes),
59 |                 'h': np.hstack(loading_bandwidths)
60 |             }
61 |         }
62 |         metrics = {
63 |             'q_squared': opls_cv.permutation_q_squared_,
64 |             'r_squared_Y': opls_cv.permutation_r_squared_Y_,
65 |             'discriminator_q_squared': opls_cv.permutation_discriminator_q_squared_,
66 |             'accuracy': opls_cv.permutation_accuracy_,
67 |             'roc_auc': opls_cv.permutation_roc_auc_
68 |         }
69 |         metrics = {key: value for key, value in metrics.items() if value is not None}
70 |         metric_bandwidths = {
71 |             key: self._estimate_bandwidth(value, grid_search_num, cv, n_jobs, verbose, pre_dispatch)
72 |             for key, value in metrics.items()
73 |         } if bandwidth is None else {key: bandwidth for key in metrics.keys()}
74 |         metric_abscissae = {
75 |             res[0]: res[1] for res in Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
76 |                 delayed(_named_abscissa)(key, value, num) for key, value in metrics.items())
77 |         }
78 |         metric_kdes = {
79 |             res[0]: res[1] for res in Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
80 |                 delayed(_named_kde)(key, metric_abscissae[key], value, metric_bandwidths[key])
81 |                 for key, value in metrics.items()
82 |             )
83 |         }
84 | 
85 |         for key in metrics.keys():
86 |             results[key] = {
87 |                 'x': metric_abscissae[key],
88 |                 'kde': metric_kdes[key],
89 |                 'h': metric_bandwidths[key]
90 |             }
91 | 
92 |         return results
93 | 


--------------------------------------------------------------------------------
/pyopls/opls.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019 Wright State University
  2 | # Author: Daniel Foose <foose.3@wright.edu>
  3 | # License: MIT
  4 | 
  5 | import numpy as np
  6 | from sklearn.base import BaseEstimator, TransformerMixin
  7 | from sklearn.utils import check_array
  8 | from sklearn.utils.validation import check_consistent_length
  9 | 
 10 | 
 11 | def _center_scale_xy(X, Y, scale=True):
 12 |     """ Center X, Y and scale if the scale parameter==True
 13 | 
 14 |     Returns
 15 |     -------
 16 |         X, Y, x_mean, y_mean, x_std, y_std
 17 |     """
 18 |     # center
 19 |     x_mean = X.mean(axis=0)
 20 |     X -= x_mean
 21 |     y_mean = Y.mean(axis=0)
 22 |     Y -= y_mean
 23 |     # scale
 24 |     if scale:
 25 |         x_std = X.std(axis=0, ddof=1)
 26 |         x_std[x_std == 0.0] = 1.0
 27 |         X /= x_std
 28 |         y_std = Y.std(axis=0, ddof=1)
 29 |         y_std[y_std == 0.0] = 1.0
 30 |         Y /= y_std
 31 |     else:
 32 |         x_std = np.ones(X.shape[1])
 33 |         y_std = np.ones(Y.shape[1])
 34 |     return X, Y, x_mean, y_mean, x_std, y_std
 35 | 
 36 | 
 37 | class OPLS(BaseEstimator, TransformerMixin):
 38 |     """Orthogonal Projection to Latent Structures (O-PLS)
 39 | 
 40 |     This class implements the O-PLS algorithm for one (and only one) response as described by [Trygg 2002].
 41 |     This is equivalent to the implementation of the libPLS MATLAB library (http://libpls.net/)
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     n_components: int, number of orthogonal components to filter. (default 5).
 46 | 
 47 |     scale: boolean, scale data? (default True)
 48 | 
 49 |     Attributes
 50 |     ----------
 51 |     W_ortho_ : weights orthogonal to y
 52 | 
 53 |     P_ortho_ : loadings orthogonal to y
 54 | 
 55 |     T_ortho_ : scores orthogonal to y
 56 | 
 57 |     x_mean_ : mean of the X provided to fit()
 58 |     y_mean_ : mean of the Y provided to fit()
 59 |     x_std_ : std deviation of the X provided to fit()
 60 |     y_std_ : std deviation of the Y provided to fit()
 61 | 
 62 |     References
 63 |     ----------
 64 |     Johan Trygg and Svante Wold. Orthogonal projections to latent structures (O-PLS).
 65 |     J. Chemometrics 2002; 16: 119-128. DOI: 10.1002/cem.695
 66 |     """
 67 |     def __init__(self, n_components=5, scale=True):
 68 |         self.n_components = n_components
 69 |         self.scale = scale
 70 | 
 71 |         self.W_ortho_ = None
 72 |         self.P_ortho_ = None
 73 |         self.T_ortho_ = None
 74 | 
 75 |         self.x_mean_ = None
 76 |         self.y_mean_ = None
 77 |         self.x_std_ = None
 78 |         self.y_std_ = None
 79 | 
 80 |     def fit(self, X, Y):
 81 |         """Fit model to data
 82 | 
 83 |         Parameters
 84 |         ----------
 85 |         X : array-like, shape = [n_samples, n_features]
 86 |             Training vectors, where n_samples is the number of samples and
 87 |             n_features is the number of predictors.
 88 | 
 89 |         Y : array-like, shape = [n_samples, 1]
 90 |             Target vector, where n_samples is the number of samples.
 91 |             This implementation only supports a single response (target) variable.
 92 | 
 93 |         """
 94 | 
 95 |         # copy since this will contains the residuals (deflated) matrices
 96 |         check_consistent_length(X, Y)
 97 |         X = check_array(X, dtype=np.float64, copy=True, ensure_min_samples=2)
 98 |         Y = check_array(Y, dtype=np.float64, copy=True, ensure_2d=False)
 99 |         if Y.ndim == 1:
100 |             Y = Y.reshape(-1, 1)
101 | 
102 |         X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = _center_scale_xy(X, Y, self.scale)
103 | 
104 |         Z = X.copy()
105 |         w = np.dot(X.T, Y)  # calculate weight vector
106 |         w /= np.linalg.norm(w)  # normalize weight vector
107 | 
108 |         W_ortho = []
109 |         T_ortho = []
110 |         P_ortho = []
111 | 
112 |         for i in range(self.n_components):
113 |             t = np.dot(Z, w)  # scores vector
114 |             p = np.dot(Z.T, t) / np.dot(t.T, t).item()  # loadings of X
115 |             w_ortho = p - np.dot(w.T, p).item() / np.dot(w.T, w).item() * w  # orthogonal weight
116 |             w_ortho = w_ortho / np.linalg.norm(w_ortho)  # normalize orthogonal weight
117 |             t_ortho = np.dot(Z, w_ortho)  # orthogonal components
118 |             p_ortho = np.dot(Z.T, t_ortho) / np.dot(t_ortho.T, t_ortho).item()
119 |             Z -= np.dot(t_ortho, p_ortho.T)
120 |             W_ortho.append(w_ortho)
121 |             T_ortho.append(t_ortho)
122 |             P_ortho.append(p_ortho)
123 | 
124 |         self.W_ortho_ = np.hstack(W_ortho)
125 |         self.T_ortho_ = np.hstack(T_ortho)
126 |         self.P_ortho_ = np.hstack(P_ortho)
127 | 
128 |         return self
129 | 
130 |     def transform(self, X):
131 |         """Get the non-orthogonal components of X (which are considered in prediction).
132 | 
133 |         Parameters
134 |         ----------
135 |         X : array-like, shape = [n_samples, n_features]
136 |             Training or test vectors, where n_samples is the number of samples and
137 |             n_features is the number of predictors (which should be the same predictors the model was trained on).
138 | 
139 |         Returns
140 |         -------
141 |         X_res, X with the orthogonal data filtered out
142 |         """
143 |         Z = check_array(X, copy=True)
144 | 
145 |         Z -= self.x_mean_
146 |         if self.scale:
147 |             Z /= self.x_std_
148 | 
149 |         # filter out orthogonal components of X
150 |         for i in range(self.n_components):
151 |             t = np.dot(Z, self.W_ortho_[:, i]).reshape(-1, 1)
152 |             Z -= np.dot(t, self.P_ortho_[:, i].T.reshape(1, -1))
153 | 
154 |         return Z
155 | 
156 |     def fit_transform(self, X, y=None, **fit_params):
157 |         """ Learn and apply the filtering on the training data and get the filtered X
158 | 
159 |         Parameters
160 |         ----------
161 |         X : array-like, shape=[n_samples, n_features]
162 |             Training vectors, where n_samples is the number of samples and
163 |             n_features is the number of predictors.
164 | 
165 |         y : array-like, shape = [n_samples, 1]
166 |             Target vector, where n_samples is the number of samples.
167 |             This O-PLS implementation only supports a single response (target) variable.
168 |             Y=None will raise ValueError from fit().
169 | 
170 |         Returns
171 |         -------
172 |         X_filtered
173 |         """
174 |         return self.fit(X, y).transform(X)
175 | 
176 |     def score(self, X):
177 |         """ Return the coefficient of determination R^2X of the transformation.
178 |         Parameters
179 |         ----------
180 |           X : array-like of shape (n_samples, n_features)
181 |               Test samples. For some estimators this may be a
182 |               precomputed kernel matrix or a list of generic objects instead,
183 |               shape = (n_samples, n_samples_fitted),
184 |               where n_samples_fitted is the number of
185 |               samples used in the fitting for the estimator.
186 |           Returns
187 |           -------
188 |           score : float
189 |               The amount of variation in X explained by the transformed X. A lower number indicates more orthogonal
190 |               variation has been removed.
191 |         """
192 |         X = check_array(X)
193 |         Z = self.transform(X)
194 |         return np.sum(np.square(Z)) / np.sum(np.square(X - self.x_mean_))  # Z is already properly centered
195 | 


--------------------------------------------------------------------------------
/pyopls/permutation_test.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from sys import stderr
  3 | 
  4 | import numpy as np
  5 | from joblib import Parallel, delayed
  6 | from sklearn.base import is_classifier, clone, ClassifierMixin
  7 | from sklearn.exceptions import DataConversionWarning
  8 | from sklearn.metrics import r2_score, accuracy_score
  9 | from sklearn.model_selection import check_cv, cross_val_predict
 10 | from sklearn.utils import indexable, check_random_state
 11 | 
 12 | 
 13 | def passthrough_scorer(estimator, *args, **kwargs):
 14 |     """Function that wraps estimator.score"""
 15 |     return estimator.score(*args, **kwargs)
 16 | 
 17 | 
 18 | def non_cv_permutation_test_score(estimator, X, y, groups=None,
 19 |                                   n_permutations=100, n_jobs=None, random_state=0,
 20 |                                   verbose=0, pre_dispatch='2*n_jobs', scorers=None):
 21 |     """Evaluate the significance of several non-cross-validated scores with permutations
 22 | 
 23 |     Read more in the :ref:`User Guide <cross_validation>`.
 24 | 
 25 |     Parameters
 26 |     ----------
 27 |     estimator : estimator object implementing 'fit'
 28 |         The object to use to fit the data.
 29 | 
 30 |     X : array-like of shape at least 2D
 31 |         The data to fit.
 32 | 
 33 |     y : array-like
 34 |         The target variable to try to predict in the case of
 35 |         supervised learning.
 36 | 
 37 |     groups : array-like, with shape (n_samples,), optional
 38 |         Labels to constrain permutation within groups, i.e. ``y`` values
 39 |         are permuted among samples with the same group identifier.
 40 |         When not specified, ``y`` values are permuted among all samples.
 41 | 
 42 |         When a grouped cross-validator is used, the group labels are
 43 |         also passed on to the ``split`` method of the cross-validator. The
 44 |         cross-validator uses them for grouping the samples  while splitting
 45 |         the dataset into train/test set.
 46 | 
 47 |     scorers : string, callable or None, optional, default: None
 48 |         a list of scoring functions
 49 | 
 50 | 
 51 |     n_permutations : integer, optional
 52 |         Number of times to permute ``y``.
 53 | 
 54 |     n_jobs : int or None, optional (default=None)
 55 |         The number of CPUs to use to do the computation.
 56 |         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
 57 |         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
 58 |         for more details.
 59 | 
 60 |     random_state : int, RandomState instance or None, optional (default=0)
 61 |         If int, random_state is the seed used by the random number generator;
 62 |         If RandomState instance, random_state is the random number generator;
 63 |         If None, the random number generator is the RandomState instance used
 64 |         by `np.random`.
 65 | 
 66 |     verbose : integer, optional
 67 |         The verbosity level.
 68 | 
 69 | 
 70 |     pre_dispatch : int, or string, optional
 71 |         Controls the number of jobs that get dispatched during parallel
 72 |         execution. Reducing this number can be useful to avoid an
 73 |         explosion of memory consumption when more jobs get dispatched
 74 |         than CPUs can process. This parameter can be:
 75 | 
 76 |             - None, in which case all the jobs are immediately
 77 |               created and spawned. Use this for lightweight and
 78 |               fast-running jobs, to avoid delays due to on-demand
 79 |               spawning of the jobs
 80 | 
 81 |             - An int, giving the exact number of total jobs that are
 82 |               spawned
 83 | 
 84 |             - A string, giving an expression as a function of n_jobs,
 85 |               as in '2*n_jobs'
 86 | 
 87 |     Returns
 88 |     -------
 89 |     score : float
 90 |         The true score without permuting targets.
 91 | 
 92 |     permutation_scores : array, shape (n_permutations,)
 93 |         The scores obtained for each permutations.
 94 | 
 95 |     pvalue : float
 96 |         The p-value, which approximates the probability that the score would
 97 |         be obtained by chance. This is calculated as:
 98 | 
 99 |         `(C + 1) / (n_permutations + 1)`
100 | 
101 |         Where C is the number of permutations whose score >= the true score.
102 | 
103 |         The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.
104 | 
105 |     Notes
106 |     -----
107 |     This function implements Test 1 in:
108 | 
109 |         Ojala and Garriga. Permutation Tests for Studying Classifier
110 |         Performance.  The Journal of Machine Learning Research (2010)
111 |         vol. 11
112 | 
113 |     """
114 |     X, y, groups = indexable(X, y, groups)
115 | 
116 |     random_state = check_random_state(random_state)
117 |     if scorers is None or not len(scorers):
118 |         if hasattr(estimator, 'score'):
119 |             scorers = [passthrough_scorer]
120 |         else:
121 |             raise TypeError(
122 |                 "If no scoring is specified, the estimator passed should "
123 |                 "have a 'score' method. The estimator %r does not."
124 |                 % estimator)
125 | 
126 |     # We clone the estimator to make sure that all the folds are
127 |     # independent, and that it is pickle-able.
128 |     score = _non_cv_permutation_test_score(clone(estimator), X, y, groups, scorers)
129 |     permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
130 |         delayed(_non_cv_permutation_test_score)(
131 |             clone(estimator), X, _shuffle(y, groups, random_state),
132 |             groups, scorers)
133 |         for _ in range(n_permutations))
134 |     permutation_scores = np.array(permutation_scores)
135 |     pvalue = (np.sum(permutation_scores >= score, axis=0) + 1.0) / (n_permutations + 1)
136 |     return [(score[i], permutation_scores[:, i], pvalue[i]) for i in range(len(scorers))]
137 | 
138 | 
139 | def _non_cv_permutation_test_score(estimator, X, y, groups, scorers):
140 |     """Auxiliary function for permutation_test_score"""
141 |     estimator.fit(X, y)
142 |     return [scorer(estimator, X, y) for scorer in scorers]
143 | 
144 | 
145 | def permutation_test_score(estimator, X, y, groups=None, cv='warn',
146 |                            n_permutations=100, n_jobs=None, random_state=0,
147 |                            verbose=0, pre_dispatch='2*n_jobs', cv_score_functions=None,
148 |                            fit_params=None, method='predict', parallel_by='permutation'):
149 |     """Evaluate the significance of several cross-validated scores with permutations
150 | 
151 |     Note: this is different from sklearn.model_selection.permutation_test_score in two ways.
152 |       1. The scikit-learn method calculates the metrics for each CV split, this makes using metrics like r-squared with
153 |       LeaveOneOut impossible. This method uses sklearn.model_selection.cross_val_predict to predict the left-out labels,
154 |       then calculates the metrics for that prediction.
155 |       2. The scikit-learn method only evaluates one metric at a time, this one evaluates an arbitrary number of metrics
156 | 
157 |     Parameters
158 |     ----------
159 |     estimator : estimator object implementing 'fit'
160 |         The object to use to fit the data.
161 | 
162 |     X : array-like of shape at least 2D
163 |         The data to fit.
164 | 
165 |     y : array-like
166 |         The target variable to try to predict in the case of
167 |         supervised learning.
168 | 
169 |     groups : array-like, with shape (n_samples,), optional
170 |         Labels to constrain permutation within groups, i.e. ``y`` values
171 |         are permuted among samples with the same group identifier.
172 |         When not specified, ``y`` values are permuted among all samples.
173 | 
174 |         When a grouped cross-validator is used, the group labels are
175 |         also passed on to the ``split`` method of the cross-validator. The
176 |         cross-validator uses them for grouping the samples  while splitting
177 |         the dataset into train/test set.
178 | 
179 |     cv_score_functions : list of callables or None, optional, default: None
180 |         a list of score functions of form score(y_true, y_pred) (like r2_score, accuracy_score).
181 |         If you have special arguments for your score function you should create another function with
182 |         the required prototype that wraps that function.
183 | 
184 |     cv : int, cross-validation generator or an iterable, optional
185 |         Determines the cross-validation splitting strategy.
186 |         Possible inputs for cv are:
187 | 
188 |         - None, to use the default 3-fold cross validation,
189 |         - integer, to specify the number of folds in a `(Stratified)KFold`,
190 |         - :term:`CV splitter`,
191 |         - An iterable yielding (train, test) splits as arrays of indices.
192 | 
193 |         For integer/None inputs, if the estimator is a classifier and ``y`` is
194 |         either binary or multiclass, :class:`StratifiedKFold` is used. In all
195 |         other cases, :class:`KFold` is used.
196 | 
197 |     n_permutations : integer, optional
198 |         Number of times to permute ``y``.
199 | 
200 |     n_jobs : int or None, optional (default=None)
201 |         The number of CPUs to use to do the computation.
202 |         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
203 |         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
204 |         for more details.
205 | 
206 |     random_state : int, RandomState instance or None, optional (default=0)
207 |         If int, random_state is the seed used by the random number generator;
208 |         If RandomState instance, random_state is the random number generator;
209 |         If None, the random number generator is the RandomState instance used
210 |         by `np.random`.
211 | 
212 |     verbose : integer, optional
213 |         The verbosity level.
214 | 
215 |     pre_dispatch : int, or string, optional
216 |         Controls the number of jobs that get dispatched during parallel
217 |         execution. Reducing this number can be useful to avoid an
218 |         explosion of memory consumption when more jobs get dispatched
219 |         than CPUs can process. This parameter can be:
220 | 
221 |             - None, in which case all the jobs are immediately
222 |               created and spawned. Use this for lightweight and
223 |               fast-running jobs, to avoid delays due to on-demand
224 |               spawning of the jobs
225 | 
226 |             - An int, giving the exact number of total jobs that are
227 |               spawned
228 | 
229 |             - A string, giving an expression as a function of n_jobs,
230 |               as in '2*n_jobs'
231 | 
232 |     fit_params : dict, optional
233 |         Parameters to pass to the fit method of the estimator.
234 | 
235 |     method : string, optional, default: 'predict'
236 |         Invokes the passed method name of the passed estimator. For
237 |         method='predict_proba', the columns correspond to the classes
238 |         in sorted order.
239 | 
240 |     parallel_by : string, optional, default: 'permutation'
241 |         Whether to parallelize the estimation step or the permuation step.
242 |         Either 'estimation' or 'permutation'. If 'estimation', the training of each cross-validation
243 |         fold gets its own job. If 'permutation', each permutation of the target gets its own job.
244 | 
245 |     Returns
246 |     -------
247 |     score : float
248 |         The true score without permuting targets.
249 | 
250 |     permutation_scores : array, shape (n_permutations,)
251 |         The scores obtained for each permutations.
252 | 
253 |     pvalue : float
254 |         The p-value, which approximates the probability that the score would
255 |         be obtained by chance. This is calculated as:
256 | 
257 |         `(C + 1) / (n_permutations + 1)`
258 | 
259 |         Where C is the number of permutations whose score >= the true score.
260 | 
261 |         The best possible p-value is 1/(n_permutations + 1), the worst is 1.0.
262 | 
263 |     Notes
264 |     -----
265 |     This function implements Test 1 in:
266 | 
267 |         Ojala and Garriga. Permutation Tests for Studying Classifier
268 |         Performance.  The Journal of Machine Learning Research (2010)
269 |         vol. 11
270 | 
271 |     """
272 |     X, y, groups = indexable(X, y, groups)
273 | 
274 |     cv = check_cv(cv, y, classifier=is_classifier(estimator))
275 |     random_state = check_random_state(random_state)
276 |     if cv_score_functions is None:
277 |         if isinstance(estimator, ClassifierMixin):
278 |             cv_score_functions = [accuracy_score]
279 |         else:
280 |             cv_score_functions = [r2_score]
281 |     # We clone the estimator to make sure that all the folds are
282 |     # independent, and that it is pickle-able.
283 |     score = _permutation_test_score(clone(estimator), X, y, groups, cv,
284 |                                     n_jobs, verbose, fit_params, pre_dispatch,
285 |                                     method, cv_score_functions)
286 |     if parallel_by == 'estimation':
287 |         permutation_scores = np.vstack([
288 |             _permutation_test_score(
289 |                 clone(estimator), X, _shuffle(y, groups, random_state),
290 |                 groups, cv, n_jobs, verbose, fit_params, pre_dispatch,
291 |                 method, cv_score_functions
292 |             ) for _ in range(n_permutations)
293 |         ])
294 |     elif parallel_by == 'permutation':
295 |         permutation_scores = np.vstack(
296 |             Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
297 |                 delayed(_permutation_test_score)(
298 |                     clone(estimator), X, _shuffle(y, groups, random_state),
299 |                     groups, cv, fit_params=fit_params, method=method, score_functions=cv_score_functions
300 |                 ) for _ in range(n_permutations)
301 |             )
302 |         )
303 |     else:
304 |         raise ValueError(f'Invalid option for parallel_by {parallel_by}')
305 |     pvalue = (np.sum(permutation_scores >= score, axis=0) + 1.0) / (n_permutations + 1)
306 |     return [(score[i], permutation_scores[:, i], pvalue[i]) for i in range(len(score))]
307 |     # return score, permutation_scores, pvalue
308 | 
309 | 
310 | def _permutation_test_score(estimator, X, y, groups=None, cv='warn',
311 |                             n_jobs=None, verbose=0, fit_params=None,
312 |                             pre_dispatch='2*n_jobs', method='predict',
313 |                             score_functions=None):
314 |     """Auxiliary function for permutation_test_score"""
315 |     if score_functions is None:
316 |         score_functions = [r2_score]
317 |     y_pred = cross_val_predict(estimator, X, y, groups, cv, n_jobs, verbose, fit_params, pre_dispatch, method)
318 |     cv_scores = [score_function(y, y_pred) for score_function in score_functions]
319 |     return np.array(cv_scores)
320 | 
321 | 
322 | def _shuffle(y, groups, random_state):
323 |     """Return a shuffled copy of y eventually shuffle among same groups."""
324 |     if groups is None:
325 |         indices = random_state.permutation(len(y))
326 |     else:
327 |         indices = np.arange(len(groups))
328 |         for group in np.unique(groups):
329 |             this_mask = (groups == group)
330 |             indices[this_mask] = random_state.permutation(indices[this_mask])
331 |     return safe_indexing(y, indices)
332 | 
333 | 
334 | def feature_permutation_loading(estimator, X, y, initial_permutations=100, alpha=0.2, final_permutations=500,
335 |                                 random_state=0, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'):
336 |     """Determine the significance of each feature
337 | 
338 |     This is done by permuting each feature in X and measuring the loading.
339 |     The feature is considered significant if the loadings are significantly different.
340 | 
341 |     This is always done with a regular PLS regressor
342 |     PLS-DA should be binarized first.
343 | 
344 |     Parameters
345 |     ----------
346 |     estimator : estimator object implementing 'fit' with x_loadings_
347 |         The object to use to fit the data. This should have an [n_features, 1] x_loadings_ array. This can be a
348 |         one-component PLS or OPLS model.
349 | 
350 |     X : array-like, shape = [n_samples, n_features]
351 |         Training vectors, where n_samples is the number of samples and
352 |         n_features is the number of predictors.
353 | 
354 |     y : array-like, shape = [n_samples, 1]
355 |         Target vector, where n_samples is the number of samples.
356 |         This implementation only supports a single response (target) variable.
357 | 
358 |     initial_permutations : int
359 |         The number of permutations to perform for all features.
360 | 
361 |     alpha : float, in range (0, 1)
362 |         The threshold for significance. If a feature is found significant in the first round, it will be retested with
363 |         final_permutations in the second round.
364 | 
365 |     final_permutations : int
366 |         The number of permutations to perform during the second round to retest points found significant in the first
367 |         round.
368 | 
369 |     n_jobs : int or None, optional (default=None)
370 |         The number of CPUs to use to do the computation.
371 |         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
372 |         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
373 |         for more details.
374 | 
375 |     verbose : integer, optional
376 |         The verbosity level.
377 | 
378 |     pre_dispatch : int, or string, optional
379 |         Controls the number of jobs that get dispatched during parallel
380 |         execution. Reducing this number can be useful to avoid an
381 |         explosion of memory consumption when more jobs get dispatched
382 |         than CPUs can process. This parameter can be:
383 | 
384 |             - None, in which case all the jobs are immediately
385 |               created and spawned. Use this for lightweight and
386 |               fast-running jobs, to avoid delays due to on-demand
387 |               spawning of the jobs
388 | 
389 |             - An int, giving the exact number of total jobs that are
390 |               spawned
391 | 
392 |             - A string, giving an expression as a function of n_jobs,
393 |               as in '2*n_jobs'
394 | 
395 |     random_state : int, RandomState instance or None, optional (default=0)
396 |         If int, random_state is the seed used by the random number generator;
397 |         If RandomState instance, random_state is the random number generator;
398 |         If None, the random number generator is the RandomState instance used
399 |         by `np.random`.
400 | 
401 |     Returns
402 |     -------
403 |     x_loadings : array [n_features]
404 |         The x_loadings found from non-permuted data.
405 | 
406 |     permutation_x_loadings: array [n_inner_permutations, n_features]
407 |         The one-component PLS loadings for each permutation in the first round.
408 | 
409 |     p_values: array [n_features]
410 |         The p-values for each feature. The null hypothesis is that permuting the feature does not change it's weight
411 |         in the one-component PLS model.
412 |     """
413 | 
414 |     def feature_ind_generator(n_permutations_, feature_inds):
415 |         """
416 |         Repeats each value in feature_inds n_permutations_ times.
417 |         """
418 |         i = 0
419 |         count = 0
420 |         while count < (n_permutations_ * len(feature_inds)):
421 |             yield feature_inds[i]
422 |             count += 1
423 |             if (count % n_permutations_) == 0:
424 |                 i += 1
425 | 
426 |     def _log(txt):
427 |         if verbose in range(1, 51):
428 |             stderr.write(txt + '\n')
429 |         if verbose > 50:
430 |             print(txt)
431 | 
432 |     random_state = check_random_state(random_state)
433 |     n_features = X.shape[1]
434 |     x_loadings = np.ravel(estimator.fit(X, y).x_loadings_)
435 |     loading_max = np.max((x_loadings, -1 * x_loadings), axis=0)
436 |     loading_min = np.min((x_loadings, -1 * x_loadings), axis=0)
437 | 
438 |     _log('Performing initial permutation tests.')
439 |     permutation_x_loadings = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
440 |         delayed(_feature_permutation_loading)(
441 |             clone(estimator), _feature_shuffle(X, feature_ind, random_state), y, x_loadings, feature_ind)
442 |         for feature_ind in feature_ind_generator(initial_permutations, [i for i in range(n_features)]))
443 |     permutation_x_loadings = np.array(permutation_x_loadings).reshape(n_features, initial_permutations).T
444 | 
445 |     _log('Calculating p values.')
446 |     p_values = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
447 |         delayed(_loading_p_value)(permutation_x_loading, upper, lower, initial_permutations)
448 |         for permutation_x_loading, upper, lower in zip(np.hsplit(permutation_x_loadings, n_features),
449 |                                                        loading_max, loading_min)
450 |     )
451 | 
452 |     # Retest values found significant in first round
453 |     retest_columns = [i for i in range(n_features) if p_values[i] < (alpha / 2.0)]  # remember, this is two-tailed
454 |     retest_loading_max = np.max((x_loadings[retest_columns], -1 * x_loadings[retest_columns]), axis=0)
455 |     retest_loading_min = np.min((x_loadings[retest_columns], -1 * x_loadings[retest_columns]), axis=0)
456 | 
457 |     _log(f'Re-testing {len(retest_columns)} features')
458 |     retest_loadings = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
459 |         delayed(_feature_permutation_loading)(
460 |             clone(estimator), _feature_shuffle(X, feature_ind, random_state), y, x_loadings, feature_ind)
461 |         for feature_ind in feature_ind_generator(final_permutations, retest_columns))
462 |     retest_loadings = np.array(retest_loadings).reshape(len(retest_columns), final_permutations).T
463 | 
464 |     # replace p-values with the more accurate ones
465 |     if len(retest_columns):
466 |         _log(f'Calculating p values for {len(retest_columns)} features.')
467 |         p_values = np.array(p_values)
468 |         p_values[retest_columns] = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)(
469 |             delayed(_loading_p_value)(retest_loading, upper, lower, initial_permutations)
470 |             for retest_loading, upper, lower in zip(np.hsplit(retest_loadings, len(retest_columns)),
471 |                                                     retest_loading_max, retest_loading_min)
472 |         )
473 |     else:
474 |         _log('No significant features after first round of tests.')
475 |     p_values = np.array(p_values)
476 |     p_values[p_values > 1] = 1  # if feature_min=feature_max=loading=0 values will be greater than 1
477 |     return x_loadings, permutation_x_loadings, p_values
478 | 
479 | 
480 | def _feature_permutation_loading(estimator, X, y, reference_loadings, feature_ind):
481 |     """Auxiliary function for feature_permutation_loading"""
482 |     """Not that since loading only depends on training data, we dont use cross-validation"""
483 |     test_loadings = np.ravel(estimator.fit(X, y).x_loadings_)
484 |     # make directions the same
485 |     err1 = (np.sum(np.square(test_loadings[:feature_ind] - reference_loadings[:feature_ind]))
486 |             + np.sum(np.square(test_loadings[feature_ind:] - reference_loadings[feature_ind:])))
487 |     err2 = (np.sum(np.square(test_loadings[:feature_ind] + reference_loadings[:feature_ind]))
488 |             + np.sum(np.square(test_loadings[feature_ind:] + reference_loadings[feature_ind:])))
489 |     sign = -1 if err2 < err1 else 1
490 |     return sign * test_loadings[feature_ind]
491 | 
492 | 
493 | def _feature_shuffle(X, feature_ind, random_state):
494 |     X = X.copy()
495 |     random_state.shuffle(X[:, feature_ind])
496 |     return X
497 | 
498 | 
499 | def _loading_p_value(permutation_loadings, upper, lower, n_permutations):
500 |     return (np.sum(permutation_loadings >= upper) + np.sum(permutation_loadings <= lower) + 1) / (n_permutations + 1)
501 | 
502 | 
503 | def safe_indexing(X, indices):
504 |     """Return items or rows from X using indices.
505 | 
506 |     Allows simple indexing of lists or arrays.
507 |     This is copied from the deprecated sklearn.utils.safe_indexing
508 | 
509 |     Parameters
510 |     ----------
511 |     X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series.
512 |         Data from which to sample rows or items.
513 |     indices : array-like of int
514 |         Indices according to which X will be subsampled.
515 | 
516 |     Returns
517 |     -------
518 |     subset
519 |         Subset of X on first axis
520 | 
521 |     Notes
522 |     -----
523 |     CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
524 |     not supported.
525 |     """
526 |     if hasattr(X, "iloc"):
527 |         # Work-around for indexing with read-only indices in pandas
528 |         indices = indices if indices.flags.writeable else indices.copy()
529 |         # Pandas Dataframes and Series
530 |         try:
531 |             return X.iloc[indices]
532 |         except ValueError:
533 |             # Cython typed memoryviews internally used in pandas do not support
534 |             # readonly buffers.
535 |             warnings.warn("Copying input dataframe for slicing.",
536 |                           DataConversionWarning)
537 |             return X.copy().iloc[indices]
538 |     elif hasattr(X, "shape"):
539 |         if hasattr(X, 'take') and (hasattr(indices, 'dtype') and
540 |                                    indices.dtype.kind == 'i'):
541 |             # This is often substantially faster than X[indices]
542 |             return X.take(indices, axis=0)
543 |         else:
544 |             return X[indices]
545 |     else:
546 |         return [X[idx] for idx in indices]
547 | 


--------------------------------------------------------------------------------
/pyopls/tests/features.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BiRG/pyopls/be801f1e021898d3337ba30795644130ec425e49/pyopls/tests/features.npy


--------------------------------------------------------------------------------
/pyopls/tests/target.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BiRG/pyopls/be801f1e021898d3337ba30795644130ec425e49/pyopls/tests/target.npy


--------------------------------------------------------------------------------
/pyopls/tests/test_opls.py:
--------------------------------------------------------------------------------
 1 | def test_opls():
 2 |     import numpy as np
 3 |     from pyopls import OPLS
 4 |     from sklearn.cross_decomposition import PLSRegression
 5 |     from sklearn.metrics import r2_score
 6 |     from sklearn.model_selection import cross_val_predict, LeaveOneOut
 7 | 
 8 |     # paths relative to repo
 9 |     spectra = np.load('pyopls/tests/features.npy')
10 |     target = np.load('pyopls/tests/target.npy')
11 | 
12 |     score = -1
13 |     n_components = 0
14 |     for n_components in range(1, spectra.shape[1]):
15 |         opls = OPLS(n_components=n_components)
16 |         Z = opls.fit(spectra, target).transform(spectra)
17 |         y_pred = cross_val_predict(PLSRegression(n_components=1), Z, target, cv=LeaveOneOut())
18 |         score_i = r2_score(target, y_pred)
19 |         if score_i < score:
20 |             n_components -= 1
21 |             break
22 |         score = score_i
23 | 
24 |     opls = OPLS(n_components=n_components)
25 |     opls.fit(spectra, target)
26 |     assert opls.n_components == n_components
27 |     assert opls.P_ortho_.shape == (spectra.shape[1], n_components)
28 |     assert opls.T_ortho_.shape == (spectra.shape[0], n_components)
29 |     assert opls.W_ortho_.shape == (spectra.shape[1], n_components)
30 |     assert opls.x_mean_.shape == (spectra.shape[1],)
31 |     assert opls.x_std_.shape == (spectra.shape[1],)
32 |     assert opls.y_mean_.shape == (1,)
33 |     assert opls.y_std_.shape == (1,)
34 | 
35 |     Z = opls.transform(spectra)
36 |     assert Z.shape == spectra.shape
37 | 
38 |     pls = PLSRegression(n_components=1)
39 |     uncorrected_r2 = r2_score(target, pls.fit(spectra, target).predict(spectra))
40 |     corrected_r2 = r2_score(target, pls.fit(Z, target).predict(Z))
41 |     uncorrected_q2 = r2_score(target, cross_val_predict(pls, spectra, target, cv=LeaveOneOut()))
42 |     corrected_q2 = r2_score(target, cross_val_predict(pls, Z, target, cv=LeaveOneOut()))
43 | 
44 |     assert uncorrected_r2 < corrected_r2
45 |     assert uncorrected_q2 < corrected_q2
46 | 


--------------------------------------------------------------------------------
/pyopls/validation.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from sys import stderr
  3 | 
  4 | import numpy as np
  5 | from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, ClassifierMixin
  6 | from sklearn.cross_decomposition import PLSRegression
  7 | from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, r2_score
  8 | from sklearn.model_selection import KFold, StratifiedKFold, LeaveOneOut, cross_val_predict
  9 | from sklearn.preprocessing import LabelBinarizer
 10 | from sklearn.utils import check_array
 11 | from sklearn.utils.multiclass import type_of_target
 12 | from sklearn.utils.validation import check_is_fitted
 13 | 
 14 | from .opls import OPLS
 15 | from .permutation_test import permutation_test_score, feature_permutation_loading
 16 | 
 17 | 
 18 | def discriminator_accuracy(y_true, y_pred):
 19 |     try:
 20 |         return accuracy_score(y_true.astype(int), np.sign(y_pred).astype(int))
 21 |     except ValueError as e:
 22 |         warnings.warn(str(e), UserWarning)
 23 |         return float('nan')
 24 | 
 25 | 
 26 | def discriminator_roc_auc(y_true, y_pred):
 27 |     try:
 28 |         return roc_auc_score(y_true, np.clip(y_pred, -1, 1))
 29 |     except ValueError as e:
 30 |         warnings.warn(str(e), UserWarning)
 31 |         return float('nan')
 32 | 
 33 | 
 34 | def discriminator_r2_score(y_true, y_pred):
 35 |     return r2_score(y_true, np.clip(y_pred, -1, 1))
 36 | 
 37 | 
 38 | def neg_press(y_true, y_pred):
 39 |     return -1 * np.sum(np.square(y_true - y_pred))
 40 | 
 41 | 
 42 | def neg_pressd(y_true, y_pred):
 43 |     return -1 * np.sum(np.square(y_true - np.clip(y_pred, -1, 1)))
 44 | 
 45 | 
 46 | class OPLSValidator(BaseEstimator, TransformerMixin, RegressorMixin):
 47 |     """Cross Validation and Diagnostics of Orthogonal Projection to Latent Structures (O-PLS)
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     min_n_components : int, minimum number of orthogonal components to remove
 52 | 
 53 |     k : int
 54 |         number of folds for k-fold cross-validation (default 5). If set to -1, leave-one out cross-validation is used.
 55 | 
 56 |     scale : boolean, scale data? (default True)
 57 | 
 58 |     n_permutations : int, number of permutations to perform on X
 59 | 
 60 |     Attributes
 61 |     ----------
 62 |     q_squared_: float, overall Q-squared metric for the regression, the R-squared value of the left-out data.
 63 | 
 64 |     permutation_q_squared_: array [n_splits*n_permutations]
 65 |         The R-squared metric for the left-out data for each permutation
 66 | 
 67 |     q_squared_p_value_ : float
 68 |         The p-value for the permutation test on Q-squared
 69 | 
 70 | 
 71 |     r_squared_Y_: float, overall R-squared metric for the regression
 72 | 
 73 |     r_squared_X_: float, overall R-squared X metric (
 74 | 
 75 |     discriminant_q_squared_: float
 76 |         Discriminant Q-squared, if this is an OPLSDA problem. Discriminant Q-squared disregards the error of class
 77 |         predictions whose values are beyond the class labels (e.g. it treats predictions of -1.5 as -1 and 1.5 as 1).
 78 | 
 79 |     permutation_discriminant_q_squared_: array [n_splits*n_permutations]
 80 |         The discriminant R-squared metric for the left-out data for each permutation.
 81 | 
 82 |     discriminant_q_squared_p_value_ : float
 83 |         The p-value for the permutation test on DQ-squared
 84 | 
 85 |     accuracy_ : float, accuracy for discrimination
 86 | 
 87 |     discriminant_r_squared_: float
 88 |         Discriminant R-squared, if this is an OPLSDA problem. Discriminant R-squared disregards the error of class
 89 |         predictions whose values are beyond the class labels (e.g. it treats a predictions of -1.5 as -1 and 1.5 as 1).
 90 | 
 91 |     permutation_accuracy_: array [n_splits*n_permutations]
 92 |         The accuracy of the left-out data for each permutation
 93 | 
 94 |     accuracy_p_value_: float
 95 |         The p-value for the permutation test on accuracy
 96 | 
 97 |     roc_auc_ : float, area under ROC curve for discrimination
 98 | 
 99 |     permutation_roc_auc_: array [n_splits*n_permutations]
100 |         The area under the ROC curve of the left-out data for each permutation.
101 | 
102 |     roc_auc_p_value_: float
103 |         The p-value for the permutation test on the are under the ROC curve.
104 | 
105 |     n_components_ : float
106 |         The optimal number of orthogonal components to remove
107 | 
108 |     feature_significance_ : array [n_features], type bool
109 |         Whether permuting the feature results in a significantly different loading for that feature in the model.
110 |         Defined as the loading for the non-permuted data being outside the "middle" of the distribution of loadings
111 |         for the permuted data, where the boundaries are a percentile range defined by outer_alpha.
112 | 
113 |     feature_p_values_ : array [n_features]
114 |         An estimated p-value for the significance of the feature, defined as the ratio of loading values inside (-p,p)
115 |         where p is the loading for non-permuted data.
116 | 
117 |     permutation_loadings_ : array [n_inner_permutations, n_features]
118 |         Values for the loadings for the permuted data.
119 | 
120 |     loadings_ : array [n_features]
121 |         Loadings for the non-permuted data
122 | 
123 |     opls_ : OPLS
124 |         The OPLS transformer
125 | 
126 |     pls_ : PLSRegression
127 |         A 1-component PLS regressor used to evaluate the OPLS transform
128 | 
129 | 
130 |     References
131 |     ----------
132 |     Johan Trygg and Svante Wold. Orthogonal projections to latent structures (O-PLS).
133 |     J. Chemometrics 2002; 16: 119-128. DOI: 10.1002/cem.695
134 | 
135 |     Johan A. Westerhuis, Ewoud J. J. van Velzen, Huub C. J. Hoefsloot and Age K. Smilde.
136 |     Discriminant Q-squared (DQ-squared) for improved discrimination in PLSDA models.
137 |     Metabolomics (2008) 4: 293. https://doi.org/10.1007/s11306-008-0126-2
138 |     """
139 | 
140 |     def __init__(self,
141 |                  min_n_components=1,
142 |                  k=10,
143 |                  scale=True,
144 |                  force_regression=False,
145 |                  n_permutations=100,
146 |                  n_inner_permutations=100,
147 |                  n_outer_permutations=500,
148 |                  inner_alpha=0.2,
149 |                  outer_alpha=0.05):
150 |         self.min_n_components = min_n_components
151 |         self.k = k
152 |         self.scale = scale
153 |         self.n_permutations = n_permutations
154 |         self.n_inner_permutations = n_inner_permutations
155 |         self.n_outer_permutations = n_outer_permutations
156 |         self.inner_alpha = inner_alpha
157 |         self.outer_alpha = outer_alpha
158 |         self.force_regression = force_regression
159 |         self.n_components_ = None
160 |         self.feature_significance_ = None
161 |         self.feature_p_values_ = None
162 | 
163 |         self.r_squared_Y_ = None
164 |         self.discriminant_r_squared_ = None
165 |         self.r_squared_X_ = None
166 | 
167 |         self.q_squared_ = None
168 |         self.permutation_q_squared_ = None
169 |         self.q_squared_p_value_ = None
170 | 
171 |         self.accuracy_ = None
172 |         self.permutation_accuracy_ = None
173 |         self.accuracy_p_value_ = None
174 | 
175 |         self.roc_auc_ = None
176 |         self.permutation_roc_auc_ = None
177 |         self.roc_auc_p_value_ = None
178 | 
179 |         self.discriminant_q_squared_ = None
180 |         self.discriminant_q_squared_p_value_ = None
181 |         self.permutation_discriminant_q_squared_ = None
182 | 
183 |         self.permutation_loadings_ = None
184 |         self.pls_ = None  # a 1-component PLSRegression
185 |         self.opls_ = None  # OPLS transform
186 |         self.loadings_ = None
187 |         self.binarizer_ = None
188 | 
189 |     @staticmethod
190 |     def _get_validator(Y, k):
191 |         if k == -1:
192 |             return LeaveOneOut()
193 |         else:
194 |             if type_of_target(Y) in ('binary', 'multiclass'):
195 |                 return StratifiedKFold(k)
196 |             else:
197 |                 return KFold(k)
198 | 
199 |     def is_discrimination(self, Y):
200 |         return type_of_target(Y).startswith('binary') and not self.force_regression
201 | 
202 |     def _get_score_function(self, Y):
203 |         return neg_pressd if self.is_discrimination(Y) else neg_press
204 | 
205 |     def _validate(self, X, Y, n_components, score_function, cv=None, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'):
206 |         cv = cv or self._get_validator(Y, self.k)
207 |         Z = OPLS(n_components, self.scale).fit_transform(X, Y)
208 |         y_pred = cross_val_predict(PLSRegression(1, self.scale), Z, Y, cv=cv, n_jobs=n_jobs, verbose=verbose,
209 |                                    pre_dispatch=pre_dispatch)
210 |         return score_function(Y, y_pred)
211 | 
212 |     def _process_binary_target(self, y, pos_label=None):
213 |         self.binarizer_ = LabelBinarizer(-1, 1)
214 |         self.binarizer_.fit(y)
215 |         if pos_label is not None and self.binarizer_.transform([pos_label])[0] == -1:
216 |             self.binarizer_.classes_ = np.flip(self.binarizer_.classes_)
217 |         return self.binarizer_.transform(y).astype(float)
218 | 
219 |     def _check_target(self, y, pos_label=None):
220 |         y = check_array(y, dtype=None, copy=True, ensure_2d=False).reshape(-1, 1)
221 |         if type_of_target(y).startswith('multiclass') and not self.force_regression:
222 |             raise ValueError('Multiclass input not directly supported. '
223 |                              'Try binarizing with sklearn.preprocessing.LabelBinarizer.')
224 |         if self.is_discrimination(y):
225 |             y = self._process_binary_target(y, pos_label)
226 |         else:
227 |             self.binarizer_ = None
228 |         return y
229 | 
230 |     def _determine_n_components(self, X, y, cv=None, scoring=None, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'):
231 |         """Determine number of orthogonal components to remove.
232 | 
233 |         Orthogonal components are removed until removing a component does not improve the performance
234 |         of the k-fold cross-validated OPLS estimator, as measured by the residual sum of squares of the left-out
235 |         data.
236 | 
237 |         Parameters
238 |         ----------
239 |         X : array-like, shape = [n_samples, n_features]
240 |             Training vectors, where n_samples is the number of samples and
241 |             n_features is the number of predictors.
242 | 
243 |         y : array-like, shape = [n_samples, 1]
244 |             Target vector, where n_samples is the number of samples.
245 |             This implementation only supports a single response (target) variable.
246 | 
247 |         cv : sklearn.model_selection.BaseCrossValidator
248 |             A cross validator. If None, _get_validator() is used to determine the validator. If target is binary or
249 |             multiclass, sklearn.model_selection.StratifiedKFold is used, otherwise sklearn.model_selection.KFold
250 |             is used unless k=-1, then sklearn.model_selection.LeaveOneOut is used.
251 | 
252 |         scoring :
253 |             Scoring method to use. Will default to 'accuracy' for OPLS-DA and 'neg_mean_squared_error' for OPLS regression.
254 | 
255 |         n_jobs : int or None, optional (default=None)
256 |             The number of CPUs to use to do the computation.
257 |             ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
258 |             ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
259 |             for more details.
260 | 
261 |         verbose : integer, optional
262 |             The verbosity level.
263 | 
264 |         pre_dispatch : int, or string, optional
265 |             Controls the number of jobs that get dispatched during parallel
266 |             execution. Reducing this number can be useful to avoid an
267 |             explosion of memory consumption when more jobs get dispatched
268 |             than CPUs can process. This parameter can be:
269 | 
270 |                 - None, in which case all the jobs are immediately
271 |                   created and spawned. Use this for lightweight and
272 |                   fast-running jobs, to avoid delays due to on-demand
273 |                   spawning of the jobs
274 | 
275 |                 - An int, giving the exact number of total jobs that are
276 |                   spawned
277 | 
278 |                 - A string, giving an expression as a function of n_jobs,
279 |                   as in '2*n_jobs'
280 |         Returns
281 |         -------
282 |         n_components: int
283 |             The number of components to remove to maximize q-squared
284 | 
285 |         """
286 |         cv = cv or self._get_validator(y, self.k)
287 |         scoring = scoring or self._get_score_function(y)
288 |         n_components = self.min_n_components
289 | 
290 |         score = self._validate(X, y, n_components, scoring, cv, n_jobs, verbose, pre_dispatch)
291 |         while n_components < X.shape[1]:
292 |             next_score = self._validate(X, y, n_components + 1, scoring, cv, n_jobs, verbose, pre_dispatch)
293 |             if next_score <= score:
294 |                 break
295 |             else:
296 |                 score = next_score
297 |                 n_components += 1
298 |         return n_components
299 | 
300 |     def _determine_significant_features(self,
301 |                                         X,
302 |                                         y,
303 |                                         n_components,
304 |                                         random_state=0,
305 |                                         n_jobs=None,
306 |                                         verbose=0,
307 |                                         pre_dispatch='2*n_jobs'):
308 |         """Determine the significance of each feature
309 | 
310 |         This is done by permuting each feature in X and measuring the loading.
311 |         The feature is considered significant if the loadings are significantly different.
312 | 
313 |         This is always done with a regular PLS regressor
314 |         PLS-DA should be binarized first.
315 | 
316 |         Parameters
317 |         ----------
318 |         X : array-like, shape = [n_samples, n_features]
319 |             Training vectors, where n_samples is the number of samples and
320 |             n_features is the number of predictors.
321 | 
322 |         y : array-like, shape = [n_samples, 1]
323 |             Target vector, where n_samples is the number of samples.
324 |             This implementation only supports a single response (target) variable.
325 | 
326 |         n_components : int
327 |             The number of orthogonal components to remove
328 | 
329 |         n_jobs : int or None, optional (default=None)
330 |             The number of CPUs to use to do the computation.
331 |             ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
332 |             ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
333 |             for more details.
334 | 
335 |         verbose : integer, optional
336 |             The verbosity level.
337 | 
338 | 
339 |         pre_dispatch : int, or string, optional
340 |             Controls the number of jobs that get dispatched during parallel
341 |             execution. Reducing this number can be useful to avoid an
342 |             explosion of memory consumption when more jobs get dispatched
343 |             than CPUs can process. This parameter can be:
344 | 
345 |                 - None, in which case all the jobs are immediately
346 |                   created and spawned. Use this for lightweight and
347 |                   fast-running jobs, to avoid delays due to on-demand
348 |                   spawning of the jobs
349 | 
350 |                 - An int, giving the exact number of total jobs that are
351 |                   spawned
352 | 
353 |                 - A string, giving an expression as a function of n_jobs,
354 |                   as in '2*n_jobs'
355 | 
356 |         Returns
357 |         -------
358 |         significance: array [n_features], type bool
359 |             Whether a particular feature is significant.
360 | 
361 |         p_values: array [n_features]
362 |             The p-values for each feature. The null hypothesis is that permuting the feature does not change it's weight
363 |             in the one-component PLS model.
364 | 
365 |         permuted_loadings: array [n_inner_permutations, n_features]
366 |             The one-component PLS loadings for each permutation
367 |         """
368 |         Z = OPLS(n_components, self.scale).fit_transform(X, y)
369 |         x_loadings, permutation_x_loadings, p_values = feature_permutation_loading(
370 |             PLSRegression(1, self.scale), Z, y, self.n_inner_permutations, self.inner_alpha,
371 |             self.n_outer_permutations, random_state, n_jobs, verbose, pre_dispatch
372 |         )
373 |         return p_values < self.outer_alpha, p_values, permutation_x_loadings
374 | 
375 |     def cross_val_roc_curve(self, X, y, cv=None, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'):
376 |         Z = self.opls_.transform(X)
377 |         cv = cv or self._get_validator(y, self.k)
378 |         check_is_fitted(self, ['opls_', 'pls_', 'binarizer_'])
379 |         y_pred = cross_val_predict(PLSRegression(1, self.scale), Z, y, cv=cv, n_jobs=n_jobs, verbose=verbose,
380 |                                    pre_dispatch=pre_dispatch)
381 |         return roc_curve(y, y_pred)
382 | 
383 |     def fit(self, X, y, n_components=None, cv=None, pos_label=None,
384 |             random_state=0, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'):
385 |         """Evaluate the quality of the OPLS regressor
386 | 
387 |         The q-squared value and a p-value for each feature's significance is determined. The final regressor can be
388 |         accessed as estimator_.
389 | 
390 |         Parameters
391 |         ----------
392 |         X : array-like, shape = [n_samples, n_features]
393 |             Training vectors, where n_samples is the number of samples and
394 |             n_features is the number of predictors.
395 | 
396 |         y : array-like, shape = [n_samples, 1]
397 |             Target vector, where n_samples is the number of samples.
398 |             This implementation only supports a single response (target) variable.
399 | 
400 |         n_components : int
401 |             The number of orthogonal components to remove. Will be determined by determine_n_components if None
402 | 
403 |         cv : sklearn.model_selection.BaseCrossValidator
404 |             A cross-validator to use for the determination of the number of components and the q-squared value.
405 |             If None, a cross validator will be selected based on the value of k and the values of the target variable.
406 |             If target is binary or multiclass, sklearn.model_selection.StratifiedKFold is used, otherwise
407 |             sklearn.model_selection.KFold is used unless k=-1, then sklearn.model_selection.LeaveOneOut is used.
408 | 
409 |         pos_label : string
410 |             If this is a discrimination problem, the value of the target corresponding to "1".
411 | 
412 |         random_state : int, RandomState instance or None, optional (default=0)
413 |             If int, random_state is the seed used by the random number generator;
414 |             If RandomState instance, random_state is the random number generator;
415 |             If None, the random number generator is the RandomState instance used
416 |             by `np.random`.
417 | 
418 |         n_jobs : int or None, optional (default=None)
419 |             The number of CPUs to use to do the computation.
420 |             ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
421 |             ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
422 |             for more details.
423 | 
424 |         verbose : integer, optional
425 |             The verbosity level.
426 | 
427 | 
428 |         pre_dispatch : int, or string, optional
429 |             Controls the number of jobs that get dispatched during parallel
430 |             execution. Reducing this number can be useful to avoid an
431 |             explosion of memory consumption when more jobs get dispatched
432 |             than CPUs can process. This parameter can be:
433 | 
434 |                 - None, in which case all the jobs are immediately
435 |                   created and spawned. Use this for lightweight and
436 |                   fast-running jobs, to avoid delays due to on-demand
437 |                   spawning of the jobs
438 | 
439 |                 - An int, giving the exact number of total jobs that are
440 |                   spawned
441 | 
442 |                 - A string, giving an expression as a function of n_jobs,
443 |                   as in '2*n_jobs'
444 | 
445 |         """
446 | 
447 |         def _log(txt):
448 |             if verbose in range(1, 51):
449 |                 stderr.write(txt + '\n')
450 |             if verbose > 50:
451 |                 print(txt)
452 | 
453 |         X = check_array(X, dtype=float, copy=True)
454 |         y = self._check_target(y, pos_label)
455 | 
456 |         if not n_components:
457 |             _log('Determining number of components to remove.')
458 |             n_components = self._determine_n_components(X, y, cv, n_jobs=n_jobs, verbose=verbose,
459 |                                                         pre_dispatch=pre_dispatch)
460 |             _log(f'Removing {n_components} orthogonal components.')
461 |         self.n_components_ = n_components or self._determine_n_components(X, y)
462 | 
463 |         self.opls_ = OPLS(self.n_components_, self.scale).fit(X, y)
464 |         Z = self.opls_.transform(X)
465 |         self.pls_ = PLSRegression(1, self.scale).fit(Z, y)
466 |         self.r_squared_X_ = self.opls_.score(X)
467 |         y_pred = self.pls_.predict(Z)
468 |         self.r_squared_Y_ = r2_score(y, y_pred)
469 |         if self.is_discrimination(y):
470 |             self.discriminant_r_squared_ = r2_score(y, np.clip(y_pred, -1, 1))
471 | 
472 |         cv = cv or self._get_validator(y, self.k)
473 | 
474 |         score_functions = [r2_score]
475 |         if self.is_discrimination(y):
476 |             score_functions += [discriminator_r2_score, discriminator_accuracy, discriminator_roc_auc]
477 | 
478 |         _log('Performing cross-validated metric permutation tests.')
479 | 
480 |         cv_results = permutation_test_score(PLSRegression(1, self.scale), Z, y, cv=cv,
481 |                                             n_permutations=self.n_permutations, cv_score_functions=score_functions,
482 |                                             n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
483 |         if self.is_discrimination(y):
484 |             [
485 |                 (self.q_squared_, self.permutation_q_squared_, self.q_squared_p_value_),
486 |                 (self.discriminant_q_squared_, self.permutation_discriminant_q_squared_,
487 |                  self.discriminant_q_squared_p_value_),
488 |                 (self.accuracy_, self.permutation_accuracy_, self.accuracy_p_value_),
489 |                 (self.roc_auc_, self.permutation_roc_auc_, self.roc_auc_p_value_)
490 |             ] = cv_results
491 |         else:
492 |             [
493 |                 (self.q_squared_, self.permutation_q_squared_, self.q_squared_p_value_)
494 |             ] = cv_results
495 | 
496 |         _log('Estimating feature significance.')
497 | 
498 |         (self.feature_significance_,
499 |          self.feature_p_values_,
500 |          self.permutation_loadings_) = self._determine_significant_features(X, y, self.n_components_, random_state,
501 |                                                                             n_jobs, verbose, pre_dispatch)
502 |         return self
503 | 
504 |     def transform(self, X):
505 |         return self.opls_.transform(X)
506 | 
507 |     def predict(self, X):
508 |         Z = self.transform(X)
509 |         return self.pls_.predict(Z)
510 | 
511 |     def score(self, X, y, sample_weight=None):
512 |         Z = self.transform(X)
513 |         return r2_score(y, self.pls_.predict(Z))
514 | 
515 |     def discriminator_roc(self, X, y):
516 |         Z = self.transform(X)
517 |         return roc_curve(y, self.pls_.predict(Z))
518 | 
519 | 
520 | class OPLSDAValidator(OPLSValidator, ClassifierMixin):
521 |     def __init__(self,
522 |                  min_n_components=1,
523 |                  k=10,
524 |                  scale=True,
525 |                  force_regression=False,
526 |                  n_permutations=100,
527 |                  n_inner_permutations=100,
528 |                  n_outer_permutations=500,
529 |                  inner_alpha=0.2,
530 |                  outer_alpha=0.01):
531 |         super().__init__(min_n_components,
532 |                          k,
533 |                          scale,
534 |                          force_regression,
535 |                          n_permutations,
536 |                          n_inner_permutations,
537 |                          n_outer_permutations,
538 |                          inner_alpha,
539 |                          outer_alpha)
540 | 
541 |     def score(self, X, y, sample_weight=None):
542 |         Z = self.transform(X)
543 |         y_pred = self.pls_.predict(Z)
544 |         return r2_score(y, np.clip(y_pred, -1, 1))
545 | 
546 |     def predict(self, X):
547 |         Z = self.opls_.transform(X)
548 |         values = np.sign(self.pls_.predict(Z))
549 |         return self.binarizer_.inverse_transform(values).reshape(-1, 1)
550 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.11.0
2 | scipy>=0.18.0
3 | scikit-learn>=0.18.0
4 | 


--------------------------------------------------------------------------------
/roc_curve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BiRG/pyopls/be801f1e021898d3337ba30795644130ec425e49/roc_curve.png


--------------------------------------------------------------------------------
/scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BiRG/pyopls/be801f1e021898d3337ba30795644130ec425e49/scores.png


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('README.md', 'r') as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name='pyopls',
 8 |     version='20.03-1',
 9 |     author='BiRG @ Wright State University',
10 |     author_email='foose.3@wright.edu',
11 |     description='Orthogonal Projection to Latent Structures',
12 |     long_description=long_description,
13 |     long_description_content_type='text/markdown',
14 |     url='https://github.com/BiRG/pyopls',
15 |     keywords='metabolomics chemometrics partial-least-squares',
16 |     download_url='https://github.com/BiRG/pyopls/archive/20.02.tar.gz',
17 |     packages=setuptools.find_packages(),
18 |     python_requires='>=3.5',
19 |     install_requires=[
20 |         'numpy>=1.11.0',
21 |         'scipy>=0.18.0',
22 |         'scikit-learn>=0.18.0'
23 |     ],
24 |     classifiers=[
25 |         'Natural Language :: English',
26 |         'Intended Audience :: Science/Research',
27 |         'Programming Language :: Python :: 3.6',
28 |         'Programming Language :: Python :: 3.7',
29 |         'Programming Language :: Python :: 3.8',
30 |         'Programming Language :: Python :: 3.9',
31 |         'License :: OSI Approved :: MIT License',
32 |         'Operating System :: OS Independent',
33 |     ],
34 | )
35 | 


--------------------------------------------------------------------------------