├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── colorectal_cancer_nmr.csv ├── pyopls ├── .gitignore ├── __init__.py ├── kernel_density.py ├── opls.py ├── permutation_test.py ├── tests │ ├── features.npy │ ├── target.npy │ └── test_opls.py └── validation.py ├── requirements.txt ├── roc_curve.png ├── scores.png ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea/* 107 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | - "3.7" 5 | - "3.8" 6 | #- "nightly" # nightly build turned off for now because numpy cannot be installed using pip yet 7 | install: 8 | - pip install -r requirements.txt 9 | script: 10 | - python -m pytest pyopls/tests/ 11 | deploy: 12 | provider: pypi 13 | user: __token__ 14 | password: 15 | secure: HJJK7VFqtvpfrGKm2Tvikmur5xVbeLIKT1d6sz+NR9KII9OYReTGi6gGjFSxmPOLTLKuMHpJ8GLfUipK0RiwQYZMe6ve9kuAX4w3Gcn3juR2CLRjc78zwg3u7vQzunuUmXO1vjDhIOawgnbwcaHKZ0bnnHCs39uwUfdxq673SrmlSLCNrRLLsfUOMycCX1HVtAlSDGvdPdsl6DMx+YqJp05uc6oVk50RgojQxjetJtonUcEeEIWXm6V/twQc4wa8mlt/ZnCnkF3kccQUp+a4/P7uoHzF6GLOjJ15rvPz6f/J8wZr1n9Jz5Ng0U4aJFX3clVMGJLcOaRf3owcydIQvfyKzwKuqPN0nnvPv4FMKTNPEMpLVwIc4PUJFrZ1g4013tZZT6zuEqKYrOPpz/nifobWhLUc2ktHt3t0B6VR2VOcYJRWlMpuI9gwAYq0dMtlnIWReYeerLWMLnpzFwAHfogOjKoCAIEPM/PmtjNh2hrYIZZhiH+wCuqcYG4SrC9J0azx8vXWzbxYPRNfvZpxmNjZ2kAeEYsQ0BEL1UI9839rmkOGyVqbEZ9VP1GHg5KsifmeD+VzZEaJOS+PYu8YPuHrfah7MYswaQC4uyU/vzEJlc3oZ/15rqzsgj9GKqWmMS25OvXxOilt9W5CpIKrV7nNzfn22JOKHRMwjPS28f8= 16 | skip_existing: true 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Wright State University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyopls - Orthogonal Projection to Latent Structures in Python. 2 | [![Build Status](https://travis-ci.org/BiRG/pyopls.svg?branch=master)](https://travis-ci.org/BiRG/pyopls) 3 | 4 | This package provides a scikit-learn-style transformer to perform OPLS. 5 | OPLS is a pre-processing method to remove variation from the descriptor 6 | variables that are orthogonal to the target variable (1). 7 | 8 | This package also provides a class to validate OPLS models using a 9 | 1-component PLS regression with cross-validation and permutation tests (2) 10 | for both regression and classification metrics (from permutations of the 11 | target) and feature PLS loadings (from permutations of the features). 12 | 13 | ## Table of Contents 14 | 1. [Installation](#installation) 15 | 2. [Notes](#notes) 16 | 3. [Examples](#examples) 17 | 1. [OPLS and PLS-DA](#opls-and-pls-da) 18 | 2. [Validation](#validation) 19 | 4. [References](#references) 20 | 5. [Data Acknowledgment](#data-acknowledgment) 21 | 22 | 23 | ## Installation 24 | pyopls is available via [pypi](https://pypi.org/project/pyopls/): 25 | ```shell 26 | pip install pyopls 27 | ``` 28 | You may also install directly from this repository for the current 29 | master: 30 | ```shell 31 | pip install git+git://github.com/BiRG/pyopls.git 32 | ``` 33 | New versions are uploaded to pypi whenever the version number is 34 | incremented in `setup.py` on the master branch. 35 | 36 | 37 | ## Notes 38 | * The implementation provided here is equivalent to that of the 39 | [libPLS MATLAB library](http://libpls.net/), which is a faithful 40 | recreation of Trygg and Wold's algorithm. 41 | * This package uses a different definition for R2X, however (see 42 | below) 43 | * `OPLS` inherits `sklearn.base.TransformerMixin` (like 44 | `sklearn.decomposition.PCA`) but does not inherit 45 | `sklearn.base.RegressorMixin` because it is not a regressor like 46 | `sklearn.cross_decomposition.PLSRegression`. You can use the output of 47 | `OPLS.transform()` as an input to another regressor or classifier. 48 | * Like `sklearn.cross_decomposition.PLSRegression`, `OPLS` will center 49 | both X and Y before performing the algorithm. This makes centering by 50 | class in PLS-DA models unnecessary. 51 | * The `score()` function of `OPLS` performs the R2X score, the 52 | ratio of the variance in the transformed X to the variance in the 53 | original X. A lower score indicates more orthogonal variance removed. 54 | * `OPLS` only supports 1-column targets. 55 | 56 | ## Examples 57 | ### OPLS and PLS-DA 58 | A CSV file containing 1H-NMR spectra for 118 serum samples of patients 59 | with colon cancer diagnoses and healthy controls is located in 60 | `colorectal_cancer_nmr.csv` in the root of this repository (see 61 | acknowledgment below). 62 | 63 | OPLS-processed data require only 1 PLS component. Performing a 64 | 39-component OPLS improves cross-validated accuracy from 70% to 100%, 65 | AUC from .578 to 1 and DQ2 (3) from 0.04 to 0.99. 66 | 67 | ```python 68 | import pandas as pd 69 | import numpy as np 70 | import matplotlib.pyplot as plt 71 | from sklearn.metrics import roc_curve, roc_auc_score 72 | from pyopls import OPLS 73 | from sklearn.cross_decomposition import PLSRegression 74 | from sklearn.model_selection import cross_val_predict, LeaveOneOut 75 | from sklearn.metrics import r2_score, accuracy_score 76 | 77 | 78 | spectra = pd.read_csv('colorectal_cancer_nmr.csv', index_col=0) 79 | spectra = spectra[spectra.classification.isin(['Colorectal Cancer', 'Healthy Control'])] 80 | target = spectra.classification.apply(lambda x: 1 if x == 'Colorectal Cancer' else -1) 81 | spectra = spectra.drop('classification', axis=1) 82 | 83 | opls = OPLS(39) 84 | Z = opls.fit_transform(spectra, target) 85 | 86 | pls = PLSRegression(1) 87 | y_pred = cross_val_predict(pls, spectra, target, cv=LeaveOneOut()) 88 | q_squared = r2_score(target, y_pred) # -0.107 89 | dq_squared = r2_score(target, np.clip(y_pred, -1, 1)) # -0.106 90 | accuracy = accuracy_score(target, np.sign(y_pred)) # 0.705 91 | 92 | processed_y_pred = cross_val_predict(pls, Z, target, cv=LeaveOneOut()) 93 | processed_q_squared = r2_score(target, processed_y_pred) # 0.981 94 | processed_dq_squared = r2_score(target, np.clip(processed_y_pred, -1, 1)) # 0.984 95 | processed_accuracy = accuracy_score(target, np.sign(processed_y_pred)) # 1.0 96 | 97 | r2_X = opls.score(spectra) # 7.8e-12 (most variance is removed) 98 | 99 | fpr, tpr, thresholds = roc_curve(target, y_pred) 100 | roc_auc = roc_auc_score(target, y_pred) 101 | proc_fpr, proc_tpr, proc_thresholds = roc_curve(target, processed_y_pred) 102 | proc_roc_auc = roc_auc_score(target, processed_y_pred) 103 | 104 | plt.figure(0) 105 | plt.plot(fpr, tpr, lw=2, color='blue', label=f'Unprocessed (AUC={roc_auc:.4f})') 106 | plt.plot(proc_fpr, proc_tpr, lw=2, color='red', 107 | label=f'39-component OPLS (AUC={proc_roc_auc:.4f})') 108 | plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--') 109 | plt.xlabel('False Positive Rate') 110 | plt.ylabel('True Positive Rate') 111 | plt.title('ROC Curve') 112 | plt.legend(loc='lower right') 113 | plt.show() 114 | 115 | plt.figure(1) 116 | pls.fit(Z, target) 117 | df = pd.DataFrame(np.column_stack([pls.x_scores_, opls.T_ortho_[:, 0]]), 118 | index=spectra.index, columns=['t', 't_ortho']) 119 | pos_df = df[target==1] 120 | neg_df = df[target==-1] 121 | plt.scatter(neg_df['t'], neg_df['t_ortho'], c='blue', label='Healthy Control') 122 | plt.scatter(pos_df['t'], pos_df['t_ortho'], c='red', label='Colorectal Cancer') 123 | plt.title('PLS Scores') 124 | plt.xlabel('t_ortho') 125 | plt.ylabel('t') 126 | plt.legend(loc='upper right') 127 | plt.show() 128 | ``` 129 | #### ROC Curve 130 | ![roc curve](roc_curve.png) 131 | #### Scores Plot 132 | ![scores plot](scores.png) 133 | ### Validation 134 | The `fit()` method of `OPLSValidator` will find the optimum number of 135 | components to remove, then evaluate the results on a 1-component 136 | `sklearn.cross_decomposition.PLSRegression` model. A permutation test is 137 | performed for each metric by permuting the target and for the PLS 138 | loadings by permuting the features. 139 | 140 | This snippet will determine the best number of components to remove, 141 | perform permutation tests for regression metrics and perform two-tailed 142 | permutation tests for each feature (bin) relative to it's loading. The 143 | feature permutation tests for the colorectal cancer dataset would take 144 | quite some time, as they require that the model be fit as many as 874k 145 | times. So instead, we look at the 146 | [UCI ML Wine Dataset](https://archive.ics.uci.edu/ml/datasets/Wine) 147 | provided by 148 | [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html) 149 | The feature permutation tests reveal that hue and malic acid do not 150 | differentate class 1 from class 0. 151 | 152 | ```python 153 | import pandas as pd 154 | from pyopls import OPLSValidator 155 | from sklearn.datasets import load_wine 156 | 157 | wine_data = load_wine() 158 | df = pd.DataFrame(wine_data['data'], columns=wine_data['feature_names']) 159 | df['classification'] = wine_data['target'] 160 | df = df[df.classification.isin((0, 1))] 161 | target = df.classification.apply(lambda x: 1 if x else -1) # discriminant for class 1 vs class 0 162 | X = df[[c for c in df.columns if c!='classification']] 163 | 164 | validator = OPLSValidator(k=-1).fit(X, target) 165 | 166 | Z = validator.opls_.transform(X) 167 | 168 | feature_df = pd.DataFrame() 169 | feature_df['feature_name'] = wine_data['feature_names'] 170 | feature_df['feature_p_value'] = validator.feature_p_values_ 171 | feature_df['feature_loading'] = validator.pls_.x_loadings_ 172 | print(feature_df.loc[feature_df.feature_loading.abs().sort_values(ascending=False).index].to_markdown()) # Pandas 1.0+ required for to_markdown 173 | ``` 174 | #### Feature importances 175 | | | feature\_name | feature\_p\_value | feature\_loading | 176 | |---:|:-----------------------------|------------------:|-----------------:| 177 | | 12 | proline | 0.00990099 | 0.385955 | 178 | | 9 | color_intensity | 0.00990099 | 0.381981 | 179 | | 0 | alcohol | 0.00990099 | 0.379567 | 180 | | 6 | flavanoids | 0.00990099 | 0.359975 | 181 | | 5 | total_phenols | 0.00990099 | 0.336182 | 182 | | 11 | od280/od315_of_diluted_wines | 0.00990099 | 0.299045 | 183 | | 3 | alcalinity_of_ash | 0.00990099 | -0.239887 | 184 | | 2 | ash | 0.00990099 | 0.22916 | 185 | | 7 | nonflavanoid_phenols | 0.00990099 | -0.224338 | 186 | | 4 | magnesium | 0.00990099 | 0.18662 | 187 | | 8 | proanthocyanins | 0.00990099 | 0.181767 | 188 | | 1 | malic_acid | 0.564356 | 0.0293328 | 189 | | 10 | hue | 0.623762 | 0.0210777 | 190 | 191 | ## References 192 | 1. Johan Trygg and Svante Wold. Orthogonal projections to latent structures (O-PLS). 193 | *J. Chemometrics* 2002; 16: 119-128. DOI: [10.1002/cem.695](https://dx.doi.org/10.1002/cem.695) 194 | 2. Eugene Edington and Patrick Onghena. "Calculating P-Values" in *Randomization tests*, 4th edition. 195 | New York: Chapman & Hall/CRC, 2007, pp. 33-53. DOI: [10.1201/9781420011814](https://doi.org/10.1201/9781420011814). 196 | 3. Johan A. Westerhuis, Ewoud J. J. van Velzen, Huub C. J. Hoefsloot, Age K. Smilde. Discriminant Q-squared for 197 | improved discrimination in PLSDA models. *Metabolomics* 2008; 4: 293-296. 198 | DOI: [10.1007/s11306-008-0126-2](https://doi.org/10.1007/s11306-008-0126-2) 199 | 200 | ## Data Acknowledgment 201 | The test dataset provided at `pyopls/tests/colorectal_cancer_nmr.csv` is 202 | available at the NIH Common Fund's National Metabolomics Data Repository 203 | (NMDR) website, the Metabolomics Workbench, 204 | [https://metabolomicsworkbench.org] where it has been assigned Project 205 | ID PR000227. The data can be accessed directly via it's Project DOI 206 | [10.21228/M89P43](https://dx.doi.org/10.21228/M89P43). This work is 207 | supported by NIH grant, U2C-DK119886. 208 | 209 | *Note*: The test dataset consists only of those spectra belonging to 210 | samples labeled "Colorectal Cancer" or "Healthy Control". The "target" 211 | variable has the value -1 for samples labeled "Healthy Control" and 212 | value +1 for samples labeled "Colorectal Cancer". 213 | -------------------------------------------------------------------------------- /pyopls/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /pyopls/__init__.py: -------------------------------------------------------------------------------- 1 | from pyopls.opls import OPLS 2 | from pyopls.validation import OPLSValidator, OPLSDAValidator 3 | -------------------------------------------------------------------------------- /pyopls/kernel_density.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from joblib import Parallel, delayed 3 | from sklearn.model_selection import GridSearchCV, KFold, LeaveOneOut 4 | from sklearn.neighbors import KernelDensity 5 | 6 | 7 | class OPLSKernelDensity: 8 | @staticmethod 9 | def _estimate_bandwidth(vals, grid_search_num, cv, n_jobs, verbose, pre_dispatch): 10 | grid = GridSearchCV(KernelDensity(kernel='gaussian'), 11 | {'bandwidth': 10 ** np.linspace(-1, 1, grid_search_num)}, 12 | cv=cv, n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch, iid=False) 13 | grid.fit(vals.reshape(-1, 1)) 14 | return grid.best_params_['bandwidth'] 15 | 16 | @staticmethod 17 | def _kde(x, vals, bw): 18 | kd = KernelDensity(kernel='gaussian', bandwidth=bw).fit(vals.reshape(-1, 1)) 19 | return kd.score_samples(x.reshape(-1, 1)) 20 | 21 | @staticmethod 22 | def _estimate_kde_abscissa(vals, num): 23 | return np.linspace(vals.min(), vals.max(), num) 24 | 25 | def get_kdes(self, opls_cv, num=None, bandwidth=None, k=5, grid_search_num=100, 26 | n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'): 27 | # Get a kernel-density estimate for permutation test results 28 | num = num or 2 * opls_cv.n_permutations 29 | 30 | def _named_kde(key, x, vals, bw): 31 | return key, self._kde(x, vals, bw) 32 | 33 | def _named_abscissa(key, vals, n): 34 | return key, self._estimate_kde_abscissa(vals, n) 35 | 36 | if k == -1: 37 | cv = LeaveOneOut() 38 | else: 39 | cv = KFold(k) 40 | loading_bandwidths = [ 41 | self._estimate_bandwidth(vals, grid_search_num, cv, n_jobs, verbose, pre_dispatch) 42 | for vals in np.hsplit(opls_cv.permutation_loadings_, opls_cv.permutation_loadings_.shape[1]) 43 | ] if bandwidth is None else [bandwidth for _ in range(opls_cv.permutation_loadings_.shape[1])] 44 | 45 | loading_abscissae = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 46 | delayed(self._estimate_kde_abscissa)(vals, num) 47 | for vals in np.hsplit(opls_cv.permutation_loadings_, opls_cv.permutation_loadings_.shape[1]) 48 | ) 49 | loading_kdes = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 50 | delayed(self._kde)(x, vals, bw) 51 | for x, vals, bw in zip(loading_abscissae, 52 | np.hsplit(opls_cv.permutation_loadings_, opls_cv.permutation_loadings_.shape[1]), 53 | loading_bandwidths) 54 | ) 55 | results = { 56 | 'loadings': { 57 | 'x': np.column_stack(loading_abscissae), 58 | 'kde': np.column_stack(loading_kdes), 59 | 'h': np.hstack(loading_bandwidths) 60 | } 61 | } 62 | metrics = { 63 | 'q_squared': opls_cv.permutation_q_squared_, 64 | 'r_squared_Y': opls_cv.permutation_r_squared_Y_, 65 | 'discriminator_q_squared': opls_cv.permutation_discriminator_q_squared_, 66 | 'accuracy': opls_cv.permutation_accuracy_, 67 | 'roc_auc': opls_cv.permutation_roc_auc_ 68 | } 69 | metrics = {key: value for key, value in metrics.items() if value is not None} 70 | metric_bandwidths = { 71 | key: self._estimate_bandwidth(value, grid_search_num, cv, n_jobs, verbose, pre_dispatch) 72 | for key, value in metrics.items() 73 | } if bandwidth is None else {key: bandwidth for key in metrics.keys()} 74 | metric_abscissae = { 75 | res[0]: res[1] for res in Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 76 | delayed(_named_abscissa)(key, value, num) for key, value in metrics.items()) 77 | } 78 | metric_kdes = { 79 | res[0]: res[1] for res in Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 80 | delayed(_named_kde)(key, metric_abscissae[key], value, metric_bandwidths[key]) 81 | for key, value in metrics.items() 82 | ) 83 | } 84 | 85 | for key in metrics.keys(): 86 | results[key] = { 87 | 'x': metric_abscissae[key], 88 | 'kde': metric_kdes[key], 89 | 'h': metric_bandwidths[key] 90 | } 91 | 92 | return results 93 | -------------------------------------------------------------------------------- /pyopls/opls.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 Wright State University 2 | # Author: Daniel Foose 3 | # License: MIT 4 | 5 | import numpy as np 6 | from sklearn.base import BaseEstimator, TransformerMixin 7 | from sklearn.utils import check_array 8 | from sklearn.utils.validation import check_consistent_length 9 | 10 | 11 | def _center_scale_xy(X, Y, scale=True): 12 | """ Center X, Y and scale if the scale parameter==True 13 | 14 | Returns 15 | ------- 16 | X, Y, x_mean, y_mean, x_std, y_std 17 | """ 18 | # center 19 | x_mean = X.mean(axis=0) 20 | X -= x_mean 21 | y_mean = Y.mean(axis=0) 22 | Y -= y_mean 23 | # scale 24 | if scale: 25 | x_std = X.std(axis=0, ddof=1) 26 | x_std[x_std == 0.0] = 1.0 27 | X /= x_std 28 | y_std = Y.std(axis=0, ddof=1) 29 | y_std[y_std == 0.0] = 1.0 30 | Y /= y_std 31 | else: 32 | x_std = np.ones(X.shape[1]) 33 | y_std = np.ones(Y.shape[1]) 34 | return X, Y, x_mean, y_mean, x_std, y_std 35 | 36 | 37 | class OPLS(BaseEstimator, TransformerMixin): 38 | """Orthogonal Projection to Latent Structures (O-PLS) 39 | 40 | This class implements the O-PLS algorithm for one (and only one) response as described by [Trygg 2002]. 41 | This is equivalent to the implementation of the libPLS MATLAB library (http://libpls.net/) 42 | 43 | Parameters 44 | ---------- 45 | n_components: int, number of orthogonal components to filter. (default 5). 46 | 47 | scale: boolean, scale data? (default True) 48 | 49 | Attributes 50 | ---------- 51 | W_ortho_ : weights orthogonal to y 52 | 53 | P_ortho_ : loadings orthogonal to y 54 | 55 | T_ortho_ : scores orthogonal to y 56 | 57 | x_mean_ : mean of the X provided to fit() 58 | y_mean_ : mean of the Y provided to fit() 59 | x_std_ : std deviation of the X provided to fit() 60 | y_std_ : std deviation of the Y provided to fit() 61 | 62 | References 63 | ---------- 64 | Johan Trygg and Svante Wold. Orthogonal projections to latent structures (O-PLS). 65 | J. Chemometrics 2002; 16: 119-128. DOI: 10.1002/cem.695 66 | """ 67 | def __init__(self, n_components=5, scale=True): 68 | self.n_components = n_components 69 | self.scale = scale 70 | 71 | self.W_ortho_ = None 72 | self.P_ortho_ = None 73 | self.T_ortho_ = None 74 | 75 | self.x_mean_ = None 76 | self.y_mean_ = None 77 | self.x_std_ = None 78 | self.y_std_ = None 79 | 80 | def fit(self, X, Y): 81 | """Fit model to data 82 | 83 | Parameters 84 | ---------- 85 | X : array-like, shape = [n_samples, n_features] 86 | Training vectors, where n_samples is the number of samples and 87 | n_features is the number of predictors. 88 | 89 | Y : array-like, shape = [n_samples, 1] 90 | Target vector, where n_samples is the number of samples. 91 | This implementation only supports a single response (target) variable. 92 | 93 | """ 94 | 95 | # copy since this will contains the residuals (deflated) matrices 96 | check_consistent_length(X, Y) 97 | X = check_array(X, dtype=np.float64, copy=True, ensure_min_samples=2) 98 | Y = check_array(Y, dtype=np.float64, copy=True, ensure_2d=False) 99 | if Y.ndim == 1: 100 | Y = Y.reshape(-1, 1) 101 | 102 | X, Y, self.x_mean_, self.y_mean_, self.x_std_, self.y_std_ = _center_scale_xy(X, Y, self.scale) 103 | 104 | Z = X.copy() 105 | w = np.dot(X.T, Y) # calculate weight vector 106 | w /= np.linalg.norm(w) # normalize weight vector 107 | 108 | W_ortho = [] 109 | T_ortho = [] 110 | P_ortho = [] 111 | 112 | for i in range(self.n_components): 113 | t = np.dot(Z, w) # scores vector 114 | p = np.dot(Z.T, t) / np.dot(t.T, t).item() # loadings of X 115 | w_ortho = p - np.dot(w.T, p).item() / np.dot(w.T, w).item() * w # orthogonal weight 116 | w_ortho = w_ortho / np.linalg.norm(w_ortho) # normalize orthogonal weight 117 | t_ortho = np.dot(Z, w_ortho) # orthogonal components 118 | p_ortho = np.dot(Z.T, t_ortho) / np.dot(t_ortho.T, t_ortho).item() 119 | Z -= np.dot(t_ortho, p_ortho.T) 120 | W_ortho.append(w_ortho) 121 | T_ortho.append(t_ortho) 122 | P_ortho.append(p_ortho) 123 | 124 | self.W_ortho_ = np.hstack(W_ortho) 125 | self.T_ortho_ = np.hstack(T_ortho) 126 | self.P_ortho_ = np.hstack(P_ortho) 127 | 128 | return self 129 | 130 | def transform(self, X): 131 | """Get the non-orthogonal components of X (which are considered in prediction). 132 | 133 | Parameters 134 | ---------- 135 | X : array-like, shape = [n_samples, n_features] 136 | Training or test vectors, where n_samples is the number of samples and 137 | n_features is the number of predictors (which should be the same predictors the model was trained on). 138 | 139 | Returns 140 | ------- 141 | X_res, X with the orthogonal data filtered out 142 | """ 143 | Z = check_array(X, copy=True) 144 | 145 | Z -= self.x_mean_ 146 | if self.scale: 147 | Z /= self.x_std_ 148 | 149 | # filter out orthogonal components of X 150 | for i in range(self.n_components): 151 | t = np.dot(Z, self.W_ortho_[:, i]).reshape(-1, 1) 152 | Z -= np.dot(t, self.P_ortho_[:, i].T.reshape(1, -1)) 153 | 154 | return Z 155 | 156 | def fit_transform(self, X, y=None, **fit_params): 157 | """ Learn and apply the filtering on the training data and get the filtered X 158 | 159 | Parameters 160 | ---------- 161 | X : array-like, shape=[n_samples, n_features] 162 | Training vectors, where n_samples is the number of samples and 163 | n_features is the number of predictors. 164 | 165 | y : array-like, shape = [n_samples, 1] 166 | Target vector, where n_samples is the number of samples. 167 | This O-PLS implementation only supports a single response (target) variable. 168 | Y=None will raise ValueError from fit(). 169 | 170 | Returns 171 | ------- 172 | X_filtered 173 | """ 174 | return self.fit(X, y).transform(X) 175 | 176 | def score(self, X): 177 | """ Return the coefficient of determination R^2X of the transformation. 178 | Parameters 179 | ---------- 180 | X : array-like of shape (n_samples, n_features) 181 | Test samples. For some estimators this may be a 182 | precomputed kernel matrix or a list of generic objects instead, 183 | shape = (n_samples, n_samples_fitted), 184 | where n_samples_fitted is the number of 185 | samples used in the fitting for the estimator. 186 | Returns 187 | ------- 188 | score : float 189 | The amount of variation in X explained by the transformed X. A lower number indicates more orthogonal 190 | variation has been removed. 191 | """ 192 | X = check_array(X) 193 | Z = self.transform(X) 194 | return np.sum(np.square(Z)) / np.sum(np.square(X - self.x_mean_)) # Z is already properly centered 195 | -------------------------------------------------------------------------------- /pyopls/permutation_test.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from sys import stderr 3 | 4 | import numpy as np 5 | from joblib import Parallel, delayed 6 | from sklearn.base import is_classifier, clone, ClassifierMixin 7 | from sklearn.exceptions import DataConversionWarning 8 | from sklearn.metrics import r2_score, accuracy_score 9 | from sklearn.model_selection import check_cv, cross_val_predict 10 | from sklearn.utils import indexable, check_random_state 11 | 12 | 13 | def passthrough_scorer(estimator, *args, **kwargs): 14 | """Function that wraps estimator.score""" 15 | return estimator.score(*args, **kwargs) 16 | 17 | 18 | def non_cv_permutation_test_score(estimator, X, y, groups=None, 19 | n_permutations=100, n_jobs=None, random_state=0, 20 | verbose=0, pre_dispatch='2*n_jobs', scorers=None): 21 | """Evaluate the significance of several non-cross-validated scores with permutations 22 | 23 | Read more in the :ref:`User Guide `. 24 | 25 | Parameters 26 | ---------- 27 | estimator : estimator object implementing 'fit' 28 | The object to use to fit the data. 29 | 30 | X : array-like of shape at least 2D 31 | The data to fit. 32 | 33 | y : array-like 34 | The target variable to try to predict in the case of 35 | supervised learning. 36 | 37 | groups : array-like, with shape (n_samples,), optional 38 | Labels to constrain permutation within groups, i.e. ``y`` values 39 | are permuted among samples with the same group identifier. 40 | When not specified, ``y`` values are permuted among all samples. 41 | 42 | When a grouped cross-validator is used, the group labels are 43 | also passed on to the ``split`` method of the cross-validator. The 44 | cross-validator uses them for grouping the samples while splitting 45 | the dataset into train/test set. 46 | 47 | scorers : string, callable or None, optional, default: None 48 | a list of scoring functions 49 | 50 | 51 | n_permutations : integer, optional 52 | Number of times to permute ``y``. 53 | 54 | n_jobs : int or None, optional (default=None) 55 | The number of CPUs to use to do the computation. 56 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 57 | ``-1`` means using all processors. See :term:`Glossary ` 58 | for more details. 59 | 60 | random_state : int, RandomState instance or None, optional (default=0) 61 | If int, random_state is the seed used by the random number generator; 62 | If RandomState instance, random_state is the random number generator; 63 | If None, the random number generator is the RandomState instance used 64 | by `np.random`. 65 | 66 | verbose : integer, optional 67 | The verbosity level. 68 | 69 | 70 | pre_dispatch : int, or string, optional 71 | Controls the number of jobs that get dispatched during parallel 72 | execution. Reducing this number can be useful to avoid an 73 | explosion of memory consumption when more jobs get dispatched 74 | than CPUs can process. This parameter can be: 75 | 76 | - None, in which case all the jobs are immediately 77 | created and spawned. Use this for lightweight and 78 | fast-running jobs, to avoid delays due to on-demand 79 | spawning of the jobs 80 | 81 | - An int, giving the exact number of total jobs that are 82 | spawned 83 | 84 | - A string, giving an expression as a function of n_jobs, 85 | as in '2*n_jobs' 86 | 87 | Returns 88 | ------- 89 | score : float 90 | The true score without permuting targets. 91 | 92 | permutation_scores : array, shape (n_permutations,) 93 | The scores obtained for each permutations. 94 | 95 | pvalue : float 96 | The p-value, which approximates the probability that the score would 97 | be obtained by chance. This is calculated as: 98 | 99 | `(C + 1) / (n_permutations + 1)` 100 | 101 | Where C is the number of permutations whose score >= the true score. 102 | 103 | The best possible p-value is 1/(n_permutations + 1), the worst is 1.0. 104 | 105 | Notes 106 | ----- 107 | This function implements Test 1 in: 108 | 109 | Ojala and Garriga. Permutation Tests for Studying Classifier 110 | Performance. The Journal of Machine Learning Research (2010) 111 | vol. 11 112 | 113 | """ 114 | X, y, groups = indexable(X, y, groups) 115 | 116 | random_state = check_random_state(random_state) 117 | if scorers is None or not len(scorers): 118 | if hasattr(estimator, 'score'): 119 | scorers = [passthrough_scorer] 120 | else: 121 | raise TypeError( 122 | "If no scoring is specified, the estimator passed should " 123 | "have a 'score' method. The estimator %r does not." 124 | % estimator) 125 | 126 | # We clone the estimator to make sure that all the folds are 127 | # independent, and that it is pickle-able. 128 | score = _non_cv_permutation_test_score(clone(estimator), X, y, groups, scorers) 129 | permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 130 | delayed(_non_cv_permutation_test_score)( 131 | clone(estimator), X, _shuffle(y, groups, random_state), 132 | groups, scorers) 133 | for _ in range(n_permutations)) 134 | permutation_scores = np.array(permutation_scores) 135 | pvalue = (np.sum(permutation_scores >= score, axis=0) + 1.0) / (n_permutations + 1) 136 | return [(score[i], permutation_scores[:, i], pvalue[i]) for i in range(len(scorers))] 137 | 138 | 139 | def _non_cv_permutation_test_score(estimator, X, y, groups, scorers): 140 | """Auxiliary function for permutation_test_score""" 141 | estimator.fit(X, y) 142 | return [scorer(estimator, X, y) for scorer in scorers] 143 | 144 | 145 | def permutation_test_score(estimator, X, y, groups=None, cv='warn', 146 | n_permutations=100, n_jobs=None, random_state=0, 147 | verbose=0, pre_dispatch='2*n_jobs', cv_score_functions=None, 148 | fit_params=None, method='predict', parallel_by='permutation'): 149 | """Evaluate the significance of several cross-validated scores with permutations 150 | 151 | Note: this is different from sklearn.model_selection.permutation_test_score in two ways. 152 | 1. The scikit-learn method calculates the metrics for each CV split, this makes using metrics like r-squared with 153 | LeaveOneOut impossible. This method uses sklearn.model_selection.cross_val_predict to predict the left-out labels, 154 | then calculates the metrics for that prediction. 155 | 2. The scikit-learn method only evaluates one metric at a time, this one evaluates an arbitrary number of metrics 156 | 157 | Parameters 158 | ---------- 159 | estimator : estimator object implementing 'fit' 160 | The object to use to fit the data. 161 | 162 | X : array-like of shape at least 2D 163 | The data to fit. 164 | 165 | y : array-like 166 | The target variable to try to predict in the case of 167 | supervised learning. 168 | 169 | groups : array-like, with shape (n_samples,), optional 170 | Labels to constrain permutation within groups, i.e. ``y`` values 171 | are permuted among samples with the same group identifier. 172 | When not specified, ``y`` values are permuted among all samples. 173 | 174 | When a grouped cross-validator is used, the group labels are 175 | also passed on to the ``split`` method of the cross-validator. The 176 | cross-validator uses them for grouping the samples while splitting 177 | the dataset into train/test set. 178 | 179 | cv_score_functions : list of callables or None, optional, default: None 180 | a list of score functions of form score(y_true, y_pred) (like r2_score, accuracy_score). 181 | If you have special arguments for your score function you should create another function with 182 | the required prototype that wraps that function. 183 | 184 | cv : int, cross-validation generator or an iterable, optional 185 | Determines the cross-validation splitting strategy. 186 | Possible inputs for cv are: 187 | 188 | - None, to use the default 3-fold cross validation, 189 | - integer, to specify the number of folds in a `(Stratified)KFold`, 190 | - :term:`CV splitter`, 191 | - An iterable yielding (train, test) splits as arrays of indices. 192 | 193 | For integer/None inputs, if the estimator is a classifier and ``y`` is 194 | either binary or multiclass, :class:`StratifiedKFold` is used. In all 195 | other cases, :class:`KFold` is used. 196 | 197 | n_permutations : integer, optional 198 | Number of times to permute ``y``. 199 | 200 | n_jobs : int or None, optional (default=None) 201 | The number of CPUs to use to do the computation. 202 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 203 | ``-1`` means using all processors. See :term:`Glossary ` 204 | for more details. 205 | 206 | random_state : int, RandomState instance or None, optional (default=0) 207 | If int, random_state is the seed used by the random number generator; 208 | If RandomState instance, random_state is the random number generator; 209 | If None, the random number generator is the RandomState instance used 210 | by `np.random`. 211 | 212 | verbose : integer, optional 213 | The verbosity level. 214 | 215 | pre_dispatch : int, or string, optional 216 | Controls the number of jobs that get dispatched during parallel 217 | execution. Reducing this number can be useful to avoid an 218 | explosion of memory consumption when more jobs get dispatched 219 | than CPUs can process. This parameter can be: 220 | 221 | - None, in which case all the jobs are immediately 222 | created and spawned. Use this for lightweight and 223 | fast-running jobs, to avoid delays due to on-demand 224 | spawning of the jobs 225 | 226 | - An int, giving the exact number of total jobs that are 227 | spawned 228 | 229 | - A string, giving an expression as a function of n_jobs, 230 | as in '2*n_jobs' 231 | 232 | fit_params : dict, optional 233 | Parameters to pass to the fit method of the estimator. 234 | 235 | method : string, optional, default: 'predict' 236 | Invokes the passed method name of the passed estimator. For 237 | method='predict_proba', the columns correspond to the classes 238 | in sorted order. 239 | 240 | parallel_by : string, optional, default: 'permutation' 241 | Whether to parallelize the estimation step or the permuation step. 242 | Either 'estimation' or 'permutation'. If 'estimation', the training of each cross-validation 243 | fold gets its own job. If 'permutation', each permutation of the target gets its own job. 244 | 245 | Returns 246 | ------- 247 | score : float 248 | The true score without permuting targets. 249 | 250 | permutation_scores : array, shape (n_permutations,) 251 | The scores obtained for each permutations. 252 | 253 | pvalue : float 254 | The p-value, which approximates the probability that the score would 255 | be obtained by chance. This is calculated as: 256 | 257 | `(C + 1) / (n_permutations + 1)` 258 | 259 | Where C is the number of permutations whose score >= the true score. 260 | 261 | The best possible p-value is 1/(n_permutations + 1), the worst is 1.0. 262 | 263 | Notes 264 | ----- 265 | This function implements Test 1 in: 266 | 267 | Ojala and Garriga. Permutation Tests for Studying Classifier 268 | Performance. The Journal of Machine Learning Research (2010) 269 | vol. 11 270 | 271 | """ 272 | X, y, groups = indexable(X, y, groups) 273 | 274 | cv = check_cv(cv, y, classifier=is_classifier(estimator)) 275 | random_state = check_random_state(random_state) 276 | if cv_score_functions is None: 277 | if isinstance(estimator, ClassifierMixin): 278 | cv_score_functions = [accuracy_score] 279 | else: 280 | cv_score_functions = [r2_score] 281 | # We clone the estimator to make sure that all the folds are 282 | # independent, and that it is pickle-able. 283 | score = _permutation_test_score(clone(estimator), X, y, groups, cv, 284 | n_jobs, verbose, fit_params, pre_dispatch, 285 | method, cv_score_functions) 286 | if parallel_by == 'estimation': 287 | permutation_scores = np.vstack([ 288 | _permutation_test_score( 289 | clone(estimator), X, _shuffle(y, groups, random_state), 290 | groups, cv, n_jobs, verbose, fit_params, pre_dispatch, 291 | method, cv_score_functions 292 | ) for _ in range(n_permutations) 293 | ]) 294 | elif parallel_by == 'permutation': 295 | permutation_scores = np.vstack( 296 | Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 297 | delayed(_permutation_test_score)( 298 | clone(estimator), X, _shuffle(y, groups, random_state), 299 | groups, cv, fit_params=fit_params, method=method, score_functions=cv_score_functions 300 | ) for _ in range(n_permutations) 301 | ) 302 | ) 303 | else: 304 | raise ValueError(f'Invalid option for parallel_by {parallel_by}') 305 | pvalue = (np.sum(permutation_scores >= score, axis=0) + 1.0) / (n_permutations + 1) 306 | return [(score[i], permutation_scores[:, i], pvalue[i]) for i in range(len(score))] 307 | # return score, permutation_scores, pvalue 308 | 309 | 310 | def _permutation_test_score(estimator, X, y, groups=None, cv='warn', 311 | n_jobs=None, verbose=0, fit_params=None, 312 | pre_dispatch='2*n_jobs', method='predict', 313 | score_functions=None): 314 | """Auxiliary function for permutation_test_score""" 315 | if score_functions is None: 316 | score_functions = [r2_score] 317 | y_pred = cross_val_predict(estimator, X, y, groups, cv, n_jobs, verbose, fit_params, pre_dispatch, method) 318 | cv_scores = [score_function(y, y_pred) for score_function in score_functions] 319 | return np.array(cv_scores) 320 | 321 | 322 | def _shuffle(y, groups, random_state): 323 | """Return a shuffled copy of y eventually shuffle among same groups.""" 324 | if groups is None: 325 | indices = random_state.permutation(len(y)) 326 | else: 327 | indices = np.arange(len(groups)) 328 | for group in np.unique(groups): 329 | this_mask = (groups == group) 330 | indices[this_mask] = random_state.permutation(indices[this_mask]) 331 | return safe_indexing(y, indices) 332 | 333 | 334 | def feature_permutation_loading(estimator, X, y, initial_permutations=100, alpha=0.2, final_permutations=500, 335 | random_state=0, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'): 336 | """Determine the significance of each feature 337 | 338 | This is done by permuting each feature in X and measuring the loading. 339 | The feature is considered significant if the loadings are significantly different. 340 | 341 | This is always done with a regular PLS regressor 342 | PLS-DA should be binarized first. 343 | 344 | Parameters 345 | ---------- 346 | estimator : estimator object implementing 'fit' with x_loadings_ 347 | The object to use to fit the data. This should have an [n_features, 1] x_loadings_ array. This can be a 348 | one-component PLS or OPLS model. 349 | 350 | X : array-like, shape = [n_samples, n_features] 351 | Training vectors, where n_samples is the number of samples and 352 | n_features is the number of predictors. 353 | 354 | y : array-like, shape = [n_samples, 1] 355 | Target vector, where n_samples is the number of samples. 356 | This implementation only supports a single response (target) variable. 357 | 358 | initial_permutations : int 359 | The number of permutations to perform for all features. 360 | 361 | alpha : float, in range (0, 1) 362 | The threshold for significance. If a feature is found significant in the first round, it will be retested with 363 | final_permutations in the second round. 364 | 365 | final_permutations : int 366 | The number of permutations to perform during the second round to retest points found significant in the first 367 | round. 368 | 369 | n_jobs : int or None, optional (default=None) 370 | The number of CPUs to use to do the computation. 371 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 372 | ``-1`` means using all processors. See :term:`Glossary ` 373 | for more details. 374 | 375 | verbose : integer, optional 376 | The verbosity level. 377 | 378 | pre_dispatch : int, or string, optional 379 | Controls the number of jobs that get dispatched during parallel 380 | execution. Reducing this number can be useful to avoid an 381 | explosion of memory consumption when more jobs get dispatched 382 | than CPUs can process. This parameter can be: 383 | 384 | - None, in which case all the jobs are immediately 385 | created and spawned. Use this for lightweight and 386 | fast-running jobs, to avoid delays due to on-demand 387 | spawning of the jobs 388 | 389 | - An int, giving the exact number of total jobs that are 390 | spawned 391 | 392 | - A string, giving an expression as a function of n_jobs, 393 | as in '2*n_jobs' 394 | 395 | random_state : int, RandomState instance or None, optional (default=0) 396 | If int, random_state is the seed used by the random number generator; 397 | If RandomState instance, random_state is the random number generator; 398 | If None, the random number generator is the RandomState instance used 399 | by `np.random`. 400 | 401 | Returns 402 | ------- 403 | x_loadings : array [n_features] 404 | The x_loadings found from non-permuted data. 405 | 406 | permutation_x_loadings: array [n_inner_permutations, n_features] 407 | The one-component PLS loadings for each permutation in the first round. 408 | 409 | p_values: array [n_features] 410 | The p-values for each feature. The null hypothesis is that permuting the feature does not change it's weight 411 | in the one-component PLS model. 412 | """ 413 | 414 | def feature_ind_generator(n_permutations_, feature_inds): 415 | """ 416 | Repeats each value in feature_inds n_permutations_ times. 417 | """ 418 | i = 0 419 | count = 0 420 | while count < (n_permutations_ * len(feature_inds)): 421 | yield feature_inds[i] 422 | count += 1 423 | if (count % n_permutations_) == 0: 424 | i += 1 425 | 426 | def _log(txt): 427 | if verbose in range(1, 51): 428 | stderr.write(txt + '\n') 429 | if verbose > 50: 430 | print(txt) 431 | 432 | random_state = check_random_state(random_state) 433 | n_features = X.shape[1] 434 | x_loadings = np.ravel(estimator.fit(X, y).x_loadings_) 435 | loading_max = np.max((x_loadings, -1 * x_loadings), axis=0) 436 | loading_min = np.min((x_loadings, -1 * x_loadings), axis=0) 437 | 438 | _log('Performing initial permutation tests.') 439 | permutation_x_loadings = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 440 | delayed(_feature_permutation_loading)( 441 | clone(estimator), _feature_shuffle(X, feature_ind, random_state), y, x_loadings, feature_ind) 442 | for feature_ind in feature_ind_generator(initial_permutations, [i for i in range(n_features)])) 443 | permutation_x_loadings = np.array(permutation_x_loadings).reshape(n_features, initial_permutations).T 444 | 445 | _log('Calculating p values.') 446 | p_values = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 447 | delayed(_loading_p_value)(permutation_x_loading, upper, lower, initial_permutations) 448 | for permutation_x_loading, upper, lower in zip(np.hsplit(permutation_x_loadings, n_features), 449 | loading_max, loading_min) 450 | ) 451 | 452 | # Retest values found significant in first round 453 | retest_columns = [i for i in range(n_features) if p_values[i] < (alpha / 2.0)] # remember, this is two-tailed 454 | retest_loading_max = np.max((x_loadings[retest_columns], -1 * x_loadings[retest_columns]), axis=0) 455 | retest_loading_min = np.min((x_loadings[retest_columns], -1 * x_loadings[retest_columns]), axis=0) 456 | 457 | _log(f'Re-testing {len(retest_columns)} features') 458 | retest_loadings = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 459 | delayed(_feature_permutation_loading)( 460 | clone(estimator), _feature_shuffle(X, feature_ind, random_state), y, x_loadings, feature_ind) 461 | for feature_ind in feature_ind_generator(final_permutations, retest_columns)) 462 | retest_loadings = np.array(retest_loadings).reshape(len(retest_columns), final_permutations).T 463 | 464 | # replace p-values with the more accurate ones 465 | if len(retest_columns): 466 | _log(f'Calculating p values for {len(retest_columns)} features.') 467 | p_values = np.array(p_values) 468 | p_values[retest_columns] = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)( 469 | delayed(_loading_p_value)(retest_loading, upper, lower, initial_permutations) 470 | for retest_loading, upper, lower in zip(np.hsplit(retest_loadings, len(retest_columns)), 471 | retest_loading_max, retest_loading_min) 472 | ) 473 | else: 474 | _log('No significant features after first round of tests.') 475 | p_values = np.array(p_values) 476 | p_values[p_values > 1] = 1 # if feature_min=feature_max=loading=0 values will be greater than 1 477 | return x_loadings, permutation_x_loadings, p_values 478 | 479 | 480 | def _feature_permutation_loading(estimator, X, y, reference_loadings, feature_ind): 481 | """Auxiliary function for feature_permutation_loading""" 482 | """Not that since loading only depends on training data, we dont use cross-validation""" 483 | test_loadings = np.ravel(estimator.fit(X, y).x_loadings_) 484 | # make directions the same 485 | err1 = (np.sum(np.square(test_loadings[:feature_ind] - reference_loadings[:feature_ind])) 486 | + np.sum(np.square(test_loadings[feature_ind:] - reference_loadings[feature_ind:]))) 487 | err2 = (np.sum(np.square(test_loadings[:feature_ind] + reference_loadings[:feature_ind])) 488 | + np.sum(np.square(test_loadings[feature_ind:] + reference_loadings[feature_ind:]))) 489 | sign = -1 if err2 < err1 else 1 490 | return sign * test_loadings[feature_ind] 491 | 492 | 493 | def _feature_shuffle(X, feature_ind, random_state): 494 | X = X.copy() 495 | random_state.shuffle(X[:, feature_ind]) 496 | return X 497 | 498 | 499 | def _loading_p_value(permutation_loadings, upper, lower, n_permutations): 500 | return (np.sum(permutation_loadings >= upper) + np.sum(permutation_loadings <= lower) + 1) / (n_permutations + 1) 501 | 502 | 503 | def safe_indexing(X, indices): 504 | """Return items or rows from X using indices. 505 | 506 | Allows simple indexing of lists or arrays. 507 | This is copied from the deprecated sklearn.utils.safe_indexing 508 | 509 | Parameters 510 | ---------- 511 | X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series. 512 | Data from which to sample rows or items. 513 | indices : array-like of int 514 | Indices according to which X will be subsampled. 515 | 516 | Returns 517 | ------- 518 | subset 519 | Subset of X on first axis 520 | 521 | Notes 522 | ----- 523 | CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are 524 | not supported. 525 | """ 526 | if hasattr(X, "iloc"): 527 | # Work-around for indexing with read-only indices in pandas 528 | indices = indices if indices.flags.writeable else indices.copy() 529 | # Pandas Dataframes and Series 530 | try: 531 | return X.iloc[indices] 532 | except ValueError: 533 | # Cython typed memoryviews internally used in pandas do not support 534 | # readonly buffers. 535 | warnings.warn("Copying input dataframe for slicing.", 536 | DataConversionWarning) 537 | return X.copy().iloc[indices] 538 | elif hasattr(X, "shape"): 539 | if hasattr(X, 'take') and (hasattr(indices, 'dtype') and 540 | indices.dtype.kind == 'i'): 541 | # This is often substantially faster than X[indices] 542 | return X.take(indices, axis=0) 543 | else: 544 | return X[indices] 545 | else: 546 | return [X[idx] for idx in indices] 547 | -------------------------------------------------------------------------------- /pyopls/tests/features.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BiRG/pyopls/be801f1e021898d3337ba30795644130ec425e49/pyopls/tests/features.npy -------------------------------------------------------------------------------- /pyopls/tests/target.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BiRG/pyopls/be801f1e021898d3337ba30795644130ec425e49/pyopls/tests/target.npy -------------------------------------------------------------------------------- /pyopls/tests/test_opls.py: -------------------------------------------------------------------------------- 1 | def test_opls(): 2 | import numpy as np 3 | from pyopls import OPLS 4 | from sklearn.cross_decomposition import PLSRegression 5 | from sklearn.metrics import r2_score 6 | from sklearn.model_selection import cross_val_predict, LeaveOneOut 7 | 8 | # paths relative to repo 9 | spectra = np.load('pyopls/tests/features.npy') 10 | target = np.load('pyopls/tests/target.npy') 11 | 12 | score = -1 13 | n_components = 0 14 | for n_components in range(1, spectra.shape[1]): 15 | opls = OPLS(n_components=n_components) 16 | Z = opls.fit(spectra, target).transform(spectra) 17 | y_pred = cross_val_predict(PLSRegression(n_components=1), Z, target, cv=LeaveOneOut()) 18 | score_i = r2_score(target, y_pred) 19 | if score_i < score: 20 | n_components -= 1 21 | break 22 | score = score_i 23 | 24 | opls = OPLS(n_components=n_components) 25 | opls.fit(spectra, target) 26 | assert opls.n_components == n_components 27 | assert opls.P_ortho_.shape == (spectra.shape[1], n_components) 28 | assert opls.T_ortho_.shape == (spectra.shape[0], n_components) 29 | assert opls.W_ortho_.shape == (spectra.shape[1], n_components) 30 | assert opls.x_mean_.shape == (spectra.shape[1],) 31 | assert opls.x_std_.shape == (spectra.shape[1],) 32 | assert opls.y_mean_.shape == (1,) 33 | assert opls.y_std_.shape == (1,) 34 | 35 | Z = opls.transform(spectra) 36 | assert Z.shape == spectra.shape 37 | 38 | pls = PLSRegression(n_components=1) 39 | uncorrected_r2 = r2_score(target, pls.fit(spectra, target).predict(spectra)) 40 | corrected_r2 = r2_score(target, pls.fit(Z, target).predict(Z)) 41 | uncorrected_q2 = r2_score(target, cross_val_predict(pls, spectra, target, cv=LeaveOneOut())) 42 | corrected_q2 = r2_score(target, cross_val_predict(pls, Z, target, cv=LeaveOneOut())) 43 | 44 | assert uncorrected_r2 < corrected_r2 45 | assert uncorrected_q2 < corrected_q2 46 | -------------------------------------------------------------------------------- /pyopls/validation.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from sys import stderr 3 | 4 | import numpy as np 5 | from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, ClassifierMixin 6 | from sklearn.cross_decomposition import PLSRegression 7 | from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, r2_score 8 | from sklearn.model_selection import KFold, StratifiedKFold, LeaveOneOut, cross_val_predict 9 | from sklearn.preprocessing import LabelBinarizer 10 | from sklearn.utils import check_array 11 | from sklearn.utils.multiclass import type_of_target 12 | from sklearn.utils.validation import check_is_fitted 13 | 14 | from .opls import OPLS 15 | from .permutation_test import permutation_test_score, feature_permutation_loading 16 | 17 | 18 | def discriminator_accuracy(y_true, y_pred): 19 | try: 20 | return accuracy_score(y_true.astype(int), np.sign(y_pred).astype(int)) 21 | except ValueError as e: 22 | warnings.warn(str(e), UserWarning) 23 | return float('nan') 24 | 25 | 26 | def discriminator_roc_auc(y_true, y_pred): 27 | try: 28 | return roc_auc_score(y_true, np.clip(y_pred, -1, 1)) 29 | except ValueError as e: 30 | warnings.warn(str(e), UserWarning) 31 | return float('nan') 32 | 33 | 34 | def discriminator_r2_score(y_true, y_pred): 35 | return r2_score(y_true, np.clip(y_pred, -1, 1)) 36 | 37 | 38 | def neg_press(y_true, y_pred): 39 | return -1 * np.sum(np.square(y_true - y_pred)) 40 | 41 | 42 | def neg_pressd(y_true, y_pred): 43 | return -1 * np.sum(np.square(y_true - np.clip(y_pred, -1, 1))) 44 | 45 | 46 | class OPLSValidator(BaseEstimator, TransformerMixin, RegressorMixin): 47 | """Cross Validation and Diagnostics of Orthogonal Projection to Latent Structures (O-PLS) 48 | 49 | Parameters 50 | ---------- 51 | min_n_components : int, minimum number of orthogonal components to remove 52 | 53 | k : int 54 | number of folds for k-fold cross-validation (default 5). If set to -1, leave-one out cross-validation is used. 55 | 56 | scale : boolean, scale data? (default True) 57 | 58 | n_permutations : int, number of permutations to perform on X 59 | 60 | Attributes 61 | ---------- 62 | q_squared_: float, overall Q-squared metric for the regression, the R-squared value of the left-out data. 63 | 64 | permutation_q_squared_: array [n_splits*n_permutations] 65 | The R-squared metric for the left-out data for each permutation 66 | 67 | q_squared_p_value_ : float 68 | The p-value for the permutation test on Q-squared 69 | 70 | 71 | r_squared_Y_: float, overall R-squared metric for the regression 72 | 73 | r_squared_X_: float, overall R-squared X metric ( 74 | 75 | discriminant_q_squared_: float 76 | Discriminant Q-squared, if this is an OPLSDA problem. Discriminant Q-squared disregards the error of class 77 | predictions whose values are beyond the class labels (e.g. it treats predictions of -1.5 as -1 and 1.5 as 1). 78 | 79 | permutation_discriminant_q_squared_: array [n_splits*n_permutations] 80 | The discriminant R-squared metric for the left-out data for each permutation. 81 | 82 | discriminant_q_squared_p_value_ : float 83 | The p-value for the permutation test on DQ-squared 84 | 85 | accuracy_ : float, accuracy for discrimination 86 | 87 | discriminant_r_squared_: float 88 | Discriminant R-squared, if this is an OPLSDA problem. Discriminant R-squared disregards the error of class 89 | predictions whose values are beyond the class labels (e.g. it treats a predictions of -1.5 as -1 and 1.5 as 1). 90 | 91 | permutation_accuracy_: array [n_splits*n_permutations] 92 | The accuracy of the left-out data for each permutation 93 | 94 | accuracy_p_value_: float 95 | The p-value for the permutation test on accuracy 96 | 97 | roc_auc_ : float, area under ROC curve for discrimination 98 | 99 | permutation_roc_auc_: array [n_splits*n_permutations] 100 | The area under the ROC curve of the left-out data for each permutation. 101 | 102 | roc_auc_p_value_: float 103 | The p-value for the permutation test on the are under the ROC curve. 104 | 105 | n_components_ : float 106 | The optimal number of orthogonal components to remove 107 | 108 | feature_significance_ : array [n_features], type bool 109 | Whether permuting the feature results in a significantly different loading for that feature in the model. 110 | Defined as the loading for the non-permuted data being outside the "middle" of the distribution of loadings 111 | for the permuted data, where the boundaries are a percentile range defined by outer_alpha. 112 | 113 | feature_p_values_ : array [n_features] 114 | An estimated p-value for the significance of the feature, defined as the ratio of loading values inside (-p,p) 115 | where p is the loading for non-permuted data. 116 | 117 | permutation_loadings_ : array [n_inner_permutations, n_features] 118 | Values for the loadings for the permuted data. 119 | 120 | loadings_ : array [n_features] 121 | Loadings for the non-permuted data 122 | 123 | opls_ : OPLS 124 | The OPLS transformer 125 | 126 | pls_ : PLSRegression 127 | A 1-component PLS regressor used to evaluate the OPLS transform 128 | 129 | 130 | References 131 | ---------- 132 | Johan Trygg and Svante Wold. Orthogonal projections to latent structures (O-PLS). 133 | J. Chemometrics 2002; 16: 119-128. DOI: 10.1002/cem.695 134 | 135 | Johan A. Westerhuis, Ewoud J. J. van Velzen, Huub C. J. Hoefsloot and Age K. Smilde. 136 | Discriminant Q-squared (DQ-squared) for improved discrimination in PLSDA models. 137 | Metabolomics (2008) 4: 293. https://doi.org/10.1007/s11306-008-0126-2 138 | """ 139 | 140 | def __init__(self, 141 | min_n_components=1, 142 | k=10, 143 | scale=True, 144 | force_regression=False, 145 | n_permutations=100, 146 | n_inner_permutations=100, 147 | n_outer_permutations=500, 148 | inner_alpha=0.2, 149 | outer_alpha=0.05): 150 | self.min_n_components = min_n_components 151 | self.k = k 152 | self.scale = scale 153 | self.n_permutations = n_permutations 154 | self.n_inner_permutations = n_inner_permutations 155 | self.n_outer_permutations = n_outer_permutations 156 | self.inner_alpha = inner_alpha 157 | self.outer_alpha = outer_alpha 158 | self.force_regression = force_regression 159 | self.n_components_ = None 160 | self.feature_significance_ = None 161 | self.feature_p_values_ = None 162 | 163 | self.r_squared_Y_ = None 164 | self.discriminant_r_squared_ = None 165 | self.r_squared_X_ = None 166 | 167 | self.q_squared_ = None 168 | self.permutation_q_squared_ = None 169 | self.q_squared_p_value_ = None 170 | 171 | self.accuracy_ = None 172 | self.permutation_accuracy_ = None 173 | self.accuracy_p_value_ = None 174 | 175 | self.roc_auc_ = None 176 | self.permutation_roc_auc_ = None 177 | self.roc_auc_p_value_ = None 178 | 179 | self.discriminant_q_squared_ = None 180 | self.discriminant_q_squared_p_value_ = None 181 | self.permutation_discriminant_q_squared_ = None 182 | 183 | self.permutation_loadings_ = None 184 | self.pls_ = None # a 1-component PLSRegression 185 | self.opls_ = None # OPLS transform 186 | self.loadings_ = None 187 | self.binarizer_ = None 188 | 189 | @staticmethod 190 | def _get_validator(Y, k): 191 | if k == -1: 192 | return LeaveOneOut() 193 | else: 194 | if type_of_target(Y) in ('binary', 'multiclass'): 195 | return StratifiedKFold(k) 196 | else: 197 | return KFold(k) 198 | 199 | def is_discrimination(self, Y): 200 | return type_of_target(Y).startswith('binary') and not self.force_regression 201 | 202 | def _get_score_function(self, Y): 203 | return neg_pressd if self.is_discrimination(Y) else neg_press 204 | 205 | def _validate(self, X, Y, n_components, score_function, cv=None, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'): 206 | cv = cv or self._get_validator(Y, self.k) 207 | Z = OPLS(n_components, self.scale).fit_transform(X, Y) 208 | y_pred = cross_val_predict(PLSRegression(1, self.scale), Z, Y, cv=cv, n_jobs=n_jobs, verbose=verbose, 209 | pre_dispatch=pre_dispatch) 210 | return score_function(Y, y_pred) 211 | 212 | def _process_binary_target(self, y, pos_label=None): 213 | self.binarizer_ = LabelBinarizer(-1, 1) 214 | self.binarizer_.fit(y) 215 | if pos_label is not None and self.binarizer_.transform([pos_label])[0] == -1: 216 | self.binarizer_.classes_ = np.flip(self.binarizer_.classes_) 217 | return self.binarizer_.transform(y).astype(float) 218 | 219 | def _check_target(self, y, pos_label=None): 220 | y = check_array(y, dtype=None, copy=True, ensure_2d=False).reshape(-1, 1) 221 | if type_of_target(y).startswith('multiclass') and not self.force_regression: 222 | raise ValueError('Multiclass input not directly supported. ' 223 | 'Try binarizing with sklearn.preprocessing.LabelBinarizer.') 224 | if self.is_discrimination(y): 225 | y = self._process_binary_target(y, pos_label) 226 | else: 227 | self.binarizer_ = None 228 | return y 229 | 230 | def _determine_n_components(self, X, y, cv=None, scoring=None, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'): 231 | """Determine number of orthogonal components to remove. 232 | 233 | Orthogonal components are removed until removing a component does not improve the performance 234 | of the k-fold cross-validated OPLS estimator, as measured by the residual sum of squares of the left-out 235 | data. 236 | 237 | Parameters 238 | ---------- 239 | X : array-like, shape = [n_samples, n_features] 240 | Training vectors, where n_samples is the number of samples and 241 | n_features is the number of predictors. 242 | 243 | y : array-like, shape = [n_samples, 1] 244 | Target vector, where n_samples is the number of samples. 245 | This implementation only supports a single response (target) variable. 246 | 247 | cv : sklearn.model_selection.BaseCrossValidator 248 | A cross validator. If None, _get_validator() is used to determine the validator. If target is binary or 249 | multiclass, sklearn.model_selection.StratifiedKFold is used, otherwise sklearn.model_selection.KFold 250 | is used unless k=-1, then sklearn.model_selection.LeaveOneOut is used. 251 | 252 | scoring : 253 | Scoring method to use. Will default to 'accuracy' for OPLS-DA and 'neg_mean_squared_error' for OPLS regression. 254 | 255 | n_jobs : int or None, optional (default=None) 256 | The number of CPUs to use to do the computation. 257 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 258 | ``-1`` means using all processors. See :term:`Glossary ` 259 | for more details. 260 | 261 | verbose : integer, optional 262 | The verbosity level. 263 | 264 | pre_dispatch : int, or string, optional 265 | Controls the number of jobs that get dispatched during parallel 266 | execution. Reducing this number can be useful to avoid an 267 | explosion of memory consumption when more jobs get dispatched 268 | than CPUs can process. This parameter can be: 269 | 270 | - None, in which case all the jobs are immediately 271 | created and spawned. Use this for lightweight and 272 | fast-running jobs, to avoid delays due to on-demand 273 | spawning of the jobs 274 | 275 | - An int, giving the exact number of total jobs that are 276 | spawned 277 | 278 | - A string, giving an expression as a function of n_jobs, 279 | as in '2*n_jobs' 280 | Returns 281 | ------- 282 | n_components: int 283 | The number of components to remove to maximize q-squared 284 | 285 | """ 286 | cv = cv or self._get_validator(y, self.k) 287 | scoring = scoring or self._get_score_function(y) 288 | n_components = self.min_n_components 289 | 290 | score = self._validate(X, y, n_components, scoring, cv, n_jobs, verbose, pre_dispatch) 291 | while n_components < X.shape[1]: 292 | next_score = self._validate(X, y, n_components + 1, scoring, cv, n_jobs, verbose, pre_dispatch) 293 | if next_score <= score: 294 | break 295 | else: 296 | score = next_score 297 | n_components += 1 298 | return n_components 299 | 300 | def _determine_significant_features(self, 301 | X, 302 | y, 303 | n_components, 304 | random_state=0, 305 | n_jobs=None, 306 | verbose=0, 307 | pre_dispatch='2*n_jobs'): 308 | """Determine the significance of each feature 309 | 310 | This is done by permuting each feature in X and measuring the loading. 311 | The feature is considered significant if the loadings are significantly different. 312 | 313 | This is always done with a regular PLS regressor 314 | PLS-DA should be binarized first. 315 | 316 | Parameters 317 | ---------- 318 | X : array-like, shape = [n_samples, n_features] 319 | Training vectors, where n_samples is the number of samples and 320 | n_features is the number of predictors. 321 | 322 | y : array-like, shape = [n_samples, 1] 323 | Target vector, where n_samples is the number of samples. 324 | This implementation only supports a single response (target) variable. 325 | 326 | n_components : int 327 | The number of orthogonal components to remove 328 | 329 | n_jobs : int or None, optional (default=None) 330 | The number of CPUs to use to do the computation. 331 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 332 | ``-1`` means using all processors. See :term:`Glossary ` 333 | for more details. 334 | 335 | verbose : integer, optional 336 | The verbosity level. 337 | 338 | 339 | pre_dispatch : int, or string, optional 340 | Controls the number of jobs that get dispatched during parallel 341 | execution. Reducing this number can be useful to avoid an 342 | explosion of memory consumption when more jobs get dispatched 343 | than CPUs can process. This parameter can be: 344 | 345 | - None, in which case all the jobs are immediately 346 | created and spawned. Use this for lightweight and 347 | fast-running jobs, to avoid delays due to on-demand 348 | spawning of the jobs 349 | 350 | - An int, giving the exact number of total jobs that are 351 | spawned 352 | 353 | - A string, giving an expression as a function of n_jobs, 354 | as in '2*n_jobs' 355 | 356 | Returns 357 | ------- 358 | significance: array [n_features], type bool 359 | Whether a particular feature is significant. 360 | 361 | p_values: array [n_features] 362 | The p-values for each feature. The null hypothesis is that permuting the feature does not change it's weight 363 | in the one-component PLS model. 364 | 365 | permuted_loadings: array [n_inner_permutations, n_features] 366 | The one-component PLS loadings for each permutation 367 | """ 368 | Z = OPLS(n_components, self.scale).fit_transform(X, y) 369 | x_loadings, permutation_x_loadings, p_values = feature_permutation_loading( 370 | PLSRegression(1, self.scale), Z, y, self.n_inner_permutations, self.inner_alpha, 371 | self.n_outer_permutations, random_state, n_jobs, verbose, pre_dispatch 372 | ) 373 | return p_values < self.outer_alpha, p_values, permutation_x_loadings 374 | 375 | def cross_val_roc_curve(self, X, y, cv=None, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'): 376 | Z = self.opls_.transform(X) 377 | cv = cv or self._get_validator(y, self.k) 378 | check_is_fitted(self, ['opls_', 'pls_', 'binarizer_']) 379 | y_pred = cross_val_predict(PLSRegression(1, self.scale), Z, y, cv=cv, n_jobs=n_jobs, verbose=verbose, 380 | pre_dispatch=pre_dispatch) 381 | return roc_curve(y, y_pred) 382 | 383 | def fit(self, X, y, n_components=None, cv=None, pos_label=None, 384 | random_state=0, n_jobs=None, verbose=0, pre_dispatch='2*n_jobs'): 385 | """Evaluate the quality of the OPLS regressor 386 | 387 | The q-squared value and a p-value for each feature's significance is determined. The final regressor can be 388 | accessed as estimator_. 389 | 390 | Parameters 391 | ---------- 392 | X : array-like, shape = [n_samples, n_features] 393 | Training vectors, where n_samples is the number of samples and 394 | n_features is the number of predictors. 395 | 396 | y : array-like, shape = [n_samples, 1] 397 | Target vector, where n_samples is the number of samples. 398 | This implementation only supports a single response (target) variable. 399 | 400 | n_components : int 401 | The number of orthogonal components to remove. Will be determined by determine_n_components if None 402 | 403 | cv : sklearn.model_selection.BaseCrossValidator 404 | A cross-validator to use for the determination of the number of components and the q-squared value. 405 | If None, a cross validator will be selected based on the value of k and the values of the target variable. 406 | If target is binary or multiclass, sklearn.model_selection.StratifiedKFold is used, otherwise 407 | sklearn.model_selection.KFold is used unless k=-1, then sklearn.model_selection.LeaveOneOut is used. 408 | 409 | pos_label : string 410 | If this is a discrimination problem, the value of the target corresponding to "1". 411 | 412 | random_state : int, RandomState instance or None, optional (default=0) 413 | If int, random_state is the seed used by the random number generator; 414 | If RandomState instance, random_state is the random number generator; 415 | If None, the random number generator is the RandomState instance used 416 | by `np.random`. 417 | 418 | n_jobs : int or None, optional (default=None) 419 | The number of CPUs to use to do the computation. 420 | ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 421 | ``-1`` means using all processors. See :term:`Glossary ` 422 | for more details. 423 | 424 | verbose : integer, optional 425 | The verbosity level. 426 | 427 | 428 | pre_dispatch : int, or string, optional 429 | Controls the number of jobs that get dispatched during parallel 430 | execution. Reducing this number can be useful to avoid an 431 | explosion of memory consumption when more jobs get dispatched 432 | than CPUs can process. This parameter can be: 433 | 434 | - None, in which case all the jobs are immediately 435 | created and spawned. Use this for lightweight and 436 | fast-running jobs, to avoid delays due to on-demand 437 | spawning of the jobs 438 | 439 | - An int, giving the exact number of total jobs that are 440 | spawned 441 | 442 | - A string, giving an expression as a function of n_jobs, 443 | as in '2*n_jobs' 444 | 445 | """ 446 | 447 | def _log(txt): 448 | if verbose in range(1, 51): 449 | stderr.write(txt + '\n') 450 | if verbose > 50: 451 | print(txt) 452 | 453 | X = check_array(X, dtype=float, copy=True) 454 | y = self._check_target(y, pos_label) 455 | 456 | if not n_components: 457 | _log('Determining number of components to remove.') 458 | n_components = self._determine_n_components(X, y, cv, n_jobs=n_jobs, verbose=verbose, 459 | pre_dispatch=pre_dispatch) 460 | _log(f'Removing {n_components} orthogonal components.') 461 | self.n_components_ = n_components or self._determine_n_components(X, y) 462 | 463 | self.opls_ = OPLS(self.n_components_, self.scale).fit(X, y) 464 | Z = self.opls_.transform(X) 465 | self.pls_ = PLSRegression(1, self.scale).fit(Z, y) 466 | self.r_squared_X_ = self.opls_.score(X) 467 | y_pred = self.pls_.predict(Z) 468 | self.r_squared_Y_ = r2_score(y, y_pred) 469 | if self.is_discrimination(y): 470 | self.discriminant_r_squared_ = r2_score(y, np.clip(y_pred, -1, 1)) 471 | 472 | cv = cv or self._get_validator(y, self.k) 473 | 474 | score_functions = [r2_score] 475 | if self.is_discrimination(y): 476 | score_functions += [discriminator_r2_score, discriminator_accuracy, discriminator_roc_auc] 477 | 478 | _log('Performing cross-validated metric permutation tests.') 479 | 480 | cv_results = permutation_test_score(PLSRegression(1, self.scale), Z, y, cv=cv, 481 | n_permutations=self.n_permutations, cv_score_functions=score_functions, 482 | n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) 483 | if self.is_discrimination(y): 484 | [ 485 | (self.q_squared_, self.permutation_q_squared_, self.q_squared_p_value_), 486 | (self.discriminant_q_squared_, self.permutation_discriminant_q_squared_, 487 | self.discriminant_q_squared_p_value_), 488 | (self.accuracy_, self.permutation_accuracy_, self.accuracy_p_value_), 489 | (self.roc_auc_, self.permutation_roc_auc_, self.roc_auc_p_value_) 490 | ] = cv_results 491 | else: 492 | [ 493 | (self.q_squared_, self.permutation_q_squared_, self.q_squared_p_value_) 494 | ] = cv_results 495 | 496 | _log('Estimating feature significance.') 497 | 498 | (self.feature_significance_, 499 | self.feature_p_values_, 500 | self.permutation_loadings_) = self._determine_significant_features(X, y, self.n_components_, random_state, 501 | n_jobs, verbose, pre_dispatch) 502 | return self 503 | 504 | def transform(self, X): 505 | return self.opls_.transform(X) 506 | 507 | def predict(self, X): 508 | Z = self.transform(X) 509 | return self.pls_.predict(Z) 510 | 511 | def score(self, X, y, sample_weight=None): 512 | Z = self.transform(X) 513 | return r2_score(y, self.pls_.predict(Z)) 514 | 515 | def discriminator_roc(self, X, y): 516 | Z = self.transform(X) 517 | return roc_curve(y, self.pls_.predict(Z)) 518 | 519 | 520 | class OPLSDAValidator(OPLSValidator, ClassifierMixin): 521 | def __init__(self, 522 | min_n_components=1, 523 | k=10, 524 | scale=True, 525 | force_regression=False, 526 | n_permutations=100, 527 | n_inner_permutations=100, 528 | n_outer_permutations=500, 529 | inner_alpha=0.2, 530 | outer_alpha=0.01): 531 | super().__init__(min_n_components, 532 | k, 533 | scale, 534 | force_regression, 535 | n_permutations, 536 | n_inner_permutations, 537 | n_outer_permutations, 538 | inner_alpha, 539 | outer_alpha) 540 | 541 | def score(self, X, y, sample_weight=None): 542 | Z = self.transform(X) 543 | y_pred = self.pls_.predict(Z) 544 | return r2_score(y, np.clip(y_pred, -1, 1)) 545 | 546 | def predict(self, X): 547 | Z = self.opls_.transform(X) 548 | values = np.sign(self.pls_.predict(Z)) 549 | return self.binarizer_.inverse_transform(values).reshape(-1, 1) 550 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.11.0 2 | scipy>=0.18.0 3 | scikit-learn>=0.18.0 4 | -------------------------------------------------------------------------------- /roc_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BiRG/pyopls/be801f1e021898d3337ba30795644130ec425e49/roc_curve.png -------------------------------------------------------------------------------- /scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BiRG/pyopls/be801f1e021898d3337ba30795644130ec425e49/scores.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open('README.md', 'r') as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name='pyopls', 8 | version='20.03-1', 9 | author='BiRG @ Wright State University', 10 | author_email='foose.3@wright.edu', 11 | description='Orthogonal Projection to Latent Structures', 12 | long_description=long_description, 13 | long_description_content_type='text/markdown', 14 | url='https://github.com/BiRG/pyopls', 15 | keywords='metabolomics chemometrics partial-least-squares', 16 | download_url='https://github.com/BiRG/pyopls/archive/20.02.tar.gz', 17 | packages=setuptools.find_packages(), 18 | python_requires='>=3.5', 19 | install_requires=[ 20 | 'numpy>=1.11.0', 21 | 'scipy>=0.18.0', 22 | 'scikit-learn>=0.18.0' 23 | ], 24 | classifiers=[ 25 | 'Natural Language :: English', 26 | 'Intended Audience :: Science/Research', 27 | 'Programming Language :: Python :: 3.6', 28 | 'Programming Language :: Python :: 3.7', 29 | 'Programming Language :: Python :: 3.8', 30 | 'Programming Language :: Python :: 3.9', 31 | 'License :: OSI Approved :: MIT License', 32 | 'Operating System :: OS Independent', 33 | ], 34 | ) 35 | --------------------------------------------------------------------------------