├── hidimstat ├── knockoffs │ ├── tests │ │ ├── __init__.py │ │ ├── test_stat_coef_diff.py │ │ ├── test_data_simulation.py │ │ ├── test_model_x_knockoff.py │ │ ├── test_generate_knockoff.py │ │ └── test_knockoff_aggregation.py │ ├── __init__.py │ ├── data_simulation.py │ ├── knockoffs.py │ ├── knockoff_aggregation.py │ ├── stat_coef_diff.py │ ├── utils.py │ └── gaussian_knockoff.py ├── version.py ├── setup.py ├── __init__.py ├── test │ ├── test_permutation_test.py │ ├── test_standardized_svr.py │ ├── test_multi_sample_split.py │ ├── test_adaptive_permutation_threshold.py │ ├── test_desparsified_lasso.py │ ├── test_clustered_inference.py │ ├── test_ensemble_clustered_inference.py │ ├── test_noise_std.py │ ├── test_scenario.py │ └── test_stat_tools.py ├── standardized_svr.py ├── adaptive_permutation_threshold.py ├── multi_sample_split.py ├── ensemble_clustered_inference.py ├── permutation_test.py ├── scenario.py ├── noise_std.py ├── clustered_inference.py ├── stat_tools.py └── desparsified_lasso.py ├── requirements.txt ├── examples ├── figures │ ├── fig1_nguyen_et_al.png │ ├── meg_somato_sLORETA.png │ └── meg_somato_cd-MTLasso.png ├── README.txt ├── plot_fmri_data_example.py └── plot_2D_simulation_example.py ├── doc ├── doc-requirements.txt ├── api.rst ├── _static │ └── style.css ├── Makefile ├── index.rst └── conf.py ├── .gitignore ├── .github └── workflows │ ├── circle_artifacts.yml │ └── deploy_ghpages.yml ├── MANIFEST.in ├── codecov.yml ├── .travis.yml ├── LICENSE ├── setup.py ├── .circleci └── config.yml ├── examples_not_exhibited └── plot_fig_1_nguyen_et_al.py └── README.md /hidimstat/knockoffs/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hidimstat/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/tests/test_stat_coef_diff.py: -------------------------------------------------------------------------------- 1 | # To be done 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | joblib 3 | scipy 4 | scikit-learn 5 | -------------------------------------------------------------------------------- /examples/figures/fig1_nguyen_et_al.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ja-che/hidimstat/HEAD/examples/figures/fig1_nguyen_et_al.png -------------------------------------------------------------------------------- /examples/figures/meg_somato_sLORETA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ja-che/hidimstat/HEAD/examples/figures/meg_somato_sLORETA.png -------------------------------------------------------------------------------- /examples/figures/meg_somato_cd-MTLasso.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ja-che/hidimstat/HEAD/examples/figures/meg_somato_cd-MTLasso.png -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | .. _general_examples: 2 | 3 | Examples Gallery 4 | ================ 5 | 6 | .. contents:: Contents 7 | :local: 8 | :depth: 3 9 | -------------------------------------------------------------------------------- /doc/doc-requirements.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | numpy 3 | numpydoc 4 | matplotlib 5 | pandas 6 | pillow 7 | scikit-learn 8 | scipy 9 | sphinx-bootstrap-theme 10 | sphinx-gallery 11 | mne 12 | pyvista 13 | pyvistaqt 14 | PyQt5 15 | nilearn 16 | memory_profiler 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Cache directories and files 2 | build 3 | dist 4 | doc 5 | examples/figures 6 | joblib 7 | .venv 8 | .pytest_cache 9 | .mypy_cache/ 10 | *.pyc 11 | __pycache__ 12 | *.egg-info 13 | .coverage 14 | 15 | # IDE specific folders 16 | .vscode 17 | 18 | .DS_Store 19 | coverage.xml 20 | 21 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/tests/test_data_simulation.py: -------------------------------------------------------------------------------- 1 | from hidimstat.knockoffs.data_simulation import simu_data 2 | 3 | n = 100 4 | p = 200 5 | seed = 42 6 | 7 | 8 | def test_simu_data(): 9 | X, y, _, _ = simu_data(n, p, seed=seed) 10 | 11 | assert X.shape == (n, p) 12 | assert y.size == n 13 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/__init__.py: -------------------------------------------------------------------------------- 1 | from .gaussian_knockoff import gaussian_knockoff_generation 2 | from .knockoffs import model_x_knockoff 3 | from .knockoff_aggregation import knockoff_aggregation 4 | from .stat_coef_diff import stat_coef_diff 5 | 6 | 7 | __all__ = [ 8 | 'gaussian_knockoff_generation', 9 | 'knockoff_aggregation', 10 | 'model_x_knockoff', 11 | 'stat_coef_diff', 12 | ] 13 | -------------------------------------------------------------------------------- /hidimstat/setup.py: -------------------------------------------------------------------------------- 1 | def configuration(parent_package='', top_path=None): 2 | from numpy.distutils.misc_util import Configuration 3 | 4 | config = Configuration('plotting', parent_package, top_path) 5 | 6 | config.add_subpackage('tests') 7 | 8 | return config 9 | 10 | 11 | if __name__ == '__main__': 12 | from numpy.distutils.core import setup 13 | setup(**configuration(top_path='').todict()) 14 | -------------------------------------------------------------------------------- /.github/workflows/circle_artifacts.yml: -------------------------------------------------------------------------------- 1 | on: [status] 2 | jobs: 3 | circleci_artifacts_redirector_job: 4 | runs-on: ubuntu-20.04 5 | name: Run CircleCI artifacts redirector 6 | steps: 7 | - name: GitHub Action step 8 | uses: larsoner/circleci-artifacts-redirector-action@master 9 | with: 10 | repo-token: ${{ secrets.GITHUB_TOKEN }} 11 | artifact-path: 0/dev/index.html 12 | circleci-jobs: build_docs 13 | job-title: Check the rendered docs here! 14 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/tests/test_model_x_knockoff.py: -------------------------------------------------------------------------------- 1 | from hidimstat.knockoffs.data_simulation import simu_data 2 | from hidimstat.knockoffs import model_x_knockoff 3 | from hidimstat.knockoffs.utils import cal_fdp_power 4 | 5 | seed = 0 6 | fdr = 0.5 7 | 8 | 9 | def test_model_x_knockoff(): 10 | 11 | n = 300 12 | p = 100 13 | X, y, _, non_zero = simu_data(n, p, seed=seed) 14 | ko_result = model_x_knockoff(X, y, fdr=fdr, seed=seed+1) 15 | fdp, power = cal_fdp_power(ko_result, non_zero) 16 | 17 | assert fdp <= 0.2 18 | assert power > 0.7 19 | -------------------------------------------------------------------------------- /doc/api.rst: -------------------------------------------------------------------------------- 1 | .. _api_documentation: 2 | 3 | ================= 4 | API Documentation 5 | ================= 6 | 7 | Estimators 8 | ========== 9 | 10 | .. currentmodule:: hidimstat 11 | 12 | Functions 13 | ========= 14 | 15 | .. autosummary:: 16 | :toctree: generated/ 17 | 18 | ada_svr 19 | aggregate_quantiles 20 | clustered_inference 21 | desparsified_lasso 22 | ensemble_clustered_inference 23 | group_reid 24 | hd_inference 25 | multivariate_1D_simulation 26 | permutation_test_cv 27 | reid 28 | standardized_svr 29 | zscore_from_pval 30 | -------------------------------------------------------------------------------- /doc/_static/style.css: -------------------------------------------------------------------------------- 1 | 2 | blockquote p { 3 | font-size: 14px !important; 4 | } 5 | 6 | blockquote { 7 | margin: 0 0 4px !important; 8 | } 9 | 10 | code { 11 | color: #49759c !important; 12 | background-color: #f3f5f9 !important; 13 | } 14 | 15 | .alert-info { 16 | background-color: #adb8cb !important; 17 | border-color: #adb8cb !important; 18 | color: #2c3e50 !important; 19 | } 20 | 21 | .function dt { 22 | padding-top: 150px; 23 | margin-top: -150px; 24 | -webkit-background-clip: content-box; 25 | background-clip: content-box; 26 | } 27 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include .circleci/config.yml 2 | include build_package.sh 3 | include *.txt 4 | include *.yml 5 | include LICENSE 6 | 7 | recursive-include doc *.css 8 | recursive-include doc *.py 9 | recursive-include doc *.rst 10 | recursive-include doc *.txt 11 | recursive-include doc Makefile 12 | recursive-include examples *.py 13 | recursive-include examples *.txt 14 | recursive-include examples_not_exhibited *.py 15 | recursive-include hidimstat *.py 16 | 17 | recursive-exclude doc/_build * 18 | recursive-exclude doc/generated * 19 | recursive-exclude doc/auto_examples * 20 | recursive-exclude examples/figures *.png 21 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | token: b7d1afb7-9730-4e21-882a-d0e893108def 3 | 4 | comment: false 5 | 6 | coverage: 7 | precision: 2 8 | round: down 9 | range: "70...100" 10 | status: 11 | project: 12 | default: 13 | # Commits pushed to master should not make the overall 14 | # project coverage decrease by more than 2%: 15 | target: auto 16 | threshold: 2% 17 | patch: 18 | default: 19 | # Be tolerant on slight code coverage diff on PRs to limit 20 | # noisy red coverage status on github PRs. 21 | # Note The coverage stats are still uploaded 22 | # to codecov so that PR reviewers can see uncovered lines 23 | # in the github diff if they install the codecov browser 24 | # extension: 25 | # https://github.com/codecov/browser-extension 26 | target: auto 27 | threshold: 2% 28 | 29 | ignore: 30 | - "**/setup.py" 31 | 32 | -------------------------------------------------------------------------------- /hidimstat/__init__.py: -------------------------------------------------------------------------------- 1 | from .clustered_inference import clustered_inference, hd_inference 2 | from .desparsified_lasso import desparsified_lasso, desparsified_group_lasso 3 | from .ensemble_clustered_inference import ensemble_clustered_inference 4 | from .adaptive_permutation_threshold import ada_svr 5 | from .multi_sample_split import aggregate_quantiles 6 | from .noise_std import reid, group_reid 7 | from .permutation_test import permutation_test_cv 8 | from .scenario import multivariate_1D_simulation 9 | from .standardized_svr import standardized_svr 10 | from .stat_tools import zscore_from_pval 11 | from .version import __version__ 12 | 13 | __all__ = [ 14 | 'aggregate_quantiles', 15 | 'clustered_inference', 16 | 'desparsified_lasso', 17 | 'desparsified_group_lasso', 18 | 'ensemble_clustered_inference', 19 | 'ada_svr', 20 | 'group_reid', 21 | 'hd_inference', 22 | 'multivariate_1D_simulation', 23 | 'permutation_test_cv', 24 | 'reid', 25 | 'standardized_svr', 26 | 'zscore_from_pval', 27 | '__version__', 28 | ] 29 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/tests/test_generate_knockoff.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Authors: Binh Nguyen 3 | 4 | from hidimstat.knockoffs.data_simulation import simu_data 5 | from hidimstat.knockoffs.gaussian_knockoff import ( 6 | _estimate_distribution, gaussian_knockoff_generation) 7 | 8 | SEED = 42 9 | fdr = 0.1 10 | 11 | 12 | def test_estimate_distribution(): 13 | n = 100 14 | p = 50 15 | X, y, _, non_zero = simu_data(n, p, seed=SEED) 16 | mu, Sigma = _estimate_distribution(X, cov_estimator='ledoit_wolf') 17 | 18 | assert mu.size == p 19 | assert Sigma.shape == (p, p) 20 | 21 | mu, Sigma = _estimate_distribution(X, cov_estimator='graph_lasso') 22 | 23 | assert mu.size == p 24 | assert Sigma.shape == (p, p) 25 | 26 | 27 | def test_gaussian_knockoff_equi(): 28 | n = 100 29 | p = 50 30 | X, y, _, non_zero = simu_data(n, p, seed=SEED) 31 | mu, Sigma = _estimate_distribution(X, cov_estimator='ledoit_wolf') 32 | 33 | X_tilde = gaussian_knockoff_generation( 34 | X, mu, Sigma, method='equi', seed=SEED*2) 35 | 36 | assert X_tilde.shape == (n, p) 37 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/tests/test_knockoff_aggregation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hidimstat.knockoffs import knockoff_aggregation, model_x_knockoff 3 | from hidimstat.knockoffs.data_simulation import simu_data 4 | from hidimstat.knockoffs.utils import cal_fdp_power 5 | 6 | n = 500 7 | p = 100 8 | snr = 5 9 | n_bootstraps = 25 10 | fdr = 0.5 11 | X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0) 12 | 13 | 14 | def test_knockoff_aggregation(): 15 | 16 | selected, aggregated_pval, pvals = knockoff_aggregation( 17 | X, y, fdr=fdr, n_bootstraps=n_bootstraps, verbose=True, random_state=0) 18 | 19 | fdp, power = cal_fdp_power(selected, non_zero_index) 20 | 21 | assert pvals.shape == (n_bootstraps, p) 22 | assert fdp < 0.5 23 | assert power > 0.1 24 | 25 | # Single AKO (or vanilla KO) 26 | selected = knockoff_aggregation( 27 | X, y, fdr=fdr, verbose=False, n_bootstraps=1, random_state=5) 28 | 29 | selected_ko = model_x_knockoff(X, y, fdr=fdr, seed=5) 30 | 31 | np.testing.assert_array_equal(selected, selected_ko) 32 | 33 | fdp, power = cal_fdp_power(selected, non_zero_index) 34 | 35 | assert fdp < 0.5 36 | assert power > 0.1 37 | -------------------------------------------------------------------------------- /hidimstat/test/test_permutation_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the permutation test module 3 | """ 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal 7 | 8 | from hidimstat.scenario import multivariate_1D_simulation 9 | from hidimstat.permutation_test import permutation_test_cv 10 | 11 | 12 | def test_permutation_test(): 13 | '''Testing the procedure on a simulation with no structure and a support 14 | of size 1. Computing one-sided p-values, we want a low p-value 15 | for the first feature and p-values close to 0.5 for the others.''' 16 | 17 | n_samples, n_features = 20, 50 18 | support_size = 1 19 | sigma = 0.1 20 | rho = 0.0 21 | 22 | X_init, y, beta, noise = \ 23 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 24 | support_size=support_size, sigma=sigma, 25 | rho=rho, shuffle=False, seed=3) 26 | 27 | y = y - np.mean(y) 28 | X_init = X_init - np.mean(X_init, axis=0) 29 | 30 | pval_corr, one_minus_pval_corr = \ 31 | permutation_test_cv(X_init, y, n_permutations=100) 32 | 33 | expected = 0.5 * np.ones(n_features) 34 | expected[:support_size] = 0.0 35 | 36 | assert_almost_equal(pval_corr, expected, decimal=1) 37 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | notifications: 2 | email: false 3 | dist: bionic # ubuntu 18.04 4 | language: python 5 | os: linux 6 | jobs: 7 | include: 8 | - python: "3.6" 9 | env: ONLY_PYTEST=true 10 | 11 | before_install: 12 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 13 | - bash miniconda.sh -b -p $HOME/miniconda 14 | - export PATH="$HOME/miniconda/bin:$PATH" 15 | - conda config --set always_yes yes --set changeps1 no --set show_channel_urls yes 16 | - conda config --set channel_priority strict 17 | - conda config --set add_pip_as_python_dependency yes 18 | - conda config --remove channels defaults 19 | - conda config --add channels conda-forge 20 | - conda update -q conda 21 | 22 | install: 23 | - conda install --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy scikit-learn joblib pytest coverage -yq 24 | - pip install -U mne 25 | - pip install check-manifest flake8 26 | script: 27 | - set -e # exit at first failure otherwise test might fail but build still passes 28 | - check-manifest; 29 | - flake8 hidimstat examples; 30 | - if [ "$ONLY_PYTEST" = true ]; then 31 | coverage run -m pytest; 32 | coverage report; 33 | coverage html; 34 | fi 35 | - export CODECOV_TOKEN="b7d1afb7-9730-4e21-882a-d0e893108def" 36 | - bash <(curl -s https://codecov.io/bash) 37 | -------------------------------------------------------------------------------- /hidimstat/test/test_standardized_svr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the standardized_svr module 3 | """ 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal 7 | 8 | from hidimstat.scenario import multivariate_1D_simulation 9 | from hidimstat.stat_tools import pval_from_scale 10 | from hidimstat.standardized_svr import standardized_svr 11 | 12 | 13 | def test_standardized_svr(): 14 | '''Testing the procedure on a simulation with no structure and a support 15 | of size 1. Computing one-sided p-values, we want a low p-value 16 | for the first feature and p-values close to 0.5 for the others.''' 17 | 18 | n_samples, n_features = 20, 50 19 | support_size = 1 20 | sigma = 0.1 21 | rho = 0.0 22 | 23 | X_init, y, beta, noise = \ 24 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 25 | support_size=support_size, sigma=sigma, 26 | rho=rho, shuffle=False, seed=3) 27 | 28 | y = y - np.mean(y) 29 | X_init = X_init - np.mean(X_init, axis=0) 30 | 31 | beta_hat, scale_hat = standardized_svr(X_init, y) 32 | 33 | pval, pval_corr, _, _ = pval_from_scale(beta_hat, scale_hat) 34 | 35 | expected = 0.5 * np.ones(n_features) 36 | expected[:support_size] = 0.0 37 | 38 | assert_almost_equal(pval_corr, expected, decimal=1) 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2020, Jerome-Alexis Chevalier 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /hidimstat/test/test_multi_sample_split.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the multi_sample_split module 3 | """ 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal, assert_equal 7 | 8 | from hidimstat.multi_sample_split import aggregate_medians, aggregate_quantiles 9 | 10 | 11 | def test_aggregate_medians(): 12 | '''Aggregated p-values is twice the median p-value. All p-values should 13 | be close to 0.04 and decreasing with respect to feature position.''' 14 | 15 | n_iter, n_features = 20, 5 16 | list_pval = (1.0 / (np.arange(n_iter * n_features) + 1)) 17 | list_pval = list_pval.reshape((n_iter, n_features)) 18 | list_pval[15:, :] = 3e-3 19 | 20 | pval = aggregate_medians(list_pval) 21 | expected = 0.04 * np.ones(n_features) 22 | 23 | assert_almost_equal(pval, expected, decimal=2) 24 | assert_equal(pval[-2] >= pval[-1], True) 25 | 26 | 27 | def test_aggregate_quantiles(): 28 | '''Aggregated p-values from adaptive quantiles formula. All p-values should 29 | be close to 0.04 and decreasing with respect to feature position.''' 30 | 31 | n_iter, n_features = 20, 5 32 | list_pval = (1.0 / (np.arange(n_iter * n_features) + 1)) 33 | list_pval = list_pval.reshape((n_iter, n_features)) 34 | list_pval[15:, :] = 3e-3 35 | 36 | pval = aggregate_quantiles(list_pval) 37 | expected = 0.03 * np.ones(n_features) 38 | 39 | assert_almost_equal(pval, expected, decimal=2) 40 | assert_equal(pval[-2] >= pval[-1], True) 41 | -------------------------------------------------------------------------------- /hidimstat/test/test_adaptive_permutation_threshold.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the adaptive_permutation_threshold module 3 | """ 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal 7 | 8 | from hidimstat.scenario import multivariate_1D_simulation 9 | from hidimstat.stat_tools import pval_from_scale 10 | from hidimstat.adaptive_permutation_threshold import ada_svr 11 | 12 | 13 | def test_ada_svr(): 14 | '''Testing the procedure on a simulation with no structure and a support 15 | of size 1. Computing one-sided p-values, we want a low p-value 16 | for the first feature and p-values close to 0.5 for the others.''' 17 | 18 | n_samples, n_features = 20, 50 19 | support_size = 1 20 | sigma = 0.1 21 | rho = 0.0 22 | 23 | X_init, y, beta, noise = \ 24 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 25 | support_size=support_size, sigma=sigma, 26 | rho=rho, shuffle=False, seed=3) 27 | 28 | y = y - np.mean(y) 29 | X_init = X_init - np.mean(X_init, axis=0) 30 | 31 | beta_hat, scale_hat = ada_svr(X_init, y) 32 | 33 | pval, pval_corr, _, _ = pval_from_scale(beta_hat, scale_hat) 34 | 35 | expected = 0.5 * np.ones(n_features) 36 | expected[:support_size] = 0.0 37 | 38 | assert_almost_equal(pval[:support_size], expected[:support_size], 39 | decimal=1) 40 | assert_almost_equal(pval_corr[support_size:], expected[support_size:], 41 | decimal=1) 42 | -------------------------------------------------------------------------------- /hidimstat/standardized_svr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import norm 3 | from sklearn.svm import LinearSVR 4 | from sklearn.model_selection import GridSearchCV 5 | from sklearn.pipeline import Pipeline 6 | 7 | 8 | def standardized_svr(X, y, Cs=np.logspace(-7, 1, 9), n_jobs=1): 9 | """Cross-validated SVR 10 | 11 | Parameters 12 | ----------- 13 | X : ndarray, shape (n_samples, n_features) 14 | Data. 15 | 16 | y : ndarray, shape (n_samples,) 17 | Target. 18 | 19 | Cs : ndarray, optional (default=np.logspace(-7, 1, 9)) 20 | The linear SVR regularization parameter is set by cross-val running 21 | a grid search on the list of hyper-parameters contained in Cs. 22 | 23 | n_jobs : int or None, optional (default=1) 24 | Number of CPUs to use during the cross validation. 25 | 26 | Returns 27 | ------- 28 | beta_hat : array, shape (n_features,) 29 | Estimated parameter vector. 30 | 31 | scale : ndarray, shape (n_features,) 32 | Value of the standard deviation of the parameters. 33 | """ 34 | 35 | n_samples, n_features = X.shape 36 | 37 | steps = [('SVR', LinearSVR())] 38 | pipeline = Pipeline(steps) 39 | parameters = {'SVR__C': Cs} 40 | 41 | grid = GridSearchCV(pipeline, param_grid=parameters, n_jobs=n_jobs) 42 | grid.fit(X, y) 43 | 44 | beta_hat = grid.best_estimator_.named_steps['SVR'].coef_ 45 | 46 | std = norm(beta_hat) / np.sqrt(n_features) 47 | scale = std * np.ones(beta_hat.size) 48 | 49 | return beta_hat, scale 50 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/data_simulation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.linalg import toeplitz 3 | 4 | 5 | def simu_data(n, p, rho=0.25, snr=2.0, sparsity=0.06, effect=1.0, seed=None): 6 | """Function to simulate data follow an autoregressive structure with Toeplitz 7 | covariance matrix 8 | 9 | Parameters 10 | ---------- 11 | n : int 12 | number of observations 13 | p : int 14 | number of variables 15 | sparsity : float, optional 16 | ratio of number of variables with non-zero coefficients over total 17 | coefficients 18 | rho : float, optional 19 | correlation parameter 20 | effect : float, optional 21 | signal magnitude, value of non-null coefficients 22 | seed : None or Int, optional 23 | random seed for generator 24 | 25 | Returns 26 | ------- 27 | X : ndarray, shape (n, p) 28 | Design matrix resulted from simulation 29 | y : ndarray, shape (n, ) 30 | Response vector resulted from simulation 31 | beta_true : ndarray, shape (n, ) 32 | Vector of true coefficient value 33 | non_zero : ndarray, shape (n, ) 34 | Vector of non zero coefficients index 35 | 36 | """ 37 | # Setup seed generator 38 | rng = np.random.default_rng(seed) 39 | 40 | # Number of non-null 41 | k = int(sparsity * p) 42 | 43 | # Generate the variables from a multivariate normal distribution 44 | mu = np.zeros(p) 45 | Sigma = toeplitz(rho ** np.arange(0, p)) # covariance matrix of X 46 | # X = np.dot(np.random.normal(size=(n, p)), cholesky(Sigma)) 47 | X = rng.multivariate_normal(mu, Sigma, size=(n)) 48 | # Generate the response from a linear model 49 | non_zero = rng.choice(p, k) 50 | beta_true = np.zeros(p) 51 | beta_true[non_zero] = effect 52 | eps = rng.standard_normal(size=n) 53 | prod_temp = np.dot(X, beta_true) 54 | noise_mag = np.linalg.norm(prod_temp) / (snr * np.linalg.norm(eps)) 55 | y = prod_temp + noise_mag * eps 56 | 57 | return X, y, beta_true, non_zero 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | 6 | from setuptools import find_packages 7 | 8 | PKG = 'hidimstat' 9 | DESCRIPTION = "High-dimensional statistical inference tools for Python" 10 | LONG_DESCRIPTION = open('README.md').read() 11 | MAINTAINER = 'Chevalier (ja-che) and Nguyen (tbng)' 12 | MAINTAINER_EMAIL = 'jerome-alexis_chevalier@hotmail.fr' 13 | URL = 'https://github.com/ja-che/hidimstat' 14 | DOWNLOAD_URL = 'https://github.com/ja-che/hidimstat' 15 | LICENSE = 'BSD' 16 | 17 | 18 | def load_version(): 19 | """Executes hidimstat/version.py in a globals dictionary and return it. 20 | Following format from Nilearn repo on github. 21 | """ 22 | # load all vars into globals, otherwise 23 | # the later function call using global vars doesn't work. 24 | globals_dict = {} 25 | with open(os.path.join('hidimstat', 'version.py')) as fp: 26 | exec(fp.read(), globals_dict) 27 | 28 | return globals_dict 29 | 30 | 31 | def setup_package(version): 32 | local_path = os.path.dirname(os.path.abspath(sys.argv[0])) 33 | 34 | os.chdir(local_path) 35 | sys.path.insert(0, local_path) 36 | 37 | from numpy.distutils.core import setup 38 | 39 | setup( 40 | packages=find_packages(exclude=['contrib', 'docs', 'tests']), 41 | name=PKG, 42 | maintainer=MAINTAINER, 43 | include_package_data=True, 44 | maintainer_email=MAINTAINER_EMAIL, 45 | description=DESCRIPTION, 46 | long_description=LONG_DESCRIPTION, 47 | long_description_content_type='text/markdown', 48 | license=LICENSE, 49 | url=URL, 50 | version=version, 51 | # download_url=DOWNLOAD_URL, 52 | zip_safe=False, # the package can run out of an .egg file 53 | classifiers=[ 54 | 'Programming Language :: Python', 55 | 'Programming Language :: Python :: 3.5', 56 | 'Development Status :: 3 - Alpha' 57 | ], 58 | ) 59 | 60 | 61 | _VERSION_GLOBALS = load_version() 62 | VERSION = _VERSION_GLOBALS['__version__'] 63 | 64 | if __name__ == "__main__": 65 | setup_package(VERSION) 66 | -------------------------------------------------------------------------------- /hidimstat/adaptive_permutation_threshold.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def ada_svr(X, y, rcond=1e-3): 5 | """Statistical inference procedure presented in Gaonkar et al. [1]_. 6 | 7 | Parameters 8 | ----------- 9 | X : ndarray, shape (n_samples, n_features) 10 | Data. 11 | 12 | y : ndarray, shape (n_samples,) 13 | Target. 14 | 15 | rcond : float, optional (default=1e-3) 16 | Cutoff for small singular values. Singular values smaller 17 | than `rcond` * largest_singular_value are set to zero. 18 | 19 | Returns 20 | ------- 21 | beta_hat : array, shape (n_features,) 22 | Estimated parameter vector. 23 | 24 | scale : ndarray, shape (n_features,) 25 | Value of the standard deviation of the parameters. 26 | 27 | References 28 | ---------- 29 | .. [1] Gaonkar, B., & Davatzikos, C. (2012, October). Deriving statistical 30 | significance maps for SVM based image classification and group 31 | comparisons. In International Conference on Medical Image Computing 32 | and Computer-Assisted Intervention (pp. 723-730). Springer, Berlin, 33 | Heidelberg. 34 | """ 35 | 36 | X = np.asarray(X) 37 | n_samples, n_features = X.shape 38 | 39 | K = _manual_inverting(np.dot(X, X.T), rcond=rcond) 40 | sum_K = np.sum(K) 41 | 42 | L = - np.outer(np.sum(K, axis=0), np.sum(K, axis=1)) / sum_K 43 | C = np.dot(X.T, K + L) 44 | 45 | beta_hat = np.dot(C, y) 46 | 47 | scale = np.sqrt(np.sum(C ** 2, axis=1)) 48 | 49 | return beta_hat, scale 50 | 51 | 52 | def _manual_inverting(X, rcond=1e-3, full_rank=False): 53 | 'Inverting taking care of low eigenvalues to increase numerical stability' 54 | 55 | X = np.asarray(X) 56 | n_samples, n_features = X.shape 57 | 58 | if n_samples != n_features: 59 | raise ValueError('The matrix is not a square matrix') 60 | 61 | U, s, V = np.linalg.svd(X, full_matrices=False) 62 | rank = np.sum(s > rcond * s.max()) 63 | s_inv = np.zeros(np.size(s)) 64 | s_inv[:rank] = 1 / s[:rank] 65 | 66 | if full_rank: 67 | s_inv[rank:] = 1 / (rcond * s.max()) 68 | 69 | X_inv = np.linalg.multi_dot([U, np.diag(s_inv), V]) 70 | 71 | return X_inv 72 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | _xvfb: &xvfb 4 | name: Start Xvfb virtual framebuffer 5 | command: | 6 | echo "export DISPLAY=:99" >> $BASH_ENV 7 | /sbin/start-stop-daemon --start --quiet --pidfile /tmp/custom_xvfb_99.pid --make-pidfile --background --exec /usr/bin/Xvfb -- :99 -screen 0 1280x1024x24 -ac +extension GLX +render -noreset -nolisten tcp -nolisten unix 8 | 9 | jobs: 10 | build_docs: 11 | docker: 12 | - image: circleci/python:3.8.5-buster 13 | steps: 14 | - checkout 15 | - run: 16 | name: Set BASH_ENV 17 | command: | 18 | set -e 19 | echo "set -e" >> $BASH_ENV 20 | echo "export XDG_RUNTIME_DIR=/tmp/runtime-circleci" >> $BASH_ENV 21 | echo "export MNE_3D_BACKEND=pyvista" >> $BASH_ENV 22 | echo "export PYTHONUNBUFFERED=1" >> $BASH_ENV 23 | echo "BASH_ENV:" 24 | cat $BASH_ENV 25 | - run: 26 | <<: *xvfb 27 | - run: 28 | name: Install OpenGL 29 | command: | 30 | sudo apt-get update --allow-releaseinfo-change 31 | sudo apt-get install -y git libopenmpi-dev openmpi-bin 32 | sudo apt-get install libosmesa6 libglx-mesa0 libopengl0 libglx0 libdbus-1-3 \ 33 | libxkbcommon-x11-0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0 \ 34 | libxcb-render-util0 libxcb-shape0 libxcb-xfixes0 libxcb-xinerama0 35 | sudo ln -s /usr/lib/x86_64-linux-gnu/libxcb-util.so.0 /usr/lib/x86_64-linux-gnu/libxcb-util.so.1 36 | - run: 37 | name: Install dependencies 38 | command: | 39 | python -m pip install --progress-bar off --upgrade pip setuptools wheel 40 | python -m pip install --progress-bar off -r doc/doc-requirements.txt 41 | python -m pip install -e . 42 | echo "localhost slots=50">hostfile 43 | - run: 44 | name: Check PyQt5 45 | command: LD_DEBUG=libs python -c "from PyQt5.QtWidgets import QApplication, QWidget; app = QApplication([])" 46 | - run: 47 | name: Check installation 48 | command: | 49 | which python 50 | QT_DEBUG_PLUGINS=1 mne sys_info 51 | python -c "import numpy; numpy.show_config()" 52 | LIBGL_DEBUG=verbose python -c "import pyvistaqt; pyvistaqt.BackgroundPlotter(show=True)" 53 | - run: 54 | name: make html 55 | no_output_timeout: 30m 56 | command: | 57 | cd doc; 58 | export OMP_NUM_THREADS=1; 59 | make html; 60 | - store_artifacts: 61 | path: doc/_build/html/ 62 | destination: dev 63 | 64 | workflows: 65 | version: 2 66 | 67 | default: 68 | jobs: 69 | - build_docs 70 | -------------------------------------------------------------------------------- /.github/workflows/deploy_ghpages.yml: -------------------------------------------------------------------------------- 1 | name: Deploy GitHub pages 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: main 7 | 8 | jobs: 9 | build_docs: 10 | runs-on: ubuntu-latest 11 | env: 12 | DISPLAY: ':99' 13 | defaults: 14 | run: 15 | shell: bash 16 | steps: 17 | - uses: actions/checkout@v2 18 | - run: | 19 | sudo apt-get update 20 | sudo apt-get install -y libgl1-mesa-glx 21 | sudo apt-get install -yqq libxkbcommon-x11-0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0 libxcb-render-util0 libxcb-xinerama0 libxcb-xfixes0 libopengl0 22 | /sbin/start-stop-daemon --start --quiet --pidfile /tmp/custom_xvfb_99.pid --make-pidfile --background --exec /usr/bin/Xvfb -- :99 -screen 0 1400x900x24 -ac +extension GLX +render -noreset 23 | name: 'Setup xvfb' 24 | - run: | 25 | sudo apt-get install -y git libopenmpi-dev openmpi-bin 26 | pip install -r doc/doc-requirements.txt 27 | pip install --progress-bar off vtk==9.0.20210612.dev0 28 | pip install --progress-bar off https://github.com/sphinx-gallery/sphinx-gallery/zipball/master 29 | pip install -e . 30 | echo "localhost slots=50">hostfile 31 | name: 'Install dependencies' 32 | - run: | 33 | LD_DEBUG=libs python -c "from PyQt5.QtWidgets import QApplication, QWidget; app = QApplication([])" 34 | name: 'Check PyQt5' 35 | - run: | 36 | which python 37 | QT_DEBUG_PLUGINS=1 mne sys_info 38 | python -c "import numpy; numpy.show_config()" 39 | LIBGL_DEBUG=verbose python -c "import pyvistaqt; pyvistaqt.BackgroundPlotter(show=True)" 40 | name: 'Check installation' 41 | - run: sphinx-build -b html doc doc/_build/html 42 | name: 'Generate HTML docs' 43 | - name: Upload generated HTML as artifact 44 | uses: actions/upload-artifact@v2 45 | with: 46 | name: DocHTML 47 | path: doc/_build/html/ 48 | 49 | deploy_docs: 50 | if: ${{ github.ref == 'refs/heads/main' }} 51 | needs: 52 | build_docs 53 | runs-on: ubuntu-latest 54 | steps: 55 | - uses: actions/checkout@v2 56 | - name: Download artifacts 57 | uses: actions/download-artifact@v4.1.7 58 | with: 59 | name: DocHTML 60 | path: doc/_build/html/ 61 | - name: Commit to documentation branch 62 | run: | 63 | git clone --no-checkout --depth 1 https://github.com/${{ github.repository_owner }}/hidimstat.git --branch gh-pages --single-branch gh-pages 64 | cp -r doc/_build/html/* gh-pages/ 65 | cd gh-pages 66 | touch .nojekyll 67 | git config --local user.email "hidimstat@github.com" 68 | git config --local user.name "hidimstat GitHub Action" 69 | git add . 70 | git commit -m "Update documentation" -a || true 71 | - name: Push changes 72 | uses: ad-m/github-push-action@v0.6.0 73 | with: 74 | branch: gh-pages 75 | directory: gh-pages 76 | github_token: ${{ secrets.GITHUB_TOKEN }} 77 | -------------------------------------------------------------------------------- /hidimstat/multi_sample_split.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def aggregate_medians(list_one_sided_pval): 5 | """Aggregation of survival function values taking twice the median 6 | 7 | Parameters 8 | ----------- 9 | list_one_sided_pval : ndarray, shape (n_iter, n_features) 10 | List of one-sided p-values. 11 | 12 | Returns 13 | ------- 14 | one_sided_pval : ndarray, shape (n_features,) 15 | Aggregated one-sided p-values. 16 | 17 | References 18 | ---------- 19 | .. [1] Meinshausen, N., Meier, L., & Bühlmann, P. (2009). P-values for 20 | high-dimensional regression. Journal of the American Statistical 21 | Association, 104(488), 1671-1681. 22 | """ 23 | 24 | n_iter, n_features = list_one_sided_pval.shape 25 | 26 | one_sided_pval = np.median(list_one_sided_pval, axis=0) 27 | one_sided_pval[one_sided_pval > 0.5] = \ 28 | np.maximum(0.5, 1 - (1 - one_sided_pval[one_sided_pval > 0.5]) * 2) 29 | one_sided_pval[one_sided_pval < 0.5] = \ 30 | np.minimum(0.5, one_sided_pval[one_sided_pval < 0.5] * 2) 31 | 32 | return one_sided_pval 33 | 34 | 35 | def aggregate_quantiles(list_one_sided_pval, gamma_min=0.2): 36 | """Aggregation of survival function values by adaptive quantile procedure 37 | 38 | Parameters 39 | ----------- 40 | list_one_sided_pval : ndarray, shape (n_iter, n_features) 41 | List of one-sided p-values. 42 | 43 | gamma_min : float, optional (default=0.2) 44 | Lowest gamma-quantile being considered to compute the adaptive 45 | quantile aggregation formula (cf. [1]_). 46 | 47 | Returns 48 | ------- 49 | one_sided_pval : ndarray, shape (n_features,) 50 | Aggregated one-sided p-values. 51 | 52 | References 53 | ---------- 54 | .. [1] Meinshausen, N., Meier, L., & Bühlmann, P. (2009). P-values for 55 | high-dimensional regression. Journal of the American Statistical 56 | Association, 104(488), 1671-1681. 57 | """ 58 | 59 | n_iter, n_features = list_one_sided_pval.shape 60 | one_sided_pval = 0.5 * np.ones(n_features) 61 | 62 | m = n_iter + 1 63 | k = np.maximum(1, int(np.floor(gamma_min * n_iter))) 64 | r = 1 - np.log(gamma_min) 65 | seq = range(k, n_iter) 66 | 67 | ordered_pval = np.sort(list_one_sided_pval, axis=0) 68 | rev_ordered_pval = ordered_pval[::-1] 69 | 70 | for i in np.arange(n_features): 71 | 72 | adjusted_ordered_pval = \ 73 | min([ordered_pval[j, i] * m / (j + 1) for j in seq]) 74 | adjusted_ordered_pval = min(0.5, adjusted_ordered_pval) 75 | 76 | adjusted_rev_ordered_pval = \ 77 | max([1 - (1 - rev_ordered_pval[j, i]) * m / (j + 1) for j in seq]) 78 | adjusted_rev_ordered_pval = max(0.5, adjusted_rev_ordered_pval) 79 | 80 | if (1 - adjusted_rev_ordered_pval) < adjusted_ordered_pval: 81 | 82 | one_sided_pval[i] = \ 83 | np.maximum(0.5, 1 - (1 - adjusted_rev_ordered_pval) * r) 84 | 85 | else: 86 | 87 | one_sided_pval[i] = np.minimum(0.5, adjusted_ordered_pval * r) 88 | 89 | return one_sided_pval 90 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/knockoffs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Authors: Binh Nguyen 3 | """ 4 | Implementation of Model-X knockoffs inference procedure, introduced in 5 | Candes et. al. (2016) " Panning for Gold: Model-X Knockoffs for 6 | High-dimensional Controlled Variable Selection" 7 | 8 | """ 9 | import numpy as np 10 | from sklearn.preprocessing import StandardScaler 11 | from sklearn.utils.validation import check_memory 12 | 13 | from .gaussian_knockoff import (_estimate_distribution, 14 | gaussian_knockoff_generation) 15 | from .stat_coef_diff import _coef_diff_threshold, stat_coef_diff 16 | 17 | 18 | def model_x_knockoff(X, y, fdr=0.1, offset=1, method='equi', 19 | statistics='lasso_cv', shrink=False, centered=True, 20 | cov_estimator='ledoit_wolf', verbose=False, memory=None, 21 | n_jobs=1, seed=None): 22 | """Model-X Knockoff inference procedure to control False Discoveries Rate, 23 | based on Candes et. al. (2016) 24 | 25 | Parameters 26 | ---------- 27 | X : 2D ndarray (n_samples, n_features) 28 | design matrix 29 | 30 | y : 1D ndarray (n_samples, ) 31 | response vector 32 | 33 | fdr : float, optional 34 | desired controlled FDR level 35 | 36 | offset : int, 0 or 1, optional 37 | offset to calculate knockoff threshold, offset = 1 is equivalent to 38 | knockoff+ 39 | 40 | method : str, optional 41 | knockoff construction methods, either equi for equi-correlated knockoff 42 | or sdp for optimization scheme 43 | 44 | statistics : str, optional 45 | method to calculate knockoff test score 46 | 47 | shrink : bool, optional 48 | whether to shrink the empirical covariance matrix 49 | 50 | centered : bool, optional 51 | whether to standardize the data before doing the inference procedure 52 | 53 | cov_estimator : str, optional 54 | method of empirical covariance matrix estimation 55 | 56 | seed : int or None, optional 57 | random seed used to generate Gaussian knockoff variable 58 | 59 | Returns 60 | ------- 61 | selected : 1D array, int 62 | vector of index of selected variables 63 | 64 | test_score : 1D array, (n_features, ) 65 | vector of test statistic 66 | 67 | thres : float 68 | knockoff threshold 69 | 70 | X_tilde : 2D array, (n_samples, n_features) 71 | knockoff design matrix 72 | """ 73 | memory = check_memory(memory) 74 | 75 | if centered: 76 | X = StandardScaler().fit_transform(X) 77 | 78 | mu, Sigma = _estimate_distribution( 79 | X, shrink=shrink, cov_estimator=cov_estimator) 80 | 81 | X_tilde = gaussian_knockoff_generation(X, mu, Sigma, memory=memory, 82 | method=method, seed=seed) 83 | test_score = memory.cache( 84 | stat_coef_diff, ignore=['n_jobs', 'joblib_verbose'])( 85 | X, X_tilde, y, method=statistics, n_jobs=n_jobs) 86 | thres = _coef_diff_threshold(test_score, fdr=fdr, offset=offset) 87 | 88 | selected = np.where(test_score >= thres)[0] 89 | 90 | if verbose: 91 | return selected, test_score, thres, X_tilde 92 | 93 | return selected 94 | -------------------------------------------------------------------------------- /hidimstat/test/test_desparsified_lasso.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the desparsified_lasso module 3 | """ 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal, assert_equal 7 | from scipy.linalg import toeplitz 8 | 9 | from hidimstat.scenario import multivariate_1D_simulation 10 | from hidimstat.scenario import multivariate_temporal_simulation 11 | from hidimstat.desparsified_lasso import desparsified_lasso 12 | from hidimstat.desparsified_lasso import desparsified_group_lasso 13 | 14 | 15 | def test_desparsified_lasso(): 16 | '''Testing the procedure on a simulation with no structure and 17 | a support of size 1. Computing 99% confidence bounds and checking 18 | that they contains the true parameter vector.''' 19 | 20 | n_samples, n_features = 50, 50 21 | support_size = 1 22 | sigma = 0.1 23 | rho = 0.0 24 | 25 | X, y, beta, noise = \ 26 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 27 | support_size=support_size, sigma=sigma, 28 | rho=rho, shuffle=False, seed=2) 29 | 30 | beta_hat, cb_min, cb_max = desparsified_lasso(X, y, confidence=0.99) 31 | 32 | assert_almost_equal(beta_hat, beta, decimal=1) 33 | assert_equal(cb_min < beta, True) 34 | assert_equal(cb_max > beta, True) 35 | 36 | beta_hat, cb_min, cb_max = \ 37 | desparsified_lasso(X, y, dof_ajdustement=True, confidence=0.99) 38 | 39 | assert_almost_equal(beta_hat, beta, decimal=1) 40 | assert_equal(cb_min < beta, True) 41 | assert_equal(cb_max > beta, True) 42 | 43 | 44 | def test_desparsified_group_lasso(): 45 | '''Testing the procedure on a simulation with no structure and 46 | a support of size 2. Computing one-sided p-values, we want 47 | low p-values for the features of the support and p-values 48 | close to 0.5 for the others.''' 49 | 50 | n_samples = 50 51 | n_features = 100 52 | n_times = 10 53 | support_size = 2 54 | sigma = 0.1 55 | rho = 0.9 56 | corr = toeplitz(np.geomspace(1, rho ** (n_times - 1), n_times)) 57 | cov = np.outer(sigma, sigma) * corr 58 | 59 | X, Y, beta, noise = \ 60 | multivariate_temporal_simulation(n_samples=n_samples, 61 | n_features=n_features, 62 | n_times=n_times, 63 | support_size=support_size, 64 | sigma=sigma, rho_noise=rho) 65 | 66 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 67 | desparsified_group_lasso(X, Y, cov=cov) 68 | 69 | expected_pval_corr = \ 70 | np.concatenate((np.zeros(support_size), 71 | 0.5 * np.ones(n_features - support_size))) 72 | 73 | assert_almost_equal(beta_hat, beta, decimal=1) 74 | assert_almost_equal(pval_corr, expected_pval_corr, decimal=1) 75 | 76 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 77 | desparsified_group_lasso(X, Y, test='F') 78 | 79 | assert_almost_equal(beta_hat, beta, decimal=1) 80 | assert_almost_equal(pval_corr, expected_pval_corr, decimal=1) 81 | 82 | # Testing error is raised when the covariance matrix has wrong shape 83 | bad_cov = np.delete(cov, 0, axis=1) 84 | np.testing.assert_raises(ValueError, desparsified_group_lasso, 85 | X=X, Y=Y, cov=bad_cov) 86 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/knockoff_aggregation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Authors: Binh Nguyen 3 | import numpy as np 4 | from joblib import Parallel, delayed 5 | from sklearn.preprocessing import StandardScaler 6 | from sklearn.utils import check_random_state 7 | from sklearn.utils.validation import check_memory 8 | 9 | from .gaussian_knockoff import (_estimate_distribution, 10 | gaussian_knockoff_generation) 11 | from .stat_coef_diff import stat_coef_diff 12 | from .utils import fdr_threshold, quantile_aggregation 13 | 14 | 15 | def knockoff_aggregation(X, y, centered=True, shrink=False, 16 | construct_method='equi', fdr=0.1, fdr_control='bhq', 17 | reshaping_function=None, offset=1, 18 | statistic='lasso_cv', cov_estimator='ledoit_wolf', 19 | joblib_verbose=0, n_bootstraps=25, n_jobs=1, 20 | adaptive_aggregation=False, gamma=0.5, gamma_min=0.05, 21 | verbose=False, memory=None, random_state=None): 22 | 23 | # unnecessary to have n_jobs > number of bootstraps 24 | n_jobs = min(n_bootstraps, n_jobs) 25 | 26 | if centered: 27 | X = StandardScaler().fit_transform(X) 28 | 29 | mu, Sigma = _estimate_distribution( 30 | X, shrink=shrink, cov_estimator=cov_estimator) 31 | 32 | mem = check_memory(memory) 33 | stat_coef_diff_cached = mem.cache(stat_coef_diff, 34 | ignore=['n_jobs', 'joblib_verbose']) 35 | 36 | if n_bootstraps == 1: 37 | X_tilde = gaussian_knockoff_generation( 38 | X, mu, Sigma, method=construct_method, 39 | memory=memory, seed=random_state) 40 | ko_stat = stat_coef_diff_cached(X, X_tilde, y, method=statistic) 41 | pvals = _empirical_pval(ko_stat, offset) 42 | threshold = fdr_threshold(pvals, fdr=fdr, 43 | method=fdr_control) 44 | selected = np.where(pvals <= threshold)[0] 45 | 46 | if verbose: 47 | return selected, pvals 48 | 49 | return selected 50 | 51 | if isinstance(random_state, (int, np.int32, np.int64)): 52 | rng = check_random_state(random_state) 53 | elif random_state is None: 54 | rng = check_random_state(0) 55 | else: 56 | raise TypeError('Wrong type for random_state') 57 | 58 | seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps) 59 | parallel = Parallel(n_jobs, verbose=joblib_verbose) 60 | X_tildes = parallel(delayed(gaussian_knockoff_generation)( 61 | X, mu, Sigma, method=construct_method, memory=memory, 62 | seed=seed) for seed in seed_list) 63 | 64 | ko_stats = parallel(delayed(stat_coef_diff_cached)( 65 | X, X_tildes[i], y, method=statistic) for i in range(n_bootstraps)) 66 | 67 | pvals = np.array([_empirical_pval(ko_stats[i], offset) 68 | for i in range(n_bootstraps)]) 69 | 70 | aggregated_pval = quantile_aggregation( 71 | pvals, gamma=gamma, gamma_min=gamma_min, 72 | adaptive=adaptive_aggregation) 73 | 74 | threshold = fdr_threshold(aggregated_pval, fdr=fdr, method=fdr_control, 75 | reshaping_function=reshaping_function) 76 | selected = np.where(aggregated_pval <= threshold)[0] 77 | 78 | if verbose: 79 | return selected, aggregated_pval, pvals 80 | 81 | return selected 82 | 83 | 84 | def _empirical_pval(test_score, offset=1): 85 | 86 | pvals = [] 87 | n_features = test_score.size 88 | 89 | if offset not in (0, 1): 90 | raise ValueError("'offset' must be either 0 or 1") 91 | 92 | test_score_inv = -test_score 93 | for i in range(n_features): 94 | if test_score[i] <= 0: 95 | pvals.append(1) 96 | else: 97 | pvals.append( 98 | (offset + np.sum(test_score_inv >= test_score[i])) / 99 | n_features 100 | ) 101 | 102 | return np.array(pvals) 103 | -------------------------------------------------------------------------------- /hidimstat/test/test_clustered_inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the clustered_inference module 3 | """ 4 | 5 | import numpy as np 6 | from sklearn.cluster import FeatureAgglomeration 7 | from sklearn.feature_extraction import image 8 | from numpy.testing import assert_almost_equal 9 | 10 | from hidimstat.scenario import multivariate_1D_simulation 11 | from hidimstat.scenario import multivariate_temporal_simulation 12 | from hidimstat.clustered_inference import clustered_inference 13 | 14 | 15 | def test_clustered_inference(): 16 | '''Testing the procedure on two simulations with a 1D data structure and 17 | with n << p: the first test has no temporal dimension, the second has a 18 | temporal dimension. The support is connected and of size 10, it must be 19 | recovered with a small spatial tolerance parametrized by `margin_size`. 20 | Computing one sided p-values, we want low p-values for the features of 21 | the support and p-values close to 0.5 for the others.''' 22 | 23 | # Scenario 1: data with no temporal dimension 24 | # ########################################### 25 | n_samples, n_features = 100, 2000 26 | support_size = 10 27 | sigma = 5.0 28 | rho = 0.95 29 | n_clusters = 200 30 | margin_size = 5 31 | interior_support = support_size - margin_size 32 | extended_support = support_size + margin_size 33 | 34 | X_init, y, beta, epsilon = \ 35 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 36 | support_size=support_size, sigma=sigma, 37 | rho=rho, shuffle=False, seed=2) 38 | 39 | y = y - np.mean(y) 40 | X_init = X_init - np.mean(X_init, axis=0) 41 | 42 | connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1) 43 | ward = FeatureAgglomeration(n_clusters=n_clusters, 44 | connectivity=connectivity, 45 | linkage='ward') 46 | 47 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 48 | clustered_inference(X_init, y, ward, n_clusters) 49 | 50 | expected = 0.5 * np.ones(n_features) 51 | expected[:support_size] = 0.0 52 | 53 | assert_almost_equal(pval_corr[:interior_support], 54 | expected[:interior_support]) 55 | assert_almost_equal(pval_corr[extended_support:200], 56 | expected[extended_support:200], 57 | decimal=1) 58 | 59 | # Scenario 2: temporal data 60 | # ######################### 61 | n_samples, n_features, n_times = 200, 2000, 10 62 | support_size = 10 63 | sigma = 5.0 64 | rho_noise = 0.9 65 | rho_data = 0.9 66 | n_clusters = 200 67 | margin_size = 5 68 | interior_support = support_size - margin_size 69 | extended_support = support_size + margin_size 70 | 71 | X, Y, beta, noise = \ 72 | multivariate_temporal_simulation(n_samples=n_samples, 73 | n_features=n_features, 74 | n_times=n_times, 75 | support_size=support_size, 76 | sigma=sigma, 77 | rho_noise=rho_noise, 78 | rho_data=rho_data, 79 | shuffle=False) 80 | 81 | connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1) 82 | ward = FeatureAgglomeration(n_clusters=n_clusters, 83 | connectivity=connectivity, 84 | linkage='ward') 85 | 86 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 87 | clustered_inference(X, Y, ward, n_clusters, 88 | method='desparsified-group-lasso') 89 | 90 | expected = 0.5 * np.ones(n_features) 91 | expected[:support_size] = 0.0 92 | 93 | assert_almost_equal(pval_corr[:interior_support], 94 | expected[:interior_support], 95 | decimal=3) 96 | assert_almost_equal(pval_corr[extended_support:], 97 | expected[extended_support:], 98 | decimal=1) 99 | -------------------------------------------------------------------------------- /examples_not_exhibited/plot_fig_1_nguyen_et_al.py: -------------------------------------------------------------------------------- 1 | # Authors: Binh Nguyen 2 | """ 3 | Work in Progress : Histogram of KO vs AKO performance 4 | ===================================================== 5 | 6 | Example: reproducing Figure 1 in:: 7 | 8 | Nguyen et al. (2020) Aggregation of Multiple Knockoffs 9 | https://arxiv.org/abs/2002.09269 10 | 11 | To reduce the script runtime it is desirable to increase n_jobs parameter. 12 | """ 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | from hidimstat.knockoffs import knockoff_aggregation, model_x_knockoff 16 | from hidimstat.knockoffs.data_simulation import simu_data 17 | from hidimstat.knockoffs.utils import cal_fdp_power 18 | from joblib import Parallel, delayed 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | color_blue = '#1f77b4' 22 | color_teal = '#1fbecf' 23 | 24 | 25 | def one_inference(n, p, snr, rho, sparsity, n_bootstraps=25, gamma=0.3, 26 | n_jobs=1, offset=1, fdr=0.1, seed=None): 27 | 28 | # Simulate data following autoregressive structure, seed is fixed to ensure 29 | # doing inference on only 1 simulation 30 | X, y, _, non_zero_index = simu_data(n=n, p=p, rho=rho, snr=snr, 31 | sparsity=sparsity, seed=42) 32 | X = StandardScaler().fit_transform(X) 33 | 34 | # Single knockoff -- has to do it 25 times to match the number of 35 | # bootstraps in AKO for fair comparison 36 | ko_fdps = [] 37 | ko_powers = [] 38 | 39 | for i in range(n_bootstraps): 40 | ko_selected = model_x_knockoff(X, y, fdr=fdr, offset=offset, 41 | n_jobs=n_jobs, seed=n_bootstraps*seed) 42 | ko_fdp, ko_power = cal_fdp_power(ko_selected, non_zero_index) 43 | ko_fdps.append(ko_fdp) 44 | ko_powers.append(ko_power) 45 | 46 | # Aggregated knockoff 47 | ako_selected = knockoff_aggregation(X, y, fdr=fdr, offset=offset, 48 | n_jobs=n_jobs, gamma=gamma, 49 | n_bootstraps=n_bootstraps, 50 | random_state=seed*2) 51 | 52 | ako_fdp, ako_power = cal_fdp_power(ako_selected, non_zero_index) 53 | 54 | return ko_fdps, ako_fdp, ko_powers, ako_power 55 | 56 | 57 | def plot(results, n_simu, fdr): 58 | 59 | ko_fdps = np.array([results[i][0] for i in range(n_simu)]).ravel() 60 | ako_fdps = np.array([results[i][1] for i in range(n_simu)]).ravel() 61 | ko_powers = np.array([results[i][2] for i in range(n_simu)]).ravel() 62 | ako_powers = np.array([results[i][3] for i in range(n_simu)]).ravel() 63 | 64 | # Plotting 65 | n_bins = 30 66 | fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(7, 4)) 67 | ax1.tick_params(labelsize=14) 68 | ax1.hist(ko_fdps, edgecolor='k', 69 | range=[0.0, 1.0], bins=n_bins, color=color_blue) 70 | ax1.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0) 71 | ax2.tick_params(labelsize=14) 72 | ax2.hist(ko_powers, edgecolor='k', 73 | range=[0.0, 1.0], bins=n_bins, color=color_blue) 74 | ax3.tick_params(labelsize=14) 75 | ax3.hist(ako_fdps, edgecolor='k', 76 | range=[0.0, 1.0], bins=n_bins, color=color_teal) 77 | ax3.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0) 78 | ax4.tick_params(labelsize=14) 79 | ax4.hist(ako_powers, edgecolor='k', 80 | range=[0.0, 1.0], bins=n_bins, color=color_teal) 81 | plt.tight_layout() 82 | 83 | figname = 'figures/histogram_ko_vs_ako.png' 84 | plt.savefig(figname) 85 | print(f'Save figure to {figname}') 86 | 87 | 88 | def main(): 89 | # Simulation paramaters 90 | n, p = 50, 200 91 | snr = 3.0 92 | rho = 0.5 93 | sparsity = 0.06 94 | offset = 1 95 | fdr = 0.05 96 | gamma = 0.3 97 | n_bootstraps = 10 98 | n_simu = 10 99 | offset = 1 100 | 101 | results = Parallel(n_jobs=1)( 102 | delayed(one_inference)( 103 | n=n, p=p, snr=snr, rho=rho, sparsity=sparsity, 104 | n_jobs=1, n_bootstraps=n_bootstraps, fdr=fdr, 105 | offset=offset, gamma=gamma, seed=seed) 106 | for seed in range(n_simu)) 107 | 108 | # Plotting 109 | plot(results, n_simu, fdr) 110 | print('Done!') 111 | 112 | 113 | # if __name__ == '__main__': 114 | # main() 115 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/stat_coef_diff.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Authors: Binh Nguyen 3 | 4 | import numpy as np 5 | from sklearn.linear_model import (LassoCV, LogisticRegressionCV) 6 | from sklearn.model_selection import KFold 7 | # from sklearn.linear_model._coordinate_descent import _alpha_grid 8 | # from sklearn.model_selection import GridSearchCV 9 | 10 | 11 | def stat_coef_diff(X, X_tilde, y, method='lasso_cv', n_splits=5, n_jobs=1, 12 | n_lambdas=10, n_iter=1000, group_reg=1e-3, l1_reg=1e-3, 13 | joblib_verbose=0, return_coef=False, solver='liblinear', 14 | seed=0): 15 | """Calculate test statistic by doing estimation with Cross-validation on 16 | concatenated design matrix [X X_tilde] to find coefficients [beta 17 | beta_tilda]. The test statistic is then: 18 | 19 | W_j = abs(beta_j) - abs(beta_tilda_j) 20 | 21 | with j = 1, ..., n_features 22 | 23 | Parameters 24 | ---------- 25 | X : 2D ndarray (n_samples, n_features) 26 | Original design matrix 27 | 28 | X_tilde : 2D ndarray (n_samples, n_features) 29 | Knockoff design matrix 30 | 31 | y : 1D ndarray (n_samples, ) 32 | Response vector 33 | 34 | loss : str, optional 35 | if the response vector is continuous, the loss used should be 36 | 'least_square', otherwise 37 | if the response vector is binary, it should be 'logistic' 38 | 39 | n_splits : int, optional 40 | number of cross-validation folds 41 | 42 | solver : str, optional 43 | solver used by sklearn function LogisticRegressionCV 44 | 45 | n_regu : int, optional 46 | number of regulation used in the regression problem 47 | 48 | return_coef : bool, optional 49 | return regression coefficient if set to True 50 | 51 | Returns 52 | ------- 53 | test_score : 1D ndarray (n_features, ) 54 | vector of test statistic 55 | 56 | coef: 1D ndarray (n_features * 2, ) 57 | coefficients of the estimation problem 58 | """ 59 | 60 | n_features = X.shape[1] 61 | X_ko = np.column_stack([X, X_tilde]) 62 | lambda_max = np.max(np.dot(X_ko.T, y)) / (2 * n_features) 63 | lambdas = np.linspace( 64 | lambda_max*np.exp(-n_lambdas), lambda_max, n_lambdas) 65 | 66 | cv = KFold(n_splits=5, shuffle=True, random_state=seed) 67 | 68 | estimator = { 69 | 'lasso_cv': LassoCV(alphas=lambdas, n_jobs=n_jobs, 70 | verbose=joblib_verbose, max_iter=1e4, cv=cv), 71 | 'logistic_l1': LogisticRegressionCV( 72 | penalty='l1', max_iter=1e4, 73 | solver=solver, cv=cv, 74 | n_jobs=n_jobs, tol=1e-8), 75 | 'logistic_l2': LogisticRegressionCV( 76 | penalty='l2', max_iter=1e4, n_jobs=n_jobs, 77 | verbose=joblib_verbose, cv=cv, tol=1e-8), 78 | } 79 | 80 | try: 81 | clf = estimator[method] 82 | except KeyError: 83 | print('{} is not a valid estimator'.format(method)) 84 | 85 | clf.fit(X_ko, y) 86 | 87 | try: 88 | coef = np.ravel(clf.coef_) 89 | except AttributeError: 90 | coef = np.ravel(clf.best_estimator_.coef_) # for GridSearchCV object 91 | 92 | test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) 93 | 94 | if return_coef: 95 | return test_score, coef 96 | 97 | return test_score 98 | 99 | 100 | def _coef_diff_threshold(test_score, fdr=0.1, offset=1): 101 | """Calculate the knockoff threshold based on the procedure stated in the 102 | article. 103 | 104 | Parameters 105 | ---------- 106 | test_score : 1D ndarray, shape (n_features, ) 107 | vector of test statistic 108 | 109 | fdr : float, optional 110 | desired controlled FDR level 111 | 112 | offset : int, 0 or 1, optional 113 | offset equals 1 is the knockoff+ procedure 114 | 115 | Returns 116 | ------- 117 | thres : float or np.inf 118 | threshold level 119 | """ 120 | if offset not in (0, 1): 121 | raise ValueError("'offset' must be either 0 or 1") 122 | 123 | t_mesh = np.sort(np.abs(test_score[test_score != 0])) 124 | for t in t_mesh: 125 | false_pos = np.sum(test_score <= -t) 126 | selected = np.sum(test_score >= t) 127 | if (offset + false_pos) / np.maximum(selected, 1) <= fdr: 128 | return t 129 | 130 | return np.inf 131 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: Binh Nguyen & Jerome-Alexis Chevalier 3 | import numpy as np 4 | 5 | 6 | def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False): 7 | if adaptive: 8 | return _adaptive_quantile_aggregation(pvals, gamma_min) 9 | else: 10 | return _fixed_quantile_aggregation(pvals, gamma) 11 | 12 | 13 | def fdr_threshold(pvals, fdr=0.1, method='bhq', reshaping_function=None): 14 | if method == 'bhq': 15 | return _bhq_threshold(pvals, fdr=fdr) 16 | elif method == 'bhy': 17 | return _bhy_threshold( 18 | pvals, fdr=fdr, reshaping_function=reshaping_function) 19 | else: 20 | raise ValueError( 21 | '{} is not support FDR control method'.format(method)) 22 | 23 | 24 | def cal_fdp_power(selected, non_zero_index, r_index=False): 25 | """ Calculate power and False Discovery Proportion 26 | 27 | Parameters 28 | ---------- 29 | selected: list index (in R format) of selected non-null variables 30 | non_zero_index: true index of non-null variables 31 | r_index : True if the index is taken from rpy2 inference 32 | 33 | Returns 34 | ------- 35 | fdp: False Discoveries Proportion 36 | power: percentage of correctly selected variables over total number of 37 | non-null variables 38 | 39 | """ 40 | # selected is the index list in R and will be different from index of 41 | # python by 1 unit 42 | 43 | if selected.size == 0: 44 | return 0.0, 0.0 45 | 46 | if r_index: 47 | selected = selected - 1 48 | 49 | true_positive = [i for i in selected if i in non_zero_index] 50 | false_positive = [i for i in selected if i not in non_zero_index] 51 | fdp = len(false_positive) / max(1, len(selected)) 52 | power = len(true_positive) / len(non_zero_index) 53 | 54 | return fdp, power 55 | 56 | 57 | def _bhq_threshold(pvals, fdr=0.1): 58 | """Standard Benjamini-Hochberg for controlling False discovery rate 59 | """ 60 | n_features = len(pvals) 61 | pvals_sorted = np.sort(pvals) 62 | selected_index = 2 * n_features 63 | for i in range(n_features - 1, -1, -1): 64 | if pvals_sorted[i] <= fdr * (i + 1) / n_features: 65 | selected_index = i 66 | break 67 | if selected_index <= n_features: 68 | return pvals_sorted[selected_index] 69 | else: 70 | return -1.0 71 | 72 | 73 | def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1): 74 | """Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input 75 | shape function. Reference: Ramdas et al (2017) 76 | """ 77 | n_features = len(pvals) 78 | pvals_sorted = np.sort(pvals) 79 | selected_index = 2 * n_features 80 | # Default value for reshaping function -- defined in 81 | # Benjamini & Yekutieli (2001) 82 | if reshaping_function is None: 83 | temp = np.arange(n_features) 84 | sum_inverse = np.sum(1 / (temp + 1)) 85 | return _bhq_threshold(pvals, fdr / sum_inverse) 86 | else: 87 | for i in range(n_features - 1, -1, -1): 88 | if pvals_sorted[i] <= fdr * reshaping_function(i + 1) / n_features: 89 | selected_index = i 90 | break 91 | if selected_index <= n_features: 92 | return pvals_sorted[selected_index] 93 | else: 94 | return -1.0 95 | 96 | 97 | def _fixed_quantile_aggregation(pvals, gamma=0.5): 98 | """Quantile aggregation function based on Meinshausen et al (2008) 99 | 100 | Parameters 101 | ---------- 102 | pvals : 2D ndarray (n_bootstrap, n_test) 103 | p-value (adjusted) 104 | 105 | gamma : float 106 | Percentile value used for aggregation. 107 | 108 | Returns 109 | ------- 110 | 1D ndarray (n_tests, ) 111 | Vector of aggregated p-value 112 | """ 113 | converted_score = (1 / gamma) * ( 114 | np.percentile(pvals, q=100*gamma, axis=0)) 115 | 116 | return np.minimum(1, converted_score) 117 | 118 | 119 | def _adaptive_quantile_aggregation(pvals, gamma_min=0.05): 120 | """adaptive version of the quantile aggregation method, Meinshausen et al. 121 | (2008)""" 122 | gammas = np.arange(gamma_min, 1.05, 0.05) 123 | list_Q = np.array([ 124 | _fixed_quantile_aggregation(pvals, gamma) for gamma in gammas]) 125 | 126 | return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0)) 127 | -------------------------------------------------------------------------------- /hidimstat/test/test_ensemble_clustered_inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the ensemble_clustered_inference module 3 | """ 4 | 5 | import numpy as np 6 | from sklearn.cluster import FeatureAgglomeration 7 | from sklearn.feature_extraction import image 8 | from numpy.testing import assert_almost_equal 9 | 10 | from hidimstat.scenario import multivariate_1D_simulation 11 | from hidimstat.scenario import multivariate_temporal_simulation 12 | from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference 13 | 14 | 15 | def test_ensemble_clustered_inference(): 16 | '''Testing the procedure on a simulation with a 1D data structure 17 | and with n << p: the first test has no temporal dimension, the second has a 18 | temporal dimension. The support is connected and of size 10, it must be 19 | recovered with a small spatial tolerance parametrized by `margin_size`. 20 | Computing one sided p-values, we want low p-values for the features of 21 | the support and p-values close to 0.5 for the others.''' 22 | 23 | # Scenario 1: data with no temporal dimension 24 | # ########################################### 25 | inference_method = 'desparsified-lasso' 26 | n_samples, n_features = 100, 2000 27 | support_size = 10 28 | sigma = 5.0 29 | rho = 0.95 30 | 31 | X_init, y, beta, epsilon = \ 32 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 33 | support_size=support_size, sigma=sigma, 34 | rho=rho, shuffle=False, seed=0) 35 | 36 | margin_size = 5 37 | n_clusters = 200 38 | n_bootstraps = 3 39 | 40 | y = y - np.mean(y) 41 | X_init = X_init - np.mean(X_init, axis=0) 42 | 43 | connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1) 44 | ward = FeatureAgglomeration(n_clusters=n_clusters, 45 | connectivity=connectivity, 46 | linkage='ward') 47 | 48 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 49 | ensemble_clustered_inference(X_init, y, ward, n_clusters, 50 | n_bootstraps=n_bootstraps, 51 | inference_method=inference_method) 52 | 53 | expected = 0.5 * np.ones(n_features) 54 | expected[:support_size] = 0.0 55 | 56 | assert_almost_equal(pval_corr[:support_size-margin_size], 57 | expected[:support_size-margin_size]) 58 | assert_almost_equal(pval_corr[support_size+margin_size:], 59 | expected[support_size+margin_size:], 60 | decimal=1) 61 | 62 | # Scenario 2: temporal data 63 | # ######################### 64 | inference_method = 'desparsified-group-lasso' 65 | n_samples, n_features, n_times = 200, 2000, 10 66 | support_size = 10 67 | sigma = 5.0 68 | rho_noise = 0.9 69 | rho_data = 0.9 70 | n_clusters = 200 71 | margin_size = 5 72 | interior_support = support_size - margin_size 73 | extended_support = support_size + margin_size 74 | n_bootstraps = 4 75 | 76 | X, Y, beta, noise = \ 77 | multivariate_temporal_simulation(n_samples=n_samples, 78 | n_features=n_features, 79 | n_times=n_times, 80 | support_size=support_size, 81 | sigma=sigma, 82 | rho_noise=rho_noise, 83 | rho_data=rho_data, 84 | shuffle=False) 85 | 86 | connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1) 87 | ward = FeatureAgglomeration(n_clusters=n_clusters, 88 | connectivity=connectivity, 89 | linkage='ward') 90 | 91 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 92 | ensemble_clustered_inference(X, Y, ward, n_clusters, 93 | n_bootstraps=n_bootstraps, 94 | inference_method=inference_method) 95 | 96 | expected = 0.5 * np.ones(n_features) 97 | expected[:support_size] = 0.0 98 | 99 | assert_almost_equal(pval_corr[:interior_support], 100 | expected[:interior_support], 101 | decimal=3) 102 | assert_almost_equal(pval_corr[extended_support:], 103 | expected[extended_support:], 104 | decimal=1) 105 | -------------------------------------------------------------------------------- /hidimstat/test/test_noise_std.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the noise_std module 3 | """ 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal 7 | from scipy.linalg import toeplitz 8 | 9 | from hidimstat.scenario import multivariate_1D_simulation 10 | from hidimstat.scenario import multivariate_temporal_simulation 11 | from hidimstat.noise_std import reid, group_reid, empirical_snr 12 | 13 | 14 | def test_reid(): 15 | '''Estimating noise standard deviation in two scenarios. 16 | First scenario: no structure and a support of size 2. 17 | Second scenario: no structure and an empty support.''' 18 | 19 | n_samples, n_features = 50, 30 20 | sigma = 2.0 21 | 22 | # First expe 23 | # ########## 24 | support_size = 2 25 | 26 | X, y, beta, noise = \ 27 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 28 | support_size=support_size, sigma=sigma, 29 | seed=0) 30 | 31 | # max_iter=1 to get a better coverage 32 | sigma_hat, _ = reid(X, y, tol=1e-3, max_iter=1) 33 | expected = sigma 34 | 35 | assert_almost_equal(sigma_hat / expected, 1.0, decimal=0) 36 | 37 | # Second expe 38 | # ########### 39 | support_size = 0 40 | 41 | X, y, beta, noise = \ 42 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 43 | support_size=support_size, sigma=sigma, 44 | seed=1) 45 | 46 | sigma_hat, _ = reid(X, y) 47 | expected = sigma 48 | 49 | assert_almost_equal(sigma_hat / expected, 1.0, decimal=1) 50 | 51 | 52 | def test_group_reid(): 53 | '''Estimating (temporal) noise covariance matrix in two scenarios. 54 | First scenario: no data structure and a support of size 2. 55 | Second scenario: no data structure and an empty support.''' 56 | 57 | n_samples = 30 58 | n_features = 50 59 | n_times = 10 60 | sigma = 1.0 61 | rho = 0.9 62 | corr = toeplitz(np.geomspace(1, rho ** (n_times - 1), n_times)) 63 | cov = np.outer(sigma, sigma) * corr 64 | 65 | # First expe 66 | # ########## 67 | support_size = 2 68 | 69 | X, Y, beta, noise = \ 70 | multivariate_temporal_simulation(n_samples=n_samples, 71 | n_features=n_features, 72 | n_times=n_times, 73 | support_size=support_size, 74 | sigma=sigma, 75 | rho_noise=rho) 76 | 77 | # max_iter=1 to get a better coverage 78 | cov_hat, _ = group_reid(X, Y, tol=1e-3, max_iter=1) 79 | error_ratio = cov_hat / cov 80 | 81 | assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) 82 | assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1) 83 | 84 | cov_hat, _ = group_reid(X, Y, method='AR') 85 | error_ratio = cov_hat / cov 86 | 87 | assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) 88 | assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=0) 89 | 90 | # Second expe 91 | # ########### 92 | support_size = 0 93 | 94 | X, Y, beta, noise = \ 95 | multivariate_temporal_simulation(n_samples=n_samples, 96 | n_features=n_features, 97 | n_times=n_times, 98 | support_size=support_size, 99 | sigma=sigma, 100 | rho_noise=rho, 101 | seed=1) 102 | 103 | cov_hat, _ = group_reid(X, Y) 104 | error_ratio = cov_hat / cov 105 | 106 | assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) 107 | assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1) 108 | 109 | cov_hat, _ = group_reid(X, Y, fit_Y=False, stationary=False) 110 | error_ratio = cov_hat / cov 111 | 112 | assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) 113 | assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=0) 114 | 115 | cov_hat, _ = group_reid(X, Y, method='AR') 116 | error_ratio = cov_hat / cov 117 | 118 | assert_almost_equal(np.max(error_ratio), 1.0, decimal=0) 119 | assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1) 120 | 121 | 122 | def test_empirical_snr(): 123 | '''Computing empirical signal to noise ratio from the target `y`, 124 | the data `X` and the true parameter vector `beta` in a simple 125 | scenario with a 1D data structure.''' 126 | 127 | n_samples, n_features = 30, 30 128 | support_size = 10 129 | sigma = 2.0 130 | 131 | X, y, beta, noise = \ 132 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 133 | support_size=support_size, sigma=sigma, 134 | seed=0) 135 | 136 | snr = empirical_snr(X, y, beta) 137 | expected = 2.0 138 | 139 | assert_almost_equal(snr, expected, decimal=0) 140 | -------------------------------------------------------------------------------- /hidimstat/knockoffs/gaussian_knockoff.py: -------------------------------------------------------------------------------- 1 | """Generation of model-x knockoff following equi-correlated method or 2 | optimization scheme following Barber et al. (2015). Requires cvxopt. 3 | """ 4 | 5 | import warnings 6 | 7 | import numpy as np 8 | from sklearn.covariance import (GraphicalLassoCV, empirical_covariance, 9 | ledoit_wolf) 10 | from sklearn.utils.validation import check_memory 11 | 12 | 13 | def gaussian_knockoff_generation(X, mu, Sigma, method='equi', memory=None, 14 | seed=None): 15 | """Generate second-order knockoff variables using equi-correlated method. 16 | Reference: Candes et al. (2016), Barber et al. (2015) 17 | 18 | Parameters 19 | ---------- 20 | X: 2D ndarray (n_samples, n_features) 21 | original design matrix 22 | 23 | mu : 1D ndarray (n_features, ) 24 | vector of empirical mean values 25 | 26 | method: str 27 | method to generate gaussian knockoff 28 | 29 | Sigma : 2D ndarray (n_samples, n_features) 30 | empirical covariance matrix 31 | 32 | Returns 33 | ------- 34 | X_tilde : 2D ndarray (n_samples, n_features) 35 | knockoff design matrix 36 | """ 37 | memory = check_memory(memory) 38 | 39 | n_samples, n_features = X.shape 40 | if method == 'equi': 41 | Diag_s = np.diag(_s_equi(Sigma)) 42 | else: 43 | raise ValueError('{} is not a valid knockoff ' 44 | 'contriction method'.format(method)) 45 | 46 | Sigma_inv_s = np.linalg.solve(Sigma, Diag_s) 47 | 48 | # First part on the RHS of equation 1.4 in Barber & Candes (2015) 49 | Mu_tilde = X - np.dot(X - mu, Sigma_inv_s) 50 | # To calculate the Cholesky decomposition later on 51 | Sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s)) 52 | while not _is_posdef(Sigma_tilde): 53 | Sigma_tilde += 1e-10 * np.eye(n_features) 54 | warnings.warn( 55 | 'The conditional covariance matrix for knockoffs is not positive ' 56 | 'definite. Adding minor positive value to the matrix.') 57 | 58 | rng = np.random.RandomState(seed) 59 | U_tilde = rng.randn(n_samples, n_features) 60 | # Equation 1.4 in Barber & Candes (2015) 61 | X_tilde = Mu_tilde + np.dot(U_tilde, np.linalg.cholesky(Sigma_tilde)) 62 | 63 | return X_tilde 64 | 65 | 66 | def _is_posdef(X, tol=1e-14): 67 | """Check a matrix is positive definite by calculating eigenvalue of the 68 | matrix 69 | 70 | Parameters 71 | ---------- 72 | X : 2D ndarray, shape (n_samples x n_features) 73 | Matrix to check 74 | 75 | tol : float, optional 76 | minimum threshold for eigenvalue 77 | 78 | Returns 79 | ------- 80 | True or False 81 | """ 82 | eig_value = np.linalg.eigvalsh(X) 83 | return np.all(eig_value > tol) 84 | 85 | 86 | def _cov_to_corr(Sigma): 87 | """Convert covariance matrix to correlation matrix 88 | 89 | Parameters 90 | ---------- 91 | Sigma : 2D ndarray (n_features, n_features) 92 | Covariance matrix 93 | 94 | Returns 95 | ------- 96 | Corr_matrix : 2D ndarray (n_features, n_features) 97 | Transformed correlation matrix 98 | """ 99 | 100 | features_std = np.sqrt(np.diag(Sigma)) 101 | Scale = np.outer(features_std, features_std) 102 | 103 | Corr_matrix = Sigma / Scale 104 | 105 | return Corr_matrix 106 | 107 | 108 | def _estimate_distribution(X, shrink=False, cov_estimator='ledoit_wolf'): 109 | 110 | alphas = [1e-3, 1e-2, 1e-1, 1] 111 | 112 | mu = X.mean(axis=0) 113 | Sigma = empirical_covariance(X) 114 | 115 | if shrink or not _is_posdef(Sigma): 116 | 117 | if cov_estimator == 'ledoit_wolf': 118 | Sigma_shrink = ledoit_wolf(X, assume_centered=True)[0] 119 | 120 | elif cov_estimator == 'graph_lasso': 121 | model = GraphicalLassoCV(alphas=alphas) 122 | Sigma_shrink = model.fit(X).covariance_ 123 | 124 | else: 125 | raise ValueError('{} is not a valid covariance estimated method' 126 | .format(cov_estimator)) 127 | 128 | return mu, Sigma_shrink 129 | 130 | return mu, Sigma 131 | 132 | 133 | def _s_equi(Sigma): 134 | """Estimate diagonal matrix of correlation between real and knockoff 135 | variables using equi-correlated equation 136 | 137 | Parameters 138 | ---------- 139 | Sigma : 2D ndarray (n_features, n_features) 140 | empirical covariance matrix calculated from original design matrix 141 | 142 | Returns 143 | ------- 144 | 1D ndarray (n_features, ) 145 | vector of diagonal values of estimated matrix diag{s} 146 | """ 147 | n_features = Sigma.shape[0] 148 | 149 | G = _cov_to_corr(Sigma) 150 | eig_value = np.linalg.eigvalsh(G) 151 | lambda_min = np.min(eig_value[0]) 152 | S = np.ones(n_features) * min(2 * lambda_min, 1) 153 | 154 | psd = False 155 | s_eps = 0 156 | 157 | while psd is False: 158 | # if all eigval > 0 then the matrix is psd 159 | psd = _is_posdef(2 * G - np.diag(S * (1 - s_eps))) 160 | if not psd: 161 | if s_eps == 0: 162 | s_eps = 1e-08 163 | else: 164 | s_eps *= 10 165 | 166 | S = S * (1 - s_eps) 167 | 168 | return S * np.diag(Sigma) 169 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | GITHUB_PAGES_BRANCH = gh-pages 11 | OUTPUTDIR = _build/html 12 | 13 | # User-friendly check for sphinx-build 14 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 15 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 16 | endif 17 | 18 | # Internal variables. 19 | PAPEROPT_a4 = -D latex_paper_size=a4 20 | PAPEROPT_letter = -D latex_paper_size=letter 21 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 22 | # the i18n builder cannot share the environment and doctrees with the others 23 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 24 | 25 | .PHONY: help 26 | help: 27 | @echo "Please use \`make ' where is one of" 28 | @echo " html-noplot to make standalone HTML files, without plotting anything" 29 | @echo " html to make standalone HTML files" 30 | @echo " dirhtml to make HTML files named index.html in directories" 31 | @echo " singlehtml to make a single large HTML file" 32 | @echo " pickle to make pickle files" 33 | @echo " htmlhelp to make HTML files and a HTML help project" 34 | @echo " qthelp to make HTML files and a qthelp project" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | @echo " coverage to run coverage check of the documentation (if enabled)" 41 | @echo " install to make the html and push it online" 42 | 43 | .PHONY: clean 44 | 45 | clean: 46 | rm -rf $(BUILDDIR)/* 47 | rm -rf auto_examples/ 48 | rm -rf generated/* 49 | rm -rf modules/* 50 | 51 | html-noplot: 52 | $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 53 | @echo 54 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 55 | 56 | .PHONY: html 57 | html: 58 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 61 | 62 | .PHONY: dirhtml 63 | dirhtml: 64 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 65 | @echo 66 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 67 | 68 | .PHONY: singlehtml 69 | singlehtml: 70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 71 | @echo 72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 73 | 74 | .PHONY: pickle 75 | pickle: 76 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 77 | @echo 78 | @echo "Build finished; now you can process the pickle files." 79 | 80 | .PHONY: htmlhelp 81 | htmlhelp: 82 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 83 | @echo 84 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 85 | ".hhp project file in $(BUILDDIR)/htmlhelp." 86 | 87 | .PHONY: qthelp 88 | qthelp: 89 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 90 | @echo 91 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 92 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 93 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/hidimstat.qhcp" 94 | @echo "To view the help file:" 95 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/hidimstat.qhc" 96 | 97 | .PHONY: latex 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | .PHONY: latexpdf 106 | latexpdf: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo "Running LaTeX files through pdflatex..." 109 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 110 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 111 | 112 | .PHONY: changes 113 | changes: 114 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 115 | @echo 116 | @echo "The overview file is in $(BUILDDIR)/changes." 117 | 118 | .PHONY: linkcheck 119 | linkcheck: 120 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 121 | @echo 122 | @echo "Link check complete; look for any errors in the above output " \ 123 | "or in $(BUILDDIR)/linkcheck/output.txt." 124 | 125 | .PHONY: doctest 126 | doctest: 127 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 128 | @echo "Testing of doctests in the sources finished, look at the " \ 129 | "results in $(BUILDDIR)/doctest/output.txt." 130 | 131 | .PHONY: coverage 132 | coverage: 133 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 134 | @echo "Testing of coverage in the sources finished, look at the " \ 135 | "results in $(BUILDDIR)/coverage/python.txt." 136 | 137 | install: 138 | touch $(OUTPUTDIR)/.nojekyll 139 | ghp-import -m "Generate Pelican site [ci skip]" -b $(GITHUB_PAGES_BRANCH) $(OUTPUTDIR) 140 | git push -f origin $(GITHUB_PAGES_BRANCH) 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HiDimStat: High-dimensional statistical inference tool for Python 2 | [![build][TravisCI]][travis] [![coverage][CodeCov]][cov] 3 | 4 | The HiDimStat package provides statistical inference methods to solve the 5 | problem of support recovery in the context of high-dimensional and 6 | spatially structured data. 7 | 8 | **Update on Oct 2024: this repository is no longer maintained.** Please refer to the new up-to-date github repo at https://github.com/mind-INRIA/hidimstat. 9 | 10 | 11 | ## Installation 12 | 13 | HiDimStat working only with Python 3, ideally Python 3.6+. For installation, 14 | run the following from terminal 15 | 16 | ```bash 17 | pip install hidimstat 18 | ``` 19 | 20 | Or if you want the latest version available (for example to contribute to 21 | the development of this project: 22 | 23 | ``` 24 | pip install -U git+https://github.com/ja-che/hidimstat.git 25 | ``` 26 | 27 | or 28 | 29 | ```bash 30 | git clone https://github.com/ja-che/hidimstat.git 31 | cd hidimstat 32 | pip install -e . 33 | ``` 34 | 35 | ## Dependencies 36 | 37 | ``` 38 | joblib 39 | numpy 40 | scipy 41 | scikit-learn 42 | ``` 43 | 44 | To run examples it is neccessary to install `matplotlib`, and to run tests it 45 | is also needed to install `pytest`. 46 | 47 | ## Documentation & Examples 48 | 49 | All the documentation of HiDimStat is available at https://ja-che.github.io/hidimstat/. 50 | 51 | As of now in the `examples` folder there are three Python scripts that 52 | illustrate how to use the main HiDimStat functions. 53 | In each script we handle a different kind of dataset: 54 | ``plot_2D_simulation_example.py`` handles a simulated dataset with a 2D 55 | spatial structure, 56 | ``plot_fmri_data_example.py`` solves the decoding problem on Haxby fMRI dataset, 57 | ``plot_meg_data_example.py`` tackles the source localization problem on several 58 | MEG/EEG datasets. 59 | 60 | 61 | ```bash 62 | # For example run the following command in terminal 63 | python plot_2D_simulation_example.py 64 | ``` 65 | 66 | ## References 67 | 68 | The algorithms developed in this package have been detailed in several 69 | conference/journal articles that can be downloaded at 70 | https://ja-che.github.io/research.html. 71 | 72 | #### Main references: 73 | 74 | Ensemble of Clustered desparsified Lasso (ECDL): 75 | 76 | * Chevalier, J. A., Salmon, J., & Thirion, B. (2018). __Statistical inference 77 | with ensemble of clustered desparsified lasso__. In _International Conference 78 | on Medical Image Computing and Computer-Assisted Intervention_ 79 | (pp. 638-646). Springer, Cham. 80 | 81 | * Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021). __Spatially relaxed inference on high-dimensional linear models__. arXiv preprint arXiv:2106.02590. 82 | 83 | Aggregation of multiple Knockoffs (AKO): 84 | 85 | * Nguyen T.-B., Chevalier J.-A., Thirion B., & Arlot S. (2020). __Aggregation 86 | of Multiple Knockoffs__. In _Proceedings of the 37th International Conference on 87 | Machine Learning_, Vienna, Austria, PMLR 119. 88 | 89 | Application to decoding (fMRI data): 90 | 91 | * Chevalier, J. A., Nguyen T.-B., Salmon, J., Varoquaux, G. & Thirion, B. (2021). __Decoding with confidence: Statistical control on decoder maps__. In _NeuroImage_, 234, 117921. 92 | 93 | Application to source localization (MEG/EEG data): 94 | 95 | * Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020). __Statistical control for spatio-temporal MEG/EEG source imaging with desparsified multi-task Lasso__. In _Proceedings of the 34th Conference on Neural Information Processing Systems (NeurIPS 2020)_, Vancouver, Canada. 96 | 97 | If you use our packages, we would appreciate citations to the relevant aforementioned papers. 98 | 99 | #### Other useful references: 100 | 101 | For de-sparsified(or de-biased) Lasso: 102 | 103 | * Javanmard, A., & Montanari, A. (2014). __Confidence intervals and hypothesis 104 | testing for high-dimensional regression__. _The Journal of Machine Learning 105 | Research_, 15(1), 2869-2909. 106 | 107 | * Zhang, C. H., & Zhang, S. S. (2014). __Confidence intervals for low dimensional 108 | parameters in high dimensional linear models__. _Journal of the Royal 109 | Statistical Society: Series B: Statistical Methodology_, 217-242. 110 | 111 | * Van de Geer, S., Bühlmann, P., Ritov, Y. A., & Dezeure, R. (2014). __On 112 | asymptotically optimal confidence regions and tests for high-dimensional 113 | models__. _The Annals of Statistics_, 42(3), 1166-1202. 114 | 115 | For Knockoffs Inference: 116 | 117 | * Barber, R. F; Candès, E. J. (2015). __Controlling the false discovery rate 118 | via knockoffs__. _Annals of Statistics_. 43 , no. 5, 119 | 2055--2085. doi:10.1214/15-AOS1337. https://projecteuclid.org/euclid.aos/1438606853 120 | 121 | * Candès, E., Fan, Y., Janson, L., & Lv, J. (2018). __Panning for gold: Model-X 122 | knockoffs for high dimensional controlled variable selection__. _Journal of the 123 | Royal Statistical Society Series B_, 80(3), 551-577. 124 | 125 | ## License 126 | 127 | This project is licensed under the BSD 2-Clause License. 128 | 129 | ## Acknowledgments 130 | 131 | This project has been funded by Labex DigiCosme (ANR-11-LABEX-0045-DIGICOSME) 132 | as part of the program "Investissement d’Avenir" (ANR-11-IDEX-0003-02), by the 133 | Fast Big project (ANR-17-CE23-0011) and the KARAIB AI Chair 134 | (ANR-20-CHIA-0025-01). This study has also been supported by the European 135 | Union’s Horizon 2020 research and innovation program 136 | (Grant Agreement No. 945539, Human Brain Project SGA3). 137 | 138 | 139 | [TravisCI]: https://travis-ci.com/ja-che/hidimstat.svg?branch=main "travisCI status" 140 | [travis]: https://travis-ci.com/ja-che/hidimstat 141 | 142 | [CodeCov]: https://codecov.io/gh/ja-che/hidimstat/branch/main/graph/badge.svg "CodeCov status" 143 | [cov]: https://codecov.io/gh/ja-che/hidimstat 144 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. hidimstat documentation master file, created by 2 | sphinx-quickstart on Fri April 23 12:22:52 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | HiDimStat: High-dimensional statistical inference tool for Python 7 | ================================================================= 8 | |Build Status| |codecov| 9 | 10 | The HiDimStat package provides statistical inference methods to solve the 11 | problem of support recovery in the context of high-dimensional and 12 | spatially structured data. 13 | 14 | 15 | Installation 16 | ------------ 17 | 18 | HiDimStat working only with Python 3, ideally Python 3.6+. For installation, 19 | run the following from terminal:: 20 | 21 | pip install hidimstat 22 | 23 | Or if you want the latest version available (for example to contribute to 24 | the development of this project):: 25 | 26 | git clone https://github.com/ja-che/hidimstat.git 27 | cd hidimstat 28 | pip install -e . 29 | 30 | 31 | Dependencies 32 | ------------ 33 | 34 | HiDimStat depends on the following packages:: 35 | 36 | joblib 37 | numpy 38 | scipy 39 | scikit-learn 40 | 41 | 42 | To run examples it is neccessary to install ``matplotlib``, and to run tests it 43 | is also needed to install ``pytest``. 44 | 45 | 46 | Documentation & Examples 47 | ------------------------ 48 | 49 | Documentation about the main HiDimStat functions is available 50 | `here `_ and examples are available `here `_. 51 | 52 | As of now, there are three different examples (Python scripts) that 53 | illustrate how to use the main HiDimStat functions. 54 | In each example we handle a different kind of dataset: 55 | ``plot_2D_simulation_example.py`` handles a simulated dataset with a 2D 56 | spatial structure, 57 | ``plot_fmri_data_example.py`` solves the decoding problem on Haxby fMRI dataset, 58 | ``plot_meg_data_example.py`` tackles the source localization problem on several 59 | MEG/EEG datasets. 60 | 61 | .. code-block:: 62 | 63 | # For example run the following command in terminal 64 | python plot_2D_simulation_example.py 65 | 66 | 67 | Build the documentation 68 | ----------------------- 69 | 70 | To build the documentation you will need to run: 71 | 72 | .. code-block:: 73 | 74 | pip install -U sphinx_gallery sphinx_bootstrap_theme 75 | cd doc 76 | make html 77 | 78 | 79 | References 80 | ---------- 81 | 82 | The algorithms developed in this package have been detailed in several 83 | conference/journal articles that can be downloaded at 84 | `https://ja-che.github.io/ `_. 85 | 86 | Main references 87 | ~~~~~~~~~~~~~~~ 88 | 89 | Ensemble of Clustered desparsified Lasso (ECDL): 90 | 91 | * Chevalier, J. A., Salmon, J., & Thirion, B. (2018). Statistical inference 92 | with ensemble of clustered desparsified lasso. In International Conference 93 | on Medical Image Computing and Computer-Assisted Intervention 94 | (pp. 638-646). Springer, Cham. 95 | 96 | * Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021). 97 | Spatially relaxed inference on high-dimensional linear models. 98 | arXiv preprint arXiv:2106.02590. 99 | 100 | Aggregation of multiple Knockoffs (AKO): 101 | 102 | * Nguyen T.-B., Chevalier J.-A., Thirion B., & Arlot S. (2020). Aggregation 103 | of Multiple Knockoffs. In Proceedings of the 37th International Conference on 104 | Machine Learning, Vienna, Austria, PMLR 119. 105 | 106 | Application to decoding (fMRI data): 107 | 108 | * Chevalier, J. A., Nguyen T.-B., Salmon, J., Varoquaux, G. & Thirion, B. 109 | (2021). Decoding with confidence: Statistical control on decoder maps. 110 | In NeuroImage, 234, 117921. 111 | 112 | Application to source localization (MEG/EEG data): 113 | 114 | * Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020). 115 | Statistical control for spatio-temporal MEG/EEG source imaging with 116 | desparsified multi-task Lasso. In Proceedings of the 34th Conference on 117 | Neural Information Processing Systems (NeurIPS 2020), Vancouver, Canada. 118 | 119 | If you use our packages, we would appreciate citations to the relevant 120 | aforementioned papers. 121 | 122 | Other useful references 123 | ~~~~~~~~~~~~~~~~~~~~~~~ 124 | 125 | For de-sparsified(or de-biased) Lasso: 126 | 127 | * Javanmard, A., & Montanari, A. (2014). Confidence intervals and hypothesis 128 | testing for high-dimensional regression. The Journal of Machine Learning 129 | Research, 15(1), 2869-2909. 130 | 131 | * Zhang, C. H., & Zhang, S. S. (2014). Confidence intervals for low dimensional 132 | parameters in high dimensional linear models. Journal of the Royal 133 | Statistical Society: Series B: Statistical Methodology, 217-242. 134 | 135 | * Van de Geer, S., Bühlmann, P., Ritov, Y. A., & Dezeure, R. (2014). On 136 | asymptotically optimal confidence regions and tests for high-dimensional 137 | models. The Annals of Statistics, 42(3), 1166-1202. 138 | 139 | For Knockoffs Inference: 140 | 141 | * Barber, R. F; Candès, E. J. (2015). Controlling the false discovery rate 142 | via knockoffs. Annals of Statistics. 43 , no. 5, 143 | 2055--2085. doi:10.1214/15-AOS1337. https://projecteuclid.org/euclid.aos/1438606853 144 | 145 | * Candès, E., Fan, Y., Janson, L., & Lv, J. (2018). Panning for gold: Model-X 146 | knockoffs for high dimensional controlled variable selection. Journal of the 147 | Royal Statistical Society Series B, 80(3), 551-577. 148 | 149 | .. |Build Status| image:: https://travis-ci.com/ja-che/hidimstat.svg?branch=main 150 | :target: https://codecov.io/gh/ja-che/hidimstat 151 | 152 | .. |codecov| image:: https://codecov.io/gh/ja-che/hidimstat/branch/main/graph/badge.svg 153 | :target: https://codecov.io/gh/ja-che/hidimstat 154 | 155 | 156 | API 157 | --- 158 | 159 | .. toctree:: 160 | :maxdepth: 1 161 | 162 | api.rst 163 | -------------------------------------------------------------------------------- /hidimstat/test/test_scenario.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the scenario module 3 | """ 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal, assert_equal 7 | 8 | from hidimstat.scenario import multivariate_1D_simulation 9 | from hidimstat.scenario import multivariate_simulation 10 | from hidimstat.scenario import multivariate_temporal_simulation 11 | 12 | ROI_SIZE_2D = 2 13 | SHAPE_2D = (12, 12) 14 | 15 | ROI_SIZE_3D = 2 16 | SHAPE_3D = (12, 12, 12) 17 | 18 | 19 | def test_multivariate_1D_simulation(): 20 | '''Test if the data has expected shape, if the input parameters 21 | are close to their empirical estimators, if the support size is 22 | correct and if the noise model is the generative model. The 23 | first test concerns a simulation with a 1D spatial structure, 24 | the second test concerns a simulation with a random structure''' 25 | 26 | n_samples = 100 27 | n_features = 500 28 | support_size = 10 29 | rho = 0.7 30 | sigma = 1.0 31 | 32 | # Test 1 33 | X, y, beta, noise = \ 34 | multivariate_1D_simulation(n_samples=n_samples, n_features=n_features, 35 | support_size=support_size, sigma=sigma, 36 | rho=rho, shuffle=False, seed=0) 37 | 38 | sigma_hat = np.std(noise) 39 | rho_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1] 40 | 41 | assert_almost_equal(sigma_hat, sigma, decimal=1) 42 | assert_almost_equal(rho_hat, rho, decimal=1) 43 | assert_equal(X.shape, (n_samples, n_features)) 44 | assert_equal(np.count_nonzero(beta), support_size) 45 | assert_equal(y, np.dot(X, beta) + noise) 46 | 47 | # Test 2 48 | X, y, beta, noise = \ 49 | multivariate_1D_simulation() 50 | rho_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1] 51 | assert_almost_equal(rho_hat, 0, decimal=1) 52 | 53 | 54 | def test_multivariate_simulation(): 55 | '''Test if the data has expected shape, if the input parameters 56 | are close to their empirical estimators, if the support has the 57 | expected size (from simple geometry) and if the noise model is 58 | the generative model. First test concerns a simulation with a 2D 59 | structure, second test concerns a simulation with a 3D structure.''' 60 | 61 | # Test 1 62 | n_samples = 100 63 | shape = SHAPE_2D 64 | roi_size = ROI_SIZE_2D 65 | sigma = 1.0 66 | smooth_X = 1.0 67 | rho_expected = 0.8 68 | return_shaped_data = True 69 | 70 | X, y, beta, noise, X_, w = \ 71 | multivariate_simulation(n_samples=n_samples, shape=shape, 72 | roi_size=roi_size, sigma=sigma, 73 | smooth_X=smooth_X, 74 | return_shaped_data=return_shaped_data, 75 | seed=0) 76 | 77 | sigma_hat = np.std(noise) 78 | rho_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1] 79 | 80 | assert_almost_equal(sigma_hat, sigma, decimal=1) 81 | assert_almost_equal(rho_hat, rho_expected, decimal=2) 82 | assert_equal(X.shape, (n_samples, shape[0] * shape[1])) 83 | assert_equal(X_.shape, (n_samples, shape[0], shape[1])) 84 | assert_equal(np.count_nonzero(beta), 4 * (roi_size ** 2)) 85 | assert_equal(y, np.dot(X, beta) + noise) 86 | 87 | # Test 2 88 | shape = SHAPE_3D 89 | roi_size = ROI_SIZE_3D 90 | return_shaped_data = False 91 | 92 | X, y, beta, noise = \ 93 | multivariate_simulation(n_samples=n_samples, shape=shape, 94 | roi_size=roi_size, 95 | return_shaped_data=return_shaped_data, 96 | seed=0) 97 | 98 | assert_equal(X.shape, (n_samples, shape[0] * shape[1] * shape[2])) 99 | assert_equal(np.count_nonzero(beta), 5 * (roi_size ** 3)) 100 | 101 | 102 | def test_multivariate_temporal_simulation(): 103 | '''Test if the data has expected shape, if the input parameters 104 | are close to their empirical estimators, if the support size is 105 | correct and if the noise model is the generative model. The 106 | first test concerns a simulation with a 1D spatial structure 107 | and a temporal structure, the second test concerns a simulation 108 | with a random spatial structure and a temporal structure.''' 109 | 110 | n_samples = 30 111 | n_features = 50 112 | n_times = 10 113 | support_size = 2 114 | sigma = 1.0 115 | rho_noise = 0.9 116 | rho_data = 0.95 117 | 118 | # Test 1 119 | X, Y, beta, noise = \ 120 | multivariate_temporal_simulation(n_samples=n_samples, 121 | n_features=n_features, 122 | n_times=n_times, 123 | support_size=support_size, 124 | sigma=sigma, 125 | rho_noise=rho_noise, 126 | rho_data=rho_data) 127 | 128 | sigma_hat = np.std(noise[:, -1]) 129 | rho_noise_hat = np.corrcoef(noise[:, -1], noise[:, -2])[0, 1] 130 | 131 | assert_almost_equal(sigma_hat, sigma, decimal=1) 132 | assert_almost_equal(rho_noise_hat, rho_noise, decimal=1) 133 | assert_equal(X.shape, (n_samples, n_features)) 134 | assert_equal(Y.shape, (n_samples, n_times)) 135 | assert_equal(np.count_nonzero(beta), support_size * n_times) 136 | assert_equal(Y, np.dot(X, beta) + noise) 137 | 138 | # Test 2 139 | X, Y, beta, noise = \ 140 | multivariate_temporal_simulation(n_samples=n_samples, 141 | n_features=n_features, 142 | n_times=n_times, 143 | support_size=support_size, 144 | sigma=sigma, 145 | rho_noise=rho_noise, 146 | rho_data=rho_data, 147 | shuffle=False) 148 | 149 | rho_data_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1] 150 | assert_almost_equal(rho_data_hat, rho_data, decimal=1) 151 | assert_equal(Y, np.dot(X, beta) + noise) 152 | -------------------------------------------------------------------------------- /hidimstat/test/test_stat_tools.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test the stat module 3 | """ 4 | 5 | import numpy as np 6 | from numpy.testing import assert_almost_equal, assert_equal 7 | 8 | from hidimstat.stat_tools import \ 9 | (_replace_infinity, pval_corr_from_pval, pval_from_scale, 10 | zscore_from_cb, pval_from_cb, two_sided_pval_from_zscore, 11 | two_sided_pval_from_cb, zscore_from_pval, 12 | pval_from_two_sided_pval_and_sign, two_sided_pval_from_pval) 13 | 14 | 15 | def test__replace_infinity(): 16 | 17 | x = np.asarray([10, np.inf, -np.inf]) 18 | 19 | # replace inf by the largest absolute value times two 20 | x_clean = _replace_infinity(x) 21 | expected = np.asarray([10, 20, -20]) 22 | assert_equal(x_clean, expected) 23 | 24 | # replace inf by 40 25 | x_clean = _replace_infinity(x, replace_val=40) 26 | expected = np.asarray([10, 40, -40]) 27 | assert_equal(x_clean, expected) 28 | 29 | # replace inf by the largest absolute value plus one 30 | x_clean = _replace_infinity(x, method='plus-one') 31 | expected = np.asarray([10, 11, -11]) 32 | assert_equal(x_clean, expected) 33 | 34 | 35 | def test_pval_corr_from_pval(): 36 | 37 | pval = np.asarray([1.0, 0.025, 0.5]) 38 | 39 | # Correction for multiple testing: 3 features tested simultaneously 40 | pval_corr = pval_corr_from_pval(pval) 41 | expected = np.asarray([1.0, 0.075, 0.5]) 42 | assert_almost_equal(pval_corr, expected, decimal=10) 43 | 44 | one_minus_pval = np.asarray([0.0, 0.975, 0.5]) 45 | 46 | # Correction for multiple testing: 3 features tested simultaneously 47 | one_minus_pval_corr = pval_corr_from_pval(one_minus_pval) 48 | expected = np.asarray([0.0, 0.925, 0.5]) 49 | assert_almost_equal(one_minus_pval_corr, expected, decimal=10) 50 | 51 | 52 | def test_pval_from_scale(): 53 | 54 | beta = np.asarray([-1.5, 1, 0]) 55 | scale = np.asarray([0.25, 0.5, 0.5]) 56 | 57 | # Computing p-value and one minus the p-value. 58 | pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 59 | pval_from_scale(beta, scale) 60 | expected = np.asarray([[1.0, 0.022, 0.5], [1.0, 0.068, 0.5], 61 | [0.0, 0.978, 0.5], [0.0, 0.932, 0.5]]) 62 | 63 | assert_almost_equal(pval, expected[0], decimal=2) 64 | assert_almost_equal(pval_corr, expected[1], decimal=2) 65 | assert_almost_equal(one_minus_pval, expected[2], decimal=2) 66 | assert_almost_equal(one_minus_pval_corr, expected[3], decimal=2) 67 | 68 | 69 | def test_zscore_from_cb(): 70 | 71 | cb_min = np.asarray([-2, 0, -1]) 72 | cb_max = np.asarray([-1, 2, 1]) 73 | 74 | # Computing z-scores from 95% confidence-intervals assuming Gaussianity 75 | zscore = zscore_from_cb(cb_min, cb_max) 76 | expected = np.asarray([-5.87, 1.96, 0]) 77 | 78 | assert_almost_equal(zscore, expected, decimal=2) 79 | 80 | 81 | def test_pval_from_cb(): 82 | 83 | cb_min = np.asarray([-2, 0, -1]) 84 | cb_max = np.asarray([-1, 2, 1]) 85 | 86 | # Computing p-value and one minus the p-value. 87 | pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 88 | pval_from_cb(cb_min, cb_max) 89 | expected = np.asarray([[1.0, 0.025, 0.5], [1.0, 0.075, 0.5], 90 | [0.0, 0.975, 0.5], [0.0, 0.925, 0.5]]) 91 | 92 | assert_almost_equal(pval, expected[0], decimal=2) 93 | assert_almost_equal(pval_corr, expected[1], decimal=2) 94 | assert_almost_equal(one_minus_pval, expected[2], decimal=2) 95 | assert_almost_equal(one_minus_pval_corr, expected[3], decimal=2) 96 | 97 | 98 | def test_two_sided_pval_from_zscore(): 99 | 100 | zscore = np.asarray([-5.87, 1.96, 0]) 101 | 102 | # Computing two-sided pval from z-scores assuming Gaussianity 103 | two_sided_pval, two_sided_pval_corr = two_sided_pval_from_zscore(zscore) 104 | expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]]) 105 | 106 | assert_almost_equal(two_sided_pval, expected[0], decimal=2) 107 | assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2) 108 | 109 | 110 | def test_two_sided_pval_from_cb(): 111 | 112 | cb_min = np.asarray([-2, 0, -1]) 113 | cb_max = np.asarray([-1, 2, 1]) 114 | 115 | # Computing two-sided pval from 95% confidence bounds assuming Gaussianity 116 | two_sided_pval, two_sided_pval_corr = \ 117 | two_sided_pval_from_cb(cb_min, cb_max) 118 | expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]]) 119 | 120 | assert_almost_equal(two_sided_pval, expected[0], decimal=2) 121 | assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2) 122 | 123 | 124 | def test_zscore_from_pval(): 125 | 126 | pval = np.asarray([1.0, 0.025, 0.5, 0.975]) 127 | 128 | # Computing z-scores from p-value 129 | zscore = zscore_from_pval(pval) 130 | expected = _replace_infinity(np.asarray([-np.inf, 1.96, 0, -1.96]), 131 | replace_val=40, method='plus-one') 132 | 133 | assert_almost_equal(zscore, expected, decimal=2) 134 | 135 | pval = np.asarray([1.0, 0.025, 0.5, 0.975]) 136 | one_minus_pval = np.asarray([0.0, 0.975, 0.5, 0.025]) 137 | 138 | # Computing z-scores from p-value and one minus the p-value 139 | zscore = zscore_from_pval(pval, one_minus_pval) 140 | expected = _replace_infinity(np.asarray([-np.inf, 1.96, 0, -1.96]), 141 | replace_val=40, method='plus-one') 142 | 143 | assert_almost_equal(zscore, expected, decimal=2) 144 | 145 | 146 | def test_pval_from_two_sided_pval_and_sign(): 147 | 148 | two_sided_pval = np.asarray([0.025, 0.05, 0.5]) 149 | parameter_sign = np.asarray([-1.0, 1.0, -1.0]) 150 | 151 | # One-sided p-values from two-sided p-value and sign. 152 | pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 153 | pval_from_two_sided_pval_and_sign(two_sided_pval, parameter_sign) 154 | expected = np.asarray([[0.9875, 0.025, 0.75], [0.9625, 0.075, 0.5], 155 | [0.0125, 0.975, 0.25], [0.0375, 0.925, 0.5]]) 156 | 157 | assert_equal(pval, expected[0]) 158 | assert_almost_equal(pval_corr, expected[1]) 159 | assert_equal(one_minus_pval, expected[2]) 160 | assert_almost_equal(one_minus_pval_corr, expected[3]) 161 | 162 | 163 | def test_two_sided_pval_from_pval(): 164 | 165 | pval = np.asarray([1.0, 0.025, 0.5]) 166 | one_minus_pval = np.asarray([0.0, 0.975, 0.5]) 167 | 168 | # Two-sided p-value from one-side p-values. 169 | two_sided_pval, two_sided_pval_corr = \ 170 | two_sided_pval_from_pval(pval, one_minus_pval) 171 | expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]]) 172 | 173 | assert_almost_equal(two_sided_pval, expected[0], decimal=2) 174 | assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2) 175 | -------------------------------------------------------------------------------- /hidimstat/ensemble_clustered_inference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from joblib import Parallel, delayed 3 | 4 | from .multi_sample_split import aggregate_medians, aggregate_quantiles 5 | from .clustered_inference import clustered_inference 6 | 7 | 8 | def _ensembling(list_beta_hat, list_pval, list_pval_corr, list_one_minus_pval, 9 | list_one_minus_pval_corr, method='quantiles', gamma_min=0.2): 10 | 11 | beta_hat = np.asarray(list_beta_hat) 12 | list_pval = np.asarray(list_pval) 13 | list_pval_corr = np.asarray(list_pval_corr) 14 | list_one_minus_pval = np.asarray(list_one_minus_pval) 15 | list_one_minus_pval_corr = np.asarray(list_one_minus_pval_corr) 16 | 17 | beta_hat = np.mean(list_beta_hat, axis=0) 18 | 19 | if method == 'quantiles': 20 | 21 | pval = aggregate_quantiles(list_pval, gamma_min) 22 | pval_corr = aggregate_quantiles(list_pval_corr, gamma_min) 23 | one_minus_pval = aggregate_quantiles(list_one_minus_pval, gamma_min) 24 | one_minus_pval_corr = \ 25 | aggregate_quantiles(list_one_minus_pval_corr, gamma_min) 26 | 27 | elif method == 'medians': 28 | 29 | pval = aggregate_medians(list_pval) 30 | pval_corr = aggregate_medians(list_pval_corr) 31 | one_minus_pval = aggregate_medians(list_one_minus_pval) 32 | one_minus_pval_corr = aggregate_medians(list_one_minus_pval_corr) 33 | 34 | else: 35 | 36 | raise ValueError("Unknown ensembling method.") 37 | 38 | return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr 39 | 40 | 41 | def ensemble_clustered_inference(X_init, y, ward, n_clusters, 42 | train_size=0.3, groups=None, 43 | inference_method='desparsified-lasso', 44 | seed=0, ensembling_method='quantiles', 45 | gamma_min=0.2, n_bootstraps=25, n_jobs=1, 46 | memory=None, verbose=1, **kwargs): 47 | """Ensemble clustered inference algorithm 48 | 49 | Parameters 50 | ---------- 51 | X_init : ndarray, shape (n_samples, n_features) 52 | Original data (uncompressed). 53 | 54 | y : ndarray, shape (n_samples,) or (n_samples, n_times) 55 | Target. 56 | 57 | ward : sklearn.cluster.FeatureAgglomeration 58 | Scikit-learn object that computes Ward hierarchical clustering. 59 | 60 | n_clusters : int 61 | Number of clusters used for the compression. 62 | 63 | train_size : float, optional (default=0.3) 64 | Fraction of samples used to compute the clustering. 65 | If `train_size = 1`, clustering is not random since all the samples 66 | are used to compute the clustering. 67 | 68 | groups : ndarray, shape (n_samples,), optional (default=None) 69 | Group labels for every sample. If not None, `groups` is used to build 70 | the subsamples that serve for computing the clustering. 71 | 72 | inference_method : str, optional (default='desparsified-lasso') 73 | Method used for making the inference. 74 | Currently, the two methods available are 'desparsified-lasso' 75 | and 'group-desparsified-lasso'. Use 'desparsified-lasso' for 76 | non-temporal data and 'group-desparsified-lasso' for temporal data. 77 | 78 | seed: int, optional (default=0) 79 | Seed used for generating a the first random subsample of the data. 80 | This seed controls the clustering randomness. 81 | 82 | ensembling_method : str, optional (default='quantiles') 83 | Method used for making the ensembling. Currently, the two methods 84 | available are 'quantiles' and 'median'. 85 | 86 | gamma_min : float, optional (default=0.2) 87 | Lowest gamma-quantile being considered to compute the adaptive 88 | quantile aggregation formula. This parameter is considered only if 89 | `ensembling_method` is 'quantiles'. 90 | 91 | n_bootstraps : int, optional (default=25) 92 | Number of clustered inference algorithm solutions to compute before 93 | making the ensembling. 94 | 95 | n_jobs : int or None, optional (default=1) 96 | Number of CPUs used to compute several clustered inference 97 | algorithms at the same time. 98 | 99 | memory : str, optional (default=None) 100 | Used to cache the output of the computation of the clustering 101 | and the inference. By default, no caching is done. If a string is 102 | given, it is the path to the caching directory. 103 | 104 | verbose: int, optional (default=1) 105 | The verbosity level. If `verbose > 0`, we print a message before 106 | runing the clustered inference. 107 | 108 | **kwargs: 109 | Arguments passed to the statistical inference function. 110 | 111 | Returns 112 | ------- 113 | beta_hat : ndarray, shape (n_features,) or (n_features, n_times) 114 | Estimated parameter vector or matrix. 115 | 116 | pval : ndarray, shape (n_features,) 117 | p-value, with numerically accurate values for 118 | positive effects (ie., for p-value close to zero). 119 | 120 | pval_corr : ndarray, shape (n_features,) 121 | p-value corrected for multiple testing. 122 | 123 | one_minus_pval : ndarray, shape (n_features,) 124 | One minus the p-value, with numerically accurate values 125 | for negative effects (ie., for p-value close to one). 126 | 127 | one_minus_pval_corr : ndarray, shape (n_features,) 128 | One minus the p-value corrected for multiple testing. 129 | 130 | References 131 | ---------- 132 | .. [1] Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021). 133 | Spatially relaxed inference on high-dimensional linear models. 134 | arXiv preprint arXiv:2106.02590. 135 | """ 136 | 137 | if memory is not None and not isinstance(memory, str): 138 | raise ValueError("'memory' must be None or a string corresponding " + 139 | "to the path of the caching directory.") 140 | 141 | # Clustered inference algorithms 142 | results = Parallel(n_jobs=n_jobs, verbose=verbose)( 143 | delayed(clustered_inference)(X_init, y, ward, n_clusters, 144 | train_size=train_size, groups=groups, 145 | method=inference_method, seed=i, 146 | n_jobs=1, memory=memory, 147 | verbose=verbose, **kwargs) 148 | for i in np.arange(seed, seed + n_bootstraps)) 149 | 150 | # Collecting results 151 | list_beta_hat = [] 152 | list_pval, list_pval_corr = [], [] 153 | list_one_minus_pval, list_one_minus_pval_corr = [], [] 154 | 155 | for i in range(n_bootstraps): 156 | 157 | list_beta_hat.append(results[i][0]) 158 | list_pval.append(results[i][1]) 159 | list_pval_corr.append(results[i][2]) 160 | list_one_minus_pval.append(results[i][3]) 161 | list_one_minus_pval_corr.append(results[i][4]) 162 | 163 | # Ensembling 164 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 165 | _ensembling(list_beta_hat, list_pval, list_pval_corr, 166 | list_one_minus_pval, list_one_minus_pval_corr, 167 | method=ensembling_method, gamma_min=gamma_min) 168 | 169 | return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr 170 | -------------------------------------------------------------------------------- /hidimstat/permutation_test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from joblib import Parallel, delayed 3 | 4 | from sklearn.base import clone 5 | from sklearn.utils import _safe_indexing 6 | from sklearn.svm import LinearSVR 7 | from sklearn.model_selection import GridSearchCV 8 | from sklearn.pipeline import Pipeline 9 | 10 | from hidimstat.stat_tools import pval_from_two_sided_pval_and_sign 11 | 12 | 13 | def permutation_test_cv(X, y, n_permutations=1000, 14 | C=None, Cs=np.logspace(-7, 1, 9), 15 | seed=0, n_jobs=1, verbose=1): 16 | """Cross-validated permutation test shuffling the target 17 | 18 | Parameters 19 | ----------- 20 | X : ndarray, shape (n_samples, n_features) 21 | Data. 22 | 23 | y : ndarray, shape (n_samples,) 24 | Target. 25 | 26 | C : float or None, optional (default=None) 27 | If None, the linear SVR regularization parameter is set by cross-val 28 | running a grid search on the list of hyper-parameters contained in Cs. 29 | Otherwise, the regularization parameter is equal to C. 30 | The strength of the regularization is inversely proportional to C. 31 | 32 | Cs : ndarray, optional (default=np.logspace(-7, 1, 9)) 33 | If C is None, the linear SVR regularization parameter is set by 34 | cross-val running a grid search on the list of hyper-parameters 35 | contained in Cs. 36 | 37 | n_permutations : int, optional (default=1000) 38 | Number of permutations used to compute the survival function 39 | and cumulative distribution function scores. 40 | 41 | seed : int, optional (default=0) 42 | Determines the permutations used for shuffling the target 43 | 44 | n_jobs : int or None, optional (default=1) 45 | Number of CPUs to use during the cross validation. 46 | 47 | verbose: int, optional (default=1) 48 | The verbosity level: if non zero, progress messages are printed 49 | when computing the permutation stats in parralel. 50 | The frequency of the messages increases with the verbosity level. 51 | 52 | Returns 53 | ------- 54 | pval_corr : ndarray, shape (n_features,) 55 | p-value corrected for multiple testing, with numerically accurate 56 | values for positive effects (ie., for p-value close to zero). 57 | 58 | one_minus_pval_corr : ndarray, shape (n_features,) 59 | One minus the corrected p-value, with numerically accurate 60 | values for negative effects (ie., for p-value close to one). 61 | """ 62 | 63 | if C is None: 64 | 65 | steps = [('SVR', LinearSVR())] 66 | pipeline = Pipeline(steps) 67 | parameters = {'SVR__C': Cs} 68 | grid = GridSearchCV(pipeline, param_grid=parameters, n_jobs=n_jobs) 69 | grid.fit(X, y) 70 | C = grid.best_params_['SVR__C'] 71 | estimator = LinearSVR(C=C) 72 | 73 | else: 74 | 75 | estimator = LinearSVR(C=C) 76 | 77 | pval_corr, one_minus_pval_corr = \ 78 | permutation_test(X, y, estimator, n_permutations=n_permutations, 79 | seed=seed, n_jobs=n_jobs, verbose=verbose) 80 | 81 | return pval_corr, one_minus_pval_corr 82 | 83 | 84 | def permutation_test(X, y, estimator, n_permutations=1000, 85 | seed=0, n_jobs=1, verbose=1): 86 | """Permutation test shuffling the target 87 | 88 | Parameters 89 | ----------- 90 | X : ndarray, shape (n_samples, n_features) 91 | Data. 92 | 93 | y : ndarray, shape (n_samples,) 94 | Target. 95 | 96 | n_permutations : int, optional (default=1000) 97 | Number of permutations used to compute the survival function 98 | and cumulative distribution function scores. 99 | 100 | seed : int, optional (default=0) 101 | Determines the permutations used for shuffling the target 102 | 103 | n_jobs : int or None, optional (default=1) 104 | Number of CPUs to use during the cross validation. 105 | 106 | verbose: int, optional (default=1) 107 | The verbosity level: if non zero, progress messages are printed 108 | when computing the permutation stats in parralel. 109 | The frequency of the messages increases with the verbosity level. 110 | 111 | Returns 112 | ------- 113 | pval_corr : ndarray, shape (n_features,) 114 | p-value corrected for multiple testing, with numerically accurate 115 | values for positive effects (ie., for p-value close to zero). 116 | 117 | one_minus_pval_corr : ndarray, shape (n_features,) 118 | One minus the corrected p-value, with numerically accurate 119 | values for negative effects (ie., for p-value close to one). 120 | """ 121 | 122 | rng = np.random.default_rng(seed) 123 | 124 | stat = _permutation_test_stat(clone(estimator), X, y) 125 | 126 | permutation_stats = \ 127 | Parallel(n_jobs=n_jobs, verbose=verbose)( 128 | delayed(_permutation_test_stat)(clone(estimator), X, 129 | _shuffle(y, rng)) 130 | for _ in range(n_permutations)) 131 | 132 | permutation_stats = np.array(permutation_stats) 133 | two_sided_pval_corr = step_down_max_T(stat, permutation_stats) 134 | 135 | stat_sign = np.sign(stat) 136 | 137 | pval_corr, _, one_minus_pval_corr, _ = \ 138 | pval_from_two_sided_pval_and_sign(two_sided_pval_corr, stat_sign) 139 | 140 | return pval_corr, one_minus_pval_corr 141 | 142 | 143 | def _permutation_test_stat(estimator, X, y): 144 | """Fit estimator and get coef""" 145 | stat = estimator.fit(X, y).coef_ 146 | return stat 147 | 148 | 149 | def _shuffle(y, rng): 150 | """Shuffle vector""" 151 | indices = rng.permutation(len(y)) 152 | return _safe_indexing(y, indices) 153 | 154 | 155 | def step_down_max_T(stat, permutation_stats): 156 | """Step-down maxT algorithm for computing adjusted p-values 157 | 158 | Parameters 159 | ----------- 160 | stat : ndarray, shape (n_features,) 161 | Statistic computed on the original (unpermutted) problem. 162 | 163 | permutation_stats : ndarray, shape (n_permutations, n_features) 164 | Statistics computed on permutted problems. 165 | 166 | Returns 167 | ------- 168 | two_sided_pval_corr : ndarray, shape (n_features,) 169 | Two-sided p-values corrected for multiple testing. 170 | 171 | References 172 | ---------- 173 | .. [1] Westfall, P. H., & Young, S. S. (1993). Resampling-based multiple 174 | testing: Examples and methods for p-value adjustment (Vol. 279). 175 | John Wiley & Sons. 176 | """ 177 | 178 | n_permutations, n_features = np.shape(permutation_stats) 179 | 180 | index_ordered = np.argsort(np.abs(stat)) 181 | stat_ranked = np.empty(n_features) 182 | stat_ranked[index_ordered] = np.arange(n_features) 183 | stat_ranked = stat_ranked.astype(int) 184 | stat_sorted = np.copy(np.abs(stat)[index_ordered]) 185 | permutation_stats_ordered = \ 186 | np.copy(np.abs(permutation_stats)[:, index_ordered]) 187 | 188 | for i in range(1, n_features): 189 | permutation_stats_ordered[:, i] = \ 190 | np.maximum(permutation_stats_ordered[:, i - 1], 191 | permutation_stats_ordered[:, i]) 192 | 193 | two_sided_pval_corr = \ 194 | (np.sum(np.less_equal(stat_sorted, permutation_stats_ordered), axis=0) 195 | / n_permutations) 196 | 197 | for i in range(n_features - 1)[::-1]: 198 | two_sided_pval_corr[i] = \ 199 | np.maximum(two_sided_pval_corr[i], two_sided_pval_corr[i + 1]) 200 | 201 | two_sided_pval_corr = np.copy(two_sided_pval_corr[stat_ranked]) 202 | 203 | return two_sided_pval_corr 204 | -------------------------------------------------------------------------------- /hidimstat/scenario.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import ndimage 3 | 4 | ROI_SIZE_2D = 2 5 | SHAPE_2D = (12, 12) 6 | 7 | ROI_SIZE_3D = 2 8 | SHAPE_3D = (12, 12, 12) 9 | 10 | 11 | def multivariate_1D_simulation(n_samples=100, n_features=500, 12 | support_size=10, sigma=1.0, 13 | rho=0.0, shuffle=True, seed=0): 14 | """Generate 1D data with Toeplitz design matrix 15 | 16 | Parameters 17 | ----------- 18 | n_samples : int 19 | Number of samples. 20 | 21 | n_features : int 22 | Number of features. 23 | 24 | support_size : int 25 | Size of the support. 26 | 27 | sigma : float 28 | Standard deviation of the additive White Gaussian noise. 29 | 30 | rho: float 31 | Level of correlation between neighboring features (if not `shuffle`). 32 | 33 | shuffle : bool 34 | Shuffle the features (breaking 1D data structure) if True. 35 | 36 | seed : int 37 | Seed used for generating design matrix and noise. 38 | 39 | Returns 40 | ------- 41 | X : ndarray, shape (n_samples, n_features) 42 | Design matrix. 43 | 44 | y : ndarray, shape (n_samples,) 45 | Target. 46 | 47 | beta : ndarray, shape (n_features,) 48 | Parameter vector. 49 | 50 | noise : ndarray, shape (n_samples,) 51 | Additive white Gaussian noise. 52 | """ 53 | 54 | rng = np.random.default_rng(seed) 55 | 56 | X = np.zeros((n_samples, n_features)) 57 | X[:, 0] = rng.standard_normal(n_samples) 58 | 59 | for i in np.arange(1, n_features): 60 | rand_vector = ((1 - rho ** 2) ** 0.5) * rng.standard_normal(n_samples) 61 | X[:, i] = rho * X[:, i - 1] + rand_vector 62 | 63 | if shuffle: 64 | rng.shuffle(X.T) 65 | 66 | beta = np.zeros(n_features) 67 | beta[0:support_size] = 1.0 68 | 69 | noise = sigma * rng.standard_normal(n_samples) 70 | y = np.dot(X, beta) + noise 71 | 72 | return X, y, beta, noise 73 | 74 | 75 | def generate_2D_weight(shape, roi_size): 76 | """Create a 2D weight map with four ROIs 77 | 78 | Parameters 79 | ----------- 80 | shape : tuple (n_x, n_z) 81 | Shape of the data in the simulation. 82 | 83 | roi_size : int 84 | Size of the edge of the ROIs. 85 | 86 | Returns 87 | ------- 88 | w : ndarray, shape (n_x, n_z) 89 | 2D weight map. 90 | """ 91 | 92 | w = np.zeros(shape + (5,)) 93 | w[0:roi_size, 0:roi_size, 0] = 1.0 94 | w[-roi_size:, -roi_size:, 1] = 1.0 95 | w[0:roi_size, -roi_size:, 2] = 1.0 96 | w[-roi_size:, 0:roi_size, 3] = 1.0 97 | 98 | return w 99 | 100 | 101 | def generate_3D_weight(shape, roi_size): 102 | """Create a 3D weight map with five ROIs 103 | 104 | Parameters 105 | ----------- 106 | shape : tuple (n_x, n_y, n_z) 107 | Shape of the data in the simulation. 108 | 109 | roi_size : int 110 | Size of the edge of the ROIs. 111 | 112 | Returns 113 | ------- 114 | w : ndarray, shape (n_x, n_y, n_z) 115 | 3D weight map. 116 | """ 117 | 118 | w = np.zeros(shape + (5,)) 119 | w[0:roi_size, 0:roi_size, 0:roi_size, 0] = -1.0 120 | w[-roi_size:, -roi_size:, 0:roi_size, 1] = 1.0 121 | w[0:roi_size, -roi_size:, -roi_size:, 2] = -1.0 122 | w[-roi_size:, 0:roi_size, -roi_size:, 3] = 1.0 123 | w[(shape[0] - roi_size) // 2:(shape[0] + roi_size) // 2, 124 | (shape[1] - roi_size) // 2:(shape[1] + roi_size) // 2, 125 | (shape[2] - roi_size) // 2:(shape[2] + roi_size) // 2, 4] = 1.0 126 | return w 127 | 128 | 129 | def multivariate_simulation(n_samples=100, 130 | shape=SHAPE_2D, 131 | roi_size=ROI_SIZE_2D, 132 | sigma=1.0, 133 | smooth_X=1.0, 134 | return_shaped_data=True, 135 | seed=0): 136 | """Generate a multivariate simulation with 2D or 3D data 137 | 138 | Parameters 139 | ----------- 140 | n_samples : int 141 | Number of samples. 142 | 143 | shape : tuple (n_x, n_y) or (n_x, n_y, n_z) 144 | Shape of the data in the simulation. 145 | 146 | roi_size : int 147 | Size of the edge of the ROIs. 148 | 149 | sigma : float 150 | Standard deviation of the additive white Gaussian noise. 151 | 152 | smooth_X : float 153 | Level of (data) smoothing using a Gaussian filter. 154 | 155 | return_shaped_data : bool 156 | If true, the function returns shaped data and weight map. 157 | 158 | seed : int 159 | Seed used for generating design matrix and noise. 160 | 161 | Returns 162 | ------- 163 | X : ndarray, shape (n_samples, n_features) 164 | Design matrix. 165 | 166 | y : ndarray, shape (n_samples,) 167 | Target. 168 | beta: ndarray, shape (n_features,) 169 | Parameter vector (flattened weight map). 170 | 171 | noise: ndarray, shape (n_samples,) 172 | Additive white Gaussian noise. 173 | 174 | X_: ndarray, shape (n_samples, n_x, n_y) or (n_samples, n_x, n_y, n_z) 175 | Reshaped design matrix. 176 | 177 | w : ndarray, shape (n_x, n_y) or (n_x, n_y, n_z) 178 | 2D or 3D weight map. 179 | """ 180 | 181 | rng = np.random.default_rng(seed) 182 | 183 | if len(shape) == 2: 184 | w = generate_2D_weight(shape, roi_size) 185 | elif len(shape) == 3: 186 | w = generate_3D_weight(shape, roi_size) 187 | 188 | beta = w.sum(-1).ravel() 189 | X_ = rng.standard_normal((n_samples,) + shape) 190 | X = [] 191 | 192 | for i in np.arange(n_samples): 193 | Xi = ndimage.filters.gaussian_filter(X_[i], smooth_X) 194 | X.append(Xi.ravel()) 195 | 196 | X = np.asarray(X) 197 | X_ = X.reshape((n_samples,) + shape) 198 | 199 | noise = sigma * rng.standard_normal(n_samples) 200 | y = np.dot(X, beta) + noise 201 | 202 | if return_shaped_data: 203 | return X, y, beta, noise, X_, w 204 | 205 | return X, y, beta, noise 206 | 207 | 208 | def multivariate_temporal_simulation(n_samples=100, n_features=500, n_times=30, 209 | support_size=10, sigma=1.0, rho_noise=0.0, 210 | rho_data=0.0, shuffle=True, seed=0): 211 | """Generate 1D temporal data with constant design matrix 212 | 213 | Parameters 214 | ----------- 215 | n_samples : int 216 | Number of samples. 217 | 218 | n_features : int 219 | Number of features. 220 | 221 | n_times : int 222 | Number of time points. 223 | 224 | support_size: int 225 | Size of the row support. 226 | 227 | sigma : float 228 | Standard deviation of the noise at each time point. 229 | 230 | rho_noise : float 231 | Level of autocorrelation in the noise. 232 | 233 | rho_data: float 234 | Level of correlation between neighboring features (if not `shuffle`). 235 | 236 | shuffle : bool 237 | Shuffle the features (breaking 1D data structure) if True. 238 | 239 | seed : int 240 | Seed used for generating design matrix and noise. 241 | 242 | Returns 243 | ------- 244 | X: ndarray, shape (n_samples, n_features) 245 | Design matrix. 246 | 247 | Y : ndarray, shape (n_samples, n_times) 248 | Target. 249 | 250 | beta : ndarray, shape (n_features, n_times) 251 | Parameter matrix. 252 | 253 | noise : ndarray, shape (n_samples, n_times) 254 | Noise matrix. 255 | """ 256 | 257 | rng = np.random.default_rng(seed) 258 | 259 | X = np.zeros((n_samples, n_features)) 260 | X[:, 0] = rng.standard_normal(n_samples) 261 | 262 | for i in np.arange(1, n_features): 263 | rand_vector = \ 264 | ((1 - rho_data ** 2) ** 0.5) * rng.standard_normal(n_samples) 265 | X[:, i] = rho_data * X[:, i - 1] + rand_vector 266 | 267 | if shuffle: 268 | rng.shuffle(X.T) 269 | 270 | beta = np.zeros((n_features, n_times)) 271 | beta[0:support_size, :] = 1.0 272 | 273 | noise = np.zeros((n_samples, n_times)) 274 | noise[:, 0] = rng.standard_normal(n_samples) 275 | 276 | for i in range(1, n_times): 277 | rand_vector = \ 278 | ((1 - rho_noise ** 2) ** 0.5) * rng.standard_normal(n_samples) 279 | noise[:, i] = rho_noise * noise[:, i - 1] + rand_vector 280 | 281 | noise = sigma * noise 282 | 283 | Y = np.dot(X, beta) + noise 284 | 285 | return X, Y, beta, noise 286 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # hidimstat documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Jun 1 00:35:01 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import os 16 | import sys 17 | import warnings 18 | import sphinx_gallery 19 | import sphinx_bootstrap_theme 20 | from distutils.version import LooseVersion 21 | import matplotlib 22 | 23 | # Disable agg warnings in doc 24 | warnings.filterwarnings("ignore", category=UserWarning, 25 | message='Matplotlib is currently using agg, which is a' 26 | ' non-GUI backend, so cannot show the figure.') 27 | 28 | # If extensions (or modules to document with autodoc) are in another directory, 29 | # add these directories to sys.path here. If the directory is relative to the 30 | # documentation root, use os.path.abspath to make it absolute, like shown here. 31 | #sys.path.insert(0, os.path.abspath('.')) 32 | 33 | 34 | # -- General configuration ------------------------------------------------ 35 | 36 | # If your documentation needs a minimal Sphinx version, state it here. 37 | #needs_sphinx = '1.0' 38 | 39 | # Add any Sphinx extension module names here, as strings. They can be 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 41 | # ones. 42 | extensions = [ 43 | 'sphinx.ext.autodoc', 44 | 'sphinx.ext.autosummary', 45 | 'sphinx.ext.doctest', 46 | 'sphinx.ext.intersphinx', 47 | 'sphinx.ext.mathjax', 48 | 'sphinx_gallery.gen_gallery', 49 | 'numpydoc', 50 | ] 51 | 52 | if LooseVersion(sphinx_gallery.__version__) < LooseVersion('0.2'): 53 | raise ImportError('Must have at least version 0.2 of sphinx-gallery, got ' 54 | '%s' % (sphinx_gallery.__version__,)) 55 | 56 | matplotlib.use('agg') 57 | 58 | 59 | # Add any paths that contain templates here, relative to this directory. 60 | templates_path = ['_templates'] 61 | 62 | # The suffix(es) of source filenames. 63 | # You can specify multiple suffix as a list of string: 64 | # 65 | source_suffix = ['.rst', '.md'] 66 | # source_suffix = '.rst' 67 | 68 | # The master toctree document. 69 | master_doc = 'index' 70 | 71 | # General information about the project. 72 | project = u'hidimstat' 73 | copyright = u'2020, Jerome-Alexis Chevalier & Binh Nguyen' 74 | author = u'Jerome-Alexis Chevalier & Binh Nguyen' 75 | 76 | # The version info for the project you're documenting, acts as replacement for 77 | # |version| and |release|, also used in various other places throughout the 78 | # built documents. 79 | # 80 | # The short X.Y version. 81 | from hidimstat import __version__ as version # noqa 82 | # The full version, including alpha/beta/rc tags. 83 | release = version 84 | 85 | # The language for content autogenerated by Sphinx. Refer to documentation 86 | # for a list of supported languages. 87 | # 88 | # This is also used if you do content translation via gettext catalogs. 89 | # Usually you set "language" from the command line for these cases. 90 | language = None 91 | 92 | # List of patterns, relative to source directory, that match files and 93 | # directories to ignore when looking for source files. 94 | # This patterns also effect to html_static_path and html_extra_path 95 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 96 | 97 | # The name of the Pygments (syntax highlighting) style to use. 98 | pygments_style = 'sphinx' 99 | 100 | # If true, `todo` and `todoList` produce output, else they produce nothing. 101 | todo_include_todos = False 102 | 103 | # generate autosummary even if no references 104 | autosummary_generate = True 105 | 106 | # remove warnings: "toctree contains reference to nonexisting document" 107 | numpydoc_show_class_members = False 108 | 109 | # -- Options for HTML output ---------------------------------------------- 110 | 111 | # The theme to use for HTML and HTML Help pages. See the documentation for 112 | # a list of builtin themes. 113 | html_theme = 'bootstrap' 114 | 115 | # Theme options are theme-specific and customize the look and feel of a theme 116 | # further. For a list of options available for each theme, see the 117 | # documentation. 118 | html_theme_options = { 119 | 'navbar_sidebarrel': False, 120 | 'navbar_pagenav': False, 121 | 'source_link_position': "", 122 | 'navbar_links': [ 123 | ("Examples", "auto_examples/index"), 124 | ("API", "api"), 125 | ("GitHub", "https://github.com/ja-che/hidimstat", True) 126 | ], 127 | 'bootswatch_theme': "flatly", 128 | 'bootstrap_version': "3", 129 | } 130 | 131 | # Add any paths that contain custom themes here, relative to this directory. 132 | html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() 133 | 134 | # Add any paths that contain custom static files (such as style sheets) here, 135 | # relative to this directory. They are copied after the builtin static files, 136 | # so a file named "default.css" will overwrite the builtin "default.css". 137 | html_static_path = ['_static'] 138 | 139 | 140 | # -- Options for HTMLHelp output ------------------------------------------ 141 | 142 | # Output file base name for HTML help builder. 143 | htmlhelp_basename = 'hidimstat_doc' 144 | 145 | 146 | # -- Options for LaTeX output --------------------------------------------- 147 | 148 | latex_elements = { 149 | # The paper size ('letterpaper' or 'a4paper'). 150 | # 151 | # 'papersize': 'letterpaper', 152 | 153 | # The font size ('10pt', '11pt' or '12pt'). 154 | # 155 | # 'pointsize': '10pt', 156 | 157 | # Additional stuff for the LaTeX preamble. 158 | # 159 | # 'preamble': '', 160 | 161 | # Latex figure (float) alignment 162 | # 163 | # 'figure_align': 'htbp', 164 | } 165 | 166 | # Grouping the document tree into LaTeX files. List of tuples 167 | # (source start file, target name, title, 168 | # author, documentclass [howto, manual, or own class]). 169 | latex_documents = [ 170 | (master_doc, 'hidimstat.tex', u'hidimstat Documentation', 171 | u'Jerome-Alexis Chevalier', 'manual'), 172 | ] 173 | 174 | 175 | # -- Options for manual page output --------------------------------------- 176 | 177 | # One entry per manual page. List of tuples 178 | # (source start file, name, description, authors, manual section). 179 | man_pages = [ 180 | (master_doc, 'hidimstat', u'Hidimstat Documentation', 181 | [author], 1) 182 | ] 183 | 184 | 185 | # -- Options for Texinfo output ------------------------------------------- 186 | 187 | # Grouping the document tree into Texinfo files. List of tuples 188 | # (source start file, target name, title, author, 189 | # dir menu entry, description, category) 190 | texinfo_documents = [ 191 | (master_doc, 'hidimstat', u'hidimstat Documentation', 192 | author, 'hidimstat', 'One line description of project.', 193 | 'Miscellaneous'), 194 | ] 195 | 196 | # -- Intersphinx configuration ----------------------------------------------- 197 | 198 | intersphinx_mapping = { 199 | 'python': ('https://docs.python.org/3', None), 200 | 'numpy': ('https://numpy.org/devdocs', None), 201 | 'scipy': ('https://scipy.github.io/devdocs', None), 202 | 'matplotlib': ('https://matplotlib.org', None), 203 | 'sklearn': ('https://scikit-learn.org/stable', None), 204 | 'numba': ('https://numba.pydata.org/numba-doc/latest', None), 205 | 'joblib': ('https://joblib.readthedocs.io/en/latest', None), 206 | 'pandas': ('https://pandas.pydata.org/pandas-docs/stable', None), 207 | 'seaborn': ('https://seaborn.pydata.org/', None), 208 | 'pyvista': ('https://docs.pyvista.org', None), 209 | } 210 | 211 | examples_dirs = ['../examples'] 212 | gallery_dirs = ['auto_examples'] 213 | import mne 214 | 215 | scrapers = ('matplotlib',) 216 | try: 217 | with warnings.catch_warnings(): 218 | warnings.filterwarnings("ignore", category=DeprecationWarning) 219 | import pyvista 220 | pyvista.OFF_SCREEN = False 221 | except Exception: 222 | pass 223 | else: 224 | brain_scraper = mne.viz._brain._BrainScraper() 225 | scrapers += (brain_scraper, 'pyvista') 226 | if any(x in scrapers for x in ('pyvista')): 227 | from traits.api import push_exception_handler 228 | push_exception_handler(reraise_exceptions=True) 229 | report_scraper = mne.report._ReportScraper() 230 | scrapers += (report_scraper,) 231 | else: 232 | report_scraper = None 233 | 234 | sphinx_gallery_conf = { 235 | 'doc_module': 'groupmne', 236 | 'reference_url': dict(groupmne=None), 237 | 'examples_dirs': examples_dirs, 238 | 'gallery_dirs': gallery_dirs, 239 | 'plot_gallery': 'True', 240 | 'thumbnail_size': (160, 112), 241 | 'min_reported_time': 1., 242 | 'backreferences_dir': os.path.join('generated'), 243 | 'abort_on_example_error': False, 244 | 'image_scrapers': scrapers, 245 | 'show_memory': True, 246 | # 'reference_url': { 247 | # 'numpy': 'http://docs.scipy.org/doc/numpy-1.9.1', 248 | # 'scipy': 'http://docs.scipy.org/doc/scipy-0.17.0/reference', 249 | # } 250 | } 251 | 252 | 253 | def setup(app): 254 | app.add_css_file('style.css') 255 | -------------------------------------------------------------------------------- /hidimstat/noise_std.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import norm 3 | from scipy.linalg import toeplitz, solve 4 | from sklearn.linear_model import LassoCV, MultiTaskLassoCV 5 | from sklearn.model_selection import KFold 6 | 7 | 8 | def reid(X, y, eps=1e-2, tol=1e-4, max_iter=1e4, n_jobs=1, seed=0): 9 | """Estimation of noise standard deviation using Reid procedure 10 | 11 | Parameters 12 | ----------- 13 | X : ndarray, shape (n_samples, n_features) 14 | Data. 15 | 16 | y : ndarray, shape (n_samples,) 17 | Target. 18 | 19 | eps: float, optional (default=1e-2) 20 | Length of the cross-validation path. 21 | eps=1e-2 means that alpha_min / alpha_max = 1e-2. 22 | 23 | tol : float, optional (default=1e-4) 24 | The tolerance for the optimization: if the updates are smaller 25 | than `tol`, the optimization code checks the dual gap for optimality 26 | and continues until it is smaller than `tol`. 27 | 28 | max_iter : int, optional (default=1e4) 29 | The maximum number of iterations. 30 | 31 | n_jobs : int or None, optional (default=1) 32 | Number of CPUs to use during the cross validation. 33 | 34 | seed: int, optional (default=0) 35 | Seed passed in the KFold object which is used to cross-validate 36 | LassoCV. This seed controls the partitioning randomness. 37 | 38 | Returns 39 | ------- 40 | sigma_hat : float 41 | Estimated noise standard deviation. 42 | 43 | beta_hat : array, shape (n_features,) 44 | Estimated parameter vector. 45 | 46 | References 47 | ---------- 48 | .. [1] Reid, S., Tibshirani, R., & Friedman, J. (2016). A study of error 49 | variance estimation in lasso regression. Statistica Sinica, 35-67. 50 | """ 51 | 52 | X = np.asarray(X) 53 | n_samples, n_features = X.shape 54 | 55 | if max_iter // 5 <= n_features: 56 | max_iter = n_features * 5 57 | print(f"'max_iter' has been increased to {max_iter}") 58 | 59 | cv = KFold(n_splits=5, shuffle=True, random_state=seed) 60 | 61 | clf_lasso_cv = \ 62 | LassoCV(eps=eps, normalize=False, fit_intercept=False, 63 | cv=cv, tol=tol, max_iter=max_iter, n_jobs=n_jobs) 64 | 65 | clf_lasso_cv.fit(X, y) 66 | beta_hat = clf_lasso_cv.coef_ 67 | residual = clf_lasso_cv.predict(X) - y 68 | coef_max = np.max(np.abs(beta_hat)) 69 | support = np.sum(np.abs(beta_hat) > tol * coef_max) 70 | 71 | # avoid dividing by 0 72 | support = min(support, n_samples - 1) 73 | 74 | sigma_hat = norm(residual) / np.sqrt(n_samples - support) 75 | 76 | return sigma_hat, beta_hat 77 | 78 | 79 | def group_reid(X, Y, fit_Y=True, stationary=True, method='simple', order=1, 80 | eps=1e-2, tol=1e-4, max_iter=1e4, n_jobs=1, seed=0): 81 | 82 | """Estimation of the covariance matrix using group Reid procedure 83 | 84 | Parameters 85 | ----------- 86 | X : ndarray, shape (n_samples, n_features) 87 | Data. 88 | 89 | Y : ndarray, shape (n_samples, n_times) 90 | Target. 91 | 92 | fit_Y : bool, optional (default=True) 93 | If True, Y will be regressed against X by MultiTaskLassoCV 94 | and the covariance matrix is estimated on the residuals. 95 | Otherwise, covariance matrix is estimated directly on Y. 96 | 97 | stationary : bool, optional (default=True) 98 | If True, noise is considered to have the same magnitude for each 99 | time step. Otherwise, magnitude of the noise is not constant. 100 | 101 | method : str, optional (default='simple') 102 | If 'simple', the correlation matrix is estimated by taking the 103 | median of the correlation between two consecutive time steps 104 | and the noise standard deviation for each time step is estimated 105 | by taking the median of the standard deviations for every time step. 106 | If 'AR', the order of the AR model is given by `order` and 107 | Yule-Walker method is used to estimate the covariance matrix. 108 | 109 | order : int, optional (default=1) 110 | If `stationary=True` and `method=AR`, `order` gives the 111 | order of the estimated autoregressive model. `order` must 112 | be smaller than the number of time steps. 113 | 114 | eps : float, optional (default=1e-2) 115 | Length of the cross-validation path. 116 | eps=1e-2 means that alpha_min / alpha_max = 1e-2. 117 | 118 | tol : float, optional (default=1e-4) 119 | The tolerance for the optimization: if the updates are smaller 120 | than `tol`, the optimization code checks the dual gap for optimality 121 | and continues until it is smaller than `tol`. 122 | 123 | max_iter : int, optional (default=1e4) 124 | The maximum number of iterations. 125 | 126 | n_jobs : int or None, optional (default=1) 127 | Number of CPUs to use during the cross validation. 128 | 129 | seed: int, optional (default=0) 130 | Seed passed in the KFold object which is used to cross-validate 131 | LassoCV. This seed controls also the partitioning randomness. 132 | 133 | Returns 134 | ------- 135 | cov_hat : ndarray, shape (n_times, n_times) 136 | Estimated covariance matrix. 137 | 138 | beta_hat : ndarray, shape (n_features, n_times) 139 | Estimated parameter matrix. 140 | 141 | References 142 | ---------- 143 | .. [1] Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020). 144 | Statistical control for spatio-temporal MEG/EEG source imaging with 145 | desparsified multi-task Lasso. In NeurIPS 2020-34h Conference on 146 | Neural Information Processing Systems. 147 | """ 148 | 149 | X = np.asarray(X) 150 | n_samples, n_features = X.shape 151 | n_times = Y.shape[1] 152 | 153 | if method == 'simple': 154 | print('Group reid: simple cov estimation') 155 | else: 156 | print(f'Group reid: {method}{order} cov estimation') 157 | 158 | if (max_iter // 5) <= n_features: 159 | max_iter = n_features * 5 160 | print(f"'max_iter' has been increased to {max_iter}") 161 | 162 | cv = KFold(n_splits=5, shuffle=True, random_state=seed) 163 | 164 | if fit_Y: 165 | 166 | clf_mtlcv = \ 167 | MultiTaskLassoCV(eps=eps, normalize=False, fit_intercept=False, 168 | cv=cv, tol=tol, max_iter=max_iter, n_jobs=n_jobs) 169 | 170 | clf_mtlcv.fit(X, Y) 171 | beta_hat = clf_mtlcv.coef_ 172 | residual = clf_mtlcv.predict(X) - Y 173 | row_max = np.max(np.sum(np.abs(beta_hat), axis=0)) 174 | support = np.sum(np.sum(np.abs(beta_hat), axis=0) > tol * row_max) 175 | 176 | # avoid dividing by 0 177 | support = min(support, n_samples - 1) 178 | 179 | else: 180 | 181 | beta_hat = np.zeros((n_features, n_times)) 182 | residual = np.copy(Y) 183 | support = 0 184 | 185 | sigma_hat_raw = norm(residual, axis=0) / np.sqrt(n_samples - support) 186 | 187 | if stationary: 188 | sigma_hat = np.median(sigma_hat_raw) * np.ones(n_times) 189 | corr_emp = np.corrcoef(residual.T) 190 | else: 191 | sigma_hat = sigma_hat_raw 192 | residual_rescaled = residual / sigma_hat 193 | corr_emp = np.corrcoef(residual_rescaled.T) 194 | 195 | # Median method 196 | if not stationary or method == 'simple': 197 | 198 | rho_hat = np.median(np.diag(corr_emp, 1)) 199 | corr_hat = \ 200 | toeplitz(np.geomspace(1, rho_hat ** (n_times - 1), n_times)) 201 | cov_hat = np.outer(sigma_hat, sigma_hat) * corr_hat 202 | 203 | # Yule-Walker method 204 | elif stationary and method == 'AR': 205 | 206 | if order > n_times - 1: 207 | raise ValueError('The requested AR order is to high with ' + 208 | 'respect to the number of time steps.') 209 | 210 | rho_ar = np.zeros(order + 1) 211 | rho_ar[0] = 1 212 | 213 | for i in range(1, order + 1): 214 | rho_ar[i] = np.median(np.diag(corr_emp, i)) 215 | 216 | A = toeplitz(rho_ar[:-1]) 217 | coef_ar = solve(A, rho_ar[1:]) 218 | 219 | residual_estimate = np.zeros((n_samples, n_times - order)) 220 | 221 | for i in range(order): 222 | # time window used to estimate the residual from AR model 223 | start = order - i - 1 224 | end = - i - 1 225 | residual_estimate += coef_ar[i] * residual[:, start:end] 226 | 227 | residual_diff = residual[:, order:] - residual_estimate 228 | sigma_eps = np.median(norm(residual_diff, axis=0) / np.sqrt(n_samples)) 229 | 230 | rho_ar_full = np.zeros(n_times) 231 | rho_ar_full[:rho_ar.size] = rho_ar 232 | 233 | for i in range(order + 1, n_times): 234 | start = i - order 235 | end = i 236 | rho_ar_full[i] = np.dot(coef_ar[::-1], rho_ar_full[start:end]) 237 | 238 | corr_hat = toeplitz(rho_ar_full) 239 | sigma_hat[:] = sigma_eps / np.sqrt((1 - np.dot(coef_ar, rho_ar[1:]))) 240 | cov_hat = np.outer(sigma_hat, sigma_hat) * corr_hat 241 | 242 | else: 243 | raise ValueError('Unknown method for estimating the covariance matrix') 244 | 245 | return cov_hat, beta_hat 246 | 247 | 248 | def empirical_snr(X, y, beta, noise=None): 249 | """Compute the SNR for the linear model: y = X beta + noise 250 | 251 | Parameters 252 | ----------- 253 | X : ndarray or scipy.sparse matrix, shape (n_samples, n_features) 254 | Data. 255 | 256 | y : ndarray, shape (n_samples,) 257 | Target. 258 | 259 | beta : ndarray, shape (n_features,) 260 | True parameter vector. 261 | 262 | noise : ndarray, shape (n_samples,), optional (default=None) 263 | True error vector. 264 | 265 | Returns 266 | ------- 267 | snr_hat : float 268 | Empirical signal-to-noise ratio. 269 | """ 270 | X = np.asarray(X) 271 | 272 | signal = np.dot(X, beta) 273 | 274 | if noise is None: 275 | noise = y - signal 276 | 277 | sig_signal = np.linalg.norm(signal - np.mean(signal)) 278 | sig_noise = np.linalg.norm(noise - np.mean(noise)) 279 | snr_hat = (sig_signal / sig_noise) ** 2 280 | 281 | return snr_hat 282 | -------------------------------------------------------------------------------- /hidimstat/clustered_inference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.utils import resample 3 | from sklearn.preprocessing import StandardScaler 4 | from sklearn.utils.validation import check_memory 5 | 6 | from .stat_tools import pval_from_cb 7 | from .desparsified_lasso import desparsified_lasso, desparsified_group_lasso 8 | 9 | 10 | def _subsampling(n_samples, train_size, groups=None, seed=0): 11 | """Random subsampling: computes a list of indices""" 12 | 13 | if groups is None: 14 | 15 | n_subsamples = int(n_samples * train_size) 16 | train_index = resample(np.arange(n_samples), n_samples=n_subsamples, 17 | replace=False, random_state=seed) 18 | 19 | else: 20 | 21 | unique_groups = np.unique(groups) 22 | n_groups = unique_groups.size 23 | n_subsample_groups = int(n_groups * train_size) 24 | train_group = resample(unique_groups, n_samples=n_subsample_groups, 25 | replace=False, random_state=seed) 26 | train_index = np.arange(n_samples)[np.isin(groups, train_group)] 27 | 28 | return train_index 29 | 30 | 31 | def _ward_clustering(X_init, ward, train_index): 32 | """Ward clustering applied to full X but computed from a subsample of X""" 33 | 34 | ward = ward.fit(X_init[train_index, :]) 35 | X_reduced = ward.transform(X_init) 36 | 37 | return X_reduced, ward 38 | 39 | 40 | def hd_inference(X, y, method, n_jobs=1, memory=None, verbose=0, **kwargs): 41 | """Wrap-up high-dimensional inference procedures 42 | 43 | Parameters 44 | ---------- 45 | X : ndarray, shape (n_samples, n_features) 46 | Data. 47 | 48 | y : ndarray, shape (n_samples,) or (n_samples, n_times) 49 | Target. 50 | 51 | method : str, optional (default='desparsified-lasso') 52 | Method used for making the inference. 53 | Currently the two methods available are 'desparsified-lasso' 54 | and 'group-desparsified-lasso'. Use 'desparsified-lasso' for 55 | non-temporal data and 'group-desparsified-lasso' for temporal data. 56 | 57 | n_jobs : int or None, optional (default=1) 58 | Number of CPUs to use during parallel steps such as inference. 59 | 60 | memory : str or joblib.Memory object, optional (default=None) 61 | Used to cache the output of the computation of the clustering 62 | and the inference. By default, no caching is done. If a string is 63 | given, it is the path to the caching directory. 64 | 65 | verbose: int, optional (default=1) 66 | The verbosity level. If `verbose > 0`, we print a message before 67 | runing the clustered inference. 68 | 69 | **kwargs: 70 | Arguments passed to the statistical inference function. 71 | 72 | Returns 73 | ------- 74 | beta_hat : ndarray, shape (n_features,) or (n_features, n_times) 75 | Estimated parameter vector or matrix. 76 | 77 | pval : ndarray, shape (n_features,) 78 | p-value, with numerically accurate values for 79 | positive effects (ie., for p-value close to zero). 80 | 81 | pval_corr : ndarray, shape (n_features,) 82 | p-value corrected for multiple testing. 83 | 84 | one_minus_pval : ndarray, shape (n_features,) 85 | One minus the p-value, with numerically accurate values 86 | for negative effects (ie., for p-value close to one). 87 | 88 | one_minus_pval_corr : ndarray, shape (n_features,) 89 | One minus the p-value corrected for multiple testing. 90 | """ 91 | 92 | if method == 'desparsified-lasso': 93 | 94 | beta_hat, cb_min, cb_max = \ 95 | desparsified_lasso(X, y, confidence=0.95, n_jobs=n_jobs, 96 | memory=memory, verbose=verbose, **kwargs) 97 | pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 98 | pval_from_cb(cb_min, cb_max, confidence=0.95) 99 | 100 | elif method == 'desparsified-group-lasso': 101 | 102 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 103 | desparsified_group_lasso(X, y, n_jobs=n_jobs, memory=memory, 104 | verbose=verbose, **kwargs) 105 | 106 | else: 107 | 108 | raise ValueError('Unknow method') 109 | 110 | return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr 111 | 112 | 113 | def _degrouping(ward, beta_hat, pval, pval_corr, 114 | one_minus_pval, one_minus_pval_corr): 115 | """Assigning cluster-wise stats to features contained in the corresponding 116 | cluster and rescaling estimated parameter""" 117 | 118 | pval_degrouped = ward.inverse_transform(pval) 119 | pval_corr_degrouped = ward.inverse_transform(pval_corr) 120 | one_minus_pval_degrouped = ward.inverse_transform(one_minus_pval) 121 | one_minus_pval_corr_degrouped = ward.inverse_transform(one_minus_pval_corr) 122 | 123 | labels = ward.labels_ 124 | clusters_size = np.zeros(labels.size) 125 | 126 | for label in range(labels.max() + 1): 127 | cluster_size = np.sum(labels == label) 128 | clusters_size[labels == label] = cluster_size 129 | 130 | if len(beta_hat.shape) == 1: 131 | 132 | beta_hat_degrouped = ward.inverse_transform(beta_hat) / clusters_size 133 | 134 | elif len(beta_hat.shape) == 2: 135 | 136 | n_features = pval_degrouped.shape[0] 137 | n_times = beta_hat.shape[1] 138 | beta_hat_degrouped = np.zeros((n_features, n_times)) 139 | 140 | for i in range(n_times): 141 | 142 | beta_hat_degrouped[:, i] = \ 143 | ward.inverse_transform(beta_hat[:, i]) / clusters_size 144 | 145 | return (beta_hat_degrouped, pval_degrouped, pval_corr_degrouped, 146 | one_minus_pval_degrouped, one_minus_pval_corr_degrouped) 147 | 148 | 149 | def clustered_inference(X_init, y, ward, n_clusters, train_size=1.0, 150 | groups=None, method='desparsified-lasso', seed=0, 151 | n_jobs=1, memory=None, verbose=1, **kwargs): 152 | """Clustered inference algorithm 153 | 154 | Parameters 155 | ---------- 156 | X_init : ndarray, shape (n_samples, n_features) 157 | Original data (uncompressed). 158 | 159 | y : ndarray, shape (n_samples,) or (n_samples, n_times) 160 | Target. 161 | 162 | ward : sklearn.cluster.FeatureAgglomeration 163 | Scikit-learn object that computes Ward hierarchical clustering. 164 | 165 | n_clusters : int 166 | Number of clusters used for the compression. 167 | 168 | train_size : float, optional (default=1.0) 169 | Fraction of samples used to compute the clustering. 170 | If `train_size = 1`, clustering is not random since all the samples 171 | are used to compute the clustering. 172 | 173 | groups : ndarray, shape (n_samples,), optional (default=None) 174 | Group labels for every sample. If not None, `groups` is used to build 175 | the subsamples that serve for computing the clustering. 176 | 177 | method : str, optional (default='desparsified-lasso') 178 | Method used for making the inference. 179 | Currently the two methods available are 'desparsified-lasso' 180 | and 'group-desparsified-lasso'. Use 'desparsified-lasso' for 181 | non-temporal data and 'group-desparsified-lasso' for temporal data. 182 | 183 | seed: int, optional (default=0) 184 | Seed used for generating a random subsample of the data. 185 | This seed controls the clustering randomness. 186 | 187 | n_jobs : int or None, optional (default=1) 188 | Number of CPUs to use during parallel steps such as inference. 189 | 190 | memory : str or joblib.Memory object, optional (default=None) 191 | Used to cache the output of the computation of the clustering 192 | and the inference. By default, no caching is done. If a string is 193 | given, it is the path to the caching directory. 194 | 195 | verbose: int, optional (default=1) 196 | The verbosity level. If `verbose > 0`, we print a message before 197 | runing the clustered inference. 198 | 199 | **kwargs: 200 | Arguments passed to the statistical inference function. 201 | 202 | Returns 203 | ------- 204 | beta_hat : ndarray, shape (n_features,) or (n_features, n_times) 205 | Estimated parameter vector or matrix. 206 | 207 | pval : ndarray, shape (n_features,) 208 | p-value, with numerically accurate values for 209 | positive effects (ie., for p-value close to zero). 210 | 211 | pval_corr : ndarray, shape (n_features,) 212 | p-value corrected for multiple testing. 213 | 214 | one_minus_pval : ndarray, shape (n_features,) 215 | One minus the p-value, with numerically accurate values 216 | for negative effects (ie., for p-value close to one). 217 | 218 | one_minus_pval_corr : ndarray, shape (n_features,) 219 | One minus the p-value corrected for multiple testing. 220 | 221 | References 222 | ---------- 223 | .. [1] Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021). 224 | Spatially relaxed inference on high-dimensional linear models. 225 | arXiv preprint arXiv:2106.02590. 226 | """ 227 | 228 | memory = check_memory(memory) 229 | 230 | n_samples, n_features = X_init.shape 231 | 232 | if verbose > 0: 233 | 234 | print(f'Clustered inference: n_clusters = {n_clusters}, ' + 235 | f'inference method = {method}, seed = {seed}') 236 | 237 | # Sampling 238 | train_index = _subsampling(n_samples, train_size, groups=groups, seed=seed) 239 | 240 | # Clustering 241 | X, ward = memory.cache(_ward_clustering)(X_init, ward, train_index) 242 | 243 | # Preprocessing 244 | X = StandardScaler().fit_transform(X) 245 | y = y - np.mean(y) 246 | 247 | # Inference: computing reduced parameter vector and stats 248 | beta_hat_, pval_, pval_corr_, one_minus_pval_, one_minus_pval_corr_ = \ 249 | hd_inference(X, y, method, n_jobs=n_jobs, memory=memory, **kwargs) 250 | 251 | # De-grouping 252 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 253 | _degrouping(ward, beta_hat_, pval_, pval_corr_, one_minus_pval_, 254 | one_minus_pval_corr_) 255 | 256 | return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr 257 | -------------------------------------------------------------------------------- /examples/plot_fmri_data_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Support recovery on fMRI data 3 | ============================= 4 | 5 | This example compares several methods that estimate a decoder map support 6 | with statistical guarantees. More precisely, we aim at thresholding the 7 | weights of some estimated decoder maps according to the confidence we have 8 | that they are nonzero. Here, we work with the Haxby dataset and we focus on 9 | the 'face vs house' contrast. Thus, we consider the labeled activation maps 10 | of a given subject and try produce a brain map that corresponds to the 11 | discriminative pattern that makes the decoding of the two conditions. 12 | 13 | In this example, we show that standard statistical methods (i.e., method 14 | such as thresholding by permutation test the SVR or Ridge decoder or the 15 | algorithm introduced by Gaonkar et al. [1]_) are not powerful when applied on 16 | the uncompressed problem (i.e., the orignal problem in which the activation 17 | maps are not reduced using compression techniques such as parcellation). 18 | This is notably due to the high dimensionality (too many voxels) and 19 | structure of the data (too much correlation between neighboring voxels). 20 | We also present two methods that offer statistical guarantees but 21 | with a (small) spatial tolerance on the shape of the support: 22 | clustered desparsified lasso (CLuDL) combines clustering (parcellation) 23 | and statistical inference ; ensemble of clustered desparsified lasso (EnCluDL) 24 | adds a randomization step over the choice of clustering. 25 | 26 | EnCluDL is powerful and does not depend on a unique clustering choice. 27 | As shown in Chevalier et al. (2021) [2]_, for several tasks the estimated 28 | support (predictive regions) looks relevant. 29 | 30 | References 31 | ---------- 32 | .. [1] Gaonkar, B., & Davatzikos, C. (2012, October). Deriving statistical 33 | significance maps for SVM based image classification and group 34 | comparisons. In International Conference on Medical Image Computing 35 | and Computer-Assisted Intervention (pp. 723-730). Springer, Berlin, 36 | Heidelberg. 37 | 38 | .. [2] Chevalier, J. A., Nguyen, T. B., Salmon, J., Varoquaux, G., 39 | & Thirion, B. (2021). Decoding with confidence: Statistical 40 | control on decoder maps. NeuroImage, 234, 117921. 41 | """ 42 | 43 | ############################################################################# 44 | # Imports needed for this script 45 | # ------------------------------ 46 | import numpy as np 47 | import pandas as pd 48 | from sklearn.utils import Bunch 49 | from sklearn.cluster import FeatureAgglomeration 50 | from sklearn.feature_extraction import image 51 | from sklearn.linear_model import Ridge 52 | from nilearn import datasets 53 | from nilearn.input_data import NiftiMasker 54 | from nilearn.image import mean_img 55 | from nilearn.plotting import plot_stat_map, show 56 | 57 | from hidimstat.stat_tools import zscore_from_pval, pval_from_scale 58 | from hidimstat.standardized_svr import standardized_svr 59 | from hidimstat.permutation_test import permutation_test, permutation_test_cv 60 | from hidimstat.adaptive_permutation_threshold import ada_svr 61 | from hidimstat.clustered_inference import clustered_inference 62 | from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference 63 | 64 | 65 | ############################################################################# 66 | # Function to fetch and preprocess Haxby dataset 67 | # ---------------------------------------------- 68 | def preprocess_haxby(subject=2, memory=None): 69 | '''Gathering and preprocessing Haxby dataset for a given subject.''' 70 | 71 | # Gathering data 72 | haxby_dataset = datasets.fetch_haxby(subjects=[subject]) 73 | fmri_filename = haxby_dataset.func[0] 74 | 75 | behavioral = pd.read_csv(haxby_dataset.session_target[0], sep=" ") 76 | 77 | # conditions = pd.DataFrame.to_numpy(behavioral['labels']) 78 | conditions = behavioral['labels'].values 79 | session_label = behavioral['chunks'].values 80 | 81 | condition_mask = np.logical_or(conditions == 'face', conditions == 'house') 82 | groups = session_label[condition_mask] 83 | 84 | # Loading anatomical image (back-ground image) 85 | if haxby_dataset.anat[0] is None: 86 | bg_img = None 87 | else: 88 | bg_img = mean_img(haxby_dataset.anat) 89 | 90 | # Building target where '1' corresponds to 'face' and '-1' to 'house' 91 | y = np.asarray((conditions[condition_mask] == 'face') * 2 - 1) 92 | 93 | # Loading mask 94 | mask_img = haxby_dataset.mask 95 | masker = NiftiMasker(mask_img=mask_img, standardize=True, 96 | smoothing_fwhm=None, memory=memory) 97 | 98 | # Computing masked data 99 | fmri_masked = masker.fit_transform(fmri_filename) 100 | X = np.asarray(fmri_masked)[condition_mask, :] 101 | 102 | return Bunch(X=X, y=y, groups=groups, bg_img=bg_img, masker=masker) 103 | 104 | 105 | ############################################################################# 106 | # Gathering and preprocessing Haxby dataset for a given subject 107 | # ------------------------------------------------------------- 108 | # The `preprocess_haxby` function make the preprocessing of the Haxby dataset, 109 | # it outputs the preprocessed activation maps for the two conditions 110 | # 'face' or 'house' (contained in `X`), the conditions (in `y`), 111 | # the session labels (in `groups`) and the mask (in `masker`). 112 | # You may choose a subject in [1, 2, 3, 4, 5, 6]. By default subject=2. 113 | data = preprocess_haxby(subject=2) 114 | X, y, groups, masker = data.X, data.y, data.groups, data.masker 115 | mask = masker.mask_img_.get_fdata().astype(bool) 116 | 117 | ############################################################################# 118 | # Initializing FeatureAgglomeration object that performs the clustering 119 | # ------------------------------------------------------------------------- 120 | # For fMRI data taking 500 clusters is generally a good default choice. 121 | 122 | n_clusters = 500 123 | # Deriving voxels connectivity. 124 | shape = mask.shape 125 | n_x, n_y, n_z = shape[0], shape[1], shape[2] 126 | connectivity = image.grid_to_graph(n_x=n_x, n_y=n_y, n_z=n_z, mask=mask) 127 | # Initializing FeatureAgglomeration object. 128 | ward = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connectivity) 129 | 130 | ############################################################################# 131 | # Making the inference with several algorithms 132 | # -------------------------------------------- 133 | 134 | ############################################################################# 135 | # First, we try to recover the discriminative partern by computing 136 | # p-values from SVR decoder weights and a parametric approximation 137 | # of the distribution of these weights. 138 | 139 | # We precomputed the regularization parameter by CV (C = 0.1) to reduce the 140 | # computation time of the example. 141 | beta_hat, scale = standardized_svr(X, y, Cs=[0.1]) 142 | pval_std_svr, _, one_minus_pval_std_svr, _ = pval_from_scale(beta_hat, scale) 143 | 144 | ############################################################################# 145 | # Now, we compute p-values thanks to permutation tests applied to 146 | # 1/the weights of the SVR decoder or 2/the weights of the Ridge decoder. 147 | 148 | # To derive the p-values from the SVR decoder, you may change the next line by 149 | # `SVR_permutation_test_inference = True`. It should take around 15 minutes. 150 | 151 | SVR_permutation_test_inference = False 152 | if SVR_permutation_test_inference: 153 | # We computed the regularization parameter by CV (C = 0.1) 154 | pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test = \ 155 | permutation_test_cv(X, y, n_permutations=50, C=0.1) 156 | 157 | # Another method is to compute the p-values by permutation test from the 158 | # Ridge decoder. The solution provided by this method should be very close to 159 | # the previous one and the computation time is much shorter: around 20 seconds. 160 | 161 | estimator = Ridge() 162 | pval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test = \ 163 | permutation_test(X, y, estimator=estimator, n_permutations=200) 164 | 165 | ############################################################################# 166 | # Now, let us run the algorithm introduced by Gaonkar et al. (c.f. References). 167 | # Since the estimator they derive is obtained by approximating the hard margin 168 | # SVM formulation, we referred to this method as "ada-SVR" which stands for 169 | # "Adaptive Permutation Threshold SVR". The function is ``ada_svr``. 170 | beta_hat, scale = ada_svr(X, y) 171 | pval_ada_svr, _, one_minus_pval_ada_svr, _ = pval_from_scale(beta_hat, scale) 172 | 173 | ############################################################################# 174 | # Now, the clustered inference algorithm which combines parcellation 175 | # and high-dimensional inference (c.f. References). 176 | beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = \ 177 | clustered_inference(X, y, ward, n_clusters) 178 | 179 | ############################################################################# 180 | # Below, we run the ensemble clustered inference algorithm which adds a 181 | # randomization step over the clustered inference algorithm (c.f. References). 182 | # To make the example as short as possible we take `n_bootstraps=5` 183 | # which means that 5 different parcellations are considered and 184 | # then 5 statistical maps are produced and aggregated into one. 185 | # However you might benefit from clustering randomization taking 186 | # `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs=2`. 187 | beta_hat, pval_ecdl, _, one_minus_pval_ecdl, _ = \ 188 | ensemble_clustered_inference(X, y, ward, n_clusters, groups=groups, 189 | n_bootstraps=5, n_jobs=2) 190 | 191 | ############################################################################# 192 | # Plotting the results 193 | # -------------------- 194 | # To allow a better visualization of the disciminative pattern we will plot 195 | # z-maps rather than p-value maps. Assuming Gaussian distribution of the 196 | # estimators we can recover a z-score from a p-value by using the 197 | # inverse survival function. 198 | # 199 | # First, we set theoretical FWER target at 10%. 200 | 201 | n_samples, n_features = X.shape 202 | target_fwer = 0.1 203 | 204 | ############################################################################# 205 | # We now translate the FWER target into a z-score target. 206 | # For the permutation test methods we do not need any additional correction 207 | # since the p-values are already adjusted for multiple testing. 208 | 209 | zscore_threshold_corr = zscore_from_pval((target_fwer / 2)) 210 | 211 | ############################################################################# 212 | # Other methods need to be corrected. We consider the Bonferroni correction. 213 | # For methods that do not reduce the feature space, the correction 214 | # consists in dividing by the number of features. 215 | 216 | correction = 1. / n_features 217 | zscore_threshold_no_clust = zscore_from_pval((target_fwer / 2) * correction) 218 | 219 | ############################################################################# 220 | # For methods that parcelates the brain into groups of voxels, the correction 221 | # consists in dividing by the number of parcels (or clusters). 222 | 223 | correction_clust = 1. / n_clusters 224 | zscore_threshold_clust = zscore_from_pval((target_fwer / 2) * correction_clust) 225 | 226 | ############################################################################# 227 | # Now, we can plot the thresholded z-score maps by translating the 228 | # p-value maps estimated previously into z-score maps and using the 229 | # suitable threshold. For a better readability, we make a small function 230 | # called `plot_map` that wraps all these steps. 231 | 232 | 233 | def plot_map(pval, one_minus_pval, zscore_threshold, title=None, 234 | cut_coords=[-25, -40, -5], masker=masker, bg_img=data.bg_img): 235 | 236 | zscore = zscore_from_pval(pval, one_minus_pval) 237 | zscore_img = masker.inverse_transform(zscore) 238 | plot_stat_map(zscore_img, threshold=zscore_threshold, bg_img=bg_img, 239 | dim=-1, cut_coords=cut_coords, title=title) 240 | 241 | 242 | plot_map(pval_std_svr, one_minus_pval_std_svr, zscore_threshold_no_clust, 243 | title='SVR parametric threshold') 244 | 245 | if SVR_permutation_test_inference: 246 | plot_map(pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test, 247 | zscore_threshold_corr, title='SVR permutation-test thresh.') 248 | 249 | plot_map(pval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test, 250 | zscore_threshold_corr, title='Ridge permutation-test thresh.') 251 | 252 | plot_map(pval_ada_svr, one_minus_pval_ada_svr, zscore_threshold_no_clust, 253 | title='SVR adaptive perm. tresh.') 254 | 255 | plot_map(pval_cdl, one_minus_pval_cdl, zscore_threshold_clust, 'CluDL') 256 | 257 | plot_map(pval_ecdl, one_minus_pval_ecdl, zscore_threshold_clust, 'EnCluDL') 258 | 259 | ############################################################################# 260 | # Analysis of the results 261 | # ----------------------- 262 | # As advocated in introduction, the methods that do not reduce the original 263 | # problem are not satisfying since they are too conservative. 264 | # Among those methods, the only one that makes discoveries is the one that 265 | # threshold the SVR decoder using a parametric approximation. 266 | # However this method has no statistical guarantees and we can see that some 267 | # isolated voxels are discovered, which seems quite spurious. 268 | # The discriminative pattern derived from the clustered inference algorithm 269 | # (CluDL) show that the method is less conservative. 270 | # However, some reasonable paterns are also included in this solution. 271 | # Finally, the solution provided by the ensemble clustered inference algorithm 272 | # (EnCluDL) seems realistic as we recover the visual cortex and do not make 273 | # spurious discoveries. 274 | 275 | show() 276 | -------------------------------------------------------------------------------- /hidimstat/stat_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import norm 3 | 4 | 5 | def _replace_infinity(x, replace_val=None, method='times-two'): 6 | """Replace infinity by large value""" 7 | 8 | largest_non_inf = np.max(np.abs(x)[np.abs(x) != np.inf]) 9 | 10 | if method == 'times-two': 11 | replace_val_min = largest_non_inf * 2 12 | elif method == 'plus-one': 13 | replace_val_min = largest_non_inf + 1 14 | 15 | if (replace_val is not None) and (replace_val < largest_non_inf): 16 | replace_val = replace_val_min 17 | elif replace_val is None: 18 | replace_val = replace_val_min 19 | 20 | x_new = np.copy(x) 21 | x_new[x_new == np.inf] = replace_val 22 | x_new[x_new == -np.inf] = -replace_val 23 | 24 | return x_new 25 | 26 | 27 | def pval_corr_from_pval(one_sided_pval): 28 | """Computing one-sided p-values corrrected for multiple testing 29 | from simple testing one-sided p-values. 30 | 31 | Parameters 32 | ---------- 33 | one_sided_pval : ndarray, shape (n_features,) 34 | One-sided p-values. 35 | 36 | Returns 37 | ------- 38 | one_sided_pval_corr : ndarray, shape (n_features,) 39 | Corrected one-sided p-values. 40 | """ 41 | 42 | n_features = one_sided_pval.size 43 | 44 | one_sided_pval_corr = np.zeros(n_features) + 0.5 45 | 46 | ind = (one_sided_pval < 0.5) 47 | one_sided_pval_corr[ind] = \ 48 | np.minimum(one_sided_pval[ind] * n_features, 0.5) 49 | 50 | ind = (one_sided_pval > 0.5) 51 | one_sided_pval_corr[ind] = \ 52 | np.maximum(1 - (1 - one_sided_pval[ind]) * n_features, 0.5) 53 | 54 | return one_sided_pval_corr 55 | 56 | 57 | def pval_from_scale(beta, scale, distrib='norm', eps=1e-14): 58 | """Computing one-sided p-values from the value of the parameter 59 | and its scale. 60 | 61 | Parameters 62 | ---------- 63 | beta : ndarray, shape (n_features,) 64 | Value of the parameters. 65 | 66 | scale : ndarray, shape (n_features,) 67 | Value of the standard deviation of the parameters. 68 | 69 | distrib : str, opitonal (default='norm') 70 | Type of distribution assumed for the underlying estimator. 71 | 'norm' means normal and is the only value accepted at the moment. 72 | 73 | eps : float, optional 74 | Machine-precision regularization in the computation of the p-values. 75 | 76 | Returns 77 | ------- 78 | pval : ndarray, shape (n_features,) 79 | p-value, with numerically accurate values for 80 | positive effects (ie., for p-value close to zero). 81 | 82 | pval_corr : ndarray, shape (n_features,) 83 | p-value corrected for multiple testing. 84 | 85 | one_minus_pval : ndarray, shape (n_features,) 86 | One minus the p-value, with numerically accurate values 87 | for negative effects (ie., for p-value close to one). 88 | 89 | one_minus_pval_corr : ndarray, shape (n_features,) 90 | One minus the p-value corrected for multiple testing. 91 | """ 92 | 93 | n_features = beta.size 94 | 95 | index_no_nan = tuple([scale != 0.0]) 96 | 97 | pval = np.zeros(n_features) + 0.5 98 | one_minus_pval = np.zeros(n_features) + 0.5 99 | 100 | if distrib == 'norm': 101 | 102 | pval[index_no_nan] = \ 103 | norm.sf(beta[index_no_nan], scale=scale[index_no_nan]) 104 | one_minus_pval[index_no_nan] = \ 105 | norm.cdf(beta[index_no_nan], scale=scale[index_no_nan]) 106 | 107 | pval[pval > 1 - eps] = 1 - eps 108 | pval_corr = pval_corr_from_pval(pval) 109 | 110 | one_minus_pval[one_minus_pval > 1 - eps] = 1 - eps 111 | one_minus_pval_corr = pval_corr_from_pval(one_minus_pval) 112 | 113 | return pval, pval_corr, one_minus_pval, one_minus_pval_corr 114 | 115 | 116 | def zscore_from_cb(cb_min, cb_max, confidence=0.95, distrib='norm'): 117 | """Computing z-scores from confidence intervals. 118 | 119 | Parameters 120 | ---------- 121 | cb_min : ndarray, shape (n_features,) 122 | Value of the inferior confidence bound. 123 | 124 | cb_max : ndarray, shape (n_features,) 125 | Value of the superior confidence bound. 126 | 127 | confidence : float, optional (default=0.95) 128 | Confidence level used to compute the confidence intervals. 129 | Each value should be in the range [0, 1]. 130 | 131 | distrib : str, opitonal (default='norm') 132 | Type of distribution assumed for the underlying estimator. 133 | 'norm' means normal and is the only value accepted at the moment. 134 | 135 | Returns 136 | ------- 137 | zscore : ndarray, shape (n_features,) 138 | z-scores. 139 | """ 140 | 141 | if distrib == 'norm': 142 | quantile = norm.ppf(1 - (1 - confidence) / 2) 143 | 144 | beta_hat = (cb_min + cb_max) / 2 145 | 146 | zscore = beta_hat / (cb_max - cb_min) * 2 * quantile 147 | 148 | return zscore 149 | 150 | 151 | def pval_from_cb(cb_min, cb_max, confidence=0.95, distrib='norm', eps=1e-14): 152 | """Computing one-sided p-values from confidence intervals. 153 | 154 | Parameters 155 | ---------- 156 | cb_min : ndarray, shape (n_features,) 157 | Value of the inferior confidence bound. 158 | 159 | cb_max : ndarray, shape (n_features,) 160 | Value of the superior confidence bound. 161 | 162 | confidence : float, optional (default=0.95) 163 | Confidence level used to compute the confidence intervals. 164 | Each value should be in the range [0, 1]. 165 | 166 | distrib : str, opitonal (default='norm') 167 | Type of distribution assumed for the underlying estimator. 168 | 'norm' means normal and is the only value accepted at the moment. 169 | 170 | eps : float, optional 171 | Machine-precision regularization in the computation of the p-values. 172 | 173 | Returns 174 | ------- 175 | pval : ndarray, shape (n_features,) 176 | p-value, with numerically accurate values for 177 | positive effects (ie., for p-value close to zero). 178 | 179 | pval_corr : ndarray, shape (n_features,) 180 | p-value corrected for multiple testing. 181 | 182 | one_minus_pval : ndarray, shape (n_features,) 183 | One minus the p-value, with numerically accurate values 184 | for negative effects (ie., for p-value close to one). 185 | 186 | one_minus_pval_corr : ndarray, shape (n_features,) 187 | One minus the p-value corrected for multiple testing. 188 | """ 189 | 190 | zscore = \ 191 | zscore_from_cb(cb_min, cb_max, confidence=confidence, distrib=distrib) 192 | 193 | if distrib == 'norm': 194 | 195 | pval = norm.sf(zscore) 196 | one_minus_pval = norm.cdf(zscore) 197 | 198 | pval[pval > 1 - eps] = 1 - eps 199 | pval_corr = pval_corr_from_pval(pval) 200 | 201 | one_minus_pval[one_minus_pval > 1 - eps] = 1 - eps 202 | one_minus_pval_corr = pval_corr_from_pval(one_minus_pval) 203 | 204 | return pval, pval_corr, one_minus_pval, one_minus_pval_corr 205 | 206 | 207 | def two_sided_pval_from_zscore(zscore, distrib='norm'): 208 | """Computing two-sided p-values from z-scores. 209 | 210 | Parameters 211 | ---------- 212 | zscore : ndarray, shape (n_features,) 213 | z-scores. 214 | 215 | distrib : str, opitonal (default='norm') 216 | Type of distribution assumed for the underlying estimator. 217 | 'norm' means normal and is the only value accepted at the moment. 218 | 219 | Returns 220 | ------- 221 | two_sided_pval : ndarray, shape (n_features,) 222 | Two-sided p-values (testing the null). 223 | 224 | two_sided_pval_corr : ndarray, shape (n_features,) 225 | Two-sided p-values corrected for multiple testing. 226 | """ 227 | n_features = zscore.size 228 | 229 | if distrib == 'norm': 230 | two_sided_pval = 2 * norm.sf(np.abs(zscore)) 231 | 232 | two_sided_pval_corr = np.minimum(1, two_sided_pval * n_features) 233 | 234 | return two_sided_pval, two_sided_pval_corr 235 | 236 | 237 | def two_sided_pval_from_cb(cb_min, cb_max, confidence=0.95, distrib='norm'): 238 | """Computing two-sided p-values from confidence intervals. 239 | 240 | Parameters 241 | ---------- 242 | cb_min : ndarray, shape (n_features,) 243 | Value of the inferior confidence bound. 244 | 245 | cb_max : ndarray, shape (n_features,) 246 | Value of the superior confidence bound. 247 | 248 | confidence : float, optional (default=0.95) 249 | Confidence level used to compute the confidence intervals. 250 | Each value should be in the range [0, 1]. 251 | 252 | distrib : str, opitonal (default='norm') 253 | Type of distribution assumed for the underlying estimator. 254 | 'norm' means normal and is the only value accepted at the moment. 255 | 256 | Returns 257 | ------- 258 | two_sided_pval : ndarray, shape (n_features,) 259 | Two-sided p-values (testing the null). 260 | 261 | two_sided_pval_corr : ndarray, shape (n_features,) 262 | Two-sided p-values corrected for multiple testing. 263 | """ 264 | zscore = \ 265 | zscore_from_cb(cb_min, cb_max, confidence=confidence, distrib=distrib) 266 | 267 | two_sided_pval, two_sided_pval_corr = \ 268 | two_sided_pval_from_zscore(zscore, distrib=distrib) 269 | 270 | return two_sided_pval, two_sided_pval_corr 271 | 272 | 273 | def zscore_from_pval(pval, one_minus_pval=None, distrib='norm'): 274 | """Computing z-scores from one-sided p-values. 275 | 276 | Parameters 277 | ----------- 278 | pval : ndarray, shape (n_features,) 279 | p-value, with numerically accurate values for 280 | positive effects (ie., for p-value close to zero). 281 | 282 | one_minus_pval : ndarray, shape (n_features,), optional (default=None) 283 | One minus the p-value, with numerically accurate values 284 | for negative effects (ie., for p-value close to one). 285 | 286 | distrib : str, opitonal (default='norm') 287 | Type of distribution assumed for the underlying estimator. 288 | 'norm' means normal and is the only value accepted at the moment. 289 | 290 | Returns 291 | ------- 292 | zscore : ndarray, shape (n_features,) 293 | z-scores. 294 | """ 295 | 296 | if distrib == 'norm': 297 | 298 | zscore = norm.isf(pval) 299 | 300 | if one_minus_pval is not None: 301 | 302 | ind = (pval > 0.5) 303 | zscore[ind] = norm.ppf(one_minus_pval[ind]) 304 | 305 | zscore = _replace_infinity(zscore, replace_val=40, method='plus-one') 306 | 307 | return zscore 308 | 309 | 310 | def pval_from_two_sided_pval_and_sign(two_sided_pval, parameter_sign, 311 | eps=1e-14): 312 | """Computing one-sided p-values from two-sided p-value and parameter sign. 313 | 314 | Parameters 315 | ---------- 316 | two_sided_pval : ndarray, shape (n_features,) 317 | Two-sided p-values (testing the null). 318 | 319 | parameter_sign : ndarray, shape (n_features,) 320 | Estimated signs for the parameters. 321 | 322 | eps : float, optional 323 | Machine-precision regularization in the computation of the p-values. 324 | 325 | Returns 326 | ------- 327 | pval : ndarray, shape (n_features,) 328 | p-value, with numerically accurate values for 329 | positive effects (ie., for p-value close to zero). 330 | 331 | pval_corr : ndarray, shape (n_features,) 332 | p-value corrected for multiple testing. 333 | 334 | one_minus_pval : ndarray, shape (n_features,) 335 | One minus the p-value, with numerically accurate values 336 | for negative effects (ie., for p-value close to one). 337 | 338 | one_minus_pval_corr : ndarray, shape (n_features,) 339 | One minus the p-value corrected for multiple testing. 340 | """ 341 | 342 | n_features = two_sided_pval.size 343 | 344 | pval = 0.5 * np.ones(n_features) 345 | one_minus_pval = 0.5 * np.ones(n_features) 346 | 347 | pval[parameter_sign > 0] = two_sided_pval[parameter_sign > 0] / 2 348 | pval[parameter_sign < 0] = 1 - two_sided_pval[parameter_sign < 0] / 2 349 | 350 | one_minus_pval[parameter_sign > 0] = \ 351 | 1 - two_sided_pval[parameter_sign > 0] / 2 352 | one_minus_pval[parameter_sign < 0] = \ 353 | two_sided_pval[parameter_sign < 0] / 2 354 | 355 | pval[pval > 1 - eps] = 1 - eps 356 | pval_corr = pval_corr_from_pval(pval) 357 | 358 | one_minus_pval[one_minus_pval > 1 - eps] = 1 - eps 359 | one_minus_pval_corr = pval_corr_from_pval(one_minus_pval) 360 | 361 | return pval, pval_corr, one_minus_pval, one_minus_pval_corr 362 | 363 | 364 | def two_sided_pval_from_pval(pval, one_minus_pval=None, distrib='norm'): 365 | """Computing two-sided p-value from one-sided p-values. 366 | 367 | Parameters 368 | ----------- 369 | pval : ndarray, shape (n_features,) 370 | p-value, with numerically accurate values for 371 | positive effects (ie., for p-value close to zero). 372 | 373 | one_minus_pval : ndarray, shape (n_features,), optional (default=None) 374 | One minus the p-value, with numerically accurate values 375 | for negative effects (ie., for p-value close to one). 376 | 377 | distrib : str, opitonal (default='norm') 378 | Type of distribution assumed for the underlying estimator. 379 | 'norm' means normal and is the only value accepted at the moment. 380 | 381 | Returns 382 | ------- 383 | two_sided_pval : ndarray, shape (n_features,) 384 | Two-sided p-values (testing the null). 385 | 386 | two_sided_pval_corr : ndarray, shape (n_features,) 387 | Two-sided p-values corrected for multiple testing. 388 | """ 389 | 390 | zscore = zscore_from_pval(pval, one_minus_pval, distrib=distrib) 391 | 392 | two_sided_pval, two_sided_pval_corr = \ 393 | two_sided_pval_from_zscore(zscore, distrib=distrib) 394 | 395 | return two_sided_pval, two_sided_pval_corr 396 | -------------------------------------------------------------------------------- /examples/plot_2D_simulation_example.py: -------------------------------------------------------------------------------- 1 | # Authors: Jerome-Alexis Chevalier 2 | """ 3 | Support recovery on simulated data (2D) 4 | ======================================= 5 | 6 | This example shows the advantages of spatially relaxed inference when 7 | dealing with high-dimensional spatial data. To do so, we compare several 8 | statistical methods that aim at recovering the support, i.e., predictive 9 | features. Among those methods some leverage the spatial structure of the 10 | data. For more details about the inference algorithms presented in this 11 | example or about the generative process used to simulate the data, 12 | please refer to Chevalier et al. (2021) [1]_. 13 | 14 | This example corresponds to the experiment described in details in 15 | Chevalier et al. (2021) [1]_. Shortly, to simulate the data, we draw 16 | ``n_samples`` i.i.d Gaussian vectors of size ``n_features`` and reshape them 17 | into squares (edges are equal to ``n_features ** (1/2)``). Then, to introduce 18 | some spatial structure, we apply a Gaussian filter that correlates features 19 | that are nearby. The 2D data are then flattened into a design matrix ``X`` to 20 | represent it as a regression setting and to ease the computation of the 21 | simulated target ``y`` (see below). Then, we construct the weight map ``w`` 22 | which has the same shape as the 2D data, as it contains four predictive 23 | regions in every corner of the square. Similarly as for the construction 24 | of ``X``, the map ``w`` is finally flattened into a vector ``beta``. Lastly, 25 | to derive the target ``y``, we draw a white Gaussian noise ``epsilon`` and 26 | use a linear generative model: ``y = X beta + epsilon``. 27 | 28 | The results of this experiment show that the methods that leverage the spatial 29 | structure of the data are relevant. More precisely, we show that clustered 30 | inference algorithms (e.g., CluDL) and ensembled clustered inference algorithms 31 | (e.g., EnCluDL) are more powerful than the standard inference methods (see also 32 | Chevalier et al. (2021) [1]_). Indeed, when the number of features is much 33 | greater than the number of samples, standard statistical methods are 34 | unlikely to recover the support. Then, the idea of clustered inference is to 35 | compress the data without breaking the spatial structure, leading to a 36 | compressed problem close to the original problem. This leads to a 37 | powerful spatially relaxed inference. Indeed, thanks to the dimension reduction 38 | the support recovery is feasible. However, due to the spatial compression, 39 | there is a limited (and quantifiable) spatial uncertainty concerning the shape 40 | of the estimated support. Finally, by considering several choices of 41 | spatial compression, ensembled clustered inference algorithms reduce 42 | significantly the spatial uncertainty compared to clustered inference 43 | algorithms which consider only one spatial compression. 44 | 45 | .. _References: 46 | 47 | References 48 | ---------- 49 | .. [1] Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021). 50 | Spatially relaxed inference on high-dimensional linear models. 51 | arXiv preprint arXiv:2106.02590. 52 | """ 53 | 54 | ############################################################################# 55 | # Imports needed for this script 56 | # ------------------------------ 57 | import numpy as np 58 | import matplotlib.pyplot as plt 59 | from sklearn.feature_extraction import image 60 | from sklearn.cluster import FeatureAgglomeration 61 | 62 | from hidimstat.scenario import multivariate_simulation 63 | from hidimstat.stat_tools import zscore_from_pval, pval_from_cb 64 | from hidimstat.desparsified_lasso import desparsified_lasso 65 | from hidimstat.clustered_inference import clustered_inference 66 | from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference 67 | 68 | 69 | ############################################################################# 70 | # Specific plotting functions 71 | # --------------------------- 72 | # The functions below are used to plot the results and illustrate the concept 73 | # of spatial tolerance. If you are reading this example for the first time, 74 | # you can skip this section. 75 | # 76 | # The following function builds a 2D map with four active regions that are 77 | # enfolded by thin tolerance regions. 78 | 79 | 80 | def weight_map_2D_extended(shape, roi_size, delta): 81 | '''Build weight map with visible tolerance region''' 82 | 83 | roi_size_extended = roi_size + delta 84 | 85 | w = np.zeros(shape + (5,)) 86 | w[0:roi_size, 0:roi_size, 0] = 0.5 87 | w[-roi_size:, -roi_size:, 1] = 0.5 88 | w[0:roi_size, -roi_size:, 2] = 0.5 89 | w[-roi_size:, 0:roi_size, 3] = 0.5 90 | w[0:roi_size_extended, 0:roi_size_extended, 0] += 0.5 91 | w[-roi_size_extended:, -roi_size_extended:, 1] += 0.5 92 | w[0:roi_size_extended, -roi_size_extended:, 2] += 0.5 93 | w[-roi_size_extended:, 0:roi_size_extended, 3] += 0.5 94 | 95 | for i in range(roi_size_extended): 96 | for j in range(roi_size_extended): 97 | if (i - roi_size) + (j - roi_size) >= delta: 98 | w[i, j, 0] = 0 99 | w[-i-1, -j-1, 1] = 0 100 | w[i, -j-1, 2] = 0 101 | w[-i-1, j, 3] = 0 102 | 103 | beta_extended = w.sum(-1).ravel() 104 | 105 | return beta_extended 106 | 107 | 108 | ############################################################################## 109 | # To generate a plot that exhibits the true support and the estimated 110 | # supports for every method, we define the two following functions: 111 | 112 | 113 | def add_one_subplot(ax, map, title): 114 | '''Add one subplot into the summary plot''' 115 | 116 | if map is not None: 117 | im = ax.imshow(map) 118 | im.set_clim(-1, 1) 119 | ax.tick_params( 120 | axis='both', 121 | which='both', 122 | bottom=False, 123 | top=False, 124 | left=False, 125 | labelbottom=False, 126 | labelleft=False) 127 | ax.set_title(title) 128 | else: 129 | ax.axis('off') 130 | ax.get_xaxis().set_visible(False) 131 | ax.get_yaxis().set_visible(False) 132 | 133 | 134 | def plot(maps, titles, save_fig=False): 135 | '''Make a summary plot from estimated supports''' 136 | 137 | fig, axes = plt.subplots(3, 2, figsize=(4, 6)) 138 | 139 | for i in range(3): 140 | for j in range(2): 141 | k = i * 2 + j 142 | add_one_subplot(axes[i][j], maps[k], titles[k]) 143 | 144 | fig.tight_layout() 145 | 146 | if save_fig: 147 | figname = 'figures/simu_2D.png' 148 | plt.savefig(figname) 149 | print(f'Save figure to {figname}') 150 | 151 | plt.show() 152 | 153 | 154 | ############################################################################## 155 | # Generating the data 156 | # ------------------- 157 | # 158 | # After setting the simulation parameters, we run the function that generates 159 | # the 2D scenario that we have briefly described in the first section of this 160 | # example. 161 | 162 | # simulation parameters 163 | n_samples = 100 164 | shape = (40, 40) 165 | n_features = shape[1] * shape[0] 166 | roi_size = 4 # size of the edge of the four predictive regions 167 | sigma = 2.0 # noise standard deviation 168 | smooth_X = 1.0 # level of spatial smoothing introduced by the Gaussian filter 169 | 170 | # generating the data 171 | X_init, y, beta, epsilon, _, _ = \ 172 | multivariate_simulation(n_samples, shape, roi_size, sigma, smooth_X, 173 | seed=1) 174 | 175 | ############################################################################## 176 | # Choosing inference parameters 177 | # ----------------------------- 178 | # 179 | # The choice of the number of clusters depends on several parameters, such as: 180 | # the structure of the data (a higher correlation between neighboring features 181 | # enable a greater dimension reduction, i.e. a smaller number of clusters), 182 | # the number of samples (small datasets require more dimension reduction) and 183 | # the required spatial tolerance (small clusters lead to limited spatial 184 | # uncertainty). Formally, "spatial tolerance" is defined by the largest 185 | # distance from the true support for which the occurence of a false discovery 186 | # is not statistically controlled (c.f. :ref:`References`). 187 | # Theoretically, the spatial tolerance ``delta`` is equal to the largest 188 | # cluster diameter. However this choice is conservative, notably in the case 189 | # of ensembled clustered inference. For these algorithms, we recommend to take 190 | # the average cluster radius. In this example, we choose ``n_clusters = 200``, 191 | # leading to a theoretical spatial tolerance ``delta = 6``. However, it 192 | # turns out that ``delta = 2``, the average cluster radius, would have been 193 | # sufficient for ensembled clustered inference algorithms (see Results). 194 | 195 | # hyper-parameters 196 | n_clusters = 200 197 | 198 | # inference parameters 199 | fwer_target = 0.1 200 | delta = 6 201 | 202 | # computation parameter 203 | n_jobs = 1 204 | 205 | ############################################################################## 206 | # Computing z-score thresholds for support estimation 207 | # --------------------------------------------------- 208 | # 209 | # Below, we translate the FWER target into z-score targets. 210 | # To compute the z-score targets we also take into account for the multiple 211 | # testing correction. To do so, we consider the Bonferroni correction. 212 | # For methods that do not reduce the feature space, the correction 213 | # consists in dividing the FWER target by the number of features. 214 | # For methods that group features into clusters, the correction 215 | # consists in dividing by the number of clusters. 216 | 217 | 218 | # computing the z-score thresholds for feature selection 219 | correction_no_cluster = 1. / n_features 220 | correction_cluster = 1. / n_clusters 221 | thr_c = zscore_from_pval((fwer_target / 2) * correction_cluster) 222 | thr_nc = zscore_from_pval((fwer_target / 2) * correction_no_cluster) 223 | 224 | ############################################################################# 225 | # Inference with several algorithms 226 | # --------------------------------- 227 | # 228 | # First, we compute a reference map that exhibits the true support and 229 | # the theoretical tolerance region. 230 | 231 | # compute true support with visible spatial tolerance 232 | beta_extended = weight_map_2D_extended(shape, roi_size, delta) 233 | 234 | ############################################################################# 235 | # Now, we compute the support estimated by a high-dimensional statistical 236 | # infernece method that does not leverage the data structure. This method 237 | # was introduced by Javanmard, A. et al. (2014), Zhang, C. H. et al. (2014) 238 | # and Van de Geer, S. et al.. (2014) (full references are available at 239 | # https://ja-che.github.io/hidimstat/). 240 | # and referred to as Desparsified Lasso. 241 | 242 | # compute desparsified lasso 243 | beta_hat, cb_min, cb_max = desparsified_lasso(X_init, y, n_jobs=n_jobs) 244 | pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 245 | pval_from_cb(cb_min, cb_max) 246 | 247 | # compute estimated support (first method) 248 | zscore = zscore_from_pval(pval, one_minus_pval) 249 | selected_dl = zscore > thr_nc # use the "no clustering threshold" 250 | 251 | # compute estimated support (second method) 252 | selected_dl = np.logical_or(pval_corr < fwer_target / 2, 253 | one_minus_pval_corr < fwer_target / 2) 254 | 255 | ############################################################################# 256 | # Now, we compute the support estimated using a clustered inference algorithm 257 | # (c.f. :ref:`References`) called Clustered Desparsified Lasso (CluDL) since it 258 | # uses the Desparsified Lasso technique after clustering the data. 259 | 260 | # Define the FeatureAgglomeration object that performs the clustering. 261 | # This object is necessary to run the current algorithm and the following one. 262 | connectivity = image.grid_to_graph(n_x=shape[0], 263 | n_y=shape[1]) 264 | ward = FeatureAgglomeration(n_clusters=n_clusters, 265 | connectivity=connectivity, 266 | linkage='ward') 267 | 268 | # clustered desparsified lasso (CluDL) 269 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 270 | clustered_inference(X_init, y, ward, n_clusters) 271 | 272 | # compute estimated support (first method) 273 | zscore = zscore_from_pval(pval, one_minus_pval) 274 | selected_cdl = zscore > thr_c # use the "clustering threshold" 275 | 276 | # compute estimated support (second method) 277 | selected_cdl = np.logical_or(pval_corr < fwer_target / 2, 278 | one_minus_pval_corr < fwer_target / 2) 279 | 280 | ############################################################################# 281 | # Finally, we compute the support estimated by an ensembled clustered 282 | # inference algorithm (c.f. :ref:`References`). This algorithm is called 283 | # Ensemble of Clustered Desparsified Lasso (EnCluDL) since it runs several 284 | # CluDL algorithms with different clustering choices. The different CluDL 285 | # solutions are then aggregated into one. 286 | 287 | # ensemble of clustered desparsified lasso (EnCluDL) 288 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 289 | ensemble_clustered_inference(X_init, y, ward, 290 | n_clusters, train_size=0.3) 291 | 292 | # compute estimated support 293 | selected_ecdl = np.logical_or(pval_corr < fwer_target / 2, 294 | one_minus_pval_corr < fwer_target / 2) 295 | 296 | ############################################################################# 297 | # Results 298 | # ------- 299 | # 300 | # Now we plot the true support, the theoretical tolerance regions and 301 | # the estimated supports for every method. 302 | 303 | maps = [] 304 | titles = [] 305 | 306 | maps.append(np.reshape(beta, shape)) 307 | titles.append('True weights') 308 | 309 | maps.append(np.reshape(beta_extended, shape)) 310 | titles.append('True weights \nwith tolerance') 311 | 312 | maps.append(np.reshape(selected_dl, shape)) 313 | titles.append('Desparsified Lasso') 314 | 315 | maps.append(None) 316 | titles.append(None) 317 | 318 | maps.append(np.reshape(selected_cdl, shape)) 319 | titles.append('CluDL') 320 | 321 | maps.append(np.reshape(selected_ecdl, shape)) 322 | titles.append('EnCluDL') 323 | 324 | plot(maps, titles) 325 | 326 | ############################################################################# 327 | # Analysis of the results 328 | # ----------------------- 329 | # As argued in the first section of this example, the standard method that 330 | # do not compress the problem is not relevant as it dramatically lacks power. 331 | # The support estimated from CluDL provides a more reasonable solution 332 | # since we recover the four regions. However the shape of the estimated support 333 | # is a bit rough. 334 | # Finally, the solution provided by EnCluDL is more accurate since the shape 335 | # of the estimated support is closer to the true support. 336 | # Also, one can note that the theoretical spatial tolerance is quite 337 | # conservative. In practice, we argue that the statistical guarantees are valid 338 | # for a lower spatial tolerance thanks to the clustering randomization. 339 | -------------------------------------------------------------------------------- /hidimstat/desparsified_lasso.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import multi_dot 3 | from scipy import stats 4 | from scipy.linalg import inv 5 | from joblib import Parallel, delayed 6 | from sklearn.utils.validation import check_memory 7 | from sklearn.linear_model import Lasso 8 | 9 | from .noise_std import reid, group_reid 10 | from .stat_tools import pval_from_two_sided_pval_and_sign 11 | 12 | 13 | def _compute_all_residuals(X, alphas, gram, max_iter=5000, tol=1e-3, 14 | method='lasso', n_jobs=1, verbose=0): 15 | """Nodewise Lasso. Compute all the residuals: regressing each column of the 16 | design matrix against the other columns""" 17 | 18 | n_samples, n_features = X.shape 19 | 20 | results = \ 21 | Parallel(n_jobs=n_jobs, verbose=verbose)( 22 | delayed(_compute_residuals) 23 | (X=X, 24 | column_index=i, 25 | alpha=alphas[i], 26 | gram=gram, 27 | max_iter=max_iter, 28 | tol=tol, 29 | method=method) 30 | for i in range(n_features)) 31 | 32 | results = np.asarray(results) 33 | Z = np.stack(results[:, 0], axis=1) 34 | omega_diag = np.stack(results[:, 1]) 35 | 36 | return Z, omega_diag 37 | 38 | 39 | def _compute_residuals(X, column_index, alpha, gram, max_iter=5000, 40 | tol=1e-3, method='lasso'): 41 | """Compute the residuals of the regression of a given column of the 42 | design matrix against the other columns""" 43 | 44 | n_samples, n_features = X.shape 45 | i = column_index 46 | 47 | X_new = np.delete(X, i, axis=1) 48 | y = np.copy(X[:, i]) 49 | 50 | if method == 'lasso': 51 | 52 | gram_ = np.delete(np.delete(gram, i, axis=0), i, axis=1) 53 | clf = Lasso(alpha=alpha, precompute=gram_, max_iter=max_iter, tol=tol) 54 | 55 | else: 56 | 57 | ValueError("The only regression method available is 'lasso'") 58 | 59 | clf.fit(X_new, y) 60 | z = y - clf.predict(X_new) 61 | 62 | omega_diag_i = n_samples * np.sum(z ** 2) / np.dot(y, z) ** 2 63 | 64 | return z, omega_diag_i 65 | 66 | 67 | def desparsified_lasso(X, y, dof_ajdustement=False, 68 | confidence=0.95, max_iter=5000, tol=1e-3, 69 | residual_method='lasso', alpha_max_fraction=0.01, 70 | n_jobs=1, memory=None, verbose=0): 71 | 72 | """Desparsified Lasso with confidence intervals 73 | 74 | Parameters 75 | ---------- 76 | X : ndarray, shape (n_samples, n_features) 77 | Data. 78 | 79 | y : ndarray, shape (n_samples,) 80 | Target. 81 | 82 | dof_ajdustement : bool, optional (default=False) 83 | If True, makes the degrees of freedom adjustement (cf. [4]_ and [5]_). 84 | Otherwise, the original Desparsified Lasso estimator is computed 85 | (cf. [1]_ and [2]_ and [3]_). 86 | 87 | confidence : float, optional (default=0.95) 88 | Confidence level used to compute the confidence intervals. 89 | Each value should be in the range [0, 1]. 90 | 91 | max_iter : int, optional (default=5000) 92 | The maximum number of iterations when regressing, by Lasso, 93 | each column of the design matrix against the others. 94 | 95 | tol : float, optional (default=1e-3) 96 | The tolerance for the optimization of the Lasso problems: if the 97 | updates are smaller than `tol`, the optimization code checks the 98 | dual gap for optimality and continues until it is smaller than `tol`. 99 | 100 | residual_method : str, optional (default='lasso') 101 | Method used for computing the residuals of the Nodewise Lasso. 102 | Currently the only method available is 'lasso'. 103 | 104 | alpha_max_fraction : float, optional (default=0.01) 105 | Only used if method='lasso'. 106 | Then alpha = alpha_max_fraction * alpha_max. 107 | 108 | n_jobs : int or None, optional (default=1) 109 | Number of CPUs to use during the Nodewise Lasso. 110 | 111 | memory : str or joblib.Memory object, optional (default=None) 112 | Used to cache the output of the computation of the Nodewise Lasso. 113 | By default, no caching is done. If a string is given, it is the path 114 | to the caching directory. 115 | 116 | verbose: int, optional (default=1) 117 | The verbosity level: if non zero, progress messages are printed 118 | when computing the Nodewise Lasso in parralel. 119 | The frequency of the messages increases with the verbosity level. 120 | 121 | Returns 122 | ------- 123 | beta_hat : array, shape (n_features,) 124 | Estimated parameter vector. 125 | 126 | cb_min : array, shape (n_features) 127 | Lower bound of the confidence intervals on the parameter vector. 128 | 129 | cb_max : array, shape (n_features) 130 | Upper bound of the confidence intervals on the parameter vector. 131 | 132 | Notes 133 | ----- 134 | The columns of `X` and `y` are always centered, this ensures that 135 | the intercepts of the Nodewise Lasso problems are all equal to zero 136 | and the intercept of the noise model is also equal to zero. Since 137 | the values of the intercepts are not of interest, the centering avoids 138 | the consideration of unecessary additional parameters. 139 | Also, you may consider to center and scale `X` beforehand, notably if 140 | the data contained in `X` has not been prescaled from measurements. 141 | 142 | References 143 | ---------- 144 | .. [1] Zhang, C. H., & Zhang, S. S. (2014). Confidence intervals for 145 | low dimensional parameters in high dimensional linear models. 146 | Journal of the Royal Statistical Society: Series B: Statistical 147 | Methodology, 217-242. 148 | 149 | .. [2] Van de Geer, S., Bühlmann, P., Ritov, Y. A., & Dezeure, R. (2014). 150 | On asymptotically optimal confidence regions and tests for 151 | high-dimensional models. Annals of Statistics, 42(3), 1166-1202. 152 | 153 | .. [3] Javanmard, A., & Montanari, A. (2014). Confidence intervals and 154 | hypothesis testing for high-dimensional regression. The Journal 155 | of Machine Learning Research, 15(1), 2869-2909. 156 | 157 | .. [4] Bellec, P. C., & Zhang, C. H. (2019). De-biasing the lasso with 158 | degrees-of-freedom adjustment. arXiv preprint arXiv:1902.08885. 159 | 160 | .. [5] Celentano, M., Montanari, A., & Wei, Y. (2020). The Lasso with 161 | general Gaussian designs with applications to hypothesis testing. 162 | arXiv preprint arXiv:2007.13716. 163 | """ 164 | 165 | X = np.asarray(X) 166 | 167 | n_samples, n_features = X.shape 168 | 169 | memory = check_memory(memory) 170 | 171 | y = y - np.mean(y) 172 | X = X - np.mean(X, axis=0) 173 | gram = np.dot(X.T, X) 174 | gram_nodiag = gram - np.diag(np.diag(gram)) 175 | 176 | list_alpha_max = np.max(np.abs(gram_nodiag), axis=0) / n_samples 177 | alphas = alpha_max_fraction * list_alpha_max 178 | 179 | # Calculating precision matrix (Nodewise Lasso) 180 | Z, omega_diag = memory.cache(_compute_all_residuals, ignore=['n_jobs'])( 181 | X, alphas, gram, max_iter=max_iter, tol=tol, 182 | method=residual_method, n_jobs=n_jobs, verbose=verbose) 183 | 184 | # Lasso regression 185 | sigma_hat, beta_lasso = reid(X, y, n_jobs=n_jobs) 186 | 187 | # Computing the degrees of freedom adjustement 188 | if dof_ajdustement: 189 | coef_max = np.max(np.abs(beta_lasso)) 190 | support = np.sum(np.abs(beta_lasso) > 0.01 * coef_max) 191 | support = min(support, n_samples - 1) 192 | dof_factor = n_samples / (n_samples - support) 193 | else: 194 | dof_factor = 1 195 | 196 | # Computing Desparsified Lasso estimator and confidence intervals 197 | beta_bias = dof_factor * np.dot(y.T, Z) / np.sum(X * Z, axis=0) 198 | 199 | P = ((Z.T.dot(X)).T / np.sum(X * Z, axis=0)).T 200 | P_nodiag = P - np.diag(np.diag(P)) 201 | Id = np.identity(n_features) 202 | P_nodiag = dof_factor * P_nodiag + (dof_factor - 1) * Id 203 | 204 | beta_hat = beta_bias - P_nodiag.dot(beta_lasso) 205 | 206 | omega_diag = omega_diag * dof_factor ** 2 207 | omega_invsqrt_diag = omega_diag ** (-0.5) 208 | 209 | quantile = stats.norm.ppf(1 - (1 - confidence) / 2) 210 | 211 | confint_radius = np.abs(quantile * sigma_hat / 212 | (np.sqrt(n_samples) * omega_invsqrt_diag)) 213 | cb_max = beta_hat + confint_radius 214 | cb_min = beta_hat - confint_radius 215 | 216 | return beta_hat, cb_min, cb_max 217 | 218 | 219 | def desparsified_group_lasso(X, Y, cov=None, test='chi2', 220 | max_iter=5000, tol=1e-3, residual_method='lasso', 221 | alpha_max_fraction=0.01, noise_method='AR', 222 | order=1, n_jobs=1, memory=None, verbose=0): 223 | """Desparsified Group Lasso 224 | 225 | Parameters 226 | ---------- 227 | X : ndarray, shape (n_samples, n_features) 228 | Data. 229 | 230 | Y : ndarray, shape (n_samples, n_times) 231 | Target. 232 | 233 | cov : ndarray, shape (n_times, n_times), optional (default=None) 234 | If None, a temporal covariance matrix of the noise is estimated. 235 | Otherwise, `cov` is the temporal covariance matrix of the noise. 236 | 237 | test : str, optional (default='chi2') 238 | Statistical test used to compute p-values. 'chi2' corresponds 239 | to a chi-squared test and 'F' corresponds to an F-test. 240 | 241 | max_iter : int, optional (default=5000) 242 | The maximum number of iterations when regressing, by Lasso, 243 | each column of the design matrix against the others. 244 | 245 | tol : float, optional (default=1e-3) 246 | The tolerance for the optimization of the Lasso problems: if the 247 | updates are smaller than `tol`, the optimization code checks the 248 | dual gap for optimality and continues until it is smaller than `tol`. 249 | 250 | residual_method : str, optional (default='lasso') 251 | Method used for computing the residuals of the Nodewise Lasso. 252 | Currently the only method available is 'lasso'. 253 | 254 | alpha_max_fraction : float, optional (default=0.01) 255 | Only used if method='lasso'. 256 | Then alpha = alpha_max_fraction * alpha_max. 257 | 258 | noise_method : str, optional (default='simple') 259 | If 'simple', the correlation matrix is estimated by taking the 260 | median of the correlation between two consecutive time steps 261 | and the noise standard deviation for each time step is estimated 262 | by taking the median of the standard deviations for every time step. 263 | If 'AR', the order of the AR model is given by `order` and 264 | Yule-Walker method is used to estimate the covariance matrix. 265 | 266 | order : int, optional (default=1) 267 | If `method=AR`, `order` gives the order of the estimated autoregressive 268 | model. `order` must be smaller than the number of time steps. 269 | 270 | n_jobs : int or None, optional (default=1) 271 | Number of CPUs to use during the Nodewise Lasso. 272 | 273 | memory : str or joblib.Memory object, optional (default=None) 274 | Used to cache the output of the computation of the Nodewise Lasso. 275 | By default, no caching is done. If a string is given, it is the path 276 | to the caching directory. 277 | 278 | verbose: int, optional (default=1) 279 | The verbosity level: if non zero, progress messages are printed 280 | when computing the Nodewise Lasso in parralel. 281 | The frequency of the messages increases with the verbosity level. 282 | 283 | Returns 284 | ------- 285 | beta_hat : ndarray, shape (n_features, n_times) 286 | Estimated parameter matrix. 287 | 288 | pval : ndarray, shape (n_features,) 289 | p-value, with numerically accurate values for 290 | positive effects (ie., for p-value close to zero). 291 | 292 | pval_corr : ndarray, shape (n_features,) 293 | p-value corrected for multiple testing. 294 | 295 | one_minus_pval : ndarray, shape (n_features,) 296 | One minus the p-value, with numerically accurate values 297 | for negative effects (ie., for p-value close to one). 298 | 299 | one_minus_pval_corr : ndarray, shape (n_features,) 300 | One minus the p-value corrected for multiple testing. 301 | Notes 302 | ----- 303 | The columns of `X` and the matrix `Y` are always centered, this ensures 304 | that the intercepts of the Nodewise Lasso problems are all equal to zero 305 | and the intercept of the noise model is also equal to zero. Since 306 | the values of the intercepts are not of interest, the centering avoids 307 | the consideration of unecessary additional parameters. 308 | Also, you may consider to center and scale `X` beforehand, notably if 309 | the data contained in `X` has not been prescaled from measurements. 310 | 311 | References 312 | ---------- 313 | .. [1] Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020). 314 | Statistical control for spatio-temporal MEG/EEG source imaging with 315 | desparsified multi-task Lasso. In NeurIPS 2020-34h Conference on 316 | Neural Information Processing Systems. 317 | """ 318 | 319 | X = np.asarray(X) 320 | 321 | n_samples, n_features = X.shape 322 | n_times = Y.shape[1] 323 | 324 | memory = check_memory(memory) 325 | 326 | if cov is not None and cov.shape != (n_times, n_times): 327 | raise ValueError(f'Shape of "cov" should be ({n_times}, {n_times}),' + 328 | f' the shape of "cov" was ({cov.shape}) instead') 329 | 330 | Y = Y - np.mean(Y) 331 | X = X - np.mean(X, axis=0) 332 | gram = np.dot(X.T, X) 333 | gram_nodiag = gram - np.diag(np.diag(gram)) 334 | 335 | list_alpha_max = np.max(np.abs(gram_nodiag), axis=0) / n_samples 336 | alphas = alpha_max_fraction * list_alpha_max 337 | 338 | # Calculating precision matrix (Nodewise Lasso) 339 | Z, omega_diag = memory.cache(_compute_all_residuals, ignore=['n_jobs'])( 340 | X, alphas, gram, max_iter=max_iter, tol=tol, 341 | method=residual_method, n_jobs=n_jobs, verbose=verbose) 342 | 343 | # Group Lasso regression 344 | cov_hat, beta_mtl = \ 345 | group_reid(X, Y, method=noise_method, order=order, n_jobs=n_jobs) 346 | 347 | if cov is not None: 348 | cov_hat = cov 349 | 350 | theta_hat = n_samples * inv(cov_hat) 351 | 352 | # Estimating the coefficient vector 353 | beta_bias = Y.T.dot(Z) / np.sum(X * Z, axis=0) 354 | 355 | beta_mtl = beta_mtl.T 356 | beta_bias = beta_bias.T 357 | 358 | P = (np.dot(X.T, Z) / np.sum(X * Z, axis=0)).T 359 | P_nodiag = P - np.diag(np.diag(P)) 360 | 361 | beta_hat = beta_bias - P_nodiag.dot(beta_mtl) 362 | 363 | if test == 'chi2': 364 | 365 | chi2_scores = \ 366 | np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) / omega_diag 367 | two_sided_pval = \ 368 | np.minimum(2 * stats.chi2.sf(chi2_scores, df=n_times), 1.0) 369 | 370 | if test == 'F': 371 | 372 | f_scores = (np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) / 373 | omega_diag / n_times) 374 | two_sided_pval = \ 375 | np.minimum(2 * stats.f.sf(f_scores, dfd=n_samples, dfn=n_times), 376 | 1.0) 377 | 378 | sign_beta = np.sign(np.sum(beta_hat, axis=1)) 379 | pval, pval_corr, one_minus_pval, one_minus_pval_corr = \ 380 | pval_from_two_sided_pval_and_sign(two_sided_pval, sign_beta) 381 | 382 | return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr 383 | --------------------------------------------------------------------------------