├── hidimstat
    ├── knockoffs
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_stat_coef_diff.py
    │   │   ├── test_data_simulation.py
    │   │   ├── test_model_x_knockoff.py
    │   │   ├── test_generate_knockoff.py
    │   │   └── test_knockoff_aggregation.py
    │   ├── __init__.py
    │   ├── data_simulation.py
    │   ├── knockoffs.py
    │   ├── knockoff_aggregation.py
    │   ├── stat_coef_diff.py
    │   ├── utils.py
    │   └── gaussian_knockoff.py
    ├── version.py
    ├── setup.py
    ├── __init__.py
    ├── test
    │   ├── test_permutation_test.py
    │   ├── test_standardized_svr.py
    │   ├── test_multi_sample_split.py
    │   ├── test_adaptive_permutation_threshold.py
    │   ├── test_desparsified_lasso.py
    │   ├── test_clustered_inference.py
    │   ├── test_ensemble_clustered_inference.py
    │   ├── test_noise_std.py
    │   ├── test_scenario.py
    │   └── test_stat_tools.py
    ├── standardized_svr.py
    ├── adaptive_permutation_threshold.py
    ├── multi_sample_split.py
    ├── ensemble_clustered_inference.py
    ├── permutation_test.py
    ├── scenario.py
    ├── noise_std.py
    ├── clustered_inference.py
    ├── stat_tools.py
    └── desparsified_lasso.py
├── requirements.txt
├── examples
    ├── figures
    │   ├── fig1_nguyen_et_al.png
    │   ├── meg_somato_sLORETA.png
    │   └── meg_somato_cd-MTLasso.png
    ├── README.txt
    ├── plot_fmri_data_example.py
    └── plot_2D_simulation_example.py
├── doc
    ├── doc-requirements.txt
    ├── api.rst
    ├── _static
    │   └── style.css
    ├── Makefile
    ├── index.rst
    └── conf.py
├── .gitignore
├── .github
    └── workflows
    │   ├── circle_artifacts.yml
    │   └── deploy_ghpages.yml
├── MANIFEST.in
├── codecov.yml
├── .travis.yml
├── LICENSE
├── setup.py
├── .circleci
    └── config.yml
├── examples_not_exhibited
    └── plot_fig_1_nguyen_et_al.py
└── README.md


/hidimstat/knockoffs/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hidimstat/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/tests/test_stat_coef_diff.py:
--------------------------------------------------------------------------------
1 | # To be done
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | joblib
3 | scipy
4 | scikit-learn
5 | 


--------------------------------------------------------------------------------
/examples/figures/fig1_nguyen_et_al.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ja-che/hidimstat/HEAD/examples/figures/fig1_nguyen_et_al.png


--------------------------------------------------------------------------------
/examples/figures/meg_somato_sLORETA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ja-che/hidimstat/HEAD/examples/figures/meg_somato_sLORETA.png


--------------------------------------------------------------------------------
/examples/figures/meg_somato_cd-MTLasso.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ja-che/hidimstat/HEAD/examples/figures/meg_somato_cd-MTLasso.png


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
1 | .. _general_examples:
2 | 
3 | Examples Gallery
4 | ================
5 | 
6 | .. contents:: Contents
7 |    :local:
8 |    :depth: 3
9 | 


--------------------------------------------------------------------------------
/doc/doc-requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib
 2 | numpy
 3 | numpydoc
 4 | matplotlib
 5 | pandas
 6 | pillow
 7 | scikit-learn
 8 | scipy
 9 | sphinx-bootstrap-theme
10 | sphinx-gallery
11 | mne
12 | pyvista
13 | pyvistaqt
14 | PyQt5
15 | nilearn
16 | memory_profiler
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Cache directories and files
 2 | build
 3 | dist
 4 | doc
 5 | examples/figures
 6 | joblib
 7 | .venv
 8 | .pytest_cache
 9 | .mypy_cache/
10 | *.pyc
11 | __pycache__
12 | *.egg-info
13 | .coverage
14 | 
15 | # IDE specific folders
16 | .vscode
17 | 
18 | .DS_Store
19 | coverage.xml
20 | 
21 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/tests/test_data_simulation.py:
--------------------------------------------------------------------------------
 1 | from hidimstat.knockoffs.data_simulation import simu_data
 2 | 
 3 | n = 100
 4 | p = 200
 5 | seed = 42
 6 | 
 7 | 
 8 | def test_simu_data():
 9 |     X, y, _, _ = simu_data(n, p, seed=seed)
10 | 
11 |     assert X.shape == (n, p)
12 |     assert y.size == n
13 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gaussian_knockoff import gaussian_knockoff_generation
 2 | from .knockoffs import model_x_knockoff
 3 | from .knockoff_aggregation import knockoff_aggregation
 4 | from .stat_coef_diff import stat_coef_diff
 5 | 
 6 | 
 7 | __all__ = [
 8 |     'gaussian_knockoff_generation',
 9 |     'knockoff_aggregation',
10 |     'model_x_knockoff',
11 |     'stat_coef_diff',
12 | ]
13 | 


--------------------------------------------------------------------------------
/hidimstat/setup.py:
--------------------------------------------------------------------------------
 1 | def configuration(parent_package='', top_path=None):
 2 |     from numpy.distutils.misc_util import Configuration
 3 | 
 4 |     config = Configuration('plotting', parent_package, top_path)
 5 | 
 6 |     config.add_subpackage('tests')
 7 | 
 8 |     return config
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     from numpy.distutils.core import setup
13 |     setup(**configuration(top_path='').todict())
14 | 


--------------------------------------------------------------------------------
/.github/workflows/circle_artifacts.yml:
--------------------------------------------------------------------------------
 1 | on: [status]
 2 | jobs:
 3 |   circleci_artifacts_redirector_job:
 4 |     runs-on: ubuntu-20.04
 5 |     name: Run CircleCI artifacts redirector
 6 |     steps:
 7 |       - name: GitHub Action step
 8 |         uses: larsoner/circleci-artifacts-redirector-action@master
 9 |         with:
10 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
11 |           artifact-path: 0/dev/index.html
12 |           circleci-jobs: build_docs
13 |           job-title: Check the rendered docs here!
14 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/tests/test_model_x_knockoff.py:
--------------------------------------------------------------------------------
 1 | from hidimstat.knockoffs.data_simulation import simu_data
 2 | from hidimstat.knockoffs import model_x_knockoff
 3 | from hidimstat.knockoffs.utils import cal_fdp_power
 4 | 
 5 | seed = 0
 6 | fdr = 0.5
 7 | 
 8 | 
 9 | def test_model_x_knockoff():
10 | 
11 |     n = 300
12 |     p = 100
13 |     X, y, _, non_zero = simu_data(n, p, seed=seed)
14 |     ko_result = model_x_knockoff(X, y, fdr=fdr, seed=seed+1)
15 |     fdp, power = cal_fdp_power(ko_result, non_zero)
16 | 
17 |     assert fdp <= 0.2
18 |     assert power > 0.7
19 | 


--------------------------------------------------------------------------------
/doc/api.rst:
--------------------------------------------------------------------------------
 1 | .. _api_documentation:
 2 | 
 3 | =================
 4 | API Documentation
 5 | =================
 6 | 
 7 | Estimators
 8 | ==========
 9 | 
10 | .. currentmodule:: hidimstat
11 | 
12 | Functions
13 | =========
14 | 
15 | .. autosummary::
16 |    :toctree: generated/
17 | 
18 |    ada_svr
19 |    aggregate_quantiles
20 |    clustered_inference
21 |    desparsified_lasso
22 |    ensemble_clustered_inference
23 |    group_reid
24 |    hd_inference
25 |    multivariate_1D_simulation
26 |    permutation_test_cv
27 |    reid
28 |    standardized_svr
29 |    zscore_from_pval
30 | 


--------------------------------------------------------------------------------
/doc/_static/style.css:
--------------------------------------------------------------------------------
 1 | 
 2 | blockquote p {
 3 |     font-size: 14px !important;
 4 | }
 5 | 
 6 | blockquote {
 7 |     margin: 0 0 4px !important;
 8 | }
 9 | 
10 | code {
11 |     color: #49759c !important;
12 |     background-color: #f3f5f9 !important;
13 | }
14 | 
15 | .alert-info {
16 |     background-color: #adb8cb !important;
17 |     border-color: #adb8cb !important;
18 |     color: #2c3e50 !important;
19 | }
20 | 
21 | .function dt {
22 |     padding-top: 150px;
23 |     margin-top: -150px;
24 |     -webkit-background-clip: content-box;
25 |     background-clip: content-box;
26 | }
27 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include .circleci/config.yml
 2 | include build_package.sh
 3 | include *.txt
 4 | include *.yml
 5 | include LICENSE
 6 | 
 7 | recursive-include doc *.css
 8 | recursive-include doc *.py
 9 | recursive-include doc *.rst
10 | recursive-include doc *.txt
11 | recursive-include doc Makefile
12 | recursive-include examples *.py
13 | recursive-include examples *.txt
14 | recursive-include examples_not_exhibited *.py
15 | recursive-include hidimstat *.py
16 | 
17 | recursive-exclude doc/_build *
18 | recursive-exclude doc/generated *
19 | recursive-exclude doc/auto_examples *
20 | recursive-exclude examples/figures *.png
21 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   token: b7d1afb7-9730-4e21-882a-d0e893108def
 3 | 
 4 | comment: false
 5 | 
 6 | coverage:
 7 |   precision: 2
 8 |   round: down
 9 |   range: "70...100"
10 |   status:
11 |     project:
12 |       default:
13 |         # Commits pushed to master should not make the overall
14 |         # project coverage decrease by more than 2%:
15 |         target: auto
16 |         threshold: 2%
17 |     patch:
18 |       default:
19 |         # Be tolerant on slight code coverage diff on PRs to limit
20 |         # noisy red coverage status on github PRs.
21 |         # Note The coverage stats are still uploaded
22 |         # to codecov so that PR reviewers can see uncovered lines
23 |         # in the github diff if they install the codecov browser
24 |         # extension:
25 |         # https://github.com/codecov/browser-extension
26 |         target: auto
27 |         threshold: 2%
28 | 
29 | ignore:
30 | - "**/setup.py"
31 | 
32 | 


--------------------------------------------------------------------------------
/hidimstat/__init__.py:
--------------------------------------------------------------------------------
 1 | from .clustered_inference import clustered_inference, hd_inference
 2 | from .desparsified_lasso import desparsified_lasso, desparsified_group_lasso
 3 | from .ensemble_clustered_inference import ensemble_clustered_inference
 4 | from .adaptive_permutation_threshold import ada_svr
 5 | from .multi_sample_split import aggregate_quantiles
 6 | from .noise_std import reid, group_reid
 7 | from .permutation_test import permutation_test_cv
 8 | from .scenario import multivariate_1D_simulation
 9 | from .standardized_svr import standardized_svr
10 | from .stat_tools import zscore_from_pval
11 | from .version import __version__
12 | 
13 | __all__ = [
14 |     'aggregate_quantiles',
15 |     'clustered_inference',
16 |     'desparsified_lasso',
17 |     'desparsified_group_lasso',
18 |     'ensemble_clustered_inference',
19 |     'ada_svr',
20 |     'group_reid',
21 |     'hd_inference',
22 |     'multivariate_1D_simulation',
23 |     'permutation_test_cv',
24 |     'reid',
25 |     'standardized_svr',
26 |     'zscore_from_pval',
27 |     '__version__',
28 | ]
29 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/tests/test_generate_knockoff.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Authors: Binh Nguyen <tuan-binh.nguyen@inria.fr>
 3 | 
 4 | from hidimstat.knockoffs.data_simulation import simu_data
 5 | from hidimstat.knockoffs.gaussian_knockoff import (
 6 |     _estimate_distribution, gaussian_knockoff_generation)
 7 | 
 8 | SEED = 42
 9 | fdr = 0.1
10 | 
11 | 
12 | def test_estimate_distribution():
13 |     n = 100
14 |     p = 50
15 |     X, y, _, non_zero = simu_data(n, p, seed=SEED)
16 |     mu, Sigma = _estimate_distribution(X, cov_estimator='ledoit_wolf')
17 | 
18 |     assert mu.size == p
19 |     assert Sigma.shape == (p, p)
20 | 
21 |     mu, Sigma = _estimate_distribution(X, cov_estimator='graph_lasso')
22 | 
23 |     assert mu.size == p
24 |     assert Sigma.shape == (p, p)
25 | 
26 | 
27 | def test_gaussian_knockoff_equi():
28 |     n = 100
29 |     p = 50
30 |     X, y, _, non_zero = simu_data(n, p, seed=SEED)
31 |     mu, Sigma = _estimate_distribution(X, cov_estimator='ledoit_wolf')
32 | 
33 |     X_tilde = gaussian_knockoff_generation(
34 |         X, mu, Sigma, method='equi', seed=SEED*2)
35 | 
36 |     assert X_tilde.shape == (n, p)
37 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/tests/test_knockoff_aggregation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from hidimstat.knockoffs import knockoff_aggregation, model_x_knockoff
 3 | from hidimstat.knockoffs.data_simulation import simu_data
 4 | from hidimstat.knockoffs.utils import cal_fdp_power
 5 | 
 6 | n = 500
 7 | p = 100
 8 | snr = 5
 9 | n_bootstraps = 25
10 | fdr = 0.5
11 | X, y, _, non_zero_index = simu_data(n, p, snr=snr, seed=0)
12 | 
13 | 
14 | def test_knockoff_aggregation():
15 | 
16 |     selected, aggregated_pval, pvals = knockoff_aggregation(
17 |         X, y, fdr=fdr, n_bootstraps=n_bootstraps, verbose=True, random_state=0)
18 | 
19 |     fdp, power = cal_fdp_power(selected, non_zero_index)
20 | 
21 |     assert pvals.shape == (n_bootstraps, p)
22 |     assert fdp < 0.5
23 |     assert power > 0.1
24 | 
25 |     # Single AKO (or vanilla KO)
26 |     selected = knockoff_aggregation(
27 |         X, y, fdr=fdr, verbose=False, n_bootstraps=1, random_state=5)
28 | 
29 |     selected_ko = model_x_knockoff(X, y, fdr=fdr, seed=5)
30 | 
31 |     np.testing.assert_array_equal(selected, selected_ko)
32 | 
33 |     fdp, power = cal_fdp_power(selected, non_zero_index)
34 | 
35 |     assert fdp < 0.5
36 |     assert power > 0.1
37 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_permutation_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the permutation test module
 3 | """
 4 | 
 5 | import numpy as np
 6 | from numpy.testing import assert_almost_equal
 7 | 
 8 | from hidimstat.scenario import multivariate_1D_simulation
 9 | from hidimstat.permutation_test import permutation_test_cv
10 | 
11 | 
12 | def test_permutation_test():
13 |     '''Testing the procedure on a simulation with no structure and a support
14 |     of size 1. Computing one-sided p-values, we want a low p-value
15 |     for the first feature and p-values close to 0.5 for the others.'''
16 | 
17 |     n_samples, n_features = 20, 50
18 |     support_size = 1
19 |     sigma = 0.1
20 |     rho = 0.0
21 | 
22 |     X_init, y, beta, noise = \
23 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
24 |                                    support_size=support_size, sigma=sigma,
25 |                                    rho=rho, shuffle=False, seed=3)
26 | 
27 |     y = y - np.mean(y)
28 |     X_init = X_init - np.mean(X_init, axis=0)
29 | 
30 |     pval_corr, one_minus_pval_corr = \
31 |         permutation_test_cv(X_init, y, n_permutations=100)
32 | 
33 |     expected = 0.5 * np.ones(n_features)
34 |     expected[:support_size] = 0.0
35 | 
36 |     assert_almost_equal(pval_corr, expected, decimal=1)
37 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | notifications:
 2 |   email: false
 3 | dist: bionic  # ubuntu 18.04
 4 | language: python
 5 | os: linux
 6 | jobs:
 7 |   include:
 8 |     - python: "3.6"
 9 |       env: ONLY_PYTEST=true
10 | 
11 | before_install:
12 |   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
13 |   - bash miniconda.sh -b -p $HOME/miniconda
14 |   - export PATH="$HOME/miniconda/bin:$PATH"
15 |   - conda config --set always_yes yes --set changeps1 no --set show_channel_urls yes
16 |   - conda config --set channel_priority strict
17 |   - conda config --set add_pip_as_python_dependency yes
18 |   - conda config --remove channels defaults
19 |   - conda config --add channels conda-forge
20 |   - conda update -q conda
21 | 
22 | install:
23 |   - conda install --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy scikit-learn joblib pytest coverage -yq
24 |   - pip install -U mne 
25 |   - pip install check-manifest flake8
26 | script:
27 |   - set -e  # exit at first failure otherwise test might fail but build still passes
28 |   - check-manifest;
29 |   - flake8 hidimstat examples;
30 |   - if [ "$ONLY_PYTEST" = true ]; then
31 |       coverage run -m pytest;
32 |       coverage report;
33 |       coverage html;
34 |     fi
35 |   - export CODECOV_TOKEN="b7d1afb7-9730-4e21-882a-d0e893108def"
36 |   - bash <(curl -s https://codecov.io/bash)
37 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_standardized_svr.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the standardized_svr module
 3 | """
 4 | 
 5 | import numpy as np
 6 | from numpy.testing import assert_almost_equal
 7 | 
 8 | from hidimstat.scenario import multivariate_1D_simulation
 9 | from hidimstat.stat_tools import pval_from_scale
10 | from hidimstat.standardized_svr import standardized_svr
11 | 
12 | 
13 | def test_standardized_svr():
14 |     '''Testing the procedure on a simulation with no structure and a support
15 |     of size 1. Computing one-sided p-values, we want a low p-value
16 |     for the first feature and p-values close to 0.5 for the others.'''
17 | 
18 |     n_samples, n_features = 20, 50
19 |     support_size = 1
20 |     sigma = 0.1
21 |     rho = 0.0
22 | 
23 |     X_init, y, beta, noise = \
24 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
25 |                                    support_size=support_size, sigma=sigma,
26 |                                    rho=rho, shuffle=False, seed=3)
27 | 
28 |     y = y - np.mean(y)
29 |     X_init = X_init - np.mean(X_init, axis=0)
30 | 
31 |     beta_hat, scale_hat = standardized_svr(X_init, y)
32 | 
33 |     pval, pval_corr, _, _ = pval_from_scale(beta_hat, scale_hat)
34 | 
35 |     expected = 0.5 * np.ones(n_features)
36 |     expected[:support_size] = 0.0
37 | 
38 |     assert_almost_equal(pval_corr, expected, decimal=1)
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2020, Jerome-Alexis Chevalier
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_multi_sample_split.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the multi_sample_split module
 3 | """
 4 | 
 5 | import numpy as np
 6 | from numpy.testing import assert_almost_equal, assert_equal
 7 | 
 8 | from hidimstat.multi_sample_split import aggregate_medians, aggregate_quantiles
 9 | 
10 | 
11 | def test_aggregate_medians():
12 |     '''Aggregated p-values is twice the median p-value. All p-values should
13 |     be close to 0.04 and decreasing with respect to feature position.'''
14 | 
15 |     n_iter, n_features = 20, 5
16 |     list_pval = (1.0 / (np.arange(n_iter * n_features) + 1))
17 |     list_pval = list_pval.reshape((n_iter, n_features))
18 |     list_pval[15:, :] = 3e-3
19 | 
20 |     pval = aggregate_medians(list_pval)
21 |     expected = 0.04 * np.ones(n_features)
22 | 
23 |     assert_almost_equal(pval, expected, decimal=2)
24 |     assert_equal(pval[-2] >= pval[-1], True)
25 | 
26 | 
27 | def test_aggregate_quantiles():
28 |     '''Aggregated p-values from adaptive quantiles formula. All p-values should
29 |     be close to 0.04 and decreasing with respect to feature position.'''
30 | 
31 |     n_iter, n_features = 20, 5
32 |     list_pval = (1.0 / (np.arange(n_iter * n_features) + 1))
33 |     list_pval = list_pval.reshape((n_iter, n_features))
34 |     list_pval[15:, :] = 3e-3
35 | 
36 |     pval = aggregate_quantiles(list_pval)
37 |     expected = 0.03 * np.ones(n_features)
38 | 
39 |     assert_almost_equal(pval, expected, decimal=2)
40 |     assert_equal(pval[-2] >= pval[-1], True)
41 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_adaptive_permutation_threshold.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the adaptive_permutation_threshold module
 3 | """
 4 | 
 5 | import numpy as np
 6 | from numpy.testing import assert_almost_equal
 7 | 
 8 | from hidimstat.scenario import multivariate_1D_simulation
 9 | from hidimstat.stat_tools import pval_from_scale
10 | from hidimstat.adaptive_permutation_threshold import ada_svr
11 | 
12 | 
13 | def test_ada_svr():
14 |     '''Testing the procedure on a simulation with no structure and a support
15 |     of size 1. Computing one-sided p-values, we want a low p-value
16 |     for the first feature and p-values close to 0.5 for the others.'''
17 | 
18 |     n_samples, n_features = 20, 50
19 |     support_size = 1
20 |     sigma = 0.1
21 |     rho = 0.0
22 | 
23 |     X_init, y, beta, noise = \
24 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
25 |                                    support_size=support_size, sigma=sigma,
26 |                                    rho=rho, shuffle=False, seed=3)
27 | 
28 |     y = y - np.mean(y)
29 |     X_init = X_init - np.mean(X_init, axis=0)
30 | 
31 |     beta_hat, scale_hat = ada_svr(X_init, y)
32 | 
33 |     pval, pval_corr, _, _ = pval_from_scale(beta_hat, scale_hat)
34 | 
35 |     expected = 0.5 * np.ones(n_features)
36 |     expected[:support_size] = 0.0
37 | 
38 |     assert_almost_equal(pval[:support_size], expected[:support_size],
39 |                         decimal=1)
40 |     assert_almost_equal(pval_corr[support_size:], expected[support_size:],
41 |                         decimal=1)
42 | 


--------------------------------------------------------------------------------
/hidimstat/standardized_svr.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.linalg import norm
 3 | from sklearn.svm import LinearSVR
 4 | from sklearn.model_selection import GridSearchCV
 5 | from sklearn.pipeline import Pipeline
 6 | 
 7 | 
 8 | def standardized_svr(X, y, Cs=np.logspace(-7, 1, 9), n_jobs=1):
 9 |     """Cross-validated SVR
10 | 
11 |     Parameters
12 |     -----------
13 |     X : ndarray, shape (n_samples, n_features)
14 |         Data.
15 | 
16 |     y : ndarray, shape (n_samples,)
17 |         Target.
18 | 
19 |     Cs : ndarray, optional (default=np.logspace(-7, 1, 9))
20 |         The linear SVR regularization parameter is set by cross-val running
21 |         a grid search on the list of hyper-parameters contained in Cs.
22 | 
23 |     n_jobs : int or None, optional (default=1)
24 |         Number of CPUs to use during the cross validation.
25 | 
26 |     Returns
27 |     -------
28 |     beta_hat : array, shape (n_features,)
29 |         Estimated parameter vector.
30 | 
31 |     scale : ndarray, shape (n_features,)
32 |         Value of the standard deviation of the parameters.
33 |     """
34 | 
35 |     n_samples, n_features = X.shape
36 | 
37 |     steps = [('SVR', LinearSVR())]
38 |     pipeline = Pipeline(steps)
39 |     parameters = {'SVR__C': Cs}
40 | 
41 |     grid = GridSearchCV(pipeline, param_grid=parameters, n_jobs=n_jobs)
42 |     grid.fit(X, y)
43 | 
44 |     beta_hat = grid.best_estimator_.named_steps['SVR'].coef_
45 | 
46 |     std = norm(beta_hat) / np.sqrt(n_features)
47 |     scale = std * np.ones(beta_hat.size)
48 | 
49 |     return beta_hat, scale
50 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/data_simulation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.linalg import toeplitz
 3 | 
 4 | 
 5 | def simu_data(n, p, rho=0.25, snr=2.0, sparsity=0.06, effect=1.0, seed=None):
 6 |     """Function to simulate data follow an autoregressive structure with Toeplitz
 7 |     covariance matrix
 8 | 
 9 |     Parameters
10 |     ----------
11 |     n : int
12 |         number of observations
13 |     p : int
14 |         number of variables
15 |     sparsity : float, optional
16 |         ratio of number of variables with non-zero coefficients over total
17 |         coefficients
18 |     rho : float, optional
19 |         correlation parameter
20 |     effect : float, optional
21 |         signal magnitude, value of non-null coefficients
22 |     seed : None or Int, optional
23 |         random seed for generator
24 | 
25 |     Returns
26 |     -------
27 |     X : ndarray, shape (n, p)
28 |         Design matrix resulted from simulation
29 |     y : ndarray, shape (n, )
30 |         Response vector resulted from simulation
31 |     beta_true : ndarray, shape (n, )
32 |         Vector of true coefficient value
33 |     non_zero : ndarray, shape (n, )
34 |         Vector of non zero coefficients index
35 | 
36 |     """
37 |     # Setup seed generator
38 |     rng = np.random.default_rng(seed)
39 | 
40 |     # Number of non-null
41 |     k = int(sparsity * p)
42 | 
43 |     # Generate the variables from a multivariate normal distribution
44 |     mu = np.zeros(p)
45 |     Sigma = toeplitz(rho ** np.arange(0, p))  # covariance matrix of X
46 |     # X = np.dot(np.random.normal(size=(n, p)), cholesky(Sigma))
47 |     X = rng.multivariate_normal(mu, Sigma, size=(n))
48 |     # Generate the response from a linear model
49 |     non_zero = rng.choice(p, k)
50 |     beta_true = np.zeros(p)
51 |     beta_true[non_zero] = effect
52 |     eps = rng.standard_normal(size=n)
53 |     prod_temp = np.dot(X, beta_true)
54 |     noise_mag = np.linalg.norm(prod_temp) / (snr * np.linalg.norm(eps))
55 |     y = prod_temp + noise_mag * eps
56 | 
57 |     return X, y, beta_true, non_zero
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | from setuptools import find_packages
 7 | 
 8 | PKG = 'hidimstat'
 9 | DESCRIPTION = "High-dimensional statistical inference tools for Python"
10 | LONG_DESCRIPTION = open('README.md').read()
11 | MAINTAINER = 'Chevalier (ja-che) and Nguyen (tbng)'
12 | MAINTAINER_EMAIL = 'jerome-alexis_chevalier@hotmail.fr'
13 | URL = 'https://github.com/ja-che/hidimstat'
14 | DOWNLOAD_URL = 'https://github.com/ja-che/hidimstat'
15 | LICENSE = 'BSD'
16 | 
17 | 
18 | def load_version():
19 |     """Executes hidimstat/version.py in a globals dictionary and return it.
20 |     Following format from Nilearn repo on github.
21 |     """
22 |     # load all vars into globals, otherwise
23 |     #   the later function call using global vars doesn't work.
24 |     globals_dict = {}
25 |     with open(os.path.join('hidimstat', 'version.py')) as fp:
26 |         exec(fp.read(), globals_dict)
27 | 
28 |     return globals_dict
29 | 
30 | 
31 | def setup_package(version):
32 |     local_path = os.path.dirname(os.path.abspath(sys.argv[0]))
33 | 
34 |     os.chdir(local_path)
35 |     sys.path.insert(0, local_path)
36 | 
37 |     from numpy.distutils.core import setup
38 | 
39 |     setup(
40 |         packages=find_packages(exclude=['contrib', 'docs', 'tests']),
41 |         name=PKG,
42 |         maintainer=MAINTAINER,
43 |         include_package_data=True,
44 |         maintainer_email=MAINTAINER_EMAIL,
45 |         description=DESCRIPTION,
46 |         long_description=LONG_DESCRIPTION,
47 |         long_description_content_type='text/markdown',
48 |         license=LICENSE,
49 |         url=URL,
50 |         version=version,
51 |         # download_url=DOWNLOAD_URL,
52 |         zip_safe=False,  # the package can run out of an .egg file
53 |         classifiers=[
54 |             'Programming Language :: Python',
55 |             'Programming Language :: Python :: 3.5',
56 |             'Development Status :: 3 - Alpha'
57 |         ],
58 |     )
59 | 
60 | 
61 | _VERSION_GLOBALS = load_version()
62 | VERSION = _VERSION_GLOBALS['__version__']
63 | 
64 | if __name__ == "__main__":
65 |     setup_package(VERSION)
66 | 


--------------------------------------------------------------------------------
/hidimstat/adaptive_permutation_threshold.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def ada_svr(X, y, rcond=1e-3):
 5 |     """Statistical inference procedure presented in Gaonkar et al. [1]_.
 6 | 
 7 |     Parameters
 8 |     -----------
 9 |     X : ndarray, shape (n_samples, n_features)
10 |         Data.
11 | 
12 |     y : ndarray, shape (n_samples,)
13 |         Target.
14 | 
15 |     rcond : float, optional (default=1e-3)
16 |         Cutoff for small singular values. Singular values smaller
17 |         than `rcond` * largest_singular_value are set to zero.
18 | 
19 |     Returns
20 |     -------
21 |     beta_hat : array, shape (n_features,)
22 |         Estimated parameter vector.
23 | 
24 |     scale : ndarray, shape (n_features,)
25 |         Value of the standard deviation of the parameters.
26 | 
27 |     References
28 |     ----------
29 |     .. [1] Gaonkar, B., & Davatzikos, C. (2012, October). Deriving statistical
30 |            significance maps for SVM based image classification and group
31 |            comparisons. In International Conference on Medical Image Computing
32 |            and Computer-Assisted Intervention (pp. 723-730). Springer, Berlin,
33 |            Heidelberg.
34 |     """
35 | 
36 |     X = np.asarray(X)
37 |     n_samples, n_features = X.shape
38 | 
39 |     K = _manual_inverting(np.dot(X, X.T), rcond=rcond)
40 |     sum_K = np.sum(K)
41 | 
42 |     L = - np.outer(np.sum(K, axis=0), np.sum(K, axis=1)) / sum_K
43 |     C = np.dot(X.T, K + L)
44 | 
45 |     beta_hat = np.dot(C, y)
46 | 
47 |     scale = np.sqrt(np.sum(C ** 2, axis=1))
48 | 
49 |     return beta_hat, scale
50 | 
51 | 
52 | def _manual_inverting(X, rcond=1e-3, full_rank=False):
53 |     'Inverting taking care of low eigenvalues to increase numerical stability'
54 | 
55 |     X = np.asarray(X)
56 |     n_samples, n_features = X.shape
57 | 
58 |     if n_samples != n_features:
59 |         raise ValueError('The matrix is not a square matrix')
60 | 
61 |     U, s, V = np.linalg.svd(X, full_matrices=False)
62 |     rank = np.sum(s > rcond * s.max())
63 |     s_inv = np.zeros(np.size(s))
64 |     s_inv[:rank] = 1 / s[:rank]
65 | 
66 |     if full_rank:
67 |         s_inv[rank:] = 1 / (rcond * s.max())
68 | 
69 |     X_inv = np.linalg.multi_dot([U, np.diag(s_inv), V])
70 | 
71 |     return X_inv
72 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | 
 3 | _xvfb: &xvfb
 4 |   name: Start Xvfb virtual framebuffer
 5 |   command: |
 6 |     echo "export DISPLAY=:99" >> $BASH_ENV
 7 |     /sbin/start-stop-daemon --start --quiet --pidfile /tmp/custom_xvfb_99.pid --make-pidfile --background --exec /usr/bin/Xvfb -- :99 -screen 0 1280x1024x24 -ac +extension GLX +render -noreset -nolisten tcp -nolisten unix
 8 | 
 9 | jobs:
10 |     build_docs:
11 |       docker:
12 |         - image: circleci/python:3.8.5-buster
13 |       steps:
14 |         - checkout
15 |         - run:
16 |             name: Set BASH_ENV
17 |             command: |
18 |               set -e
19 |               echo "set -e" >> $BASH_ENV
20 |               echo "export XDG_RUNTIME_DIR=/tmp/runtime-circleci" >> $BASH_ENV
21 |               echo "export MNE_3D_BACKEND=pyvista" >> $BASH_ENV
22 |               echo "export PYTHONUNBUFFERED=1" >> $BASH_ENV
23 |               echo "BASH_ENV:"
24 |               cat $BASH_ENV
25 |         - run:
26 |             <<: *xvfb
27 |         - run:
28 |             name: Install OpenGL
29 |             command: |
30 |               sudo apt-get update --allow-releaseinfo-change
31 |               sudo apt-get install -y git libopenmpi-dev openmpi-bin
32 |               sudo apt-get install libosmesa6 libglx-mesa0 libopengl0 libglx0 libdbus-1-3 \
33 |                 libxkbcommon-x11-0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0 \
34 |                 libxcb-render-util0 libxcb-shape0 libxcb-xfixes0 libxcb-xinerama0
35 |               sudo ln -s /usr/lib/x86_64-linux-gnu/libxcb-util.so.0 /usr/lib/x86_64-linux-gnu/libxcb-util.so.1
36 |         - run:
37 |             name: Install dependencies
38 |             command: |
39 |               python -m pip install --progress-bar off --upgrade pip setuptools wheel
40 |               python -m pip install --progress-bar off -r doc/doc-requirements.txt
41 |               python -m pip install -e .
42 |               echo "localhost slots=50">hostfile
43 |         - run:
44 |             name: Check PyQt5
45 |             command: LD_DEBUG=libs python -c "from PyQt5.QtWidgets import QApplication, QWidget; app = QApplication([])"
46 |         - run:
47 |             name: Check installation
48 |             command: |
49 |                which python
50 |                QT_DEBUG_PLUGINS=1 mne sys_info
51 |                python -c "import numpy; numpy.show_config()"
52 |                LIBGL_DEBUG=verbose python -c "import pyvistaqt; pyvistaqt.BackgroundPlotter(show=True)"
53 |         - run:
54 |             name: make html
55 |             no_output_timeout: 30m
56 |             command: |
57 |               cd doc;
58 |               export OMP_NUM_THREADS=1;
59 |               make html;
60 |         - store_artifacts:
61 |             path: doc/_build/html/
62 |             destination: dev
63 | 
64 | workflows:
65 |   version: 2
66 | 
67 |   default:
68 |     jobs:
69 |       - build_docs
70 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy_ghpages.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy GitHub pages
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches: main
 7 | 
 8 | jobs:
 9 |   build_docs:
10 |     runs-on: ubuntu-latest
11 |     env:
12 |       DISPLAY: ':99'
13 |     defaults:
14 |       run:
15 |         shell: bash
16 |     steps:
17 |       - uses: actions/checkout@v2
18 |       - run: |
19 |           sudo apt-get update
20 |           sudo apt-get install -y libgl1-mesa-glx
21 |           sudo apt-get install -yqq libxkbcommon-x11-0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-randr0 libxcb-render-util0 libxcb-xinerama0 libxcb-xfixes0 libopengl0
22 |           /sbin/start-stop-daemon --start --quiet --pidfile /tmp/custom_xvfb_99.pid --make-pidfile --background --exec /usr/bin/Xvfb -- :99 -screen 0 1400x900x24 -ac +extension GLX +render -noreset
23 |         name: 'Setup xvfb'
24 |       - run: |
25 |           sudo apt-get install -y git libopenmpi-dev openmpi-bin
26 |           pip install -r doc/doc-requirements.txt
27 |           pip install --progress-bar off vtk==9.0.20210612.dev0
28 |           pip install --progress-bar off https://github.com/sphinx-gallery/sphinx-gallery/zipball/master
29 |           pip install -e .
30 |           echo "localhost slots=50">hostfile
31 |         name: 'Install dependencies'
32 |       - run: |
33 |           LD_DEBUG=libs python -c "from PyQt5.QtWidgets import QApplication, QWidget; app = QApplication([])"
34 |         name: 'Check PyQt5'
35 |       - run: |
36 |           which python
37 |           QT_DEBUG_PLUGINS=1 mne sys_info
38 |           python -c "import numpy; numpy.show_config()"
39 |           LIBGL_DEBUG=verbose python -c "import pyvistaqt; pyvistaqt.BackgroundPlotter(show=True)"
40 |         name: 'Check installation'
41 |       - run: sphinx-build -b html doc doc/_build/html
42 |         name: 'Generate HTML docs'
43 |       - name:  Upload generated HTML as artifact
44 |         uses: actions/upload-artifact@v2
45 |         with:
46 |           name: DocHTML
47 |           path: doc/_build/html/
48 | 
49 |   deploy_docs:
50 |     if: ${{ github.ref == 'refs/heads/main' }}
51 |     needs:
52 |       build_docs
53 |     runs-on: ubuntu-latest
54 |     steps:
55 |       - uses: actions/checkout@v2
56 |       - name: Download artifacts
57 |         uses: actions/download-artifact@v4.1.7
58 |         with:
59 |           name: DocHTML
60 |           path: doc/_build/html/
61 |       - name: Commit to documentation branch
62 |         run: |
63 |           git clone --no-checkout --depth 1 https://github.com/${{ github.repository_owner }}/hidimstat.git --branch gh-pages --single-branch gh-pages
64 |           cp -r doc/_build/html/* gh-pages/
65 |           cd gh-pages
66 |           touch .nojekyll
67 |           git config --local user.email "hidimstat@github.com"
68 |           git config --local user.name "hidimstat GitHub Action"
69 |           git add .
70 |           git commit -m "Update documentation" -a || true
71 |       - name: Push changes
72 |         uses: ad-m/github-push-action@v0.6.0
73 |         with:
74 |           branch: gh-pages
75 |           directory: gh-pages
76 |           github_token: ${{ secrets.GITHUB_TOKEN }}
77 | 


--------------------------------------------------------------------------------
/hidimstat/multi_sample_split.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def aggregate_medians(list_one_sided_pval):
 5 |     """Aggregation of survival function values taking twice the median
 6 | 
 7 |     Parameters
 8 |     -----------
 9 |     list_one_sided_pval : ndarray, shape (n_iter, n_features)
10 |         List of one-sided p-values.
11 | 
12 |     Returns
13 |     -------
14 |     one_sided_pval : ndarray, shape (n_features,)
15 |         Aggregated one-sided p-values.
16 | 
17 |     References
18 |     ----------
19 |     .. [1] Meinshausen, N., Meier, L., & Bühlmann, P. (2009). P-values for
20 |            high-dimensional regression. Journal of the American Statistical
21 |            Association, 104(488), 1671-1681.
22 |     """
23 | 
24 |     n_iter, n_features = list_one_sided_pval.shape
25 | 
26 |     one_sided_pval = np.median(list_one_sided_pval, axis=0)
27 |     one_sided_pval[one_sided_pval > 0.5] = \
28 |         np.maximum(0.5, 1 - (1 - one_sided_pval[one_sided_pval > 0.5]) * 2)
29 |     one_sided_pval[one_sided_pval < 0.5] = \
30 |         np.minimum(0.5, one_sided_pval[one_sided_pval < 0.5] * 2)
31 | 
32 |     return one_sided_pval
33 | 
34 | 
35 | def aggregate_quantiles(list_one_sided_pval, gamma_min=0.2):
36 |     """Aggregation of survival function values by adaptive quantile procedure
37 | 
38 |     Parameters
39 |     -----------
40 |     list_one_sided_pval : ndarray, shape (n_iter, n_features)
41 |         List of one-sided p-values.
42 | 
43 |     gamma_min : float, optional (default=0.2)
44 |         Lowest gamma-quantile being considered to compute the adaptive
45 |         quantile aggregation formula (cf. [1]_).
46 | 
47 |     Returns
48 |     -------
49 |     one_sided_pval : ndarray, shape (n_features,)
50 |         Aggregated one-sided p-values.
51 | 
52 |     References
53 |     ----------
54 |     .. [1] Meinshausen, N., Meier, L., & Bühlmann, P. (2009). P-values for
55 |            high-dimensional regression. Journal of the American Statistical
56 |            Association, 104(488), 1671-1681.
57 |     """
58 | 
59 |     n_iter, n_features = list_one_sided_pval.shape
60 |     one_sided_pval = 0.5 * np.ones(n_features)
61 | 
62 |     m = n_iter + 1
63 |     k = np.maximum(1, int(np.floor(gamma_min * n_iter)))
64 |     r = 1 - np.log(gamma_min)
65 |     seq = range(k, n_iter)
66 | 
67 |     ordered_pval = np.sort(list_one_sided_pval, axis=0)
68 |     rev_ordered_pval = ordered_pval[::-1]
69 | 
70 |     for i in np.arange(n_features):
71 | 
72 |         adjusted_ordered_pval = \
73 |             min([ordered_pval[j, i] * m / (j + 1) for j in seq])
74 |         adjusted_ordered_pval = min(0.5, adjusted_ordered_pval)
75 | 
76 |         adjusted_rev_ordered_pval = \
77 |             max([1 - (1 - rev_ordered_pval[j, i]) * m / (j + 1) for j in seq])
78 |         adjusted_rev_ordered_pval = max(0.5, adjusted_rev_ordered_pval)
79 | 
80 |         if (1 - adjusted_rev_ordered_pval) < adjusted_ordered_pval:
81 | 
82 |             one_sided_pval[i] = \
83 |                 np.maximum(0.5, 1 - (1 - adjusted_rev_ordered_pval) * r)
84 | 
85 |         else:
86 | 
87 |             one_sided_pval[i] = np.minimum(0.5, adjusted_ordered_pval * r)
88 | 
89 |     return one_sided_pval
90 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/knockoffs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Authors: Binh Nguyen <tuan-binh.nguyen@inria.fr>
 3 | """
 4 | Implementation of Model-X knockoffs inference procedure, introduced in
 5 | Candes et. al. (2016) " Panning for Gold: Model-X Knockoffs for
 6 | High-dimensional Controlled Variable Selection"
 7 | <https://arxiv.org/abs/1610.02351>
 8 | """
 9 | import numpy as np
10 | from sklearn.preprocessing import StandardScaler
11 | from sklearn.utils.validation import check_memory
12 | 
13 | from .gaussian_knockoff import (_estimate_distribution,
14 |                                 gaussian_knockoff_generation)
15 | from .stat_coef_diff import _coef_diff_threshold, stat_coef_diff
16 | 
17 | 
18 | def model_x_knockoff(X, y, fdr=0.1, offset=1, method='equi',
19 |                      statistics='lasso_cv', shrink=False, centered=True,
20 |                      cov_estimator='ledoit_wolf', verbose=False, memory=None,
21 |                      n_jobs=1, seed=None):
22 |     """Model-X Knockoff inference procedure to control False Discoveries Rate,
23 |     based on Candes et. al. (2016)
24 | 
25 |     Parameters
26 |     ----------
27 |     X : 2D ndarray (n_samples, n_features)
28 |         design matrix
29 | 
30 |     y : 1D ndarray (n_samples, )
31 |         response vector
32 | 
33 |     fdr : float, optional
34 |         desired controlled FDR level
35 | 
36 |     offset : int, 0 or 1, optional
37 |         offset to calculate knockoff threshold, offset = 1 is equivalent to
38 |         knockoff+
39 | 
40 |     method : str, optional
41 |         knockoff construction methods, either equi for equi-correlated knockoff
42 |         or sdp for optimization scheme
43 | 
44 |     statistics : str, optional
45 |         method to calculate knockoff test score
46 | 
47 |     shrink : bool, optional
48 |         whether to shrink the empirical covariance matrix
49 | 
50 |     centered : bool, optional
51 |         whether to standardize the data before doing the inference procedure
52 | 
53 |     cov_estimator : str, optional
54 |         method of empirical covariance matrix estimation
55 | 
56 |     seed : int or None, optional
57 |         random seed used to generate Gaussian knockoff variable
58 | 
59 |     Returns
60 |     -------
61 |     selected : 1D array, int
62 |         vector of index of selected variables
63 | 
64 |     test_score : 1D array, (n_features, )
65 |         vector of test statistic
66 | 
67 |     thres : float
68 |         knockoff threshold
69 | 
70 |     X_tilde : 2D array, (n_samples, n_features)
71 |         knockoff design matrix
72 |     """
73 |     memory = check_memory(memory)
74 | 
75 |     if centered:
76 |         X = StandardScaler().fit_transform(X)
77 | 
78 |     mu, Sigma = _estimate_distribution(
79 |         X, shrink=shrink, cov_estimator=cov_estimator)
80 | 
81 |     X_tilde = gaussian_knockoff_generation(X, mu, Sigma, memory=memory,
82 |                                            method=method, seed=seed)
83 |     test_score = memory.cache(
84 |         stat_coef_diff, ignore=['n_jobs', 'joblib_verbose'])(
85 |         X, X_tilde, y, method=statistics, n_jobs=n_jobs)
86 |     thres = _coef_diff_threshold(test_score, fdr=fdr, offset=offset)
87 | 
88 |     selected = np.where(test_score >= thres)[0]
89 | 
90 |     if verbose:
91 |         return selected, test_score, thres, X_tilde
92 | 
93 |     return selected
94 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_desparsified_lasso.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the desparsified_lasso module
 3 | """
 4 | 
 5 | import numpy as np
 6 | from numpy.testing import assert_almost_equal, assert_equal
 7 | from scipy.linalg import toeplitz
 8 | 
 9 | from hidimstat.scenario import multivariate_1D_simulation
10 | from hidimstat.scenario import multivariate_temporal_simulation
11 | from hidimstat.desparsified_lasso import desparsified_lasso
12 | from hidimstat.desparsified_lasso import desparsified_group_lasso
13 | 
14 | 
15 | def test_desparsified_lasso():
16 |     '''Testing the procedure on a simulation with no structure and
17 |     a support of size 1. Computing 99% confidence bounds and checking
18 |     that they contains the true parameter vector.'''
19 | 
20 |     n_samples, n_features = 50, 50
21 |     support_size = 1
22 |     sigma = 0.1
23 |     rho = 0.0
24 | 
25 |     X, y, beta, noise = \
26 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
27 |                                    support_size=support_size, sigma=sigma,
28 |                                    rho=rho, shuffle=False, seed=2)
29 | 
30 |     beta_hat, cb_min, cb_max = desparsified_lasso(X, y, confidence=0.99)
31 | 
32 |     assert_almost_equal(beta_hat, beta, decimal=1)
33 |     assert_equal(cb_min < beta, True)
34 |     assert_equal(cb_max > beta, True)
35 | 
36 |     beta_hat, cb_min, cb_max = \
37 |         desparsified_lasso(X, y, dof_ajdustement=True, confidence=0.99)
38 | 
39 |     assert_almost_equal(beta_hat, beta, decimal=1)
40 |     assert_equal(cb_min < beta, True)
41 |     assert_equal(cb_max > beta, True)
42 | 
43 | 
44 | def test_desparsified_group_lasso():
45 |     '''Testing the procedure on a simulation with no structure and
46 |     a support of size 2. Computing one-sided p-values, we want
47 |     low p-values for the features of the support and p-values
48 |     close to 0.5 for the others.'''
49 | 
50 |     n_samples = 50
51 |     n_features = 100
52 |     n_times = 10
53 |     support_size = 2
54 |     sigma = 0.1
55 |     rho = 0.9
56 |     corr = toeplitz(np.geomspace(1, rho ** (n_times - 1), n_times))
57 |     cov = np.outer(sigma, sigma) * corr
58 | 
59 |     X, Y, beta, noise = \
60 |         multivariate_temporal_simulation(n_samples=n_samples,
61 |                                          n_features=n_features,
62 |                                          n_times=n_times,
63 |                                          support_size=support_size,
64 |                                          sigma=sigma, rho_noise=rho)
65 | 
66 |     beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
67 |         desparsified_group_lasso(X, Y, cov=cov)
68 | 
69 |     expected_pval_corr = \
70 |         np.concatenate((np.zeros(support_size),
71 |                         0.5 * np.ones(n_features - support_size)))
72 | 
73 |     assert_almost_equal(beta_hat, beta, decimal=1)
74 |     assert_almost_equal(pval_corr, expected_pval_corr, decimal=1)
75 | 
76 |     beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
77 |         desparsified_group_lasso(X, Y, test='F')
78 | 
79 |     assert_almost_equal(beta_hat, beta, decimal=1)
80 |     assert_almost_equal(pval_corr, expected_pval_corr, decimal=1)
81 | 
82 |     # Testing error is raised when the covariance matrix has wrong shape
83 |     bad_cov = np.delete(cov, 0, axis=1)
84 |     np.testing.assert_raises(ValueError, desparsified_group_lasso,
85 |                              X=X, Y=Y, cov=bad_cov)
86 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/knockoff_aggregation.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Authors: Binh Nguyen <tuan-binh.nguyen@inria.fr>
  3 | import numpy as np
  4 | from joblib import Parallel, delayed
  5 | from sklearn.preprocessing import StandardScaler
  6 | from sklearn.utils import check_random_state
  7 | from sklearn.utils.validation import check_memory
  8 | 
  9 | from .gaussian_knockoff import (_estimate_distribution,
 10 |                                 gaussian_knockoff_generation)
 11 | from .stat_coef_diff import stat_coef_diff
 12 | from .utils import fdr_threshold, quantile_aggregation
 13 | 
 14 | 
 15 | def knockoff_aggregation(X, y, centered=True, shrink=False,
 16 |                          construct_method='equi', fdr=0.1, fdr_control='bhq',
 17 |                          reshaping_function=None, offset=1,
 18 |                          statistic='lasso_cv', cov_estimator='ledoit_wolf',
 19 |                          joblib_verbose=0, n_bootstraps=25, n_jobs=1,
 20 |                          adaptive_aggregation=False, gamma=0.5, gamma_min=0.05,
 21 |                          verbose=False, memory=None, random_state=None):
 22 | 
 23 |     # unnecessary to have n_jobs > number of bootstraps
 24 |     n_jobs = min(n_bootstraps, n_jobs)
 25 | 
 26 |     if centered:
 27 |         X = StandardScaler().fit_transform(X)
 28 | 
 29 |     mu, Sigma = _estimate_distribution(
 30 |         X, shrink=shrink, cov_estimator=cov_estimator)
 31 | 
 32 |     mem = check_memory(memory)
 33 |     stat_coef_diff_cached = mem.cache(stat_coef_diff,
 34 |                                       ignore=['n_jobs', 'joblib_verbose'])
 35 | 
 36 |     if n_bootstraps == 1:
 37 |         X_tilde = gaussian_knockoff_generation(
 38 |             X, mu, Sigma, method=construct_method,
 39 |             memory=memory, seed=random_state)
 40 |         ko_stat = stat_coef_diff_cached(X, X_tilde, y, method=statistic)
 41 |         pvals = _empirical_pval(ko_stat, offset)
 42 |         threshold = fdr_threshold(pvals, fdr=fdr,
 43 |                                   method=fdr_control)
 44 |         selected = np.where(pvals <= threshold)[0]
 45 | 
 46 |         if verbose:
 47 |             return selected, pvals
 48 | 
 49 |         return selected
 50 | 
 51 |     if isinstance(random_state, (int, np.int32, np.int64)):
 52 |         rng = check_random_state(random_state)
 53 |     elif random_state is None:
 54 |         rng = check_random_state(0)
 55 |     else:
 56 |         raise TypeError('Wrong type for random_state')
 57 | 
 58 |     seed_list = rng.randint(1, np.iinfo(np.int32).max, n_bootstraps)
 59 |     parallel = Parallel(n_jobs, verbose=joblib_verbose)
 60 |     X_tildes = parallel(delayed(gaussian_knockoff_generation)(
 61 |         X, mu, Sigma, method=construct_method, memory=memory,
 62 |         seed=seed) for seed in seed_list)
 63 | 
 64 |     ko_stats = parallel(delayed(stat_coef_diff_cached)(
 65 |         X, X_tildes[i], y, method=statistic) for i in range(n_bootstraps))
 66 | 
 67 |     pvals = np.array([_empirical_pval(ko_stats[i], offset)
 68 |                       for i in range(n_bootstraps)])
 69 | 
 70 |     aggregated_pval = quantile_aggregation(
 71 |         pvals, gamma=gamma, gamma_min=gamma_min,
 72 |         adaptive=adaptive_aggregation)
 73 | 
 74 |     threshold = fdr_threshold(aggregated_pval, fdr=fdr, method=fdr_control,
 75 |                               reshaping_function=reshaping_function)
 76 |     selected = np.where(aggregated_pval <= threshold)[0]
 77 | 
 78 |     if verbose:
 79 |         return selected, aggregated_pval, pvals
 80 | 
 81 |     return selected
 82 | 
 83 | 
 84 | def _empirical_pval(test_score, offset=1):
 85 | 
 86 |     pvals = []
 87 |     n_features = test_score.size
 88 | 
 89 |     if offset not in (0, 1):
 90 |         raise ValueError("'offset' must be either 0 or 1")
 91 | 
 92 |     test_score_inv = -test_score
 93 |     for i in range(n_features):
 94 |         if test_score[i] <= 0:
 95 |             pvals.append(1)
 96 |         else:
 97 |             pvals.append(
 98 |                 (offset + np.sum(test_score_inv >= test_score[i])) /
 99 |                 n_features
100 |             )
101 | 
102 |     return np.array(pvals)
103 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_clustered_inference.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test the clustered_inference module
 3 | """
 4 | 
 5 | import numpy as np
 6 | from sklearn.cluster import FeatureAgglomeration
 7 | from sklearn.feature_extraction import image
 8 | from numpy.testing import assert_almost_equal
 9 | 
10 | from hidimstat.scenario import multivariate_1D_simulation
11 | from hidimstat.scenario import multivariate_temporal_simulation
12 | from hidimstat.clustered_inference import clustered_inference
13 | 
14 | 
15 | def test_clustered_inference():
16 |     '''Testing the procedure on two simulations with a 1D data structure and
17 |     with n << p: the first test has no temporal dimension, the second has a
18 |     temporal dimension. The support is connected and of size 10, it must be
19 |     recovered with a small spatial tolerance parametrized by `margin_size`.
20 |     Computing one sided p-values, we want low p-values for the features of
21 |     the support and p-values close to 0.5 for the others.'''
22 | 
23 |     # Scenario 1: data with no temporal dimension
24 |     # ###########################################
25 |     n_samples, n_features = 100, 2000
26 |     support_size = 10
27 |     sigma = 5.0
28 |     rho = 0.95
29 |     n_clusters = 200
30 |     margin_size = 5
31 |     interior_support = support_size - margin_size
32 |     extended_support = support_size + margin_size
33 | 
34 |     X_init, y, beta, epsilon = \
35 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
36 |                                    support_size=support_size, sigma=sigma,
37 |                                    rho=rho, shuffle=False, seed=2)
38 | 
39 |     y = y - np.mean(y)
40 |     X_init = X_init - np.mean(X_init, axis=0)
41 | 
42 |     connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
43 |     ward = FeatureAgglomeration(n_clusters=n_clusters,
44 |                                 connectivity=connectivity,
45 |                                 linkage='ward')
46 | 
47 |     beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
48 |         clustered_inference(X_init, y, ward, n_clusters)
49 | 
50 |     expected = 0.5 * np.ones(n_features)
51 |     expected[:support_size] = 0.0
52 | 
53 |     assert_almost_equal(pval_corr[:interior_support],
54 |                         expected[:interior_support])
55 |     assert_almost_equal(pval_corr[extended_support:200],
56 |                         expected[extended_support:200],
57 |                         decimal=1)
58 | 
59 |     # Scenario 2: temporal data
60 |     # #########################
61 |     n_samples, n_features, n_times = 200, 2000, 10
62 |     support_size = 10
63 |     sigma = 5.0
64 |     rho_noise = 0.9
65 |     rho_data = 0.9
66 |     n_clusters = 200
67 |     margin_size = 5
68 |     interior_support = support_size - margin_size
69 |     extended_support = support_size + margin_size
70 | 
71 |     X, Y, beta, noise = \
72 |         multivariate_temporal_simulation(n_samples=n_samples,
73 |                                          n_features=n_features,
74 |                                          n_times=n_times,
75 |                                          support_size=support_size,
76 |                                          sigma=sigma,
77 |                                          rho_noise=rho_noise,
78 |                                          rho_data=rho_data,
79 |                                          shuffle=False)
80 | 
81 |     connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
82 |     ward = FeatureAgglomeration(n_clusters=n_clusters,
83 |                                 connectivity=connectivity,
84 |                                 linkage='ward')
85 | 
86 |     beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
87 |         clustered_inference(X, Y, ward, n_clusters,
88 |                             method='desparsified-group-lasso')
89 | 
90 |     expected = 0.5 * np.ones(n_features)
91 |     expected[:support_size] = 0.0
92 | 
93 |     assert_almost_equal(pval_corr[:interior_support],
94 |                         expected[:interior_support],
95 |                         decimal=3)
96 |     assert_almost_equal(pval_corr[extended_support:],
97 |                         expected[extended_support:],
98 |                         decimal=1)
99 | 


--------------------------------------------------------------------------------
/examples_not_exhibited/plot_fig_1_nguyen_et_al.py:
--------------------------------------------------------------------------------
  1 | # Authors: Binh Nguyen <tuan-binh.nguyen@inria.fr>
  2 | """
  3 | Work in Progress : Histogram of KO vs AKO performance
  4 | =====================================================
  5 | 
  6 | Example: reproducing Figure 1 in::
  7 | 
  8 |     Nguyen et al. (2020) Aggregation of Multiple Knockoffs
  9 |     https://arxiv.org/abs/2002.09269
 10 | 
 11 | To reduce the script runtime it is desirable to increase n_jobs parameter.
 12 | """
 13 | import matplotlib.pyplot as plt
 14 | import numpy as np
 15 | from hidimstat.knockoffs import knockoff_aggregation, model_x_knockoff
 16 | from hidimstat.knockoffs.data_simulation import simu_data
 17 | from hidimstat.knockoffs.utils import cal_fdp_power
 18 | from joblib import Parallel, delayed
 19 | from sklearn.preprocessing import StandardScaler
 20 | 
 21 | color_blue = '#1f77b4'
 22 | color_teal = '#1fbecf'
 23 | 
 24 | 
 25 | def one_inference(n, p, snr, rho, sparsity, n_bootstraps=25, gamma=0.3,
 26 |                   n_jobs=1, offset=1, fdr=0.1, seed=None):
 27 | 
 28 |     # Simulate data following autoregressive structure, seed is fixed to ensure
 29 |     # doing inference on only 1 simulation
 30 |     X, y, _, non_zero_index = simu_data(n=n, p=p, rho=rho, snr=snr,
 31 |                                         sparsity=sparsity, seed=42)
 32 |     X = StandardScaler().fit_transform(X)
 33 | 
 34 |     # Single knockoff -- has to do it 25 times to match the number of
 35 |     # bootstraps in AKO for fair comparison
 36 |     ko_fdps = []
 37 |     ko_powers = []
 38 | 
 39 |     for i in range(n_bootstraps):
 40 |         ko_selected = model_x_knockoff(X, y, fdr=fdr, offset=offset,
 41 |                                        n_jobs=n_jobs, seed=n_bootstraps*seed)
 42 |         ko_fdp, ko_power = cal_fdp_power(ko_selected, non_zero_index)
 43 |         ko_fdps.append(ko_fdp)
 44 |         ko_powers.append(ko_power)
 45 | 
 46 |     # Aggregated knockoff
 47 |     ako_selected = knockoff_aggregation(X, y, fdr=fdr, offset=offset,
 48 |                                         n_jobs=n_jobs, gamma=gamma,
 49 |                                         n_bootstraps=n_bootstraps,
 50 |                                         random_state=seed*2)
 51 | 
 52 |     ako_fdp, ako_power = cal_fdp_power(ako_selected, non_zero_index)
 53 | 
 54 |     return ko_fdps, ako_fdp, ko_powers, ako_power
 55 | 
 56 | 
 57 | def plot(results, n_simu, fdr):
 58 | 
 59 |     ko_fdps = np.array([results[i][0] for i in range(n_simu)]).ravel()
 60 |     ako_fdps = np.array([results[i][1] for i in range(n_simu)]).ravel()
 61 |     ko_powers = np.array([results[i][2] for i in range(n_simu)]).ravel()
 62 |     ako_powers = np.array([results[i][3] for i in range(n_simu)]).ravel()
 63 | 
 64 |     # Plotting
 65 |     n_bins = 30
 66 |     fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(7, 4))
 67 |     ax1.tick_params(labelsize=14)
 68 |     ax1.hist(ko_fdps, edgecolor='k',
 69 |              range=[0.0, 1.0], bins=n_bins, color=color_blue)
 70 |     ax1.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0)
 71 |     ax2.tick_params(labelsize=14)
 72 |     ax2.hist(ko_powers, edgecolor='k',
 73 |              range=[0.0, 1.0], bins=n_bins, color=color_blue)
 74 |     ax3.tick_params(labelsize=14)
 75 |     ax3.hist(ako_fdps, edgecolor='k',
 76 |              range=[0.0, 1.0], bins=n_bins, color=color_teal)
 77 |     ax3.axvline(x=fdr, linestyle='--', color='r', linewidth=1.0)
 78 |     ax4.tick_params(labelsize=14)
 79 |     ax4.hist(ako_powers, edgecolor='k',
 80 |              range=[0.0, 1.0], bins=n_bins, color=color_teal)
 81 |     plt.tight_layout()
 82 | 
 83 |     figname = 'figures/histogram_ko_vs_ako.png'
 84 |     plt.savefig(figname)
 85 |     print(f'Save figure to {figname}')
 86 | 
 87 | 
 88 | def main():
 89 |     # Simulation paramaters
 90 |     n, p = 50, 200
 91 |     snr = 3.0
 92 |     rho = 0.5
 93 |     sparsity = 0.06
 94 |     offset = 1
 95 |     fdr = 0.05
 96 |     gamma = 0.3
 97 |     n_bootstraps = 10
 98 |     n_simu = 10
 99 |     offset = 1
100 | 
101 |     results = Parallel(n_jobs=1)(
102 |         delayed(one_inference)(
103 |             n=n, p=p, snr=snr, rho=rho, sparsity=sparsity,
104 |             n_jobs=1, n_bootstraps=n_bootstraps, fdr=fdr,
105 |             offset=offset, gamma=gamma, seed=seed)
106 |         for seed in range(n_simu))
107 | 
108 |     # Plotting
109 |     plot(results, n_simu, fdr)
110 |     print('Done!')
111 | 
112 | 
113 | # if __name__ == '__main__':
114 | #     main()
115 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/stat_coef_diff.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Authors: Binh Nguyen <tuan-binh.nguyen@inria.fr>
  3 | 
  4 | import numpy as np
  5 | from sklearn.linear_model import (LassoCV, LogisticRegressionCV)
  6 | from sklearn.model_selection import KFold
  7 | # from sklearn.linear_model._coordinate_descent import _alpha_grid
  8 | # from sklearn.model_selection import GridSearchCV
  9 | 
 10 | 
 11 | def stat_coef_diff(X, X_tilde, y, method='lasso_cv', n_splits=5, n_jobs=1,
 12 |                    n_lambdas=10, n_iter=1000, group_reg=1e-3, l1_reg=1e-3,
 13 |                    joblib_verbose=0, return_coef=False, solver='liblinear',
 14 |                    seed=0):
 15 |     """Calculate test statistic by doing estimation with Cross-validation on
 16 |     concatenated design matrix [X X_tilde] to find coefficients [beta
 17 |     beta_tilda]. The test statistic is then:
 18 | 
 19 |                         W_j =  abs(beta_j) - abs(beta_tilda_j)
 20 | 
 21 |     with j = 1, ..., n_features
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     X : 2D ndarray (n_samples, n_features)
 26 |         Original design matrix
 27 | 
 28 |     X_tilde : 2D ndarray (n_samples, n_features)
 29 |         Knockoff design matrix
 30 | 
 31 |     y : 1D ndarray (n_samples, )
 32 |         Response vector
 33 | 
 34 |     loss : str, optional
 35 |         if the response vector is continuous, the loss used should be
 36 |         'least_square', otherwise
 37 |         if the response vector is binary, it should be 'logistic'
 38 | 
 39 |     n_splits : int, optional
 40 |         number of cross-validation folds
 41 | 
 42 |     solver : str, optional
 43 |         solver used by sklearn function LogisticRegressionCV
 44 | 
 45 |     n_regu : int, optional
 46 |         number of regulation used in the regression problem
 47 | 
 48 |     return_coef : bool, optional
 49 |         return regression coefficient if set to True
 50 | 
 51 |     Returns
 52 |     -------
 53 |     test_score : 1D ndarray (n_features, )
 54 |         vector of test statistic
 55 | 
 56 |     coef: 1D ndarray (n_features * 2, )
 57 |         coefficients of the estimation problem
 58 |     """
 59 | 
 60 |     n_features = X.shape[1]
 61 |     X_ko = np.column_stack([X, X_tilde])
 62 |     lambda_max = np.max(np.dot(X_ko.T, y)) / (2 * n_features)
 63 |     lambdas = np.linspace(
 64 |         lambda_max*np.exp(-n_lambdas), lambda_max, n_lambdas)
 65 | 
 66 |     cv = KFold(n_splits=5, shuffle=True, random_state=seed)
 67 | 
 68 |     estimator = {
 69 |         'lasso_cv': LassoCV(alphas=lambdas, n_jobs=n_jobs,
 70 |                             verbose=joblib_verbose, max_iter=1e4, cv=cv),
 71 |         'logistic_l1': LogisticRegressionCV(
 72 |             penalty='l1', max_iter=1e4,
 73 |             solver=solver, cv=cv,
 74 |             n_jobs=n_jobs, tol=1e-8),
 75 |         'logistic_l2': LogisticRegressionCV(
 76 |             penalty='l2', max_iter=1e4, n_jobs=n_jobs,
 77 |             verbose=joblib_verbose, cv=cv, tol=1e-8),
 78 |     }
 79 | 
 80 |     try:
 81 |         clf = estimator[method]
 82 |     except KeyError:
 83 |         print('{} is not a valid estimator'.format(method))
 84 | 
 85 |     clf.fit(X_ko, y)
 86 | 
 87 |     try:
 88 |         coef = np.ravel(clf.coef_)
 89 |     except AttributeError:
 90 |         coef = np.ravel(clf.best_estimator_.coef_)  # for GridSearchCV object
 91 | 
 92 |     test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:])
 93 | 
 94 |     if return_coef:
 95 |         return test_score, coef
 96 | 
 97 |     return test_score
 98 | 
 99 | 
100 | def _coef_diff_threshold(test_score, fdr=0.1, offset=1):
101 |     """Calculate the knockoff threshold based on the procedure stated in the
102 |     article.
103 | 
104 |     Parameters
105 |     ----------
106 |     test_score : 1D ndarray, shape (n_features, )
107 |         vector of test statistic
108 | 
109 |     fdr : float, optional
110 |         desired controlled FDR level
111 | 
112 |     offset : int, 0 or 1, optional
113 |         offset equals 1 is the knockoff+ procedure
114 | 
115 |     Returns
116 |     -------
117 |     thres : float or np.inf
118 |         threshold level
119 |     """
120 |     if offset not in (0, 1):
121 |         raise ValueError("'offset' must be either 0 or 1")
122 | 
123 |     t_mesh = np.sort(np.abs(test_score[test_score != 0]))
124 |     for t in t_mesh:
125 |         false_pos = np.sum(test_score <= -t)
126 |         selected = np.sum(test_score >= t)
127 |         if (offset + false_pos) / np.maximum(selected, 1) <= fdr:
128 |             return t
129 | 
130 |     return np.inf
131 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Author: Binh Nguyen <tuan-binh.nguyen@inria.fr> & Jerome-Alexis Chevalier
  3 | import numpy as np
  4 | 
  5 | 
  6 | def quantile_aggregation(pvals, gamma=0.5, gamma_min=0.05, adaptive=False):
  7 |     if adaptive:
  8 |         return _adaptive_quantile_aggregation(pvals, gamma_min)
  9 |     else:
 10 |         return _fixed_quantile_aggregation(pvals, gamma)
 11 | 
 12 | 
 13 | def fdr_threshold(pvals, fdr=0.1, method='bhq', reshaping_function=None):
 14 |     if method == 'bhq':
 15 |         return _bhq_threshold(pvals, fdr=fdr)
 16 |     elif method == 'bhy':
 17 |         return _bhy_threshold(
 18 |             pvals, fdr=fdr, reshaping_function=reshaping_function)
 19 |     else:
 20 |         raise ValueError(
 21 |             '{} is not support FDR control method'.format(method))
 22 | 
 23 | 
 24 | def cal_fdp_power(selected, non_zero_index, r_index=False):
 25 |     """ Calculate power and False Discovery Proportion
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     selected: list index (in R format) of selected non-null variables
 30 |     non_zero_index: true index of non-null variables
 31 |     r_index : True if the index is taken from rpy2 inference
 32 | 
 33 |     Returns
 34 |     -------
 35 |     fdp: False Discoveries Proportion
 36 |     power: percentage of correctly selected variables over total number of
 37 |         non-null variables
 38 | 
 39 |     """
 40 |     # selected is the index list in R and will be different from index of
 41 |     # python by 1 unit
 42 | 
 43 |     if selected.size == 0:
 44 |         return 0.0, 0.0
 45 | 
 46 |     if r_index:
 47 |         selected = selected - 1
 48 | 
 49 |     true_positive = [i for i in selected if i in non_zero_index]
 50 |     false_positive = [i for i in selected if i not in non_zero_index]
 51 |     fdp = len(false_positive) / max(1, len(selected))
 52 |     power = len(true_positive) / len(non_zero_index)
 53 | 
 54 |     return fdp, power
 55 | 
 56 | 
 57 | def _bhq_threshold(pvals, fdr=0.1):
 58 |     """Standard Benjamini-Hochberg for controlling False discovery rate
 59 |     """
 60 |     n_features = len(pvals)
 61 |     pvals_sorted = np.sort(pvals)
 62 |     selected_index = 2 * n_features
 63 |     for i in range(n_features - 1, -1, -1):
 64 |         if pvals_sorted[i] <= fdr * (i + 1) / n_features:
 65 |             selected_index = i
 66 |             break
 67 |     if selected_index <= n_features:
 68 |         return pvals_sorted[selected_index]
 69 |     else:
 70 |         return -1.0
 71 | 
 72 | 
 73 | def _bhy_threshold(pvals, reshaping_function=None, fdr=0.1):
 74 |     """Benjamini-Hochberg-Yekutieli procedure for controlling FDR, with input
 75 |     shape function. Reference: Ramdas et al (2017)
 76 |     """
 77 |     n_features = len(pvals)
 78 |     pvals_sorted = np.sort(pvals)
 79 |     selected_index = 2 * n_features
 80 |     # Default value for reshaping function -- defined in
 81 |     # Benjamini & Yekutieli (2001)
 82 |     if reshaping_function is None:
 83 |         temp = np.arange(n_features)
 84 |         sum_inverse = np.sum(1 / (temp + 1))
 85 |         return _bhq_threshold(pvals, fdr / sum_inverse)
 86 |     else:
 87 |         for i in range(n_features - 1, -1, -1):
 88 |             if pvals_sorted[i] <= fdr * reshaping_function(i + 1) / n_features:
 89 |                 selected_index = i
 90 |                 break
 91 |         if selected_index <= n_features:
 92 |             return pvals_sorted[selected_index]
 93 |         else:
 94 |             return -1.0
 95 | 
 96 | 
 97 | def _fixed_quantile_aggregation(pvals, gamma=0.5):
 98 |     """Quantile aggregation function based on Meinshausen et al (2008)
 99 | 
100 |     Parameters
101 |     ----------
102 |     pvals : 2D ndarray (n_bootstrap, n_test)
103 |         p-value (adjusted)
104 | 
105 |     gamma : float
106 |         Percentile value used for aggregation.
107 | 
108 |     Returns
109 |     -------
110 |     1D ndarray (n_tests, )
111 |         Vector of aggregated p-value
112 |     """
113 |     converted_score = (1 / gamma) * (
114 |         np.percentile(pvals, q=100*gamma, axis=0))
115 | 
116 |     return np.minimum(1, converted_score)
117 | 
118 | 
119 | def _adaptive_quantile_aggregation(pvals, gamma_min=0.05):
120 |     """adaptive version of the quantile aggregation method, Meinshausen et al.
121 |     (2008)"""
122 |     gammas = np.arange(gamma_min, 1.05, 0.05)
123 |     list_Q = np.array([
124 |         _fixed_quantile_aggregation(pvals, gamma) for gamma in gammas])
125 | 
126 |     return np.minimum(1, (1 - np.log(gamma_min)) * list_Q.min(0))
127 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_ensemble_clustered_inference.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test the ensemble_clustered_inference module
  3 | """
  4 | 
  5 | import numpy as np
  6 | from sklearn.cluster import FeatureAgglomeration
  7 | from sklearn.feature_extraction import image
  8 | from numpy.testing import assert_almost_equal
  9 | 
 10 | from hidimstat.scenario import multivariate_1D_simulation
 11 | from hidimstat.scenario import multivariate_temporal_simulation
 12 | from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference
 13 | 
 14 | 
 15 | def test_ensemble_clustered_inference():
 16 |     '''Testing the procedure on a simulation with a 1D data structure
 17 |     and with n << p: the first test has no temporal dimension, the second has a
 18 |     temporal dimension. The support is connected and of size 10, it must be
 19 |     recovered with a small spatial tolerance parametrized by `margin_size`.
 20 |     Computing one sided p-values, we want low p-values for the features of
 21 |     the support and p-values close to 0.5 for the others.'''
 22 | 
 23 |     # Scenario 1: data with no temporal dimension
 24 |     # ###########################################
 25 |     inference_method = 'desparsified-lasso'
 26 |     n_samples, n_features = 100, 2000
 27 |     support_size = 10
 28 |     sigma = 5.0
 29 |     rho = 0.95
 30 | 
 31 |     X_init, y, beta, epsilon = \
 32 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
 33 |                                    support_size=support_size, sigma=sigma,
 34 |                                    rho=rho, shuffle=False, seed=0)
 35 | 
 36 |     margin_size = 5
 37 |     n_clusters = 200
 38 |     n_bootstraps = 3
 39 | 
 40 |     y = y - np.mean(y)
 41 |     X_init = X_init - np.mean(X_init, axis=0)
 42 | 
 43 |     connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
 44 |     ward = FeatureAgglomeration(n_clusters=n_clusters,
 45 |                                 connectivity=connectivity,
 46 |                                 linkage='ward')
 47 | 
 48 |     beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
 49 |         ensemble_clustered_inference(X_init, y, ward, n_clusters,
 50 |                                      n_bootstraps=n_bootstraps,
 51 |                                      inference_method=inference_method)
 52 | 
 53 |     expected = 0.5 * np.ones(n_features)
 54 |     expected[:support_size] = 0.0
 55 | 
 56 |     assert_almost_equal(pval_corr[:support_size-margin_size],
 57 |                         expected[:support_size-margin_size])
 58 |     assert_almost_equal(pval_corr[support_size+margin_size:],
 59 |                         expected[support_size+margin_size:],
 60 |                         decimal=1)
 61 | 
 62 |     # Scenario 2: temporal data
 63 |     # #########################
 64 |     inference_method = 'desparsified-group-lasso'
 65 |     n_samples, n_features, n_times = 200, 2000, 10
 66 |     support_size = 10
 67 |     sigma = 5.0
 68 |     rho_noise = 0.9
 69 |     rho_data = 0.9
 70 |     n_clusters = 200
 71 |     margin_size = 5
 72 |     interior_support = support_size - margin_size
 73 |     extended_support = support_size + margin_size
 74 |     n_bootstraps = 4
 75 | 
 76 |     X, Y, beta, noise = \
 77 |         multivariate_temporal_simulation(n_samples=n_samples,
 78 |                                          n_features=n_features,
 79 |                                          n_times=n_times,
 80 |                                          support_size=support_size,
 81 |                                          sigma=sigma,
 82 |                                          rho_noise=rho_noise,
 83 |                                          rho_data=rho_data,
 84 |                                          shuffle=False)
 85 | 
 86 |     connectivity = image.grid_to_graph(n_x=n_features, n_y=1, n_z=1)
 87 |     ward = FeatureAgglomeration(n_clusters=n_clusters,
 88 |                                 connectivity=connectivity,
 89 |                                 linkage='ward')
 90 | 
 91 |     beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
 92 |         ensemble_clustered_inference(X, Y, ward, n_clusters,
 93 |                                      n_bootstraps=n_bootstraps,
 94 |                                      inference_method=inference_method)
 95 | 
 96 |     expected = 0.5 * np.ones(n_features)
 97 |     expected[:support_size] = 0.0
 98 | 
 99 |     assert_almost_equal(pval_corr[:interior_support],
100 |                         expected[:interior_support],
101 |                         decimal=3)
102 |     assert_almost_equal(pval_corr[extended_support:],
103 |                         expected[extended_support:],
104 |                         decimal=1)
105 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_noise_std.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test the noise_std module
  3 | """
  4 | 
  5 | import numpy as np
  6 | from numpy.testing import assert_almost_equal
  7 | from scipy.linalg import toeplitz
  8 | 
  9 | from hidimstat.scenario import multivariate_1D_simulation
 10 | from hidimstat.scenario import multivariate_temporal_simulation
 11 | from hidimstat.noise_std import reid, group_reid, empirical_snr
 12 | 
 13 | 
 14 | def test_reid():
 15 |     '''Estimating noise standard deviation in two scenarios.
 16 |     First scenario: no structure and a support of size 2.
 17 |     Second scenario: no structure and an empty support.'''
 18 | 
 19 |     n_samples, n_features = 50, 30
 20 |     sigma = 2.0
 21 | 
 22 |     # First expe
 23 |     # ##########
 24 |     support_size = 2
 25 | 
 26 |     X, y, beta, noise = \
 27 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
 28 |                                    support_size=support_size, sigma=sigma,
 29 |                                    seed=0)
 30 | 
 31 |     # max_iter=1 to get a better coverage
 32 |     sigma_hat, _ = reid(X, y, tol=1e-3, max_iter=1)
 33 |     expected = sigma
 34 | 
 35 |     assert_almost_equal(sigma_hat / expected, 1.0, decimal=0)
 36 | 
 37 |     # Second expe
 38 |     # ###########
 39 |     support_size = 0
 40 | 
 41 |     X, y, beta, noise = \
 42 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
 43 |                                    support_size=support_size, sigma=sigma,
 44 |                                    seed=1)
 45 | 
 46 |     sigma_hat, _ = reid(X, y)
 47 |     expected = sigma
 48 | 
 49 |     assert_almost_equal(sigma_hat / expected, 1.0, decimal=1)
 50 | 
 51 | 
 52 | def test_group_reid():
 53 |     '''Estimating (temporal) noise covariance matrix in two scenarios.
 54 |     First scenario: no data structure and a support of size 2.
 55 |     Second scenario: no data structure and an empty support.'''
 56 | 
 57 |     n_samples = 30
 58 |     n_features = 50
 59 |     n_times = 10
 60 |     sigma = 1.0
 61 |     rho = 0.9
 62 |     corr = toeplitz(np.geomspace(1, rho ** (n_times - 1), n_times))
 63 |     cov = np.outer(sigma, sigma) * corr
 64 | 
 65 |     # First expe
 66 |     # ##########
 67 |     support_size = 2
 68 | 
 69 |     X, Y, beta, noise = \
 70 |         multivariate_temporal_simulation(n_samples=n_samples,
 71 |                                          n_features=n_features,
 72 |                                          n_times=n_times,
 73 |                                          support_size=support_size,
 74 |                                          sigma=sigma,
 75 |                                          rho_noise=rho)
 76 | 
 77 |     # max_iter=1 to get a better coverage
 78 |     cov_hat, _ = group_reid(X, Y, tol=1e-3, max_iter=1)
 79 |     error_ratio = cov_hat / cov
 80 | 
 81 |     assert_almost_equal(np.max(error_ratio), 1.0, decimal=0)
 82 |     assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1)
 83 | 
 84 |     cov_hat, _ = group_reid(X, Y, method='AR')
 85 |     error_ratio = cov_hat / cov
 86 | 
 87 |     assert_almost_equal(np.max(error_ratio), 1.0, decimal=0)
 88 |     assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=0)
 89 | 
 90 |     # Second expe
 91 |     # ###########
 92 |     support_size = 0
 93 | 
 94 |     X, Y, beta, noise = \
 95 |         multivariate_temporal_simulation(n_samples=n_samples,
 96 |                                          n_features=n_features,
 97 |                                          n_times=n_times,
 98 |                                          support_size=support_size,
 99 |                                          sigma=sigma,
100 |                                          rho_noise=rho,
101 |                                          seed=1)
102 | 
103 |     cov_hat, _ = group_reid(X, Y)
104 |     error_ratio = cov_hat / cov
105 | 
106 |     assert_almost_equal(np.max(error_ratio), 1.0, decimal=0)
107 |     assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1)
108 | 
109 |     cov_hat, _ = group_reid(X, Y, fit_Y=False, stationary=False)
110 |     error_ratio = cov_hat / cov
111 | 
112 |     assert_almost_equal(np.max(error_ratio), 1.0, decimal=0)
113 |     assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=0)
114 | 
115 |     cov_hat, _ = group_reid(X, Y, method='AR')
116 |     error_ratio = cov_hat / cov
117 | 
118 |     assert_almost_equal(np.max(error_ratio), 1.0, decimal=0)
119 |     assert_almost_equal(np.log(np.min(error_ratio)), 0.0, decimal=1)
120 | 
121 | 
122 | def test_empirical_snr():
123 |     '''Computing empirical signal to noise ratio from the target `y`,
124 |     the data `X` and the true parameter vector `beta` in a simple
125 |     scenario with a 1D data structure.'''
126 | 
127 |     n_samples, n_features = 30, 30
128 |     support_size = 10
129 |     sigma = 2.0
130 | 
131 |     X, y, beta, noise = \
132 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
133 |                                    support_size=support_size, sigma=sigma,
134 |                                    seed=0)
135 | 
136 |     snr = empirical_snr(X, y, beta)
137 |     expected = 2.0
138 | 
139 |     assert_almost_equal(snr, expected, decimal=0)
140 | 


--------------------------------------------------------------------------------
/hidimstat/knockoffs/gaussian_knockoff.py:
--------------------------------------------------------------------------------
  1 | """Generation of model-x knockoff following equi-correlated method or
  2 | optimization scheme following Barber et al. (2015). Requires cvxopt.
  3 | """
  4 | 
  5 | import warnings
  6 | 
  7 | import numpy as np
  8 | from sklearn.covariance import (GraphicalLassoCV, empirical_covariance,
  9 |                                 ledoit_wolf)
 10 | from sklearn.utils.validation import check_memory
 11 | 
 12 | 
 13 | def gaussian_knockoff_generation(X, mu, Sigma, method='equi', memory=None,
 14 |                                  seed=None):
 15 |     """Generate second-order knockoff variables using equi-correlated method.
 16 |     Reference: Candes et al. (2016), Barber et al. (2015)
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     X: 2D ndarray (n_samples, n_features)
 21 |         original design matrix
 22 | 
 23 |     mu : 1D ndarray (n_features, )
 24 |         vector of empirical mean values
 25 | 
 26 |     method: str
 27 |         method to generate gaussian knockoff
 28 | 
 29 |     Sigma : 2D ndarray (n_samples, n_features)
 30 |         empirical covariance matrix
 31 | 
 32 |     Returns
 33 |     -------
 34 |     X_tilde : 2D ndarray (n_samples, n_features)
 35 |         knockoff design matrix
 36 |     """
 37 |     memory = check_memory(memory)
 38 | 
 39 |     n_samples, n_features = X.shape
 40 |     if method == 'equi':
 41 |         Diag_s = np.diag(_s_equi(Sigma))
 42 |     else:
 43 |         raise ValueError('{} is not a valid knockoff '
 44 |                          'contriction method'.format(method))
 45 | 
 46 |     Sigma_inv_s = np.linalg.solve(Sigma, Diag_s)
 47 | 
 48 |     # First part on the RHS of equation 1.4 in Barber & Candes (2015)
 49 |     Mu_tilde = X - np.dot(X - mu, Sigma_inv_s)
 50 |     # To calculate the Cholesky decomposition later on
 51 |     Sigma_tilde = 2 * Diag_s - Diag_s.dot(Sigma_inv_s.dot(Diag_s))
 52 |     while not _is_posdef(Sigma_tilde):
 53 |         Sigma_tilde += 1e-10 * np.eye(n_features)
 54 |         warnings.warn(
 55 |             'The conditional covariance matrix for knockoffs is not positive '
 56 |             'definite. Adding minor positive value to the matrix.')
 57 | 
 58 |     rng = np.random.RandomState(seed)
 59 |     U_tilde = rng.randn(n_samples, n_features)
 60 |     # Equation 1.4 in Barber & Candes (2015)
 61 |     X_tilde = Mu_tilde + np.dot(U_tilde, np.linalg.cholesky(Sigma_tilde))
 62 | 
 63 |     return X_tilde
 64 | 
 65 | 
 66 | def _is_posdef(X, tol=1e-14):
 67 |     """Check a matrix is positive definite by calculating eigenvalue of the
 68 |     matrix
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     X : 2D ndarray, shape (n_samples x n_features)
 73 |         Matrix to check
 74 | 
 75 |     tol : float, optional
 76 |         minimum threshold for eigenvalue
 77 | 
 78 |     Returns
 79 |     -------
 80 |     True or False
 81 |     """
 82 |     eig_value = np.linalg.eigvalsh(X)
 83 |     return np.all(eig_value > tol)
 84 | 
 85 | 
 86 | def _cov_to_corr(Sigma):
 87 |     """Convert covariance matrix to correlation matrix
 88 | 
 89 |     Parameters
 90 |     ----------
 91 |     Sigma : 2D ndarray (n_features, n_features)
 92 |         Covariance matrix
 93 | 
 94 |     Returns
 95 |     -------
 96 |     Corr_matrix : 2D ndarray (n_features, n_features)
 97 |         Transformed correlation matrix
 98 |     """
 99 | 
100 |     features_std = np.sqrt(np.diag(Sigma))
101 |     Scale = np.outer(features_std, features_std)
102 | 
103 |     Corr_matrix = Sigma / Scale
104 | 
105 |     return Corr_matrix
106 | 
107 | 
108 | def _estimate_distribution(X, shrink=False, cov_estimator='ledoit_wolf'):
109 | 
110 |     alphas = [1e-3, 1e-2, 1e-1, 1]
111 | 
112 |     mu = X.mean(axis=0)
113 |     Sigma = empirical_covariance(X)
114 | 
115 |     if shrink or not _is_posdef(Sigma):
116 | 
117 |         if cov_estimator == 'ledoit_wolf':
118 |             Sigma_shrink = ledoit_wolf(X, assume_centered=True)[0]
119 | 
120 |         elif cov_estimator == 'graph_lasso':
121 |             model = GraphicalLassoCV(alphas=alphas)
122 |             Sigma_shrink = model.fit(X).covariance_
123 | 
124 |         else:
125 |             raise ValueError('{} is not a valid covariance estimated method'
126 |                              .format(cov_estimator))
127 | 
128 |         return mu, Sigma_shrink
129 | 
130 |     return mu, Sigma
131 | 
132 | 
133 | def _s_equi(Sigma):
134 |     """Estimate diagonal matrix of correlation between real and knockoff
135 |     variables using equi-correlated equation
136 | 
137 |     Parameters
138 |     ----------
139 |     Sigma : 2D ndarray (n_features, n_features)
140 |         empirical covariance matrix calculated from original design matrix
141 | 
142 |     Returns
143 |     -------
144 |     1D ndarray (n_features, )
145 |         vector of diagonal values of estimated matrix diag{s}
146 |     """
147 |     n_features = Sigma.shape[0]
148 | 
149 |     G = _cov_to_corr(Sigma)
150 |     eig_value = np.linalg.eigvalsh(G)
151 |     lambda_min = np.min(eig_value[0])
152 |     S = np.ones(n_features) * min(2 * lambda_min, 1)
153 | 
154 |     psd = False
155 |     s_eps = 0
156 | 
157 |     while psd is False:
158 |         # if all eigval > 0 then the matrix is psd
159 |         psd = _is_posdef(2 * G - np.diag(S * (1 - s_eps)))
160 |         if not psd:
161 |             if s_eps == 0:
162 |                 s_eps = 1e-08
163 |             else:
164 |                 s_eps *= 10
165 | 
166 |     S = S * (1 - s_eps)
167 | 
168 |     return S * np.diag(Sigma)
169 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | GITHUB_PAGES_BRANCH = gh-pages
 11 | OUTPUTDIR = _build/html
 12 | 
 13 | # User-friendly check for sphinx-build
 14 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 15 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 16 | endif
 17 | 
 18 | # Internal variables.
 19 | PAPEROPT_a4     = -D latex_paper_size=a4
 20 | PAPEROPT_letter = -D latex_paper_size=letter
 21 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 22 | # the i18n builder cannot share the environment and doctrees with the others
 23 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 24 | 
 25 | .PHONY: help
 26 | help:
 27 | 	@echo "Please use \`make <target>' where <target> is one of"
 28 | 	@echo "  html-noplot to make standalone HTML files, without plotting anything"
 29 | 	@echo "  html       to make standalone HTML files"
 30 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 31 | 	@echo "  singlehtml to make a single large HTML file"
 32 | 	@echo "  pickle     to make pickle files"
 33 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 34 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 41 | 	@echo "  install    to make the html and push it online"
 42 | 
 43 | .PHONY: clean
 44 | 
 45 | clean:
 46 | 	rm -rf $(BUILDDIR)/*
 47 | 	rm -rf auto_examples/
 48 | 	rm -rf generated/*
 49 | 	rm -rf modules/*
 50 | 
 51 | html-noplot:
 52 | 	$(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 53 | 	@echo
 54 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 55 | 
 56 | .PHONY: html
 57 | html:
 58 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 59 | 	@echo
 60 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 61 | 
 62 | .PHONY: dirhtml
 63 | dirhtml:
 64 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 65 | 	@echo
 66 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 67 | 
 68 | .PHONY: singlehtml
 69 | singlehtml:
 70 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 71 | 	@echo
 72 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 73 | 
 74 | .PHONY: pickle
 75 | pickle:
 76 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 77 | 	@echo
 78 | 	@echo "Build finished; now you can process the pickle files."
 79 | 
 80 | .PHONY: htmlhelp
 81 | htmlhelp:
 82 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 83 | 	@echo
 84 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 85 | 		  ".hhp project file in $(BUILDDIR)/htmlhelp."
 86 | 
 87 | .PHONY: qthelp
 88 | qthelp:
 89 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 90 | 	@echo
 91 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 92 | 		  ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 93 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/hidimstat.qhcp"
 94 | 	@echo "To view the help file:"
 95 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/hidimstat.qhc"
 96 | 
 97 | .PHONY: latex
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 		  "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | .PHONY: latexpdf
106 | latexpdf:
107 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
108 | 	@echo "Running LaTeX files through pdflatex..."
109 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
110 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
111 | 
112 | .PHONY: changes
113 | changes:
114 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
115 | 	@echo
116 | 	@echo "The overview file is in $(BUILDDIR)/changes."
117 | 
118 | .PHONY: linkcheck
119 | linkcheck:
120 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
121 | 	@echo
122 | 	@echo "Link check complete; look for any errors in the above output " \
123 | 		  "or in $(BUILDDIR)/linkcheck/output.txt."
124 | 
125 | .PHONY: doctest
126 | doctest:
127 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
128 | 	@echo "Testing of doctests in the sources finished, look at the " \
129 | 		  "results in $(BUILDDIR)/doctest/output.txt."
130 | 
131 | .PHONY: coverage
132 | coverage:
133 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
134 | 	@echo "Testing of coverage in the sources finished, look at the " \
135 | 		  "results in $(BUILDDIR)/coverage/python.txt."
136 | 
137 | install:
138 | 	touch $(OUTPUTDIR)/.nojekyll
139 | 	ghp-import -m "Generate Pelican site [ci skip]" -b $(GITHUB_PAGES_BRANCH) $(OUTPUTDIR)
140 | 	git push -f origin $(GITHUB_PAGES_BRANCH)
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # HiDimStat: High-dimensional statistical inference tool for Python
  2 | [![build][TravisCI]][travis]  [![coverage][CodeCov]][cov]
  3 | 
  4 | The HiDimStat package provides statistical inference methods to solve the
  5 | problem of support recovery in the context of high-dimensional and
  6 | spatially structured data.
  7 | 
  8 | **Update on Oct 2024: this repository is no longer maintained.** Please refer to the new up-to-date github repo at https://github.com/mind-INRIA/hidimstat.
  9 | 
 10 | 
 11 | ## Installation
 12 | 
 13 | HiDimStat working only with Python 3, ideally Python 3.6+. For installation,
 14 | run the following from terminal
 15 | 
 16 | ```bash
 17 | pip install hidimstat
 18 | ```
 19 | 
 20 | Or if you want the latest version available (for example to contribute to
 21 | the development of this project:
 22 | 
 23 | ```
 24 | pip install -U git+https://github.com/ja-che/hidimstat.git
 25 | ```
 26 | 
 27 | or
 28 | 
 29 | ```bash
 30 | git clone https://github.com/ja-che/hidimstat.git
 31 | cd hidimstat
 32 | pip install -e .
 33 | ```
 34 | 
 35 | ## Dependencies
 36 | 
 37 | ```
 38 | joblib
 39 | numpy
 40 | scipy
 41 | scikit-learn
 42 | ```
 43 | 
 44 | To run examples it is neccessary to install `matplotlib`, and to run tests it
 45 | is also needed to install `pytest`.
 46 | 
 47 | ## Documentation & Examples
 48 | 
 49 | All the documentation of HiDimStat is available at https://ja-che.github.io/hidimstat/.
 50 | 
 51 | As of now in the `examples` folder there are three Python scripts that
 52 | illustrate how to use the main HiDimStat functions.
 53 | In each script we handle a different kind of dataset:
 54 | ``plot_2D_simulation_example.py`` handles a simulated dataset with a 2D
 55 | spatial structure,
 56 | ``plot_fmri_data_example.py`` solves the decoding problem on Haxby fMRI dataset,
 57 | ``plot_meg_data_example.py`` tackles the source localization problem on several
 58 | MEG/EEG datasets.
 59 | 
 60 | 
 61 | ```bash
 62 | # For example run the following command in terminal
 63 | python plot_2D_simulation_example.py
 64 | ```
 65 | 
 66 | ## References
 67 | 
 68 | The algorithms developed in this package have been detailed in several
 69 | conference/journal articles that can be downloaded at
 70 | https://ja-che.github.io/research.html.
 71 | 
 72 | #### Main references:
 73 | 
 74 | Ensemble of Clustered desparsified Lasso (ECDL):
 75 | 
 76 | * Chevalier, J. A., Salmon, J., & Thirion, B. (2018). __Statistical inference
 77 |   with ensemble of clustered desparsified lasso__. In _International Conference
 78 |   on Medical Image Computing and Computer-Assisted Intervention_
 79 |   (pp. 638-646). Springer, Cham.
 80 | 
 81 | * Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021). __Spatially relaxed inference on high-dimensional linear models__. arXiv preprint arXiv:2106.02590.
 82 | 
 83 | Aggregation of multiple Knockoffs (AKO):
 84 | 
 85 | * Nguyen T.-B., Chevalier J.-A., Thirion B., & Arlot S. (2020). __Aggregation
 86 |   of Multiple Knockoffs__. In _Proceedings of the 37th International Conference on
 87 |   Machine Learning_, Vienna, Austria, PMLR 119.
 88 | 
 89 | Application to decoding (fMRI data):
 90 | 
 91 | * Chevalier, J. A., Nguyen T.-B., Salmon, J., Varoquaux, G. & Thirion, B. (2021). __Decoding with confidence: Statistical control on decoder maps__. In _NeuroImage_, 234, 117921.
 92 | 
 93 | Application to source localization (MEG/EEG data):
 94 | 
 95 | * Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020). __Statistical control for spatio-temporal MEG/EEG source imaging with desparsified multi-task Lasso__. In _Proceedings of the 34th Conference on Neural Information Processing Systems (NeurIPS 2020)_, Vancouver, Canada.
 96 | 
 97 | If you use our packages, we would appreciate citations to the relevant aforementioned papers.
 98 | 
 99 | #### Other useful references:
100 | 
101 | For de-sparsified(or de-biased) Lasso:
102 | 
103 | * Javanmard, A., & Montanari, A. (2014). __Confidence intervals and hypothesis
104 |   testing for high-dimensional regression__. _The Journal of Machine Learning
105 |   Research_, 15(1), 2869-2909.
106 | 
107 | * Zhang, C. H., & Zhang, S. S. (2014). __Confidence intervals for low dimensional
108 |   parameters in high dimensional linear models__. _Journal of the Royal
109 |   Statistical Society: Series B: Statistical Methodology_, 217-242.
110 | 
111 | * Van de Geer, S., Bühlmann, P., Ritov, Y. A., & Dezeure, R. (2014). __On
112 |   asymptotically optimal confidence regions and tests for high-dimensional
113 |   models__. _The Annals of Statistics_, 42(3), 1166-1202.
114 | 
115 | For Knockoffs Inference:
116 | 
117 | * Barber, R. F; Candès, E. J. (2015). __Controlling the false discovery rate
118 |   via knockoffs__. _Annals of Statistics_. 43 , no. 5,
119 |   2055--2085. doi:10.1214/15-AOS1337. https://projecteuclid.org/euclid.aos/1438606853
120 | 
121 | * Candès, E., Fan, Y., Janson, L., & Lv, J. (2018). __Panning for gold: Model-X
122 |   knockoffs for high dimensional controlled variable selection__. _Journal of the
123 |   Royal Statistical Society Series B_, 80(3), 551-577.
124 | 
125 | ## License
126 | 
127 | This project is licensed under the BSD 2-Clause License.
128 | 
129 | ## Acknowledgments
130 | 
131 | This project has been funded by Labex DigiCosme (ANR-11-LABEX-0045-DIGICOSME)
132 | as part of the program "Investissement d’Avenir" (ANR-11-IDEX-0003-02), by the
133 | Fast Big project (ANR-17-CE23-0011) and the KARAIB AI Chair
134 | (ANR-20-CHIA-0025-01). This study has also been supported by the European
135 | Union’s Horizon 2020 research and innovation program
136 | (Grant Agreement No. 945539, Human Brain Project SGA3).
137 | 
138 | 
139 | [TravisCI]: https://travis-ci.com/ja-che/hidimstat.svg?branch=main "travisCI status"
140 | [travis]: https://travis-ci.com/ja-che/hidimstat
141 | 
142 | [CodeCov]: https://codecov.io/gh/ja-che/hidimstat/branch/main/graph/badge.svg "CodeCov status"
143 | [cov]: https://codecov.io/gh/ja-che/hidimstat
144 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
  1 | .. hidimstat documentation master file, created by
  2 |    sphinx-quickstart on Fri April 23 12:22:52 2021.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | HiDimStat: High-dimensional statistical inference tool for Python
  7 | =================================================================
  8 | |Build Status| |codecov|
  9 | 
 10 | The HiDimStat package provides statistical inference methods to solve the
 11 | problem of support recovery in the context of high-dimensional and
 12 | spatially structured data.
 13 | 
 14 | 
 15 | Installation
 16 | ------------
 17 | 
 18 | HiDimStat working only with Python 3, ideally Python 3.6+. For installation,
 19 | run the following from terminal::
 20 | 
 21 |   pip install hidimstat
 22 | 
 23 | Or if you want the latest version available (for example to contribute to
 24 | the development of this project)::
 25 | 
 26 |   git clone https://github.com/ja-che/hidimstat.git
 27 |   cd hidimstat
 28 |   pip install -e .
 29 | 
 30 | 
 31 | Dependencies
 32 | ------------
 33 | 
 34 | HiDimStat depends on the following packages::
 35 | 
 36 |   joblib
 37 |   numpy
 38 |   scipy
 39 |   scikit-learn
 40 | 
 41 | 
 42 | To run examples it is neccessary to install ``matplotlib``, and to run tests it
 43 | is also needed to install ``pytest``.
 44 | 
 45 | 
 46 | Documentation & Examples
 47 | ------------------------
 48 | 
 49 | Documentation about the main HiDimStat functions is available
 50 | `here <api.html>`_ and examples are available `here <auto_examples/index.html>`_.
 51 | 
 52 | As of now, there are three different examples (Python scripts) that
 53 | illustrate how to use the main HiDimStat functions.
 54 | In each example we handle a different kind of dataset:
 55 | ``plot_2D_simulation_example.py`` handles a simulated dataset with a 2D
 56 | spatial structure,
 57 | ``plot_fmri_data_example.py`` solves the decoding problem on Haxby fMRI dataset,
 58 | ``plot_meg_data_example.py`` tackles the source localization problem on several
 59 | MEG/EEG datasets.
 60 | 
 61 | .. code-block::
 62 | 
 63 |   # For example run the following command in terminal
 64 |   python plot_2D_simulation_example.py
 65 | 
 66 | 
 67 | Build the documentation
 68 | -----------------------
 69 | 
 70 | To build the documentation you will need to run:
 71 | 
 72 | .. code-block::
 73 | 
 74 |     pip install -U sphinx_gallery sphinx_bootstrap_theme
 75 |     cd doc
 76 |     make html
 77 | 
 78 | 
 79 | References
 80 | ----------
 81 | 
 82 | The algorithms developed in this package have been detailed in several
 83 | conference/journal articles that can be downloaded at
 84 | `https://ja-che.github.io/ <https://ja-che.github.io/research.html>`_.
 85 | 
 86 | Main references
 87 | ~~~~~~~~~~~~~~~
 88 | 
 89 | Ensemble of Clustered desparsified Lasso (ECDL):
 90 | 
 91 | * Chevalier, J. A., Salmon, J., & Thirion, B. (2018). Statistical inference
 92 |   with ensemble of clustered desparsified lasso. In International Conference
 93 |   on Medical Image Computing and Computer-Assisted Intervention
 94 |   (pp. 638-646). Springer, Cham.
 95 | 
 96 | * Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021).
 97 |   Spatially relaxed inference on high-dimensional linear models.
 98 |   arXiv preprint arXiv:2106.02590.
 99 | 
100 | Aggregation of multiple Knockoffs (AKO):
101 | 
102 | * Nguyen T.-B., Chevalier J.-A., Thirion B., & Arlot S. (2020). Aggregation
103 |   of Multiple Knockoffs. In Proceedings of the 37th International Conference on
104 |   Machine Learning, Vienna, Austria, PMLR 119.
105 | 
106 | Application to decoding (fMRI data):
107 | 
108 | * Chevalier, J. A., Nguyen T.-B., Salmon, J., Varoquaux, G. & Thirion, B.
109 |   (2021). Decoding with confidence: Statistical control on decoder maps.
110 |   In NeuroImage, 234, 117921.
111 | 
112 | Application to source localization (MEG/EEG data):
113 | 
114 | * Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020).
115 |   Statistical control for spatio-temporal MEG/EEG source imaging with
116 |   desparsified multi-task Lasso. In Proceedings of the 34th Conference on
117 |   Neural Information Processing Systems (NeurIPS 2020), Vancouver, Canada.
118 | 
119 | If you use our packages, we would appreciate citations to the relevant
120 | aforementioned papers.
121 | 
122 | Other useful references
123 | ~~~~~~~~~~~~~~~~~~~~~~~
124 | 
125 | For de-sparsified(or de-biased) Lasso:
126 | 
127 | * Javanmard, A., & Montanari, A. (2014). Confidence intervals and hypothesis
128 |   testing for high-dimensional regression. The Journal of Machine Learning
129 |   Research, 15(1), 2869-2909.
130 | 
131 | * Zhang, C. H., & Zhang, S. S. (2014). Confidence intervals for low dimensional
132 |   parameters in high dimensional linear models. Journal of the Royal
133 |   Statistical Society: Series B: Statistical Methodology, 217-242.
134 | 
135 | * Van de Geer, S., Bühlmann, P., Ritov, Y. A., & Dezeure, R. (2014). On
136 |   asymptotically optimal confidence regions and tests for high-dimensional
137 |   models. The Annals of Statistics, 42(3), 1166-1202.
138 | 
139 | For Knockoffs Inference:
140 | 
141 | * Barber, R. F; Candès, E. J. (2015). Controlling the false discovery rate
142 |   via knockoffs. Annals of Statistics. 43 , no. 5,
143 |   2055--2085. doi:10.1214/15-AOS1337. https://projecteuclid.org/euclid.aos/1438606853
144 | 
145 | * Candès, E., Fan, Y., Janson, L., & Lv, J. (2018). Panning for gold: Model-X
146 |   knockoffs for high dimensional controlled variable selection. Journal of the
147 |   Royal Statistical Society Series B, 80(3), 551-577.
148 | 
149 | .. |Build Status| image:: https://travis-ci.com/ja-che/hidimstat.svg?branch=main
150 |    :target: https://codecov.io/gh/ja-che/hidimstat
151 | 
152 | .. |codecov| image:: https://codecov.io/gh/ja-che/hidimstat/branch/main/graph/badge.svg
153 |    :target: https://codecov.io/gh/ja-che/hidimstat
154 | 
155 | 
156 | API
157 | ---
158 | 
159 | .. toctree::
160 |     :maxdepth: 1
161 | 
162 |     api.rst
163 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_scenario.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test the scenario module
  3 | """
  4 | 
  5 | import numpy as np
  6 | from numpy.testing import assert_almost_equal, assert_equal
  7 | 
  8 | from hidimstat.scenario import multivariate_1D_simulation
  9 | from hidimstat.scenario import multivariate_simulation
 10 | from hidimstat.scenario import multivariate_temporal_simulation
 11 | 
 12 | ROI_SIZE_2D = 2
 13 | SHAPE_2D = (12, 12)
 14 | 
 15 | ROI_SIZE_3D = 2
 16 | SHAPE_3D = (12, 12, 12)
 17 | 
 18 | 
 19 | def test_multivariate_1D_simulation():
 20 |     '''Test if the data has expected shape, if the input parameters
 21 |     are close to their empirical estimators, if the support size is
 22 |     correct and if the noise model is the generative model. The
 23 |     first test concerns a simulation with a 1D spatial structure,
 24 |     the second test concerns a simulation with a random structure'''
 25 | 
 26 |     n_samples = 100
 27 |     n_features = 500
 28 |     support_size = 10
 29 |     rho = 0.7
 30 |     sigma = 1.0
 31 | 
 32 |     # Test 1
 33 |     X, y, beta, noise = \
 34 |         multivariate_1D_simulation(n_samples=n_samples, n_features=n_features,
 35 |                                    support_size=support_size, sigma=sigma,
 36 |                                    rho=rho, shuffle=False, seed=0)
 37 | 
 38 |     sigma_hat = np.std(noise)
 39 |     rho_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1]
 40 | 
 41 |     assert_almost_equal(sigma_hat, sigma, decimal=1)
 42 |     assert_almost_equal(rho_hat, rho, decimal=1)
 43 |     assert_equal(X.shape, (n_samples, n_features))
 44 |     assert_equal(np.count_nonzero(beta), support_size)
 45 |     assert_equal(y, np.dot(X, beta) + noise)
 46 | 
 47 |     # Test 2
 48 |     X, y, beta, noise = \
 49 |         multivariate_1D_simulation()
 50 |     rho_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1]
 51 |     assert_almost_equal(rho_hat, 0, decimal=1)
 52 | 
 53 | 
 54 | def test_multivariate_simulation():
 55 |     '''Test if the data has expected shape, if the input parameters
 56 |     are close to their empirical estimators, if the support has the
 57 |     expected size (from simple geometry) and if the noise model is
 58 |     the generative model. First test concerns a simulation with a 2D
 59 |     structure, second test concerns a simulation with a 3D structure.'''
 60 | 
 61 |     # Test 1
 62 |     n_samples = 100
 63 |     shape = SHAPE_2D
 64 |     roi_size = ROI_SIZE_2D
 65 |     sigma = 1.0
 66 |     smooth_X = 1.0
 67 |     rho_expected = 0.8
 68 |     return_shaped_data = True
 69 | 
 70 |     X, y, beta, noise, X_, w = \
 71 |         multivariate_simulation(n_samples=n_samples, shape=shape,
 72 |                                 roi_size=roi_size, sigma=sigma,
 73 |                                 smooth_X=smooth_X,
 74 |                                 return_shaped_data=return_shaped_data,
 75 |                                 seed=0)
 76 | 
 77 |     sigma_hat = np.std(noise)
 78 |     rho_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1]
 79 | 
 80 |     assert_almost_equal(sigma_hat, sigma, decimal=1)
 81 |     assert_almost_equal(rho_hat, rho_expected, decimal=2)
 82 |     assert_equal(X.shape, (n_samples, shape[0] * shape[1]))
 83 |     assert_equal(X_.shape, (n_samples, shape[0], shape[1]))
 84 |     assert_equal(np.count_nonzero(beta), 4 * (roi_size ** 2))
 85 |     assert_equal(y, np.dot(X, beta) + noise)
 86 | 
 87 |     # Test 2
 88 |     shape = SHAPE_3D
 89 |     roi_size = ROI_SIZE_3D
 90 |     return_shaped_data = False
 91 | 
 92 |     X, y, beta, noise = \
 93 |         multivariate_simulation(n_samples=n_samples, shape=shape,
 94 |                                 roi_size=roi_size,
 95 |                                 return_shaped_data=return_shaped_data,
 96 |                                 seed=0)
 97 | 
 98 |     assert_equal(X.shape, (n_samples, shape[0] * shape[1] * shape[2]))
 99 |     assert_equal(np.count_nonzero(beta), 5 * (roi_size ** 3))
100 | 
101 | 
102 | def test_multivariate_temporal_simulation():
103 |     '''Test if the data has expected shape, if the input parameters
104 |     are close to their empirical estimators, if the support size is
105 |     correct and if the noise model is the generative model. The
106 |     first test concerns a simulation with a 1D spatial structure
107 |     and a temporal structure, the second test concerns a simulation
108 |     with a random spatial structure and a temporal structure.'''
109 | 
110 |     n_samples = 30
111 |     n_features = 50
112 |     n_times = 10
113 |     support_size = 2
114 |     sigma = 1.0
115 |     rho_noise = 0.9
116 |     rho_data = 0.95
117 | 
118 |     # Test 1
119 |     X, Y, beta, noise = \
120 |         multivariate_temporal_simulation(n_samples=n_samples,
121 |                                          n_features=n_features,
122 |                                          n_times=n_times,
123 |                                          support_size=support_size,
124 |                                          sigma=sigma,
125 |                                          rho_noise=rho_noise,
126 |                                          rho_data=rho_data)
127 | 
128 |     sigma_hat = np.std(noise[:, -1])
129 |     rho_noise_hat = np.corrcoef(noise[:, -1], noise[:, -2])[0, 1]
130 | 
131 |     assert_almost_equal(sigma_hat, sigma, decimal=1)
132 |     assert_almost_equal(rho_noise_hat, rho_noise, decimal=1)
133 |     assert_equal(X.shape, (n_samples, n_features))
134 |     assert_equal(Y.shape, (n_samples, n_times))
135 |     assert_equal(np.count_nonzero(beta), support_size * n_times)
136 |     assert_equal(Y, np.dot(X, beta) + noise)
137 | 
138 |     # Test 2
139 |     X, Y, beta, noise = \
140 |         multivariate_temporal_simulation(n_samples=n_samples,
141 |                                          n_features=n_features,
142 |                                          n_times=n_times,
143 |                                          support_size=support_size,
144 |                                          sigma=sigma,
145 |                                          rho_noise=rho_noise,
146 |                                          rho_data=rho_data,
147 |                                          shuffle=False)
148 | 
149 |     rho_data_hat = np.corrcoef(X[:, 19], X[:, 20])[0, 1]
150 |     assert_almost_equal(rho_data_hat, rho_data, decimal=1)
151 |     assert_equal(Y, np.dot(X, beta) + noise)
152 | 


--------------------------------------------------------------------------------
/hidimstat/test/test_stat_tools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test the stat module
  3 | """
  4 | 
  5 | import numpy as np
  6 | from numpy.testing import assert_almost_equal, assert_equal
  7 | 
  8 | from hidimstat.stat_tools import \
  9 |     (_replace_infinity, pval_corr_from_pval, pval_from_scale,
 10 |      zscore_from_cb, pval_from_cb, two_sided_pval_from_zscore,
 11 |      two_sided_pval_from_cb, zscore_from_pval,
 12 |      pval_from_two_sided_pval_and_sign, two_sided_pval_from_pval)
 13 | 
 14 | 
 15 | def test__replace_infinity():
 16 | 
 17 |     x = np.asarray([10, np.inf, -np.inf])
 18 | 
 19 |     # replace inf by the largest absolute value times two
 20 |     x_clean = _replace_infinity(x)
 21 |     expected = np.asarray([10, 20, -20])
 22 |     assert_equal(x_clean, expected)
 23 | 
 24 |     # replace inf by 40
 25 |     x_clean = _replace_infinity(x, replace_val=40)
 26 |     expected = np.asarray([10, 40, -40])
 27 |     assert_equal(x_clean, expected)
 28 | 
 29 |     # replace inf by the largest absolute value plus one
 30 |     x_clean = _replace_infinity(x, method='plus-one')
 31 |     expected = np.asarray([10, 11, -11])
 32 |     assert_equal(x_clean, expected)
 33 | 
 34 | 
 35 | def test_pval_corr_from_pval():
 36 | 
 37 |     pval = np.asarray([1.0, 0.025, 0.5])
 38 | 
 39 |     # Correction for multiple testing: 3 features tested simultaneously
 40 |     pval_corr = pval_corr_from_pval(pval)
 41 |     expected = np.asarray([1.0, 0.075, 0.5])
 42 |     assert_almost_equal(pval_corr, expected, decimal=10)
 43 | 
 44 |     one_minus_pval = np.asarray([0.0, 0.975, 0.5])
 45 | 
 46 |     # Correction for multiple testing: 3 features tested simultaneously
 47 |     one_minus_pval_corr = pval_corr_from_pval(one_minus_pval)
 48 |     expected = np.asarray([0.0, 0.925, 0.5])
 49 |     assert_almost_equal(one_minus_pval_corr, expected, decimal=10)
 50 | 
 51 | 
 52 | def test_pval_from_scale():
 53 | 
 54 |     beta = np.asarray([-1.5, 1, 0])
 55 |     scale = np.asarray([0.25, 0.5, 0.5])
 56 | 
 57 |     # Computing p-value and one minus the p-value.
 58 |     pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
 59 |         pval_from_scale(beta, scale)
 60 |     expected = np.asarray([[1.0, 0.022, 0.5], [1.0, 0.068, 0.5],
 61 |                            [0.0, 0.978, 0.5], [0.0, 0.932, 0.5]])
 62 | 
 63 |     assert_almost_equal(pval, expected[0], decimal=2)
 64 |     assert_almost_equal(pval_corr, expected[1], decimal=2)
 65 |     assert_almost_equal(one_minus_pval, expected[2], decimal=2)
 66 |     assert_almost_equal(one_minus_pval_corr, expected[3], decimal=2)
 67 | 
 68 | 
 69 | def test_zscore_from_cb():
 70 | 
 71 |     cb_min = np.asarray([-2, 0, -1])
 72 |     cb_max = np.asarray([-1, 2, 1])
 73 | 
 74 |     # Computing z-scores from 95% confidence-intervals assuming Gaussianity
 75 |     zscore = zscore_from_cb(cb_min, cb_max)
 76 |     expected = np.asarray([-5.87, 1.96, 0])
 77 | 
 78 |     assert_almost_equal(zscore, expected, decimal=2)
 79 | 
 80 | 
 81 | def test_pval_from_cb():
 82 | 
 83 |     cb_min = np.asarray([-2, 0, -1])
 84 |     cb_max = np.asarray([-1, 2, 1])
 85 | 
 86 |     # Computing p-value and one minus the p-value.
 87 |     pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
 88 |         pval_from_cb(cb_min, cb_max)
 89 |     expected = np.asarray([[1.0, 0.025, 0.5], [1.0, 0.075, 0.5],
 90 |                            [0.0, 0.975, 0.5], [0.0, 0.925, 0.5]])
 91 | 
 92 |     assert_almost_equal(pval, expected[0], decimal=2)
 93 |     assert_almost_equal(pval_corr, expected[1], decimal=2)
 94 |     assert_almost_equal(one_minus_pval, expected[2], decimal=2)
 95 |     assert_almost_equal(one_minus_pval_corr, expected[3], decimal=2)
 96 | 
 97 | 
 98 | def test_two_sided_pval_from_zscore():
 99 | 
100 |     zscore = np.asarray([-5.87, 1.96, 0])
101 | 
102 |     # Computing two-sided pval from z-scores assuming Gaussianity
103 |     two_sided_pval, two_sided_pval_corr = two_sided_pval_from_zscore(zscore)
104 |     expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]])
105 | 
106 |     assert_almost_equal(two_sided_pval, expected[0], decimal=2)
107 |     assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2)
108 | 
109 | 
110 | def test_two_sided_pval_from_cb():
111 | 
112 |     cb_min = np.asarray([-2, 0, -1])
113 |     cb_max = np.asarray([-1, 2, 1])
114 | 
115 |     # Computing two-sided pval from 95% confidence bounds assuming Gaussianity
116 |     two_sided_pval, two_sided_pval_corr = \
117 |         two_sided_pval_from_cb(cb_min, cb_max)
118 |     expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]])
119 | 
120 |     assert_almost_equal(two_sided_pval, expected[0], decimal=2)
121 |     assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2)
122 | 
123 | 
124 | def test_zscore_from_pval():
125 | 
126 |     pval = np.asarray([1.0, 0.025, 0.5, 0.975])
127 | 
128 |     # Computing z-scores from p-value
129 |     zscore = zscore_from_pval(pval)
130 |     expected = _replace_infinity(np.asarray([-np.inf, 1.96, 0, -1.96]),
131 |                                  replace_val=40, method='plus-one')
132 | 
133 |     assert_almost_equal(zscore, expected, decimal=2)
134 | 
135 |     pval = np.asarray([1.0, 0.025, 0.5, 0.975])
136 |     one_minus_pval = np.asarray([0.0, 0.975, 0.5, 0.025])
137 | 
138 |     # Computing z-scores from p-value and one minus the p-value
139 |     zscore = zscore_from_pval(pval, one_minus_pval)
140 |     expected = _replace_infinity(np.asarray([-np.inf, 1.96, 0, -1.96]),
141 |                                  replace_val=40, method='plus-one')
142 | 
143 |     assert_almost_equal(zscore, expected, decimal=2)
144 | 
145 | 
146 | def test_pval_from_two_sided_pval_and_sign():
147 | 
148 |     two_sided_pval = np.asarray([0.025, 0.05, 0.5])
149 |     parameter_sign = np.asarray([-1.0, 1.0, -1.0])
150 | 
151 |     # One-sided p-values from two-sided p-value and sign.
152 |     pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
153 |         pval_from_two_sided_pval_and_sign(two_sided_pval, parameter_sign)
154 |     expected = np.asarray([[0.9875, 0.025, 0.75], [0.9625, 0.075, 0.5],
155 |                            [0.0125, 0.975, 0.25], [0.0375, 0.925, 0.5]])
156 | 
157 |     assert_equal(pval, expected[0])
158 |     assert_almost_equal(pval_corr, expected[1])
159 |     assert_equal(one_minus_pval, expected[2])
160 |     assert_almost_equal(one_minus_pval_corr, expected[3])
161 | 
162 | 
163 | def test_two_sided_pval_from_pval():
164 | 
165 |     pval = np.asarray([1.0, 0.025, 0.5])
166 |     one_minus_pval = np.asarray([0.0, 0.975, 0.5])
167 | 
168 |     # Two-sided p-value from one-side p-values.
169 |     two_sided_pval, two_sided_pval_corr = \
170 |         two_sided_pval_from_pval(pval, one_minus_pval)
171 |     expected = np.asarray([[0.0, 0.05, 1.0], [0.0, 0.15, 1.0]])
172 | 
173 |     assert_almost_equal(two_sided_pval, expected[0], decimal=2)
174 |     assert_almost_equal(two_sided_pval_corr, expected[1], decimal=2)
175 | 


--------------------------------------------------------------------------------
/hidimstat/ensemble_clustered_inference.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from joblib import Parallel, delayed
  3 | 
  4 | from .multi_sample_split import aggregate_medians, aggregate_quantiles
  5 | from .clustered_inference import clustered_inference
  6 | 
  7 | 
  8 | def _ensembling(list_beta_hat, list_pval, list_pval_corr, list_one_minus_pval,
  9 |                 list_one_minus_pval_corr, method='quantiles', gamma_min=0.2):
 10 | 
 11 |     beta_hat = np.asarray(list_beta_hat)
 12 |     list_pval = np.asarray(list_pval)
 13 |     list_pval_corr = np.asarray(list_pval_corr)
 14 |     list_one_minus_pval = np.asarray(list_one_minus_pval)
 15 |     list_one_minus_pval_corr = np.asarray(list_one_minus_pval_corr)
 16 | 
 17 |     beta_hat = np.mean(list_beta_hat, axis=0)
 18 | 
 19 |     if method == 'quantiles':
 20 | 
 21 |         pval = aggregate_quantiles(list_pval, gamma_min)
 22 |         pval_corr = aggregate_quantiles(list_pval_corr, gamma_min)
 23 |         one_minus_pval = aggregate_quantiles(list_one_minus_pval, gamma_min)
 24 |         one_minus_pval_corr = \
 25 |             aggregate_quantiles(list_one_minus_pval_corr, gamma_min)
 26 | 
 27 |     elif method == 'medians':
 28 | 
 29 |         pval = aggregate_medians(list_pval)
 30 |         pval_corr = aggregate_medians(list_pval_corr)
 31 |         one_minus_pval = aggregate_medians(list_one_minus_pval)
 32 |         one_minus_pval_corr = aggregate_medians(list_one_minus_pval_corr)
 33 | 
 34 |     else:
 35 | 
 36 |         raise ValueError("Unknown ensembling method.")
 37 | 
 38 |     return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr
 39 | 
 40 | 
 41 | def ensemble_clustered_inference(X_init, y, ward, n_clusters,
 42 |                                  train_size=0.3, groups=None,
 43 |                                  inference_method='desparsified-lasso',
 44 |                                  seed=0, ensembling_method='quantiles',
 45 |                                  gamma_min=0.2, n_bootstraps=25, n_jobs=1,
 46 |                                  memory=None, verbose=1, **kwargs):
 47 |     """Ensemble clustered inference algorithm
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     X_init : ndarray, shape (n_samples, n_features)
 52 |         Original data (uncompressed).
 53 | 
 54 |     y : ndarray, shape (n_samples,) or (n_samples, n_times)
 55 |         Target.
 56 | 
 57 |     ward : sklearn.cluster.FeatureAgglomeration
 58 |         Scikit-learn object that computes Ward hierarchical clustering.
 59 | 
 60 |     n_clusters : int
 61 |         Number of clusters used for the compression.
 62 | 
 63 |     train_size : float, optional (default=0.3)
 64 |         Fraction of samples used to compute the clustering.
 65 |         If `train_size = 1`, clustering is not random since all the samples
 66 |         are used to compute the clustering.
 67 | 
 68 |     groups : ndarray, shape (n_samples,), optional (default=None)
 69 |         Group labels for every sample. If not None, `groups` is used to build
 70 |         the subsamples that serve for computing the clustering.
 71 | 
 72 |     inference_method : str, optional (default='desparsified-lasso')
 73 |         Method used for making the inference.
 74 |         Currently, the two methods available are 'desparsified-lasso'
 75 |         and 'group-desparsified-lasso'. Use 'desparsified-lasso' for
 76 |         non-temporal data and 'group-desparsified-lasso' for temporal data.
 77 | 
 78 |     seed: int, optional (default=0)
 79 |         Seed used for generating a the first random subsample of the data.
 80 |         This seed controls the clustering randomness.
 81 | 
 82 |     ensembling_method : str, optional (default='quantiles')
 83 |         Method used for making the ensembling. Currently, the two methods
 84 |         available are 'quantiles' and 'median'.
 85 | 
 86 |     gamma_min : float, optional (default=0.2)
 87 |         Lowest gamma-quantile being considered to compute the adaptive
 88 |         quantile aggregation formula. This parameter is considered only if
 89 |         `ensembling_method` is 'quantiles'.
 90 | 
 91 |     n_bootstraps : int, optional (default=25)
 92 |         Number of clustered inference algorithm solutions to compute before
 93 |         making the ensembling.
 94 | 
 95 |     n_jobs : int or None, optional (default=1)
 96 |         Number of CPUs used to compute several clustered inference
 97 |         algorithms at the same time.
 98 | 
 99 |     memory : str, optional (default=None)
100 |         Used to cache the output of the computation of the clustering
101 |         and the inference. By default, no caching is done. If a string is
102 |         given, it is the path to the caching directory.
103 | 
104 |     verbose: int, optional (default=1)
105 |         The verbosity level. If `verbose > 0`, we print a message before
106 |         runing the clustered inference.
107 | 
108 |     **kwargs:
109 |         Arguments passed to the statistical inference function.
110 | 
111 |     Returns
112 |     -------
113 |     beta_hat : ndarray, shape (n_features,) or (n_features, n_times)
114 |         Estimated parameter vector or matrix.
115 | 
116 |     pval : ndarray, shape (n_features,)
117 |         p-value, with numerically accurate values for
118 |         positive effects (ie., for p-value close to zero).
119 | 
120 |     pval_corr : ndarray, shape (n_features,)
121 |         p-value corrected for multiple testing.
122 | 
123 |     one_minus_pval : ndarray, shape (n_features,)
124 |         One minus the p-value, with numerically accurate values
125 |         for negative effects (ie., for p-value close to one).
126 | 
127 |     one_minus_pval_corr : ndarray, shape (n_features,)
128 |         One minus the p-value corrected for multiple testing.
129 | 
130 |     References
131 |     ----------
132 |     .. [1] Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021).
133 |            Spatially relaxed inference on high-dimensional linear models.
134 |            arXiv preprint arXiv:2106.02590.
135 |     """
136 | 
137 |     if memory is not None and not isinstance(memory, str):
138 |         raise ValueError("'memory' must be None or a string corresponding " +
139 |                          "to the path of the caching directory.")
140 | 
141 |     # Clustered inference algorithms
142 |     results = Parallel(n_jobs=n_jobs, verbose=verbose)(
143 |         delayed(clustered_inference)(X_init, y, ward, n_clusters,
144 |                                      train_size=train_size, groups=groups,
145 |                                      method=inference_method, seed=i,
146 |                                      n_jobs=1, memory=memory,
147 |                                      verbose=verbose, **kwargs)
148 |         for i in np.arange(seed, seed + n_bootstraps))
149 | 
150 |     # Collecting results
151 |     list_beta_hat = []
152 |     list_pval, list_pval_corr = [], []
153 |     list_one_minus_pval, list_one_minus_pval_corr = [], []
154 | 
155 |     for i in range(n_bootstraps):
156 | 
157 |         list_beta_hat.append(results[i][0])
158 |         list_pval.append(results[i][1])
159 |         list_pval_corr.append(results[i][2])
160 |         list_one_minus_pval.append(results[i][3])
161 |         list_one_minus_pval_corr.append(results[i][4])
162 | 
163 |     # Ensembling
164 |     beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
165 |         _ensembling(list_beta_hat, list_pval, list_pval_corr,
166 |                     list_one_minus_pval, list_one_minus_pval_corr,
167 |                     method=ensembling_method, gamma_min=gamma_min)
168 | 
169 |     return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr
170 | 


--------------------------------------------------------------------------------
/hidimstat/permutation_test.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from joblib import Parallel, delayed
  3 | 
  4 | from sklearn.base import clone
  5 | from sklearn.utils import _safe_indexing
  6 | from sklearn.svm import LinearSVR
  7 | from sklearn.model_selection import GridSearchCV
  8 | from sklearn.pipeline import Pipeline
  9 | 
 10 | from hidimstat.stat_tools import pval_from_two_sided_pval_and_sign
 11 | 
 12 | 
 13 | def permutation_test_cv(X, y, n_permutations=1000,
 14 |                         C=None, Cs=np.logspace(-7, 1, 9),
 15 |                         seed=0, n_jobs=1, verbose=1):
 16 |     """Cross-validated permutation test shuffling the target
 17 | 
 18 |     Parameters
 19 |     -----------
 20 |     X : ndarray, shape (n_samples, n_features)
 21 |         Data.
 22 | 
 23 |     y : ndarray, shape (n_samples,)
 24 |         Target.
 25 | 
 26 |     C : float or None, optional (default=None)
 27 |         If None, the linear SVR regularization parameter is set by cross-val
 28 |         running a grid search on the list of hyper-parameters contained in Cs.
 29 |         Otherwise, the regularization parameter is equal to C.
 30 |         The strength of the regularization is inversely proportional to C.
 31 | 
 32 |     Cs : ndarray, optional (default=np.logspace(-7, 1, 9))
 33 |         If C is None, the linear SVR regularization parameter is set by
 34 |         cross-val running a grid search on the list of hyper-parameters
 35 |         contained in Cs.
 36 | 
 37 |     n_permutations : int, optional (default=1000)
 38 |         Number of permutations used to compute the survival function
 39 |         and cumulative distribution function scores.
 40 | 
 41 |     seed : int, optional (default=0)
 42 |         Determines the permutations used for shuffling the target
 43 | 
 44 |     n_jobs : int or None, optional (default=1)
 45 |         Number of CPUs to use during the cross validation.
 46 | 
 47 |     verbose: int, optional (default=1)
 48 |         The verbosity level: if non zero, progress messages are printed
 49 |         when computing the permutation stats in parralel.
 50 |         The frequency of the messages increases with the verbosity level.
 51 | 
 52 |     Returns
 53 |     -------
 54 |     pval_corr : ndarray, shape (n_features,)
 55 |         p-value corrected for multiple testing, with numerically accurate
 56 |         values for positive effects (ie., for p-value close to zero).
 57 | 
 58 |     one_minus_pval_corr : ndarray, shape (n_features,)
 59 |         One minus the corrected p-value, with numerically accurate
 60 |         values for negative effects (ie., for p-value close to one).
 61 |     """
 62 | 
 63 |     if C is None:
 64 | 
 65 |         steps = [('SVR', LinearSVR())]
 66 |         pipeline = Pipeline(steps)
 67 |         parameters = {'SVR__C': Cs}
 68 |         grid = GridSearchCV(pipeline, param_grid=parameters, n_jobs=n_jobs)
 69 |         grid.fit(X, y)
 70 |         C = grid.best_params_['SVR__C']
 71 |         estimator = LinearSVR(C=C)
 72 | 
 73 |     else:
 74 | 
 75 |         estimator = LinearSVR(C=C)
 76 | 
 77 |     pval_corr, one_minus_pval_corr = \
 78 |         permutation_test(X, y, estimator, n_permutations=n_permutations,
 79 |                          seed=seed, n_jobs=n_jobs, verbose=verbose)
 80 | 
 81 |     return pval_corr, one_minus_pval_corr
 82 | 
 83 | 
 84 | def permutation_test(X, y, estimator, n_permutations=1000,
 85 |                      seed=0, n_jobs=1, verbose=1):
 86 |     """Permutation test shuffling the target
 87 | 
 88 |     Parameters
 89 |     -----------
 90 |     X : ndarray, shape (n_samples, n_features)
 91 |         Data.
 92 | 
 93 |     y : ndarray, shape (n_samples,)
 94 |         Target.
 95 | 
 96 |     n_permutations : int, optional (default=1000)
 97 |         Number of permutations used to compute the survival function
 98 |         and cumulative distribution function scores.
 99 | 
100 |     seed : int, optional (default=0)
101 |         Determines the permutations used for shuffling the target
102 | 
103 |     n_jobs : int or None, optional (default=1)
104 |         Number of CPUs to use during the cross validation.
105 | 
106 |     verbose: int, optional (default=1)
107 |         The verbosity level: if non zero, progress messages are printed
108 |         when computing the permutation stats in parralel.
109 |         The frequency of the messages increases with the verbosity level.
110 | 
111 |     Returns
112 |     -------
113 |     pval_corr : ndarray, shape (n_features,)
114 |         p-value corrected for multiple testing, with numerically accurate
115 |         values for positive effects (ie., for p-value close to zero).
116 | 
117 |     one_minus_pval_corr : ndarray, shape (n_features,)
118 |         One minus the corrected p-value, with numerically accurate
119 |         values for negative effects (ie., for p-value close to one).
120 |     """
121 | 
122 |     rng = np.random.default_rng(seed)
123 | 
124 |     stat = _permutation_test_stat(clone(estimator), X, y)
125 | 
126 |     permutation_stats = \
127 |         Parallel(n_jobs=n_jobs, verbose=verbose)(
128 |             delayed(_permutation_test_stat)(clone(estimator), X,
129 |                                             _shuffle(y, rng))
130 |             for _ in range(n_permutations))
131 | 
132 |     permutation_stats = np.array(permutation_stats)
133 |     two_sided_pval_corr = step_down_max_T(stat, permutation_stats)
134 | 
135 |     stat_sign = np.sign(stat)
136 | 
137 |     pval_corr, _, one_minus_pval_corr, _ = \
138 |         pval_from_two_sided_pval_and_sign(two_sided_pval_corr, stat_sign)
139 | 
140 |     return pval_corr, one_minus_pval_corr
141 | 
142 | 
143 | def _permutation_test_stat(estimator, X, y):
144 |     """Fit estimator and get coef"""
145 |     stat = estimator.fit(X, y).coef_
146 |     return stat
147 | 
148 | 
149 | def _shuffle(y, rng):
150 |     """Shuffle vector"""
151 |     indices = rng.permutation(len(y))
152 |     return _safe_indexing(y, indices)
153 | 
154 | 
155 | def step_down_max_T(stat, permutation_stats):
156 |     """Step-down maxT algorithm for computing adjusted p-values
157 | 
158 |     Parameters
159 |     -----------
160 |     stat : ndarray, shape (n_features,)
161 |         Statistic computed on the original (unpermutted) problem.
162 | 
163 |     permutation_stats : ndarray, shape (n_permutations, n_features)
164 |         Statistics computed on permutted problems.
165 | 
166 |     Returns
167 |     -------
168 |     two_sided_pval_corr : ndarray, shape (n_features,)
169 |         Two-sided p-values corrected for multiple testing.
170 | 
171 |     References
172 |     ----------
173 |     .. [1] Westfall, P. H., & Young, S. S. (1993). Resampling-based multiple
174 |            testing: Examples and methods for p-value adjustment (Vol. 279).
175 |            John Wiley & Sons.
176 |     """
177 | 
178 |     n_permutations, n_features = np.shape(permutation_stats)
179 | 
180 |     index_ordered = np.argsort(np.abs(stat))
181 |     stat_ranked = np.empty(n_features)
182 |     stat_ranked[index_ordered] = np.arange(n_features)
183 |     stat_ranked = stat_ranked.astype(int)
184 |     stat_sorted = np.copy(np.abs(stat)[index_ordered])
185 |     permutation_stats_ordered = \
186 |         np.copy(np.abs(permutation_stats)[:, index_ordered])
187 | 
188 |     for i in range(1, n_features):
189 |         permutation_stats_ordered[:, i] = \
190 |             np.maximum(permutation_stats_ordered[:, i - 1],
191 |                        permutation_stats_ordered[:, i])
192 | 
193 |     two_sided_pval_corr = \
194 |         (np.sum(np.less_equal(stat_sorted, permutation_stats_ordered), axis=0)
195 |          / n_permutations)
196 | 
197 |     for i in range(n_features - 1)[::-1]:
198 |         two_sided_pval_corr[i] = \
199 |             np.maximum(two_sided_pval_corr[i], two_sided_pval_corr[i + 1])
200 | 
201 |     two_sided_pval_corr = np.copy(two_sided_pval_corr[stat_ranked])
202 | 
203 |     return two_sided_pval_corr
204 | 


--------------------------------------------------------------------------------
/hidimstat/scenario.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy import ndimage
  3 | 
  4 | ROI_SIZE_2D = 2
  5 | SHAPE_2D = (12, 12)
  6 | 
  7 | ROI_SIZE_3D = 2
  8 | SHAPE_3D = (12, 12, 12)
  9 | 
 10 | 
 11 | def multivariate_1D_simulation(n_samples=100, n_features=500,
 12 |                                support_size=10, sigma=1.0,
 13 |                                rho=0.0, shuffle=True, seed=0):
 14 |     """Generate 1D data with Toeplitz design matrix
 15 | 
 16 |     Parameters
 17 |     -----------
 18 |     n_samples : int
 19 |         Number of samples.
 20 | 
 21 |     n_features : int
 22 |         Number of features.
 23 | 
 24 |     support_size : int
 25 |         Size of the support.
 26 | 
 27 |     sigma : float
 28 |         Standard deviation of the additive White Gaussian noise.
 29 | 
 30 |     rho: float
 31 |         Level of correlation between neighboring features (if not `shuffle`).
 32 | 
 33 |     shuffle : bool
 34 |         Shuffle the features (breaking 1D data structure) if True.
 35 | 
 36 |     seed : int
 37 |         Seed used for generating design matrix and noise.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     X : ndarray, shape (n_samples, n_features)
 42 |         Design matrix.
 43 | 
 44 |     y : ndarray, shape (n_samples,)
 45 |         Target.
 46 | 
 47 |     beta : ndarray, shape (n_features,)
 48 |         Parameter vector.
 49 | 
 50 |     noise : ndarray, shape (n_samples,)
 51 |         Additive white Gaussian noise.
 52 |     """
 53 | 
 54 |     rng = np.random.default_rng(seed)
 55 | 
 56 |     X = np.zeros((n_samples, n_features))
 57 |     X[:, 0] = rng.standard_normal(n_samples)
 58 | 
 59 |     for i in np.arange(1, n_features):
 60 |         rand_vector = ((1 - rho ** 2) ** 0.5) * rng.standard_normal(n_samples)
 61 |         X[:, i] = rho * X[:, i - 1] + rand_vector
 62 | 
 63 |     if shuffle:
 64 |         rng.shuffle(X.T)
 65 | 
 66 |     beta = np.zeros(n_features)
 67 |     beta[0:support_size] = 1.0
 68 | 
 69 |     noise = sigma * rng.standard_normal(n_samples)
 70 |     y = np.dot(X, beta) + noise
 71 | 
 72 |     return X, y, beta, noise
 73 | 
 74 | 
 75 | def generate_2D_weight(shape, roi_size):
 76 |     """Create a 2D weight map with four ROIs
 77 | 
 78 |     Parameters
 79 |     -----------
 80 |     shape : tuple (n_x, n_z)
 81 |         Shape of the data in the simulation.
 82 | 
 83 |     roi_size : int
 84 |         Size of the edge of the ROIs.
 85 | 
 86 |     Returns
 87 |     -------
 88 |     w : ndarray, shape (n_x, n_z)
 89 |         2D weight map.
 90 |     """
 91 | 
 92 |     w = np.zeros(shape + (5,))
 93 |     w[0:roi_size, 0:roi_size, 0] = 1.0
 94 |     w[-roi_size:, -roi_size:, 1] = 1.0
 95 |     w[0:roi_size, -roi_size:, 2] = 1.0
 96 |     w[-roi_size:, 0:roi_size, 3] = 1.0
 97 | 
 98 |     return w
 99 | 
100 | 
101 | def generate_3D_weight(shape, roi_size):
102 |     """Create a 3D weight map with five ROIs
103 | 
104 |     Parameters
105 |     -----------
106 |     shape : tuple (n_x, n_y, n_z)
107 |         Shape of the data in the simulation.
108 | 
109 |     roi_size : int
110 |         Size of the edge of the ROIs.
111 | 
112 |     Returns
113 |     -------
114 |     w : ndarray, shape (n_x, n_y, n_z)
115 |         3D weight map.
116 |     """
117 | 
118 |     w = np.zeros(shape + (5,))
119 |     w[0:roi_size, 0:roi_size, 0:roi_size, 0] = -1.0
120 |     w[-roi_size:, -roi_size:, 0:roi_size, 1] = 1.0
121 |     w[0:roi_size, -roi_size:, -roi_size:, 2] = -1.0
122 |     w[-roi_size:, 0:roi_size, -roi_size:, 3] = 1.0
123 |     w[(shape[0] - roi_size) // 2:(shape[0] + roi_size) // 2,
124 |       (shape[1] - roi_size) // 2:(shape[1] + roi_size) // 2,
125 |       (shape[2] - roi_size) // 2:(shape[2] + roi_size) // 2, 4] = 1.0
126 |     return w
127 | 
128 | 
129 | def multivariate_simulation(n_samples=100,
130 |                             shape=SHAPE_2D,
131 |                             roi_size=ROI_SIZE_2D,
132 |                             sigma=1.0,
133 |                             smooth_X=1.0,
134 |                             return_shaped_data=True,
135 |                             seed=0):
136 |     """Generate a multivariate simulation with 2D or 3D data
137 | 
138 |     Parameters
139 |     -----------
140 |     n_samples : int
141 |         Number of samples.
142 | 
143 |     shape : tuple (n_x, n_y) or (n_x, n_y, n_z)
144 |         Shape of the data in the simulation.
145 | 
146 |     roi_size : int
147 |         Size of the edge of the ROIs.
148 | 
149 |     sigma : float
150 |         Standard deviation of the additive white Gaussian noise.
151 | 
152 |     smooth_X : float
153 |         Level of (data) smoothing using a Gaussian filter.
154 | 
155 |     return_shaped_data : bool
156 |         If true, the function returns shaped data and weight map.
157 | 
158 |     seed : int
159 |         Seed used for generating design matrix and noise.
160 | 
161 |     Returns
162 |     -------
163 |     X : ndarray, shape (n_samples, n_features)
164 |         Design matrix.
165 | 
166 |     y : ndarray, shape (n_samples,)
167 |         Target.
168 |     beta: ndarray, shape (n_features,)
169 |         Parameter vector (flattened weight map).
170 | 
171 |     noise: ndarray, shape (n_samples,)
172 |         Additive white Gaussian noise.
173 | 
174 |     X_: ndarray, shape (n_samples, n_x, n_y) or (n_samples, n_x, n_y, n_z)
175 |         Reshaped design matrix.
176 | 
177 |     w : ndarray, shape (n_x, n_y) or (n_x, n_y, n_z)
178 |         2D or 3D weight map.
179 |     """
180 | 
181 |     rng = np.random.default_rng(seed)
182 | 
183 |     if len(shape) == 2:
184 |         w = generate_2D_weight(shape, roi_size)
185 |     elif len(shape) == 3:
186 |         w = generate_3D_weight(shape, roi_size)
187 | 
188 |     beta = w.sum(-1).ravel()
189 |     X_ = rng.standard_normal((n_samples,) + shape)
190 |     X = []
191 | 
192 |     for i in np.arange(n_samples):
193 |         Xi = ndimage.filters.gaussian_filter(X_[i], smooth_X)
194 |         X.append(Xi.ravel())
195 | 
196 |     X = np.asarray(X)
197 |     X_ = X.reshape((n_samples,) + shape)
198 | 
199 |     noise = sigma * rng.standard_normal(n_samples)
200 |     y = np.dot(X, beta) + noise
201 | 
202 |     if return_shaped_data:
203 |         return X, y, beta, noise, X_, w
204 | 
205 |     return X, y, beta, noise
206 | 
207 | 
208 | def multivariate_temporal_simulation(n_samples=100, n_features=500, n_times=30,
209 |                                      support_size=10, sigma=1.0, rho_noise=0.0,
210 |                                      rho_data=0.0, shuffle=True, seed=0):
211 |     """Generate 1D temporal data with constant design matrix
212 | 
213 |     Parameters
214 |     -----------
215 |     n_samples : int
216 |         Number of samples.
217 | 
218 |     n_features : int
219 |         Number of features.
220 | 
221 |     n_times : int
222 |         Number of time points.
223 | 
224 |     support_size: int
225 |         Size of the row support.
226 | 
227 |     sigma : float
228 |         Standard deviation of the noise at each time point.
229 | 
230 |     rho_noise : float
231 |         Level of autocorrelation in the noise.
232 | 
233 |     rho_data: float
234 |         Level of correlation between neighboring features (if not `shuffle`).
235 | 
236 |     shuffle : bool
237 |         Shuffle the features (breaking 1D data structure) if True.
238 | 
239 |     seed : int
240 |         Seed used for generating design matrix and noise.
241 | 
242 |     Returns
243 |     -------
244 |     X: ndarray, shape (n_samples, n_features)
245 |         Design matrix.
246 | 
247 |     Y : ndarray, shape (n_samples, n_times)
248 |         Target.
249 | 
250 |     beta : ndarray, shape (n_features, n_times)
251 |         Parameter matrix.
252 | 
253 |     noise : ndarray, shape (n_samples, n_times)
254 |         Noise matrix.
255 |     """
256 | 
257 |     rng = np.random.default_rng(seed)
258 | 
259 |     X = np.zeros((n_samples, n_features))
260 |     X[:, 0] = rng.standard_normal(n_samples)
261 | 
262 |     for i in np.arange(1, n_features):
263 |         rand_vector = \
264 |             ((1 - rho_data ** 2) ** 0.5) * rng.standard_normal(n_samples)
265 |         X[:, i] = rho_data * X[:, i - 1] + rand_vector
266 | 
267 |     if shuffle:
268 |         rng.shuffle(X.T)
269 | 
270 |     beta = np.zeros((n_features, n_times))
271 |     beta[0:support_size, :] = 1.0
272 | 
273 |     noise = np.zeros((n_samples, n_times))
274 |     noise[:, 0] = rng.standard_normal(n_samples)
275 | 
276 |     for i in range(1, n_times):
277 |         rand_vector = \
278 |             ((1 - rho_noise ** 2) ** 0.5) * rng.standard_normal(n_samples)
279 |         noise[:, i] = rho_noise * noise[:, i - 1] + rand_vector
280 | 
281 |     noise = sigma * noise
282 | 
283 |     Y = np.dot(X, beta) + noise
284 | 
285 |     return X, Y, beta, noise
286 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # hidimstat documentation build configuration file, created by
  4 | # sphinx-quickstart on Thu Jun  1 00:35:01 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import os
 16 | import sys
 17 | import warnings
 18 | import sphinx_gallery
 19 | import sphinx_bootstrap_theme
 20 | from distutils.version import LooseVersion
 21 | import matplotlib
 22 | 
 23 | # Disable agg warnings in doc
 24 | warnings.filterwarnings("ignore", category=UserWarning,
 25 |                         message='Matplotlib is currently using agg, which is a'
 26 |                                 ' non-GUI backend, so cannot show the figure.')
 27 | 
 28 | # If extensions (or modules to document with autodoc) are in another directory,
 29 | # add these directories to sys.path here. If the directory is relative to the
 30 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 31 | #sys.path.insert(0, os.path.abspath('.'))
 32 | 
 33 | 
 34 | # -- General configuration ------------------------------------------------
 35 | 
 36 | # If your documentation needs a minimal Sphinx version, state it here.
 37 | #needs_sphinx = '1.0'
 38 | 
 39 | # Add any Sphinx extension module names here, as strings. They can be
 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 41 | # ones.
 42 | extensions = [
 43 |     'sphinx.ext.autodoc',
 44 |     'sphinx.ext.autosummary',
 45 |     'sphinx.ext.doctest',
 46 |     'sphinx.ext.intersphinx',
 47 |     'sphinx.ext.mathjax',
 48 |     'sphinx_gallery.gen_gallery',
 49 |     'numpydoc',
 50 | ]
 51 | 
 52 | if LooseVersion(sphinx_gallery.__version__) < LooseVersion('0.2'):
 53 |     raise ImportError('Must have at least version 0.2 of sphinx-gallery, got '
 54 |                       '%s' % (sphinx_gallery.__version__,))
 55 | 
 56 | matplotlib.use('agg')
 57 | 
 58 | 
 59 | # Add any paths that contain templates here, relative to this directory.
 60 | templates_path = ['_templates']
 61 | 
 62 | # The suffix(es) of source filenames.
 63 | # You can specify multiple suffix as a list of string:
 64 | #
 65 | source_suffix = ['.rst', '.md']
 66 | # source_suffix = '.rst'
 67 | 
 68 | # The master toctree document.
 69 | master_doc = 'index'
 70 | 
 71 | # General information about the project.
 72 | project = u'hidimstat'
 73 | copyright = u'2020, Jerome-Alexis Chevalier & Binh Nguyen'
 74 | author = u'Jerome-Alexis Chevalier & Binh Nguyen'
 75 | 
 76 | # The version info for the project you're documenting, acts as replacement for
 77 | # |version| and |release|, also used in various other places throughout the
 78 | # built documents.
 79 | #
 80 | # The short X.Y version.
 81 | from hidimstat import __version__ as version  # noqa
 82 | # The full version, including alpha/beta/rc tags.
 83 | release = version
 84 | 
 85 | # The language for content autogenerated by Sphinx. Refer to documentation
 86 | # for a list of supported languages.
 87 | #
 88 | # This is also used if you do content translation via gettext catalogs.
 89 | # Usually you set "language" from the command line for these cases.
 90 | language = None
 91 | 
 92 | # List of patterns, relative to source directory, that match files and
 93 | # directories to ignore when looking for source files.
 94 | # This patterns also effect to html_static_path and html_extra_path
 95 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 96 | 
 97 | # The name of the Pygments (syntax highlighting) style to use.
 98 | pygments_style = 'sphinx'
 99 | 
100 | # If true, `todo` and `todoList` produce output, else they produce nothing.
101 | todo_include_todos = False
102 | 
103 | # generate autosummary even if no references
104 | autosummary_generate = True
105 | 
106 | # remove warnings: "toctree contains reference to nonexisting document"
107 | numpydoc_show_class_members = False
108 | 
109 | # -- Options for HTML output ----------------------------------------------
110 | 
111 | # The theme to use for HTML and HTML Help pages.  See the documentation for
112 | # a list of builtin themes.
113 | html_theme = 'bootstrap'
114 | 
115 | # Theme options are theme-specific and customize the look and feel of a theme
116 | # further.  For a list of options available for each theme, see the
117 | # documentation.
118 | html_theme_options = {
119 |     'navbar_sidebarrel': False,
120 |     'navbar_pagenav': False,
121 |     'source_link_position': "",
122 |     'navbar_links': [
123 |         ("Examples", "auto_examples/index"),
124 |         ("API", "api"),
125 |         ("GitHub", "https://github.com/ja-che/hidimstat", True)
126 |     ],
127 |     'bootswatch_theme': "flatly",
128 |     'bootstrap_version': "3",
129 | }
130 | 
131 | # Add any paths that contain custom themes here, relative to this directory.
132 | html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
133 | 
134 | # Add any paths that contain custom static files (such as style sheets) here,
135 | # relative to this directory. They are copied after the builtin static files,
136 | # so a file named "default.css" will overwrite the builtin "default.css".
137 | html_static_path = ['_static']
138 | 
139 | 
140 | # -- Options for HTMLHelp output ------------------------------------------
141 | 
142 | # Output file base name for HTML help builder.
143 | htmlhelp_basename = 'hidimstat_doc'
144 | 
145 | 
146 | # -- Options for LaTeX output ---------------------------------------------
147 | 
148 | latex_elements = {
149 |     # The paper size ('letterpaper' or 'a4paper').
150 |     #
151 |     # 'papersize': 'letterpaper',
152 | 
153 |     # The font size ('10pt', '11pt' or '12pt').
154 |     #
155 |     # 'pointsize': '10pt',
156 | 
157 |     # Additional stuff for the LaTeX preamble.
158 |     #
159 |     # 'preamble': '',
160 | 
161 |     # Latex figure (float) alignment
162 |     #
163 |     # 'figure_align': 'htbp',
164 | }
165 | 
166 | # Grouping the document tree into LaTeX files. List of tuples
167 | # (source start file, target name, title,
168 | #  author, documentclass [howto, manual, or own class]).
169 | latex_documents = [
170 |     (master_doc, 'hidimstat.tex', u'hidimstat Documentation',
171 |      u'Jerome-Alexis Chevalier', 'manual'),
172 | ]
173 | 
174 | 
175 | # -- Options for manual page output ---------------------------------------
176 | 
177 | # One entry per manual page. List of tuples
178 | # (source start file, name, description, authors, manual section).
179 | man_pages = [
180 |     (master_doc, 'hidimstat', u'Hidimstat Documentation',
181 |      [author], 1)
182 | ]
183 | 
184 | 
185 | # -- Options for Texinfo output -------------------------------------------
186 | 
187 | # Grouping the document tree into Texinfo files. List of tuples
188 | # (source start file, target name, title, author,
189 | #  dir menu entry, description, category)
190 | texinfo_documents = [
191 |     (master_doc, 'hidimstat', u'hidimstat Documentation',
192 |      author, 'hidimstat', 'One line description of project.',
193 |      'Miscellaneous'),
194 | ]
195 | 
196 | # -- Intersphinx configuration -----------------------------------------------
197 | 
198 | intersphinx_mapping = {
199 |     'python': ('https://docs.python.org/3', None),
200 |     'numpy': ('https://numpy.org/devdocs', None),
201 |     'scipy': ('https://scipy.github.io/devdocs', None),
202 |     'matplotlib': ('https://matplotlib.org', None),
203 |     'sklearn': ('https://scikit-learn.org/stable', None),
204 |     'numba': ('https://numba.pydata.org/numba-doc/latest', None),
205 |     'joblib': ('https://joblib.readthedocs.io/en/latest', None),
206 |     'pandas': ('https://pandas.pydata.org/pandas-docs/stable', None),
207 |     'seaborn': ('https://seaborn.pydata.org/', None),
208 |     'pyvista': ('https://docs.pyvista.org', None),
209 | }
210 | 
211 | examples_dirs = ['../examples']
212 | gallery_dirs = ['auto_examples']
213 | import mne
214 | 
215 | scrapers = ('matplotlib',)
216 | try:
217 |     with warnings.catch_warnings():
218 |         warnings.filterwarnings("ignore", category=DeprecationWarning)
219 |         import pyvista
220 |     pyvista.OFF_SCREEN = False
221 | except Exception:
222 |     pass
223 | else:
224 |     brain_scraper = mne.viz._brain._BrainScraper()
225 |     scrapers += (brain_scraper, 'pyvista')
226 | if any(x in scrapers for x in ('pyvista')):
227 |     from traits.api import push_exception_handler
228 |     push_exception_handler(reraise_exceptions=True)
229 |     report_scraper = mne.report._ReportScraper()
230 |     scrapers += (report_scraper,)
231 | else:
232 |     report_scraper = None
233 | 
234 | sphinx_gallery_conf = {
235 |     'doc_module': 'groupmne',
236 |     'reference_url': dict(groupmne=None),
237 |     'examples_dirs': examples_dirs,
238 |     'gallery_dirs': gallery_dirs,
239 |     'plot_gallery': 'True',
240 |     'thumbnail_size': (160, 112),
241 |     'min_reported_time': 1.,
242 |     'backreferences_dir': os.path.join('generated'),
243 |     'abort_on_example_error': False,
244 |     'image_scrapers': scrapers,
245 |     'show_memory': True,
246 |     # 'reference_url': {
247 |     #     'numpy': 'http://docs.scipy.org/doc/numpy-1.9.1',
248 |     #     'scipy': 'http://docs.scipy.org/doc/scipy-0.17.0/reference',
249 |     # }
250 | }
251 | 
252 | 
253 | def setup(app):
254 |     app.add_css_file('style.css')
255 | 


--------------------------------------------------------------------------------
/hidimstat/noise_std.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy.linalg import norm
  3 | from scipy.linalg import toeplitz, solve
  4 | from sklearn.linear_model import LassoCV, MultiTaskLassoCV
  5 | from sklearn.model_selection import KFold
  6 | 
  7 | 
  8 | def reid(X, y, eps=1e-2, tol=1e-4, max_iter=1e4, n_jobs=1, seed=0):
  9 |     """Estimation of noise standard deviation using Reid procedure
 10 | 
 11 |     Parameters
 12 |     -----------
 13 |     X : ndarray, shape (n_samples, n_features)
 14 |         Data.
 15 | 
 16 |     y : ndarray, shape (n_samples,)
 17 |         Target.
 18 | 
 19 |     eps: float, optional (default=1e-2)
 20 |         Length of the cross-validation path.
 21 |         eps=1e-2 means that alpha_min / alpha_max = 1e-2.
 22 | 
 23 |     tol : float, optional (default=1e-4)
 24 |         The tolerance for the optimization: if the updates are smaller
 25 |         than `tol`, the optimization code checks the dual gap for optimality
 26 |         and continues until it is smaller than `tol`.
 27 | 
 28 |     max_iter : int, optional (default=1e4)
 29 |         The maximum number of iterations.
 30 | 
 31 |     n_jobs : int or None, optional (default=1)
 32 |         Number of CPUs to use during the cross validation.
 33 | 
 34 |     seed: int, optional (default=0)
 35 |         Seed passed in the KFold object which is used to cross-validate
 36 |         LassoCV. This seed controls the partitioning randomness.
 37 | 
 38 |     Returns
 39 |     -------
 40 |     sigma_hat : float
 41 |         Estimated noise standard deviation.
 42 | 
 43 |     beta_hat : array, shape (n_features,)
 44 |         Estimated parameter vector.
 45 | 
 46 |     References
 47 |     ----------
 48 |     .. [1] Reid, S., Tibshirani, R., & Friedman, J. (2016). A study of error
 49 |            variance estimation in lasso regression. Statistica Sinica, 35-67.
 50 |     """
 51 | 
 52 |     X = np.asarray(X)
 53 |     n_samples, n_features = X.shape
 54 | 
 55 |     if max_iter // 5 <= n_features:
 56 |         max_iter = n_features * 5
 57 |         print(f"'max_iter' has been increased to {max_iter}")
 58 | 
 59 |     cv = KFold(n_splits=5, shuffle=True, random_state=seed)
 60 | 
 61 |     clf_lasso_cv = \
 62 |         LassoCV(eps=eps, normalize=False, fit_intercept=False,
 63 |                 cv=cv, tol=tol, max_iter=max_iter, n_jobs=n_jobs)
 64 | 
 65 |     clf_lasso_cv.fit(X, y)
 66 |     beta_hat = clf_lasso_cv.coef_
 67 |     residual = clf_lasso_cv.predict(X) - y
 68 |     coef_max = np.max(np.abs(beta_hat))
 69 |     support = np.sum(np.abs(beta_hat) > tol * coef_max)
 70 | 
 71 |     # avoid dividing by 0
 72 |     support = min(support, n_samples - 1)
 73 | 
 74 |     sigma_hat = norm(residual) / np.sqrt(n_samples - support)
 75 | 
 76 |     return sigma_hat, beta_hat
 77 | 
 78 | 
 79 | def group_reid(X, Y, fit_Y=True, stationary=True, method='simple', order=1,
 80 |                eps=1e-2, tol=1e-4, max_iter=1e4, n_jobs=1, seed=0):
 81 | 
 82 |     """Estimation of the covariance matrix using group Reid procedure
 83 | 
 84 |     Parameters
 85 |     -----------
 86 |     X : ndarray, shape (n_samples, n_features)
 87 |         Data.
 88 | 
 89 |     Y : ndarray, shape (n_samples, n_times)
 90 |         Target.
 91 | 
 92 |     fit_Y : bool, optional (default=True)
 93 |         If True, Y will be regressed against X by MultiTaskLassoCV
 94 |         and the covariance matrix is estimated on the residuals.
 95 |         Otherwise, covariance matrix is estimated directly on Y.
 96 | 
 97 |     stationary : bool, optional (default=True)
 98 |         If True, noise is considered to have the same magnitude for each
 99 |         time step. Otherwise, magnitude of the noise is not constant.
100 | 
101 |     method : str, optional (default='simple')
102 |         If 'simple', the correlation matrix is estimated by taking the
103 |         median of the correlation between two consecutive time steps
104 |         and the noise standard deviation for each time step is estimated
105 |         by taking the median of the standard deviations for every time step.
106 |         If 'AR', the order of the AR model is given by `order` and
107 |         Yule-Walker method is used to estimate the covariance matrix.
108 | 
109 |     order : int, optional (default=1)
110 |         If `stationary=True` and `method=AR`, `order` gives the
111 |         order of the estimated autoregressive model. `order` must
112 |         be smaller than the number of time steps.
113 | 
114 |     eps : float, optional (default=1e-2)
115 |         Length of the cross-validation path.
116 |         eps=1e-2 means that alpha_min / alpha_max = 1e-2.
117 | 
118 |     tol : float, optional (default=1e-4)
119 |         The tolerance for the optimization: if the updates are smaller
120 |         than `tol`, the optimization code checks the dual gap for optimality
121 |         and continues until it is smaller than `tol`.
122 | 
123 |     max_iter : int, optional (default=1e4)
124 |         The maximum number of iterations.
125 | 
126 |     n_jobs : int or None, optional (default=1)
127 |         Number of CPUs to use during the cross validation.
128 | 
129 |     seed: int, optional (default=0)
130 |         Seed passed in the KFold object which is used to cross-validate
131 |         LassoCV. This seed controls also the partitioning randomness.
132 | 
133 |     Returns
134 |     -------
135 |     cov_hat : ndarray, shape (n_times, n_times)
136 |         Estimated covariance matrix.
137 | 
138 |     beta_hat : ndarray, shape (n_features, n_times)
139 |         Estimated parameter matrix.
140 | 
141 |     References
142 |     ----------
143 |     .. [1] Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020).
144 |            Statistical control for spatio-temporal MEG/EEG source imaging with
145 |            desparsified multi-task Lasso. In NeurIPS 2020-34h Conference on
146 |            Neural Information Processing Systems.
147 |     """
148 | 
149 |     X = np.asarray(X)
150 |     n_samples, n_features = X.shape
151 |     n_times = Y.shape[1]
152 | 
153 |     if method == 'simple':
154 |         print('Group reid: simple cov estimation')
155 |     else:
156 |         print(f'Group reid: {method}{order} cov estimation')
157 | 
158 |     if (max_iter // 5) <= n_features:
159 |         max_iter = n_features * 5
160 |         print(f"'max_iter' has been increased to {max_iter}")
161 | 
162 |     cv = KFold(n_splits=5, shuffle=True, random_state=seed)
163 | 
164 |     if fit_Y:
165 | 
166 |         clf_mtlcv = \
167 |             MultiTaskLassoCV(eps=eps, normalize=False, fit_intercept=False,
168 |                              cv=cv, tol=tol, max_iter=max_iter, n_jobs=n_jobs)
169 | 
170 |         clf_mtlcv.fit(X, Y)
171 |         beta_hat = clf_mtlcv.coef_
172 |         residual = clf_mtlcv.predict(X) - Y
173 |         row_max = np.max(np.sum(np.abs(beta_hat), axis=0))
174 |         support = np.sum(np.sum(np.abs(beta_hat), axis=0) > tol * row_max)
175 | 
176 |         # avoid dividing by 0
177 |         support = min(support, n_samples - 1)
178 | 
179 |     else:
180 | 
181 |         beta_hat = np.zeros((n_features, n_times))
182 |         residual = np.copy(Y)
183 |         support = 0
184 | 
185 |     sigma_hat_raw = norm(residual, axis=0) / np.sqrt(n_samples - support)
186 | 
187 |     if stationary:
188 |         sigma_hat = np.median(sigma_hat_raw) * np.ones(n_times)
189 |         corr_emp = np.corrcoef(residual.T)
190 |     else:
191 |         sigma_hat = sigma_hat_raw
192 |         residual_rescaled = residual / sigma_hat
193 |         corr_emp = np.corrcoef(residual_rescaled.T)
194 | 
195 |     # Median method
196 |     if not stationary or method == 'simple':
197 | 
198 |         rho_hat = np.median(np.diag(corr_emp, 1))
199 |         corr_hat = \
200 |             toeplitz(np.geomspace(1, rho_hat ** (n_times - 1), n_times))
201 |         cov_hat = np.outer(sigma_hat, sigma_hat) * corr_hat
202 | 
203 |     # Yule-Walker method
204 |     elif stationary and method == 'AR':
205 | 
206 |         if order > n_times - 1:
207 |             raise ValueError('The requested AR order is to high with ' +
208 |                              'respect to the number of time steps.')
209 | 
210 |         rho_ar = np.zeros(order + 1)
211 |         rho_ar[0] = 1
212 | 
213 |         for i in range(1, order + 1):
214 |             rho_ar[i] = np.median(np.diag(corr_emp, i))
215 | 
216 |         A = toeplitz(rho_ar[:-1])
217 |         coef_ar = solve(A, rho_ar[1:])
218 | 
219 |         residual_estimate = np.zeros((n_samples, n_times - order))
220 | 
221 |         for i in range(order):
222 |             # time window used to estimate the residual from AR model
223 |             start = order - i - 1
224 |             end = - i - 1
225 |             residual_estimate += coef_ar[i] * residual[:, start:end]
226 | 
227 |         residual_diff = residual[:, order:] - residual_estimate
228 |         sigma_eps = np.median(norm(residual_diff, axis=0) / np.sqrt(n_samples))
229 | 
230 |         rho_ar_full = np.zeros(n_times)
231 |         rho_ar_full[:rho_ar.size] = rho_ar
232 | 
233 |         for i in range(order + 1, n_times):
234 |             start = i - order
235 |             end = i
236 |             rho_ar_full[i] = np.dot(coef_ar[::-1], rho_ar_full[start:end])
237 | 
238 |         corr_hat = toeplitz(rho_ar_full)
239 |         sigma_hat[:] = sigma_eps / np.sqrt((1 - np.dot(coef_ar, rho_ar[1:])))
240 |         cov_hat = np.outer(sigma_hat, sigma_hat) * corr_hat
241 | 
242 |     else:
243 |         raise ValueError('Unknown method for estimating the covariance matrix')
244 | 
245 |     return cov_hat, beta_hat
246 | 
247 | 
248 | def empirical_snr(X, y, beta, noise=None):
249 |     """Compute the SNR for the linear model: y = X beta + noise
250 | 
251 |     Parameters
252 |     -----------
253 |     X : ndarray or scipy.sparse matrix, shape (n_samples, n_features)
254 |         Data.
255 | 
256 |     y : ndarray, shape (n_samples,)
257 |         Target.
258 | 
259 |     beta : ndarray, shape (n_features,)
260 |         True parameter vector.
261 | 
262 |     noise : ndarray, shape (n_samples,), optional (default=None)
263 |         True error vector.
264 | 
265 |     Returns
266 |     -------
267 |     snr_hat : float
268 |         Empirical signal-to-noise ratio.
269 |     """
270 |     X = np.asarray(X)
271 | 
272 |     signal = np.dot(X, beta)
273 | 
274 |     if noise is None:
275 |         noise = y - signal
276 | 
277 |     sig_signal = np.linalg.norm(signal - np.mean(signal))
278 |     sig_noise = np.linalg.norm(noise - np.mean(noise))
279 |     snr_hat = (sig_signal / sig_noise) ** 2
280 | 
281 |     return snr_hat
282 | 


--------------------------------------------------------------------------------
/hidimstat/clustered_inference.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.utils import resample
  3 | from sklearn.preprocessing import StandardScaler
  4 | from sklearn.utils.validation import check_memory
  5 | 
  6 | from .stat_tools import pval_from_cb
  7 | from .desparsified_lasso import desparsified_lasso, desparsified_group_lasso
  8 | 
  9 | 
 10 | def _subsampling(n_samples, train_size, groups=None, seed=0):
 11 |     """Random subsampling: computes a list of indices"""
 12 | 
 13 |     if groups is None:
 14 | 
 15 |         n_subsamples = int(n_samples * train_size)
 16 |         train_index = resample(np.arange(n_samples), n_samples=n_subsamples,
 17 |                                replace=False, random_state=seed)
 18 | 
 19 |     else:
 20 | 
 21 |         unique_groups = np.unique(groups)
 22 |         n_groups = unique_groups.size
 23 |         n_subsample_groups = int(n_groups * train_size)
 24 |         train_group = resample(unique_groups, n_samples=n_subsample_groups,
 25 |                                replace=False, random_state=seed)
 26 |         train_index = np.arange(n_samples)[np.isin(groups, train_group)]
 27 | 
 28 |     return train_index
 29 | 
 30 | 
 31 | def _ward_clustering(X_init, ward, train_index):
 32 |     """Ward clustering applied to full X but computed from a subsample of X"""
 33 | 
 34 |     ward = ward.fit(X_init[train_index, :])
 35 |     X_reduced = ward.transform(X_init)
 36 | 
 37 |     return X_reduced, ward
 38 | 
 39 | 
 40 | def hd_inference(X, y, method, n_jobs=1, memory=None, verbose=0, **kwargs):
 41 |     """Wrap-up high-dimensional inference procedures
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     X : ndarray, shape (n_samples, n_features)
 46 |         Data.
 47 | 
 48 |     y : ndarray, shape (n_samples,) or (n_samples, n_times)
 49 |         Target.
 50 | 
 51 |     method : str, optional (default='desparsified-lasso')
 52 |         Method used for making the inference.
 53 |         Currently the two methods available are 'desparsified-lasso'
 54 |         and 'group-desparsified-lasso'. Use 'desparsified-lasso' for
 55 |         non-temporal data and 'group-desparsified-lasso' for temporal data.
 56 | 
 57 |     n_jobs : int or None, optional (default=1)
 58 |         Number of CPUs to use during parallel steps such as inference.
 59 | 
 60 |     memory : str or joblib.Memory object, optional (default=None)
 61 |         Used to cache the output of the computation of the clustering
 62 |         and the inference. By default, no caching is done. If a string is
 63 |         given, it is the path to the caching directory.
 64 | 
 65 |     verbose: int, optional (default=1)
 66 |         The verbosity level. If `verbose > 0`, we print a message before
 67 |         runing the clustered inference.
 68 | 
 69 |     **kwargs:
 70 |         Arguments passed to the statistical inference function.
 71 | 
 72 |     Returns
 73 |     -------
 74 |     beta_hat : ndarray, shape (n_features,) or (n_features, n_times)
 75 |         Estimated parameter vector or matrix.
 76 | 
 77 |     pval : ndarray, shape (n_features,)
 78 |         p-value, with numerically accurate values for
 79 |         positive effects (ie., for p-value close to zero).
 80 | 
 81 |     pval_corr : ndarray, shape (n_features,)
 82 |         p-value corrected for multiple testing.
 83 | 
 84 |     one_minus_pval : ndarray, shape (n_features,)
 85 |         One minus the p-value, with numerically accurate values
 86 |         for negative effects (ie., for p-value close to one).
 87 | 
 88 |     one_minus_pval_corr : ndarray, shape (n_features,)
 89 |         One minus the p-value corrected for multiple testing.
 90 |     """
 91 | 
 92 |     if method == 'desparsified-lasso':
 93 | 
 94 |         beta_hat, cb_min, cb_max = \
 95 |             desparsified_lasso(X, y, confidence=0.95, n_jobs=n_jobs,
 96 |                                memory=memory, verbose=verbose, **kwargs)
 97 |         pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
 98 |             pval_from_cb(cb_min, cb_max, confidence=0.95)
 99 | 
100 |     elif method == 'desparsified-group-lasso':
101 | 
102 |         beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
103 |             desparsified_group_lasso(X, y, n_jobs=n_jobs, memory=memory,
104 |                                      verbose=verbose, **kwargs)
105 | 
106 |     else:
107 | 
108 |         raise ValueError('Unknow method')
109 | 
110 |     return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr
111 | 
112 | 
113 | def _degrouping(ward, beta_hat, pval, pval_corr,
114 |                 one_minus_pval, one_minus_pval_corr):
115 |     """Assigning cluster-wise stats to features contained in the corresponding
116 |     cluster and rescaling estimated parameter"""
117 | 
118 |     pval_degrouped = ward.inverse_transform(pval)
119 |     pval_corr_degrouped = ward.inverse_transform(pval_corr)
120 |     one_minus_pval_degrouped = ward.inverse_transform(one_minus_pval)
121 |     one_minus_pval_corr_degrouped = ward.inverse_transform(one_minus_pval_corr)
122 | 
123 |     labels = ward.labels_
124 |     clusters_size = np.zeros(labels.size)
125 | 
126 |     for label in range(labels.max() + 1):
127 |         cluster_size = np.sum(labels == label)
128 |         clusters_size[labels == label] = cluster_size
129 | 
130 |     if len(beta_hat.shape) == 1:
131 | 
132 |         beta_hat_degrouped = ward.inverse_transform(beta_hat) / clusters_size
133 | 
134 |     elif len(beta_hat.shape) == 2:
135 | 
136 |         n_features = pval_degrouped.shape[0]
137 |         n_times = beta_hat.shape[1]
138 |         beta_hat_degrouped = np.zeros((n_features, n_times))
139 | 
140 |         for i in range(n_times):
141 | 
142 |             beta_hat_degrouped[:, i] = \
143 |                 ward.inverse_transform(beta_hat[:, i]) / clusters_size
144 | 
145 |     return (beta_hat_degrouped, pval_degrouped, pval_corr_degrouped,
146 |             one_minus_pval_degrouped, one_minus_pval_corr_degrouped)
147 | 
148 | 
149 | def clustered_inference(X_init, y, ward, n_clusters, train_size=1.0,
150 |                         groups=None, method='desparsified-lasso', seed=0,
151 |                         n_jobs=1, memory=None, verbose=1, **kwargs):
152 |     """Clustered inference algorithm
153 | 
154 |     Parameters
155 |     ----------
156 |     X_init : ndarray, shape (n_samples, n_features)
157 |         Original data (uncompressed).
158 | 
159 |     y : ndarray, shape (n_samples,) or (n_samples, n_times)
160 |         Target.
161 | 
162 |     ward : sklearn.cluster.FeatureAgglomeration
163 |         Scikit-learn object that computes Ward hierarchical clustering.
164 | 
165 |     n_clusters : int
166 |         Number of clusters used for the compression.
167 | 
168 |     train_size : float, optional (default=1.0)
169 |         Fraction of samples used to compute the clustering.
170 |         If `train_size = 1`, clustering is not random since all the samples
171 |         are used to compute the clustering.
172 | 
173 |     groups : ndarray, shape (n_samples,), optional (default=None)
174 |         Group labels for every sample. If not None, `groups` is used to build
175 |         the subsamples that serve for computing the clustering.
176 | 
177 |     method : str, optional (default='desparsified-lasso')
178 |         Method used for making the inference.
179 |         Currently the two methods available are 'desparsified-lasso'
180 |         and 'group-desparsified-lasso'. Use 'desparsified-lasso' for
181 |         non-temporal data and 'group-desparsified-lasso' for temporal data.
182 | 
183 |     seed: int, optional (default=0)
184 |         Seed used for generating a random subsample of the data.
185 |         This seed controls the clustering randomness.
186 | 
187 |     n_jobs : int or None, optional (default=1)
188 |         Number of CPUs to use during parallel steps such as inference.
189 | 
190 |     memory : str or joblib.Memory object, optional (default=None)
191 |         Used to cache the output of the computation of the clustering
192 |         and the inference. By default, no caching is done. If a string is
193 |         given, it is the path to the caching directory.
194 | 
195 |     verbose: int, optional (default=1)
196 |         The verbosity level. If `verbose > 0`, we print a message before
197 |         runing the clustered inference.
198 | 
199 |     **kwargs:
200 |         Arguments passed to the statistical inference function.
201 | 
202 |     Returns
203 |     -------
204 |     beta_hat : ndarray, shape (n_features,) or (n_features, n_times)
205 |         Estimated parameter vector or matrix.
206 | 
207 |     pval : ndarray, shape (n_features,)
208 |         p-value, with numerically accurate values for
209 |         positive effects (ie., for p-value close to zero).
210 | 
211 |     pval_corr : ndarray, shape (n_features,)
212 |         p-value corrected for multiple testing.
213 | 
214 |     one_minus_pval : ndarray, shape (n_features,)
215 |         One minus the p-value, with numerically accurate values
216 |         for negative effects (ie., for p-value close to one).
217 | 
218 |     one_minus_pval_corr : ndarray, shape (n_features,)
219 |         One minus the p-value corrected for multiple testing.
220 | 
221 |     References
222 |     ----------
223 |     .. [1] Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021).
224 |            Spatially relaxed inference on high-dimensional linear models.
225 |            arXiv preprint arXiv:2106.02590.
226 |     """
227 | 
228 |     memory = check_memory(memory)
229 | 
230 |     n_samples, n_features = X_init.shape
231 | 
232 |     if verbose > 0:
233 | 
234 |         print(f'Clustered inference: n_clusters = {n_clusters}, ' +
235 |               f'inference method = {method}, seed = {seed}')
236 | 
237 |     # Sampling
238 |     train_index = _subsampling(n_samples, train_size, groups=groups, seed=seed)
239 | 
240 |     # Clustering
241 |     X, ward = memory.cache(_ward_clustering)(X_init, ward, train_index)
242 | 
243 |     # Preprocessing
244 |     X = StandardScaler().fit_transform(X)
245 |     y = y - np.mean(y)
246 | 
247 |     # Inference: computing reduced parameter vector and stats
248 |     beta_hat_, pval_, pval_corr_, one_minus_pval_, one_minus_pval_corr_ = \
249 |         hd_inference(X, y, method, n_jobs=n_jobs, memory=memory, **kwargs)
250 | 
251 |     # De-grouping
252 |     beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
253 |         _degrouping(ward, beta_hat_, pval_, pval_corr_, one_minus_pval_,
254 |                     one_minus_pval_corr_)
255 | 
256 |     return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr
257 | 


--------------------------------------------------------------------------------
/examples/plot_fmri_data_example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Support recovery on fMRI data
  3 | =============================
  4 | 
  5 | This example compares several methods that estimate a decoder map support
  6 | with statistical guarantees. More precisely, we aim at thresholding the
  7 | weights of some estimated decoder maps according to the confidence we have
  8 | that they are nonzero. Here, we work with the Haxby dataset and we focus on
  9 | the 'face vs house' contrast. Thus, we consider the labeled activation maps
 10 | of a given subject and try produce a brain map that corresponds to the
 11 | discriminative pattern that makes the decoding of the two conditions.
 12 | 
 13 | In this example, we show that standard statistical methods (i.e., method
 14 | such as thresholding by permutation test the SVR or Ridge decoder or the
 15 | algorithm introduced by Gaonkar et al. [1]_) are not powerful when applied on
 16 | the uncompressed problem (i.e., the orignal problem in which the activation
 17 | maps are not reduced using compression techniques such as parcellation).
 18 | This is notably due to the high dimensionality (too many voxels) and
 19 | structure of the data (too much correlation between neighboring voxels).
 20 | We also present two methods that offer statistical guarantees but
 21 | with a (small) spatial tolerance on the shape of the support:
 22 | clustered desparsified lasso (CLuDL) combines clustering (parcellation)
 23 | and statistical inference ; ensemble of clustered desparsified lasso (EnCluDL)
 24 | adds a randomization step over the choice of clustering.
 25 | 
 26 | EnCluDL is powerful and does not depend on a unique clustering choice.
 27 | As shown in Chevalier et al. (2021) [2]_, for several tasks the estimated
 28 | support (predictive regions) looks relevant.
 29 | 
 30 | References
 31 | ----------
 32 | .. [1] Gaonkar, B., & Davatzikos, C. (2012, October). Deriving statistical
 33 |        significance maps for SVM based image classification and group
 34 |        comparisons. In International Conference on Medical Image Computing
 35 |        and Computer-Assisted Intervention (pp. 723-730). Springer, Berlin,
 36 |        Heidelberg.
 37 | 
 38 | .. [2] Chevalier, J. A., Nguyen, T. B., Salmon, J., Varoquaux, G.,
 39 |        & Thirion, B. (2021). Decoding with confidence: Statistical
 40 |        control on decoder maps. NeuroImage, 234, 117921.
 41 | """
 42 | 
 43 | #############################################################################
 44 | # Imports needed for this script
 45 | # ------------------------------
 46 | import numpy as np
 47 | import pandas as pd
 48 | from sklearn.utils import Bunch
 49 | from sklearn.cluster import FeatureAgglomeration
 50 | from sklearn.feature_extraction import image
 51 | from sklearn.linear_model import Ridge
 52 | from nilearn import datasets
 53 | from nilearn.input_data import NiftiMasker
 54 | from nilearn.image import mean_img
 55 | from nilearn.plotting import plot_stat_map, show
 56 | 
 57 | from hidimstat.stat_tools import zscore_from_pval, pval_from_scale
 58 | from hidimstat.standardized_svr import standardized_svr
 59 | from hidimstat.permutation_test import permutation_test, permutation_test_cv
 60 | from hidimstat.adaptive_permutation_threshold import ada_svr
 61 | from hidimstat.clustered_inference import clustered_inference
 62 | from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference
 63 | 
 64 | 
 65 | #############################################################################
 66 | # Function to fetch and preprocess Haxby dataset
 67 | # ----------------------------------------------
 68 | def preprocess_haxby(subject=2, memory=None):
 69 |     '''Gathering and preprocessing Haxby dataset for a given subject.'''
 70 | 
 71 |     # Gathering data
 72 |     haxby_dataset = datasets.fetch_haxby(subjects=[subject])
 73 |     fmri_filename = haxby_dataset.func[0]
 74 | 
 75 |     behavioral = pd.read_csv(haxby_dataset.session_target[0], sep=" ")
 76 | 
 77 |     # conditions = pd.DataFrame.to_numpy(behavioral['labels'])
 78 |     conditions = behavioral['labels'].values
 79 |     session_label = behavioral['chunks'].values
 80 | 
 81 |     condition_mask = np.logical_or(conditions == 'face', conditions == 'house')
 82 |     groups = session_label[condition_mask]
 83 | 
 84 |     # Loading anatomical image (back-ground image)
 85 |     if haxby_dataset.anat[0] is None:
 86 |         bg_img = None
 87 |     else:
 88 |         bg_img = mean_img(haxby_dataset.anat)
 89 | 
 90 |     # Building target where '1' corresponds to 'face' and '-1' to 'house'
 91 |     y = np.asarray((conditions[condition_mask] == 'face') * 2 - 1)
 92 | 
 93 |     # Loading mask
 94 |     mask_img = haxby_dataset.mask
 95 |     masker = NiftiMasker(mask_img=mask_img, standardize=True,
 96 |                          smoothing_fwhm=None, memory=memory)
 97 | 
 98 |     # Computing masked data
 99 |     fmri_masked = masker.fit_transform(fmri_filename)
100 |     X = np.asarray(fmri_masked)[condition_mask, :]
101 | 
102 |     return Bunch(X=X, y=y, groups=groups, bg_img=bg_img, masker=masker)
103 | 
104 | 
105 | #############################################################################
106 | # Gathering and preprocessing Haxby dataset for a given subject
107 | # -------------------------------------------------------------
108 | # The `preprocess_haxby` function make the preprocessing of the Haxby dataset,
109 | # it outputs the preprocessed activation maps for the two conditions
110 | # 'face' or 'house' (contained in `X`), the conditions (in `y`),
111 | # the session labels (in `groups`) and the mask (in `masker`).
112 | # You may choose a subject in [1, 2, 3, 4, 5, 6]. By default subject=2.
113 | data = preprocess_haxby(subject=2)
114 | X, y, groups, masker = data.X, data.y, data.groups, data.masker
115 | mask = masker.mask_img_.get_fdata().astype(bool)
116 | 
117 | #############################################################################
118 | # Initializing FeatureAgglomeration object that performs the clustering
119 | # -------------------------------------------------------------------------
120 | # For fMRI data taking 500 clusters is generally a good default choice.
121 | 
122 | n_clusters = 500
123 | # Deriving voxels connectivity.
124 | shape = mask.shape
125 | n_x, n_y, n_z = shape[0], shape[1], shape[2]
126 | connectivity = image.grid_to_graph(n_x=n_x, n_y=n_y, n_z=n_z, mask=mask)
127 | # Initializing FeatureAgglomeration object.
128 | ward = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connectivity)
129 | 
130 | #############################################################################
131 | # Making the inference with several algorithms
132 | # --------------------------------------------
133 | 
134 | #############################################################################
135 | # First, we try to recover the discriminative partern by computing
136 | # p-values from SVR decoder weights and a parametric approximation
137 | # of the distribution of these weights.
138 | 
139 | # We precomputed the regularization parameter by CV (C = 0.1) to reduce the
140 | # computation time of the example.
141 | beta_hat, scale = standardized_svr(X, y, Cs=[0.1])
142 | pval_std_svr, _, one_minus_pval_std_svr, _ = pval_from_scale(beta_hat, scale)
143 | 
144 | #############################################################################
145 | # Now, we compute p-values thanks to permutation tests applied to
146 | # 1/the weights of the SVR decoder or 2/the weights of the Ridge decoder.
147 | 
148 | # To derive the p-values from the SVR decoder, you may change the next line by
149 | # `SVR_permutation_test_inference = True`. It should take around 15 minutes.
150 | 
151 | SVR_permutation_test_inference = False
152 | if SVR_permutation_test_inference:
153 |     # We computed the regularization parameter by CV (C = 0.1)
154 |     pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test = \
155 |         permutation_test_cv(X, y, n_permutations=50, C=0.1)
156 | 
157 | # Another method is to compute the p-values by permutation test from the
158 | # Ridge decoder. The solution provided by this method should be very close to
159 | # the previous one and the computation time is much shorter: around 20 seconds.
160 | 
161 | estimator = Ridge()
162 | pval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test = \
163 |     permutation_test(X, y, estimator=estimator, n_permutations=200)
164 | 
165 | #############################################################################
166 | # Now, let us run the algorithm introduced by Gaonkar et al. (c.f. References).
167 | # Since the estimator they derive is obtained by approximating the hard margin
168 | # SVM formulation, we referred to this method as "ada-SVR" which stands for
169 | # "Adaptive Permutation Threshold SVR". The function is ``ada_svr``.
170 | beta_hat, scale = ada_svr(X, y)
171 | pval_ada_svr, _, one_minus_pval_ada_svr, _ = pval_from_scale(beta_hat, scale)
172 | 
173 | #############################################################################
174 | # Now, the clustered inference algorithm which combines parcellation
175 | # and high-dimensional inference (c.f. References).
176 | beta_hat, pval_cdl, _, one_minus_pval_cdl, _ = \
177 |     clustered_inference(X, y, ward, n_clusters)
178 | 
179 | #############################################################################
180 | # Below, we run the ensemble clustered inference algorithm which adds a
181 | # randomization step over the clustered inference algorithm (c.f. References).
182 | # To make the example as short as possible we take `n_bootstraps=5`
183 | # which means that 5 different parcellations are considered and
184 | # then 5 statistical maps are produced and aggregated into one.
185 | # However you might benefit from clustering randomization taking
186 | # `n_bootstraps=25` or `n_bootstraps=100`, also we set `n_jobs=2`.
187 | beta_hat, pval_ecdl, _, one_minus_pval_ecdl, _ = \
188 |     ensemble_clustered_inference(X, y, ward, n_clusters, groups=groups,
189 |                                  n_bootstraps=5, n_jobs=2)
190 | 
191 | #############################################################################
192 | # Plotting the results
193 | # --------------------
194 | # To allow a better visualization of the disciminative pattern we will plot
195 | # z-maps rather than p-value maps. Assuming Gaussian distribution of the
196 | # estimators we can recover a z-score from a p-value by using the
197 | # inverse survival function.
198 | #
199 | # First, we set theoretical FWER target at 10%.
200 | 
201 | n_samples, n_features = X.shape
202 | target_fwer = 0.1
203 | 
204 | #############################################################################
205 | # We now translate the FWER target into a z-score target.
206 | # For the permutation test methods we do not need any additional correction
207 | # since the p-values are already adjusted for multiple testing.
208 | 
209 | zscore_threshold_corr = zscore_from_pval((target_fwer / 2))
210 | 
211 | #############################################################################
212 | # Other methods need to be corrected. We consider the Bonferroni correction.
213 | # For methods that do not reduce the feature space, the correction
214 | # consists in dividing by the number of features.
215 | 
216 | correction = 1. / n_features
217 | zscore_threshold_no_clust = zscore_from_pval((target_fwer / 2) * correction)
218 | 
219 | #############################################################################
220 | # For methods that parcelates the brain into groups of voxels, the correction
221 | # consists in dividing by the number of parcels (or clusters).
222 | 
223 | correction_clust = 1. / n_clusters
224 | zscore_threshold_clust = zscore_from_pval((target_fwer / 2) * correction_clust)
225 | 
226 | #############################################################################
227 | # Now, we can plot the thresholded z-score maps by translating the
228 | # p-value maps estimated previously into z-score maps and using the
229 | # suitable threshold. For a better readability, we make a small function
230 | # called `plot_map` that wraps all these steps.
231 | 
232 | 
233 | def plot_map(pval, one_minus_pval, zscore_threshold, title=None,
234 |              cut_coords=[-25, -40, -5], masker=masker, bg_img=data.bg_img):
235 | 
236 |     zscore = zscore_from_pval(pval, one_minus_pval)
237 |     zscore_img = masker.inverse_transform(zscore)
238 |     plot_stat_map(zscore_img, threshold=zscore_threshold, bg_img=bg_img,
239 |                   dim=-1, cut_coords=cut_coords, title=title)
240 | 
241 | 
242 | plot_map(pval_std_svr, one_minus_pval_std_svr, zscore_threshold_no_clust,
243 |          title='SVR parametric threshold')
244 | 
245 | if SVR_permutation_test_inference:
246 |     plot_map(pval_corr_svr_perm_test, one_minus_pval_corr_svr_perm_test,
247 |              zscore_threshold_corr, title='SVR permutation-test thresh.')
248 | 
249 | plot_map(pval_corr_ridge_perm_test, one_minus_pval_corr_ridge_perm_test,
250 |          zscore_threshold_corr, title='Ridge permutation-test thresh.')
251 | 
252 | plot_map(pval_ada_svr, one_minus_pval_ada_svr, zscore_threshold_no_clust,
253 |          title='SVR adaptive perm. tresh.')
254 | 
255 | plot_map(pval_cdl, one_minus_pval_cdl, zscore_threshold_clust, 'CluDL')
256 | 
257 | plot_map(pval_ecdl, one_minus_pval_ecdl, zscore_threshold_clust, 'EnCluDL')
258 | 
259 | #############################################################################
260 | # Analysis of the results
261 | # -----------------------
262 | # As advocated in introduction, the methods that do not reduce the original
263 | # problem are not satisfying since they are too conservative.
264 | # Among those methods, the only one that makes discoveries is the one that
265 | # threshold the SVR decoder using a parametric approximation.
266 | # However this method has no statistical guarantees and we can see that some
267 | # isolated voxels are discovered, which seems quite spurious.
268 | # The discriminative pattern derived from the clustered inference algorithm
269 | # (CluDL) show that the method is less conservative.
270 | # However, some reasonable paterns are also included in this solution.
271 | # Finally, the solution provided by the ensemble clustered inference algorithm
272 | # (EnCluDL) seems realistic as we recover the visual cortex and do not make
273 | # spurious discoveries.
274 | 
275 | show()
276 | 


--------------------------------------------------------------------------------
/hidimstat/stat_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.stats import norm
  3 | 
  4 | 
  5 | def _replace_infinity(x, replace_val=None, method='times-two'):
  6 |     """Replace infinity by large value"""
  7 | 
  8 |     largest_non_inf = np.max(np.abs(x)[np.abs(x) != np.inf])
  9 | 
 10 |     if method == 'times-two':
 11 |         replace_val_min = largest_non_inf * 2
 12 |     elif method == 'plus-one':
 13 |         replace_val_min = largest_non_inf + 1
 14 | 
 15 |     if (replace_val is not None) and (replace_val < largest_non_inf):
 16 |         replace_val = replace_val_min
 17 |     elif replace_val is None:
 18 |         replace_val = replace_val_min
 19 | 
 20 |     x_new = np.copy(x)
 21 |     x_new[x_new == np.inf] = replace_val
 22 |     x_new[x_new == -np.inf] = -replace_val
 23 | 
 24 |     return x_new
 25 | 
 26 | 
 27 | def pval_corr_from_pval(one_sided_pval):
 28 |     """Computing one-sided p-values corrrected for multiple testing
 29 |     from simple testing one-sided p-values.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     one_sided_pval : ndarray, shape (n_features,)
 34 |         One-sided p-values.
 35 | 
 36 |     Returns
 37 |     -------
 38 |     one_sided_pval_corr : ndarray, shape (n_features,)
 39 |         Corrected one-sided p-values.
 40 |      """
 41 | 
 42 |     n_features = one_sided_pval.size
 43 | 
 44 |     one_sided_pval_corr = np.zeros(n_features) + 0.5
 45 | 
 46 |     ind = (one_sided_pval < 0.5)
 47 |     one_sided_pval_corr[ind] = \
 48 |         np.minimum(one_sided_pval[ind] * n_features, 0.5)
 49 | 
 50 |     ind = (one_sided_pval > 0.5)
 51 |     one_sided_pval_corr[ind] = \
 52 |         np.maximum(1 - (1 - one_sided_pval[ind]) * n_features, 0.5)
 53 | 
 54 |     return one_sided_pval_corr
 55 | 
 56 | 
 57 | def pval_from_scale(beta, scale, distrib='norm', eps=1e-14):
 58 |     """Computing one-sided p-values from the value of the parameter
 59 |     and its scale.
 60 | 
 61 |     Parameters
 62 |     ----------
 63 |     beta : ndarray, shape (n_features,)
 64 |         Value of the parameters.
 65 | 
 66 |     scale : ndarray, shape (n_features,)
 67 |         Value of the standard deviation of the parameters.
 68 | 
 69 |     distrib : str, opitonal (default='norm')
 70 |         Type of distribution assumed for the underlying estimator.
 71 |         'norm' means normal and is the only value accepted at the moment.
 72 | 
 73 |     eps : float, optional
 74 |         Machine-precision regularization in the computation of the p-values.
 75 | 
 76 |     Returns
 77 |     -------
 78 |     pval : ndarray, shape (n_features,)
 79 |         p-value, with numerically accurate values for
 80 |         positive effects (ie., for p-value close to zero).
 81 | 
 82 |     pval_corr : ndarray, shape (n_features,)
 83 |         p-value corrected for multiple testing.
 84 | 
 85 |     one_minus_pval : ndarray, shape (n_features,)
 86 |         One minus the p-value, with numerically accurate values
 87 |         for negative effects (ie., for p-value close to one).
 88 | 
 89 |     one_minus_pval_corr : ndarray, shape (n_features,)
 90 |         One minus the p-value corrected for multiple testing.
 91 |     """
 92 | 
 93 |     n_features = beta.size
 94 | 
 95 |     index_no_nan = tuple([scale != 0.0])
 96 | 
 97 |     pval = np.zeros(n_features) + 0.5
 98 |     one_minus_pval = np.zeros(n_features) + 0.5
 99 | 
100 |     if distrib == 'norm':
101 | 
102 |         pval[index_no_nan] = \
103 |             norm.sf(beta[index_no_nan], scale=scale[index_no_nan])
104 |         one_minus_pval[index_no_nan] = \
105 |             norm.cdf(beta[index_no_nan], scale=scale[index_no_nan])
106 | 
107 |     pval[pval > 1 - eps] = 1 - eps
108 |     pval_corr = pval_corr_from_pval(pval)
109 | 
110 |     one_minus_pval[one_minus_pval > 1 - eps] = 1 - eps
111 |     one_minus_pval_corr = pval_corr_from_pval(one_minus_pval)
112 | 
113 |     return pval, pval_corr, one_minus_pval, one_minus_pval_corr
114 | 
115 | 
116 | def zscore_from_cb(cb_min, cb_max, confidence=0.95, distrib='norm'):
117 |     """Computing z-scores from confidence intervals.
118 | 
119 |     Parameters
120 |     ----------
121 |     cb_min : ndarray, shape (n_features,)
122 |         Value of the inferior confidence bound.
123 | 
124 |     cb_max : ndarray, shape (n_features,)
125 |         Value of the superior confidence bound.
126 | 
127 |     confidence : float, optional (default=0.95)
128 |         Confidence level used to compute the confidence intervals.
129 |         Each value should be in the range [0, 1].
130 | 
131 |     distrib : str, opitonal (default='norm')
132 |         Type of distribution assumed for the underlying estimator.
133 |         'norm' means normal and is the only value accepted at the moment.
134 | 
135 |     Returns
136 |     -------
137 |     zscore : ndarray, shape (n_features,)
138 |         z-scores.
139 |     """
140 | 
141 |     if distrib == 'norm':
142 |         quantile = norm.ppf(1 - (1 - confidence) / 2)
143 | 
144 |     beta_hat = (cb_min + cb_max) / 2
145 | 
146 |     zscore = beta_hat / (cb_max - cb_min) * 2 * quantile
147 | 
148 |     return zscore
149 | 
150 | 
151 | def pval_from_cb(cb_min, cb_max, confidence=0.95, distrib='norm', eps=1e-14):
152 |     """Computing one-sided p-values from confidence intervals.
153 | 
154 |     Parameters
155 |     ----------
156 |     cb_min : ndarray, shape (n_features,)
157 |         Value of the inferior confidence bound.
158 | 
159 |     cb_max : ndarray, shape (n_features,)
160 |         Value of the superior confidence bound.
161 | 
162 |     confidence : float, optional (default=0.95)
163 |         Confidence level used to compute the confidence intervals.
164 |         Each value should be in the range [0, 1].
165 | 
166 |     distrib : str, opitonal (default='norm')
167 |         Type of distribution assumed for the underlying estimator.
168 |         'norm' means normal and is the only value accepted at the moment.
169 | 
170 |     eps : float, optional
171 |         Machine-precision regularization in the computation of the p-values.
172 | 
173 |     Returns
174 |     -------
175 |     pval : ndarray, shape (n_features,)
176 |         p-value, with numerically accurate values for
177 |         positive effects (ie., for p-value close to zero).
178 | 
179 |     pval_corr : ndarray, shape (n_features,)
180 |         p-value corrected for multiple testing.
181 | 
182 |     one_minus_pval : ndarray, shape (n_features,)
183 |         One minus the p-value, with numerically accurate values
184 |         for negative effects (ie., for p-value close to one).
185 | 
186 |     one_minus_pval_corr : ndarray, shape (n_features,)
187 |         One minus the p-value corrected for multiple testing.
188 |     """
189 | 
190 |     zscore = \
191 |         zscore_from_cb(cb_min, cb_max, confidence=confidence, distrib=distrib)
192 | 
193 |     if distrib == 'norm':
194 | 
195 |         pval = norm.sf(zscore)
196 |         one_minus_pval = norm.cdf(zscore)
197 | 
198 |     pval[pval > 1 - eps] = 1 - eps
199 |     pval_corr = pval_corr_from_pval(pval)
200 | 
201 |     one_minus_pval[one_minus_pval > 1 - eps] = 1 - eps
202 |     one_minus_pval_corr = pval_corr_from_pval(one_minus_pval)
203 | 
204 |     return pval, pval_corr, one_minus_pval, one_minus_pval_corr
205 | 
206 | 
207 | def two_sided_pval_from_zscore(zscore, distrib='norm'):
208 |     """Computing two-sided p-values from z-scores.
209 | 
210 |     Parameters
211 |     ----------
212 |     zscore : ndarray, shape (n_features,)
213 |         z-scores.
214 | 
215 |     distrib : str, opitonal (default='norm')
216 |         Type of distribution assumed for the underlying estimator.
217 |         'norm' means normal and is the only value accepted at the moment.
218 | 
219 |     Returns
220 |     -------
221 |     two_sided_pval : ndarray, shape (n_features,)
222 |         Two-sided p-values (testing the null).
223 | 
224 |     two_sided_pval_corr : ndarray, shape (n_features,)
225 |         Two-sided p-values corrected for multiple testing.
226 |     """
227 |     n_features = zscore.size
228 | 
229 |     if distrib == 'norm':
230 |         two_sided_pval = 2 * norm.sf(np.abs(zscore))
231 | 
232 |     two_sided_pval_corr = np.minimum(1, two_sided_pval * n_features)
233 | 
234 |     return two_sided_pval, two_sided_pval_corr
235 | 
236 | 
237 | def two_sided_pval_from_cb(cb_min, cb_max, confidence=0.95, distrib='norm'):
238 |     """Computing two-sided p-values from confidence intervals.
239 | 
240 |     Parameters
241 |     ----------
242 |     cb_min : ndarray, shape (n_features,)
243 |         Value of the inferior confidence bound.
244 | 
245 |     cb_max : ndarray, shape (n_features,)
246 |         Value of the superior confidence bound.
247 | 
248 |     confidence : float, optional (default=0.95)
249 |         Confidence level used to compute the confidence intervals.
250 |         Each value should be in the range [0, 1].
251 | 
252 |     distrib : str, opitonal (default='norm')
253 |         Type of distribution assumed for the underlying estimator.
254 |         'norm' means normal and is the only value accepted at the moment.
255 | 
256 |     Returns
257 |     -------
258 |     two_sided_pval : ndarray, shape (n_features,)
259 |         Two-sided p-values (testing the null).
260 | 
261 |     two_sided_pval_corr : ndarray, shape (n_features,)
262 |         Two-sided p-values corrected for multiple testing.
263 |     """
264 |     zscore = \
265 |         zscore_from_cb(cb_min, cb_max, confidence=confidence, distrib=distrib)
266 | 
267 |     two_sided_pval, two_sided_pval_corr = \
268 |         two_sided_pval_from_zscore(zscore, distrib=distrib)
269 | 
270 |     return two_sided_pval, two_sided_pval_corr
271 | 
272 | 
273 | def zscore_from_pval(pval, one_minus_pval=None, distrib='norm'):
274 |     """Computing z-scores from one-sided p-values.
275 | 
276 |     Parameters
277 |     -----------
278 |     pval : ndarray, shape (n_features,)
279 |         p-value, with numerically accurate values for
280 |         positive effects (ie., for p-value close to zero).
281 | 
282 |     one_minus_pval : ndarray, shape (n_features,), optional (default=None)
283 |         One minus the p-value, with numerically accurate values
284 |         for negative effects (ie., for p-value close to one).
285 | 
286 |     distrib : str, opitonal (default='norm')
287 |         Type of distribution assumed for the underlying estimator.
288 |         'norm' means normal and is the only value accepted at the moment.
289 | 
290 |     Returns
291 |     -------
292 |     zscore : ndarray, shape (n_features,)
293 |         z-scores.
294 |     """
295 | 
296 |     if distrib == 'norm':
297 | 
298 |         zscore = norm.isf(pval)
299 | 
300 |         if one_minus_pval is not None:
301 | 
302 |             ind = (pval > 0.5)
303 |             zscore[ind] = norm.ppf(one_minus_pval[ind])
304 | 
305 |     zscore = _replace_infinity(zscore, replace_val=40, method='plus-one')
306 | 
307 |     return zscore
308 | 
309 | 
310 | def pval_from_two_sided_pval_and_sign(two_sided_pval, parameter_sign,
311 |                                       eps=1e-14):
312 |     """Computing one-sided p-values from two-sided p-value and parameter sign.
313 | 
314 |     Parameters
315 |     ----------
316 |     two_sided_pval : ndarray, shape (n_features,)
317 |         Two-sided p-values (testing the null).
318 | 
319 |     parameter_sign : ndarray, shape (n_features,)
320 |         Estimated signs for the parameters.
321 | 
322 |     eps : float, optional
323 |         Machine-precision regularization in the computation of the p-values.
324 | 
325 |     Returns
326 |     -------
327 |     pval : ndarray, shape (n_features,)
328 |         p-value, with numerically accurate values for
329 |         positive effects (ie., for p-value close to zero).
330 | 
331 |     pval_corr : ndarray, shape (n_features,)
332 |         p-value corrected for multiple testing.
333 | 
334 |     one_minus_pval : ndarray, shape (n_features,)
335 |         One minus the p-value, with numerically accurate values
336 |         for negative effects (ie., for p-value close to one).
337 | 
338 |     one_minus_pval_corr : ndarray, shape (n_features,)
339 |         One minus the p-value corrected for multiple testing.
340 |     """
341 | 
342 |     n_features = two_sided_pval.size
343 | 
344 |     pval = 0.5 * np.ones(n_features)
345 |     one_minus_pval = 0.5 * np.ones(n_features)
346 | 
347 |     pval[parameter_sign > 0] = two_sided_pval[parameter_sign > 0] / 2
348 |     pval[parameter_sign < 0] = 1 - two_sided_pval[parameter_sign < 0] / 2
349 | 
350 |     one_minus_pval[parameter_sign > 0] = \
351 |         1 - two_sided_pval[parameter_sign > 0] / 2
352 |     one_minus_pval[parameter_sign < 0] = \
353 |         two_sided_pval[parameter_sign < 0] / 2
354 | 
355 |     pval[pval > 1 - eps] = 1 - eps
356 |     pval_corr = pval_corr_from_pval(pval)
357 | 
358 |     one_minus_pval[one_minus_pval > 1 - eps] = 1 - eps
359 |     one_minus_pval_corr = pval_corr_from_pval(one_minus_pval)
360 | 
361 |     return pval, pval_corr, one_minus_pval, one_minus_pval_corr
362 | 
363 | 
364 | def two_sided_pval_from_pval(pval, one_minus_pval=None, distrib='norm'):
365 |     """Computing two-sided p-value from one-sided p-values.
366 | 
367 |     Parameters
368 |     -----------
369 |     pval : ndarray, shape (n_features,)
370 |         p-value, with numerically accurate values for
371 |         positive effects (ie., for p-value close to zero).
372 | 
373 |     one_minus_pval : ndarray, shape (n_features,), optional (default=None)
374 |         One minus the p-value, with numerically accurate values
375 |         for negative effects (ie., for p-value close to one).
376 | 
377 |     distrib : str, opitonal (default='norm')
378 |         Type of distribution assumed for the underlying estimator.
379 |         'norm' means normal and is the only value accepted at the moment.
380 | 
381 |     Returns
382 |     -------
383 |     two_sided_pval : ndarray, shape (n_features,)
384 |         Two-sided p-values (testing the null).
385 | 
386 |     two_sided_pval_corr : ndarray, shape (n_features,)
387 |         Two-sided p-values corrected for multiple testing.
388 |     """
389 | 
390 |     zscore = zscore_from_pval(pval, one_minus_pval, distrib=distrib)
391 | 
392 |     two_sided_pval, two_sided_pval_corr = \
393 |         two_sided_pval_from_zscore(zscore, distrib=distrib)
394 | 
395 |     return two_sided_pval, two_sided_pval_corr
396 | 


--------------------------------------------------------------------------------
/examples/plot_2D_simulation_example.py:
--------------------------------------------------------------------------------
  1 | # Authors: Jerome-Alexis Chevalier <jerome-alexis.chevalier@inria.fr>
  2 | """
  3 | Support recovery on simulated data (2D)
  4 | =======================================
  5 | 
  6 | This example shows the advantages of spatially relaxed inference when
  7 | dealing with high-dimensional spatial data. To do so, we compare several
  8 | statistical methods that aim at recovering the support, i.e., predictive
  9 | features. Among those methods some leverage the spatial structure of the
 10 | data. For more details about the inference algorithms presented in this
 11 | example or about the generative process used to simulate the data,
 12 | please refer to Chevalier et al. (2021) [1]_.
 13 | 
 14 | This example corresponds to the experiment described in details in
 15 | Chevalier et al. (2021) [1]_. Shortly, to simulate the data, we draw
 16 | ``n_samples`` i.i.d Gaussian vectors of size ``n_features`` and reshape them
 17 | into squares (edges are equal to ``n_features ** (1/2)``). Then, to introduce
 18 | some spatial structure, we apply a Gaussian filter that correlates features
 19 | that are nearby. The 2D data are then flattened into a design matrix ``X`` to
 20 | represent it as a regression setting and to ease the computation of the
 21 | simulated target ``y`` (see below). Then, we construct the weight map ``w``
 22 | which has the same shape as the 2D data, as it contains four predictive
 23 | regions in every corner of the square. Similarly as for the construction
 24 | of ``X``, the map ``w`` is finally flattened into a vector ``beta``. Lastly,
 25 | to derive the target ``y``, we draw a white Gaussian noise ``epsilon`` and
 26 | use a linear generative model: ``y = X beta + epsilon``.
 27 | 
 28 | The results of this experiment show that the methods that leverage the spatial
 29 | structure of the data are relevant. More precisely, we show that clustered
 30 | inference algorithms (e.g., CluDL) and ensembled clustered inference algorithms
 31 | (e.g., EnCluDL) are more powerful than the standard inference methods (see also
 32 | Chevalier et al. (2021) [1]_). Indeed, when the number of features is much
 33 | greater than the number of samples, standard statistical methods are
 34 | unlikely to recover the support. Then, the idea of clustered inference is to
 35 | compress the data without breaking the spatial structure, leading to a
 36 | compressed problem  close to the original problem. This leads to a
 37 | powerful spatially relaxed inference. Indeed, thanks to the dimension reduction
 38 | the support recovery is feasible. However, due to the spatial compression,
 39 | there is a limited (and quantifiable) spatial uncertainty concerning the shape
 40 | of the estimated support. Finally, by considering several choices of
 41 | spatial compression, ensembled clustered inference algorithms reduce
 42 | significantly the spatial uncertainty compared to clustered inference
 43 | algorithms which consider only one spatial compression.
 44 | 
 45 | .. _References:
 46 | 
 47 | References
 48 | ----------
 49 | .. [1] Chevalier, J. A., Nguyen, T. B., Thirion, B., & Salmon, J. (2021).
 50 |        Spatially relaxed inference on high-dimensional linear models.
 51 |        arXiv preprint arXiv:2106.02590.
 52 | """
 53 | 
 54 | #############################################################################
 55 | # Imports needed for this script
 56 | # ------------------------------
 57 | import numpy as np
 58 | import matplotlib.pyplot as plt
 59 | from sklearn.feature_extraction import image
 60 | from sklearn.cluster import FeatureAgglomeration
 61 | 
 62 | from hidimstat.scenario import multivariate_simulation
 63 | from hidimstat.stat_tools import zscore_from_pval, pval_from_cb
 64 | from hidimstat.desparsified_lasso import desparsified_lasso
 65 | from hidimstat.clustered_inference import clustered_inference
 66 | from hidimstat.ensemble_clustered_inference import ensemble_clustered_inference
 67 | 
 68 | 
 69 | #############################################################################
 70 | # Specific plotting functions
 71 | # ---------------------------
 72 | # The functions below are used to plot the results and illustrate the concept
 73 | # of spatial tolerance. If you are reading this example for the first time,
 74 | # you can skip this section.
 75 | #
 76 | # The following function builds a 2D map with four active regions that are
 77 | # enfolded by thin tolerance regions.
 78 | 
 79 | 
 80 | def weight_map_2D_extended(shape, roi_size, delta):
 81 |     '''Build weight map with visible tolerance region'''
 82 | 
 83 |     roi_size_extended = roi_size + delta
 84 | 
 85 |     w = np.zeros(shape + (5,))
 86 |     w[0:roi_size, 0:roi_size, 0] = 0.5
 87 |     w[-roi_size:, -roi_size:, 1] = 0.5
 88 |     w[0:roi_size, -roi_size:, 2] = 0.5
 89 |     w[-roi_size:, 0:roi_size, 3] = 0.5
 90 |     w[0:roi_size_extended, 0:roi_size_extended, 0] += 0.5
 91 |     w[-roi_size_extended:, -roi_size_extended:, 1] += 0.5
 92 |     w[0:roi_size_extended, -roi_size_extended:, 2] += 0.5
 93 |     w[-roi_size_extended:, 0:roi_size_extended, 3] += 0.5
 94 | 
 95 |     for i in range(roi_size_extended):
 96 |         for j in range(roi_size_extended):
 97 |             if (i - roi_size) + (j - roi_size) >= delta:
 98 |                 w[i, j, 0] = 0
 99 |                 w[-i-1, -j-1, 1] = 0
100 |                 w[i, -j-1, 2] = 0
101 |                 w[-i-1, j, 3] = 0
102 | 
103 |     beta_extended = w.sum(-1).ravel()
104 | 
105 |     return beta_extended
106 | 
107 | 
108 | ##############################################################################
109 | # To generate a plot that exhibits the true support and the estimated
110 | # supports for every method, we define the two following functions:
111 | 
112 | 
113 | def add_one_subplot(ax, map, title):
114 |     '''Add one subplot into the summary plot'''
115 | 
116 |     if map is not None:
117 |         im = ax.imshow(map)
118 |         im.set_clim(-1, 1)
119 |         ax.tick_params(
120 |             axis='both',
121 |             which='both',
122 |             bottom=False,
123 |             top=False,
124 |             left=False,
125 |             labelbottom=False,
126 |             labelleft=False)
127 |         ax.set_title(title)
128 |     else:
129 |         ax.axis('off')
130 |         ax.get_xaxis().set_visible(False)
131 |         ax.get_yaxis().set_visible(False)
132 | 
133 | 
134 | def plot(maps, titles, save_fig=False):
135 |     '''Make a summary plot from estimated supports'''
136 | 
137 |     fig, axes = plt.subplots(3, 2, figsize=(4, 6))
138 | 
139 |     for i in range(3):
140 |         for j in range(2):
141 |             k = i * 2 + j
142 |             add_one_subplot(axes[i][j], maps[k], titles[k])
143 | 
144 |     fig.tight_layout()
145 | 
146 |     if save_fig:
147 |         figname = 'figures/simu_2D.png'
148 |         plt.savefig(figname)
149 |         print(f'Save figure to {figname}')
150 | 
151 |     plt.show()
152 | 
153 | 
154 | ##############################################################################
155 | # Generating the data
156 | # -------------------
157 | #
158 | # After setting the simulation parameters, we run the function that generates
159 | # the 2D scenario that we have briefly described in the first section of this
160 | # example.
161 | 
162 | # simulation parameters
163 | n_samples = 100
164 | shape = (40, 40)
165 | n_features = shape[1] * shape[0]
166 | roi_size = 4  # size of the edge of the four predictive regions
167 | sigma = 2.0  # noise standard deviation
168 | smooth_X = 1.0  # level of spatial smoothing introduced by the Gaussian filter
169 | 
170 | # generating the data
171 | X_init, y, beta, epsilon, _, _ = \
172 |     multivariate_simulation(n_samples, shape, roi_size, sigma, smooth_X,
173 |                             seed=1)
174 | 
175 | ##############################################################################
176 | # Choosing inference parameters
177 | # -----------------------------
178 | #
179 | # The choice of the number of clusters depends on several parameters, such as:
180 | # the structure of the data (a higher correlation between neighboring features
181 | # enable a greater dimension reduction, i.e. a smaller number of clusters),
182 | # the number of samples (small datasets require more dimension reduction) and
183 | # the required spatial tolerance (small clusters lead to limited spatial
184 | # uncertainty). Formally, "spatial tolerance" is defined by the largest
185 | # distance from the true support for which the occurence of a false discovery
186 | # is not statistically controlled (c.f. :ref:`References`).
187 | # Theoretically, the spatial tolerance ``delta`` is equal to the largest
188 | # cluster diameter. However this choice is conservative, notably in the case
189 | # of ensembled clustered inference. For these algorithms, we recommend to take
190 | # the average cluster radius. In this example, we choose ``n_clusters = 200``,
191 | # leading to a theoretical spatial tolerance ``delta = 6``. However, it
192 | # turns out that ``delta = 2``, the average cluster radius, would have been
193 | # sufficient for ensembled clustered inference algorithms (see Results).
194 | 
195 | # hyper-parameters
196 | n_clusters = 200
197 | 
198 | # inference parameters
199 | fwer_target = 0.1
200 | delta = 6
201 | 
202 | # computation parameter
203 | n_jobs = 1
204 | 
205 | ##############################################################################
206 | # Computing z-score thresholds for support estimation
207 | # ---------------------------------------------------
208 | #
209 | # Below, we translate the FWER target into z-score targets.
210 | # To compute the z-score targets we also take into account for the multiple
211 | # testing correction. To do so, we consider the Bonferroni correction.
212 | # For methods that do not reduce the feature space, the correction
213 | # consists in dividing the FWER target by the number of features.
214 | # For methods that group features into clusters, the correction
215 | # consists in dividing by the number of clusters.
216 | 
217 | 
218 | # computing the z-score thresholds for feature selection
219 | correction_no_cluster = 1. / n_features
220 | correction_cluster = 1. / n_clusters
221 | thr_c = zscore_from_pval((fwer_target / 2) * correction_cluster)
222 | thr_nc = zscore_from_pval((fwer_target / 2) * correction_no_cluster)
223 | 
224 | #############################################################################
225 | # Inference with several algorithms
226 | # ---------------------------------
227 | #
228 | # First, we compute a reference map that exhibits the true support and
229 | # the theoretical tolerance region.
230 | 
231 | # compute true support with visible spatial tolerance
232 | beta_extended = weight_map_2D_extended(shape, roi_size, delta)
233 | 
234 | #############################################################################
235 | # Now, we compute the support estimated by a high-dimensional statistical
236 | # infernece method that does not leverage the data structure. This method
237 | # was introduced by Javanmard, A. et al. (2014), Zhang, C. H. et al. (2014)
238 | # and Van de Geer, S. et al.. (2014) (full references are available at
239 | # https://ja-che.github.io/hidimstat/).
240 | # and referred to as Desparsified Lasso.
241 | 
242 | # compute desparsified lasso
243 | beta_hat, cb_min, cb_max = desparsified_lasso(X_init, y, n_jobs=n_jobs)
244 | pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
245 |     pval_from_cb(cb_min, cb_max)
246 | 
247 | # compute estimated support (first method)
248 | zscore = zscore_from_pval(pval, one_minus_pval)
249 | selected_dl = zscore > thr_nc  # use the "no clustering threshold"
250 | 
251 | # compute estimated support (second method)
252 | selected_dl = np.logical_or(pval_corr < fwer_target / 2,
253 |                             one_minus_pval_corr < fwer_target / 2)
254 | 
255 | #############################################################################
256 | # Now, we compute the support estimated using a clustered inference algorithm
257 | # (c.f. :ref:`References`) called Clustered Desparsified Lasso (CluDL) since it
258 | # uses the Desparsified Lasso technique after clustering the data.
259 | 
260 | # Define the FeatureAgglomeration object that performs the clustering.
261 | # This object is necessary to run the current algorithm and the following one.
262 | connectivity = image.grid_to_graph(n_x=shape[0],
263 |                                    n_y=shape[1])
264 | ward = FeatureAgglomeration(n_clusters=n_clusters,
265 |                             connectivity=connectivity,
266 |                             linkage='ward')
267 | 
268 | # clustered desparsified lasso (CluDL)
269 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
270 |     clustered_inference(X_init, y, ward, n_clusters)
271 | 
272 | # compute estimated support (first method)
273 | zscore = zscore_from_pval(pval, one_minus_pval)
274 | selected_cdl = zscore > thr_c  # use the "clustering threshold"
275 | 
276 | # compute estimated support (second method)
277 | selected_cdl = np.logical_or(pval_corr < fwer_target / 2,
278 |                              one_minus_pval_corr < fwer_target / 2)
279 | 
280 | #############################################################################
281 | # Finally, we compute the support estimated by an ensembled clustered
282 | # inference algorithm (c.f. :ref:`References`). This algorithm is called
283 | # Ensemble of Clustered Desparsified Lasso (EnCluDL) since it runs several
284 | # CluDL algorithms with different clustering choices. The different CluDL
285 | # solutions are then aggregated into one.
286 | 
287 | # ensemble of clustered desparsified lasso (EnCluDL)
288 | beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
289 |     ensemble_clustered_inference(X_init, y, ward,
290 |                                  n_clusters, train_size=0.3)
291 | 
292 | # compute estimated support
293 | selected_ecdl = np.logical_or(pval_corr < fwer_target / 2,
294 |                               one_minus_pval_corr < fwer_target / 2)
295 | 
296 | #############################################################################
297 | # Results
298 | # -------
299 | #
300 | # Now we plot the true support, the theoretical tolerance regions and
301 | # the estimated supports for every method.
302 | 
303 | maps = []
304 | titles = []
305 | 
306 | maps.append(np.reshape(beta, shape))
307 | titles.append('True weights')
308 | 
309 | maps.append(np.reshape(beta_extended, shape))
310 | titles.append('True weights \nwith tolerance')
311 | 
312 | maps.append(np.reshape(selected_dl, shape))
313 | titles.append('Desparsified Lasso')
314 | 
315 | maps.append(None)
316 | titles.append(None)
317 | 
318 | maps.append(np.reshape(selected_cdl, shape))
319 | titles.append('CluDL')
320 | 
321 | maps.append(np.reshape(selected_ecdl, shape))
322 | titles.append('EnCluDL')
323 | 
324 | plot(maps, titles)
325 | 
326 | #############################################################################
327 | # Analysis of the results
328 | # -----------------------
329 | # As argued in the first section of this example, the standard method that
330 | # do not compress the problem is not relevant as it dramatically lacks power.
331 | # The support estimated from CluDL provides a more reasonable solution
332 | # since we recover the four regions. However the shape of the estimated support
333 | # is a bit rough.
334 | # Finally, the solution provided by EnCluDL is more accurate since the shape
335 | # of the estimated support is closer to the true support.
336 | # Also, one can note that the theoretical spatial tolerance is quite
337 | # conservative. In practice, we argue that the statistical guarantees are valid
338 | # for a lower spatial tolerance thanks to the clustering randomization.
339 | 


--------------------------------------------------------------------------------
/hidimstat/desparsified_lasso.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy.linalg import multi_dot
  3 | from scipy import stats
  4 | from scipy.linalg import inv
  5 | from joblib import Parallel, delayed
  6 | from sklearn.utils.validation import check_memory
  7 | from sklearn.linear_model import Lasso
  8 | 
  9 | from .noise_std import reid, group_reid
 10 | from .stat_tools import pval_from_two_sided_pval_and_sign
 11 | 
 12 | 
 13 | def _compute_all_residuals(X, alphas, gram, max_iter=5000, tol=1e-3,
 14 |                            method='lasso', n_jobs=1, verbose=0):
 15 |     """Nodewise Lasso. Compute all the residuals: regressing each column of the
 16 |     design matrix against the other columns"""
 17 | 
 18 |     n_samples, n_features = X.shape
 19 | 
 20 |     results = \
 21 |         Parallel(n_jobs=n_jobs, verbose=verbose)(
 22 |             delayed(_compute_residuals)
 23 |                 (X=X,
 24 |                  column_index=i,
 25 |                  alpha=alphas[i],
 26 |                  gram=gram,
 27 |                  max_iter=max_iter,
 28 |                  tol=tol,
 29 |                  method=method)
 30 |             for i in range(n_features))
 31 | 
 32 |     results = np.asarray(results)
 33 |     Z = np.stack(results[:, 0], axis=1)
 34 |     omega_diag = np.stack(results[:, 1])
 35 | 
 36 |     return Z, omega_diag
 37 | 
 38 | 
 39 | def _compute_residuals(X, column_index, alpha, gram, max_iter=5000,
 40 |                        tol=1e-3, method='lasso'):
 41 |     """Compute the residuals of the regression of a given column of the
 42 |     design matrix against the other columns"""
 43 | 
 44 |     n_samples, n_features = X.shape
 45 |     i = column_index
 46 | 
 47 |     X_new = np.delete(X, i, axis=1)
 48 |     y = np.copy(X[:, i])
 49 | 
 50 |     if method == 'lasso':
 51 | 
 52 |         gram_ = np.delete(np.delete(gram, i, axis=0), i, axis=1)
 53 |         clf = Lasso(alpha=alpha, precompute=gram_, max_iter=max_iter, tol=tol)
 54 | 
 55 |     else:
 56 | 
 57 |         ValueError("The only regression method available is 'lasso'")
 58 | 
 59 |     clf.fit(X_new, y)
 60 |     z = y - clf.predict(X_new)
 61 | 
 62 |     omega_diag_i = n_samples * np.sum(z ** 2) / np.dot(y, z) ** 2
 63 | 
 64 |     return z, omega_diag_i
 65 | 
 66 | 
 67 | def desparsified_lasso(X, y, dof_ajdustement=False,
 68 |                        confidence=0.95, max_iter=5000, tol=1e-3,
 69 |                        residual_method='lasso', alpha_max_fraction=0.01,
 70 |                        n_jobs=1, memory=None, verbose=0):
 71 | 
 72 |     """Desparsified Lasso with confidence intervals
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     X : ndarray, shape (n_samples, n_features)
 77 |         Data.
 78 | 
 79 |     y : ndarray, shape (n_samples,)
 80 |         Target.
 81 | 
 82 |     dof_ajdustement : bool, optional (default=False)
 83 |         If True, makes the degrees of freedom adjustement (cf. [4]_ and [5]_).
 84 |         Otherwise, the original Desparsified Lasso estimator is computed
 85 |         (cf. [1]_ and [2]_ and [3]_).
 86 | 
 87 |     confidence : float, optional (default=0.95)
 88 |         Confidence level used to compute the confidence intervals.
 89 |         Each value should be in the range [0, 1].
 90 | 
 91 |     max_iter : int, optional (default=5000)
 92 |         The maximum number of iterations when regressing, by Lasso,
 93 |         each column of the design matrix against the others.
 94 | 
 95 |     tol : float, optional (default=1e-3)
 96 |         The tolerance for the optimization of the Lasso problems: if the
 97 |         updates are smaller than `tol`, the optimization code checks the
 98 |         dual gap for optimality and continues until it is smaller than `tol`.
 99 | 
100 |     residual_method : str, optional (default='lasso')
101 |         Method used for computing the residuals of the Nodewise Lasso.
102 |         Currently the only method available is 'lasso'.
103 | 
104 |     alpha_max_fraction : float, optional (default=0.01)
105 |         Only used if method='lasso'.
106 |         Then alpha = alpha_max_fraction * alpha_max.
107 | 
108 |     n_jobs : int or None, optional (default=1)
109 |         Number of CPUs to use during the Nodewise Lasso.
110 | 
111 |     memory : str or joblib.Memory object, optional (default=None)
112 |         Used to cache the output of the computation of the Nodewise Lasso.
113 |         By default, no caching is done. If a string is given, it is the path
114 |         to the caching directory.
115 | 
116 |     verbose: int, optional (default=1)
117 |         The verbosity level: if non zero, progress messages are printed
118 |         when computing the Nodewise Lasso in parralel.
119 |         The frequency of the messages increases with the verbosity level.
120 | 
121 |     Returns
122 |     -------
123 |     beta_hat : array, shape (n_features,)
124 |         Estimated parameter vector.
125 | 
126 |     cb_min : array, shape (n_features)
127 |         Lower bound of the confidence intervals on the parameter vector.
128 | 
129 |     cb_max : array, shape (n_features)
130 |         Upper bound of the confidence intervals on the parameter vector.
131 | 
132 |     Notes
133 |     -----
134 |     The columns of `X` and `y` are always centered, this ensures that
135 |     the intercepts of the Nodewise Lasso problems are all equal to zero
136 |     and the intercept of the noise model is also equal to zero. Since
137 |     the values of the intercepts are not of interest, the centering avoids
138 |     the consideration of unecessary additional parameters.
139 |     Also, you may consider to center and scale `X` beforehand, notably if
140 |     the data contained in `X` has not been prescaled from measurements.
141 | 
142 |     References
143 |     ----------
144 |     .. [1] Zhang, C. H., & Zhang, S. S. (2014). Confidence intervals for
145 |            low dimensional parameters in high dimensional linear models.
146 |            Journal of the Royal Statistical Society: Series B: Statistical
147 |            Methodology, 217-242.
148 | 
149 |     .. [2] Van de Geer, S., Bühlmann, P., Ritov, Y. A., & Dezeure, R. (2014).
150 |            On asymptotically optimal confidence regions and tests for
151 |            high-dimensional models. Annals of Statistics, 42(3), 1166-1202.
152 | 
153 |     .. [3] Javanmard, A., & Montanari, A. (2014). Confidence intervals and
154 |            hypothesis testing for high-dimensional regression. The Journal
155 |            of Machine Learning Research, 15(1), 2869-2909.
156 | 
157 |     .. [4] Bellec, P. C., & Zhang, C. H. (2019). De-biasing the lasso with
158 |            degrees-of-freedom adjustment. arXiv preprint arXiv:1902.08885.
159 | 
160 |     .. [5] Celentano, M., Montanari, A., & Wei, Y. (2020). The Lasso with
161 |            general Gaussian designs with applications to hypothesis testing.
162 |            arXiv preprint arXiv:2007.13716.
163 |     """
164 | 
165 |     X = np.asarray(X)
166 | 
167 |     n_samples, n_features = X.shape
168 | 
169 |     memory = check_memory(memory)
170 | 
171 |     y = y - np.mean(y)
172 |     X = X - np.mean(X, axis=0)
173 |     gram = np.dot(X.T, X)
174 |     gram_nodiag = gram - np.diag(np.diag(gram))
175 | 
176 |     list_alpha_max = np.max(np.abs(gram_nodiag), axis=0) / n_samples
177 |     alphas = alpha_max_fraction * list_alpha_max
178 | 
179 |     # Calculating precision matrix (Nodewise Lasso)
180 |     Z, omega_diag = memory.cache(_compute_all_residuals, ignore=['n_jobs'])(
181 |         X, alphas, gram, max_iter=max_iter, tol=tol,
182 |         method=residual_method, n_jobs=n_jobs, verbose=verbose)
183 | 
184 |     # Lasso regression
185 |     sigma_hat, beta_lasso = reid(X, y, n_jobs=n_jobs)
186 | 
187 |     # Computing the degrees of freedom adjustement
188 |     if dof_ajdustement:
189 |         coef_max = np.max(np.abs(beta_lasso))
190 |         support = np.sum(np.abs(beta_lasso) > 0.01 * coef_max)
191 |         support = min(support, n_samples - 1)
192 |         dof_factor = n_samples / (n_samples - support)
193 |     else:
194 |         dof_factor = 1
195 | 
196 |     # Computing Desparsified Lasso estimator and confidence intervals
197 |     beta_bias = dof_factor * np.dot(y.T, Z) / np.sum(X * Z, axis=0)
198 | 
199 |     P = ((Z.T.dot(X)).T / np.sum(X * Z, axis=0)).T
200 |     P_nodiag = P - np.diag(np.diag(P))
201 |     Id = np.identity(n_features)
202 |     P_nodiag = dof_factor * P_nodiag + (dof_factor - 1) * Id
203 | 
204 |     beta_hat = beta_bias - P_nodiag.dot(beta_lasso)
205 | 
206 |     omega_diag = omega_diag * dof_factor ** 2
207 |     omega_invsqrt_diag = omega_diag ** (-0.5)
208 | 
209 |     quantile = stats.norm.ppf(1 - (1 - confidence) / 2)
210 | 
211 |     confint_radius = np.abs(quantile * sigma_hat /
212 |                             (np.sqrt(n_samples) * omega_invsqrt_diag))
213 |     cb_max = beta_hat + confint_radius
214 |     cb_min = beta_hat - confint_radius
215 | 
216 |     return beta_hat, cb_min, cb_max
217 | 
218 | 
219 | def desparsified_group_lasso(X, Y, cov=None, test='chi2',
220 |                              max_iter=5000, tol=1e-3, residual_method='lasso',
221 |                              alpha_max_fraction=0.01, noise_method='AR',
222 |                              order=1, n_jobs=1, memory=None, verbose=0):
223 |     """Desparsified Group Lasso
224 | 
225 |     Parameters
226 |     ----------
227 |     X : ndarray, shape (n_samples, n_features)
228 |         Data.
229 | 
230 |     Y : ndarray, shape (n_samples, n_times)
231 |         Target.
232 | 
233 |     cov : ndarray, shape (n_times, n_times), optional (default=None)
234 |         If None, a temporal covariance matrix of the noise is estimated.
235 |         Otherwise, `cov` is the temporal covariance matrix of the noise.
236 | 
237 |     test : str, optional (default='chi2')
238 |         Statistical test used to compute p-values. 'chi2' corresponds
239 |         to a chi-squared test and 'F' corresponds to an F-test.
240 | 
241 |     max_iter : int, optional (default=5000)
242 |         The maximum number of iterations when regressing, by Lasso,
243 |         each column of the design matrix against the others.
244 | 
245 |     tol : float, optional (default=1e-3)
246 |         The tolerance for the optimization of the Lasso problems: if the
247 |         updates are smaller than `tol`, the optimization code checks the
248 |         dual gap for optimality and continues until it is smaller than `tol`.
249 | 
250 |     residual_method : str, optional (default='lasso')
251 |         Method used for computing the residuals of the Nodewise Lasso.
252 |         Currently the only method available is 'lasso'.
253 | 
254 |     alpha_max_fraction : float, optional (default=0.01)
255 |         Only used if method='lasso'.
256 |         Then alpha = alpha_max_fraction * alpha_max.
257 | 
258 |     noise_method : str, optional (default='simple')
259 |         If 'simple', the correlation matrix is estimated by taking the
260 |         median of the correlation between two consecutive time steps
261 |         and the noise standard deviation for each time step is estimated
262 |         by taking the median of the standard deviations for every time step.
263 |         If 'AR', the order of the AR model is given by `order` and
264 |         Yule-Walker method is used to estimate the covariance matrix.
265 | 
266 |     order : int, optional (default=1)
267 |         If `method=AR`, `order` gives the order of the estimated autoregressive
268 |         model. `order` must be smaller than the number of time steps.
269 | 
270 |     n_jobs : int or None, optional (default=1)
271 |         Number of CPUs to use during the Nodewise Lasso.
272 | 
273 |     memory : str or joblib.Memory object, optional (default=None)
274 |         Used to cache the output of the computation of the Nodewise Lasso.
275 |         By default, no caching is done. If a string is given, it is the path
276 |         to the caching directory.
277 | 
278 |     verbose: int, optional (default=1)
279 |         The verbosity level: if non zero, progress messages are printed
280 |         when computing the Nodewise Lasso in parralel.
281 |         The frequency of the messages increases with the verbosity level.
282 | 
283 |     Returns
284 |     -------
285 |     beta_hat : ndarray, shape (n_features, n_times)
286 |         Estimated parameter matrix.
287 | 
288 |     pval : ndarray, shape (n_features,)
289 |         p-value, with numerically accurate values for
290 |         positive effects (ie., for p-value close to zero).
291 | 
292 |     pval_corr : ndarray, shape (n_features,)
293 |         p-value corrected for multiple testing.
294 | 
295 |     one_minus_pval : ndarray, shape (n_features,)
296 |         One minus the p-value, with numerically accurate values
297 |         for negative effects (ie., for p-value close to one).
298 | 
299 |     one_minus_pval_corr : ndarray, shape (n_features,)
300 |         One minus the p-value corrected for multiple testing.
301 |     Notes
302 |     -----
303 |     The columns of `X` and the matrix `Y` are always centered, this ensures
304 |     that the intercepts of the Nodewise Lasso problems are all equal to zero
305 |     and the intercept of the noise model is also equal to zero. Since
306 |     the values of the intercepts are not of interest, the centering avoids
307 |     the consideration of unecessary additional parameters.
308 |     Also, you may consider to center and scale `X` beforehand, notably if
309 |     the data contained in `X` has not been prescaled from measurements.
310 | 
311 |     References
312 |     ----------
313 |     .. [1] Chevalier, J. A., Gramfort, A., Salmon, J., & Thirion, B. (2020).
314 |            Statistical control for spatio-temporal MEG/EEG source imaging with
315 |            desparsified multi-task Lasso. In NeurIPS 2020-34h Conference on
316 |            Neural Information Processing Systems.
317 |     """
318 | 
319 |     X = np.asarray(X)
320 | 
321 |     n_samples, n_features = X.shape
322 |     n_times = Y.shape[1]
323 | 
324 |     memory = check_memory(memory)
325 | 
326 |     if cov is not None and cov.shape != (n_times, n_times):
327 |         raise ValueError(f'Shape of "cov" should be ({n_times}, {n_times}),' +
328 |                          f' the shape of "cov" was ({cov.shape}) instead')
329 | 
330 |     Y = Y - np.mean(Y)
331 |     X = X - np.mean(X, axis=0)
332 |     gram = np.dot(X.T, X)
333 |     gram_nodiag = gram - np.diag(np.diag(gram))
334 | 
335 |     list_alpha_max = np.max(np.abs(gram_nodiag), axis=0) / n_samples
336 |     alphas = alpha_max_fraction * list_alpha_max
337 | 
338 |     # Calculating precision matrix (Nodewise Lasso)
339 |     Z, omega_diag = memory.cache(_compute_all_residuals, ignore=['n_jobs'])(
340 |         X, alphas, gram, max_iter=max_iter, tol=tol,
341 |         method=residual_method, n_jobs=n_jobs, verbose=verbose)
342 | 
343 |     # Group Lasso regression
344 |     cov_hat, beta_mtl = \
345 |         group_reid(X, Y, method=noise_method, order=order, n_jobs=n_jobs)
346 | 
347 |     if cov is not None:
348 |         cov_hat = cov
349 | 
350 |     theta_hat = n_samples * inv(cov_hat)
351 | 
352 |     # Estimating the coefficient vector
353 |     beta_bias = Y.T.dot(Z) / np.sum(X * Z, axis=0)
354 | 
355 |     beta_mtl = beta_mtl.T
356 |     beta_bias = beta_bias.T
357 | 
358 |     P = (np.dot(X.T, Z) / np.sum(X * Z, axis=0)).T
359 |     P_nodiag = P - np.diag(np.diag(P))
360 | 
361 |     beta_hat = beta_bias - P_nodiag.dot(beta_mtl)
362 | 
363 |     if test == 'chi2':
364 | 
365 |         chi2_scores = \
366 |             np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) / omega_diag
367 |         two_sided_pval = \
368 |             np.minimum(2 * stats.chi2.sf(chi2_scores, df=n_times), 1.0)
369 | 
370 |     if test == 'F':
371 | 
372 |         f_scores = (np.diag(multi_dot([beta_hat, theta_hat, beta_hat.T])) /
373 |                     omega_diag / n_times)
374 |         two_sided_pval = \
375 |             np.minimum(2 * stats.f.sf(f_scores, dfd=n_samples, dfn=n_times),
376 |                        1.0)
377 | 
378 |     sign_beta = np.sign(np.sum(beta_hat, axis=1))
379 |     pval, pval_corr, one_minus_pval, one_minus_pval_corr = \
380 |         pval_from_two_sided_pval_and_sign(two_sided_pval, sign_beta)
381 | 
382 |     return beta_hat, pval, pval_corr, one_minus_pval, one_minus_pval_corr
383 | 


--------------------------------------------------------------------------------