├── .gitignore ├── .nojekyll ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── appveyor.yml ├── ci_scripts ├── appveyor │ └── run_with_env.cmd ├── circleci │ ├── build_doc.sh │ ├── checkout_merge_commit.sh │ └── push_doc.sh └── travis │ ├── install.sh │ ├── success.sh │ └── test.sh ├── circle.yml ├── doc ├── Makefile ├── api.rst ├── conf.py ├── index.rst ├── make.bat ├── randomized_lasso.rst └── stability_selection.rst ├── examples ├── README.txt ├── plot_randomized_lasso_path.py └── plot_stability_scores.py ├── requirements.txt ├── setup.cfg ├── setup.py └── stability_selection ├── __init__.py ├── bootstrap.py ├── randomized_lasso.py ├── stability_selection.py └── tests ├── __init__.py ├── test_common.py ├── test_randomized_lasso.py ├── test_stability_selection.py └── test_stratified_bootstrap.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # scikit-learn specific 10 | doc/_build/ 11 | doc/auto_examples/ 12 | doc/modules/generated/ 13 | doc/datasets/generated/ 14 | 15 | # Distribution / packaging 16 | 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | -------------------------------------------------------------------------------- /.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/stability-selection/e6e34da3601cc8215cd0b08d5c5f3a9dd3ccfe01/.nojekyll -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | cache: 4 | apt: true 5 | # We use three different cache directory 6 | # to work around a Travis bug with multi-platform cache 7 | directories: 8 | - $HOME/.cache/pip 9 | - $HOME/download 10 | env: 11 | global: 12 | # Directory where tests are run from 13 | - TEST_DIR=/tmp/test_dir/ 14 | - MODULE=stability_selection 15 | matrix: 16 | #- DISTRIB="conda" PYTHON_VERSION="2.7" 17 | # NUMPY_VERSION="1.7.1" SCIPY_VERSION="0.11.0" CYTHON_VERSION="0.21" 18 | - DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" 19 | NUMPY_VERSION="1.10.4" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" 20 | 21 | install: source ci_scripts/travis/install.sh 22 | script: bash ci_scripts/travis/test.sh 23 | after_success: source ci_scripts/travis/success.sh 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, Thomas Huijskens. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of project-template nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stability-selection - A scikit-learn compatible implementation of stability selection 2 | 3 | [![Build Status](https://travis-ci.org/scikit-learn-contrib/stability-selection.svg?branch=master)](https://travis-ci.org/scikit-learn-contrib/stability-selection) 4 | [![Coverage Status](https://coveralls.io/repos/github/scikit-learn-contrib/stability-selection/badge.svg?branch=master)](https://coveralls.io/github/scikit-learn-contrib/stability-selection?branch=master) 5 | [![CircleCI](https://circleci.com/gh/scikit-learn-contrib/stability-selection.svg?style=svg)](https://circleci.com/gh/scikit-learn-contrib/stability-selection) 6 | 7 | **stability-selection** is a Python implementation of the stability selection feature selection algorithm, first proposed by [Meinshausen and Buhlmann](https://stat.ethz.ch/~nicolai/stability.pdf). 8 | 9 | The idea behind stability selection is to inject more noise into the original problem by generating bootstrap samples of the data, and to use a base feature selection algorithm (like the LASSO) to find out which features are important in every sampled version of the data. The results on each bootstrap sample are then aggregated to compute a *stability score* for each feature in the data. Features can then be selected by choosing an appropriate threshold for the stability scores. 10 | 11 | ## Installation 12 | 13 | To install the module, clone the repository 14 | ```bash 15 | git clone https://github.com/scikit-learn-contrib/stability-selection.git 16 | ``` 17 | Before installing the module you will need `numpy`, `matplotlib`, and `sklearn`. Install these modules separately, or install using the `requirements.txt` file: 18 | ```bash 19 | pip install -r requirements.txt 20 | ``` 21 | and execute the following in the project directory to install `stability-selection`: 22 | ```bash 23 | python setup.py install 24 | ``` 25 | 26 | ## Documentation and algorithmic details 27 | 28 | See the [documentation](https://thuijskens.github.io/stability-selection/docs/index.html) for details on the module, and the accompanying [blog post](https://thuijskens.github.io/2018/07/25/stability-selection/) for details on the algorithmic details. 29 | 30 | ## Example usage 31 | 32 | `stability-selection` implements a class `StabilitySelection`, that takes any scikit-learn compatible estimator that has either a ``feature_importances_`` or ``coef_`` attribute after fitting. Important other parameters are 33 | 34 | - `lambda_name`: the name of the penalization parameter of the base estimator (for example, `C` in the case of `LogisticRegression`). 35 | - `lambda_grid`: an array of values of the penalization parameter to iterate over. 36 | 37 | After instantiation, the algorithm can be run with the familiar `fit` and `transform` calls. 38 | 39 | ### Basic example 40 | See below for an example: 41 | ```python 42 | import numpy as np 43 | 44 | from sklearn.linear_model import LogisticRegression 45 | from sklearn.pipeline import Pipeline 46 | from sklearn.preprocessing import StandardScaler 47 | from sklearn.utils import check_random_state 48 | from stability_selection import StabilitySelection 49 | 50 | 51 | def _generate_dummy_classification_data(p=1000, n=1000, k=5, random_state=123321): 52 | 53 | rng = check_random_state(random_state) 54 | 55 | X = rng.normal(loc=0.0, scale=1.0, size=(n, p)) 56 | betas = np.zeros(p) 57 | important_betas = np.sort(rng.choice(a=np.arange(p), size=k)) 58 | betas[important_betas] = rng.uniform(size=k) 59 | 60 | probs = 1 / (1 + np.exp(-1 * np.matmul(X, betas))) 61 | y = (probs > 0.5).astype(int) 62 | 63 | return X, y, important_betas 64 | 65 | ## This is all preparation of the dummy data set 66 | n, p, k = 500, 1000, 5 67 | 68 | X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) 69 | base_estimator = Pipeline([ 70 | ('scaler', StandardScaler()), 71 | ('model', LogisticRegression(penalty='l1')) 72 | ]) 73 | 74 | ## Here stability selection is instantiated and run 75 | selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C', 76 | lambda_grid=np.logspace(-5, -1, 50)).fit(X, y) 77 | 78 | print(selector.get_support(indices=True)) 79 | ``` 80 | 81 | ### Bootstrapping strategies 82 | 83 | `stability-selection` uses bootstrapping without replacement by default (as proposed in the original paper), but does support different bootstrapping strategies. [Shah and Samworth] proposed *complementary pairs* bootstrapping, where the data set is bootstrapped in pairs, such that the intersection is empty but the union equals the original data set. `StabilitySelection` supports this through the `bootstrap_func` parameter. 84 | 85 | This parameter can be: 86 | - A string, which must be one of 87 | - 'subsample': For subsampling without replacement (default). 88 | - 'complementary_pairs': For complementary pairs subsampling [2]. 89 | - 'stratified': For stratified bootstrapping in imbalanced 90 | classification. 91 | - A function that takes `y`, and a random state 92 | as inputs and returns a list of sample indices in the range 93 | `(0, len(y)-1)`. 94 | 95 | For example, the `StabilitySelection` call in the above example can be replaced with 96 | ```python 97 | selector = StabilitySelection(base_estimator=base_estimator, 98 | lambda_name='model__C', 99 | lambda_grid=np.logspace(-5, -1, 50), 100 | bootstrap_func='complementary_pairs') 101 | selector.fit(X, y) 102 | ``` 103 | to run stability selection with complementary pairs bootstrapping. 104 | 105 | ## Feedback and contributing 106 | 107 | Feedback and contributions are much appreciated. If you have any feedback, please post it on the [issue tracker](https://github.com/scikit-learn-contrib/stability-selection/issues). 108 | 109 | ## References 110 | 111 | [1]: Meinshausen, N. and Buhlmann, P., 2010. Stability selection. Journal of the Royal Statistical Society: 112 | Series B (Statistical Methodology), 72(4), pp.417-473. 113 | 114 | [2] Shah, R.D. and Samworth, R.J., 2013. Variable selection with 115 | error control: another look at stability selection. Journal 116 | of the Royal Statistical Society: Series B (Statistical Methodology), 117 | 75(1), pp.55-80. 118 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # AppVeyor.com is a Continuous Integration service to build and run tests under 2 | # Windows 3 | 4 | 5 | environment: 6 | global: 7 | # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the 8 | # /E:ON and /V:ON options are not enabled in the batch script interpreter 9 | # See: http://stackoverflow.com/a/13751649/163740 10 | CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci_scripts\\appveyor\\run_with_env.cmd" 11 | WHEELHOUSE_UPLOADER_USERNAME: sklearn 12 | WHEELHOUSE_UPLOADER_SECRET: 13 | secure: XzK+Mi6Ba5frV2B/jHq7h4aD8/nox9SsI3T8Kub1L2XNevRSIurUEry3PdWESzRY 14 | MODULE: skltemplate 15 | PROJECT_NAME: sklearn-template 16 | CLOUD_STORAGE: CLOUDFILES 17 | CLOUD_CONTATINER: sklearn-template-trial 18 | 19 | matrix: 20 | - PYTHON: "C:\\Python27" 21 | PYTHON_VERSION: "2.7.8" 22 | PYTHON_ARCH: "32" 23 | MINICONDA: "C:\\Miniconda" 24 | 25 | - PYTHON: "C:\\Python27-x64" 26 | PYTHON_VERSION: "2.7.8" 27 | PYTHON_ARCH: "64" 28 | MINICONDA: "C:\\Miniconda-x64" 29 | 30 | - PYTHON: "C:\\Python35" 31 | PYTHON_VERSION: "3.5.0" 32 | PYTHON_ARCH: "32" 33 | MINICONDA: "C:\\Miniconda35" 34 | 35 | - PYTHON: "C:\\Python35-x64" 36 | PYTHON_VERSION: "3.5.0" 37 | PYTHON_ARCH: "64" 38 | MINICONDA: "C:\\Miniconda35-x64" 39 | 40 | install: 41 | # Miniconda is pre-installed in the worker build 42 | - "SET PATH=%MINICONDA%;%MINICONDA%\\Scripts;%PATH%" 43 | - "python -m pip install -U pip" 44 | 45 | # Check that we have the expected version and architecture for Python 46 | - "python --version" 47 | - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" 48 | - "pip --version" 49 | 50 | # Remove cygwin because it clashes with conda 51 | # see http://help.appveyor.com/discussions/problems/3712-git-remote-https-seems-to-be-broken 52 | - rmdir C:\\cygwin /s /q 53 | 54 | # Install the build and runtime dependencies of the project. 55 | - conda install --quiet --yes numpy scipy cython nose scikit-learn wheel 56 | - pip install wheelhouse_uploader nose-timer 57 | - "%CMD_IN_ENV% python setup.py bdist_wheel bdist_wininst" 58 | - ps: "ls dist" 59 | 60 | # Install the generated wheel package to test it 61 | - "pip install --pre --no-index --find-links dist %PROJECT_NAME%" 62 | 63 | 64 | # Not a .NET project, we build scikit-learn in the install step instead 65 | build: false 66 | 67 | 68 | artifacts: 69 | # Archive the generated wheel package in the ci.appveyor.com build report. 70 | - path: dist\* 71 | 72 | 73 | on_success: 74 | # Upload the generated wheel package to Rackspace 75 | # On Windows, Apache Libcloud cannot find a standard CA cert bundle so we 76 | # disable the ssl checks. 77 | - "python -m wheelhouse_uploader upload provider=%CLOUD_STORAGE% --no-ssl-check --local-folder=dist %CLOUD_CONTAINER%" 78 | 79 | 80 | test_script: 81 | # Change to a non-source folder to make sure we run the tests on the 82 | # installed library. 83 | - "mkdir empty_folder" 84 | - "cd empty_folder" 85 | 86 | - "python -c \"import nose; nose.main()\" --with-timer --timer-top-n 20 -s -v %MODULE%" 87 | 88 | # Move back to the project folder 89 | - "cd .." 90 | 91 | 92 | cache: 93 | # Use the appveyor cache to avoid re-downloading large archives such 94 | # the MKL numpy and scipy wheels mirrored on a rackspace cloud 95 | # container, speed up the appveyor jobs and reduce bandwidth 96 | # usage on our rackspace account. 97 | - '%APPDATA%\pip\Cache' -------------------------------------------------------------------------------- /ci_scripts/appveyor/run_with_env.cmd: -------------------------------------------------------------------------------- 1 | :: To build extensions for 64 bit Python 3, we need to configure environment 2 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: 3 | :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) 4 | :: 5 | :: To build extensions for 64 bit Python 2, we need to configure environment 6 | :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: 7 | :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) 8 | :: 9 | :: 32 bit builds, and 64-bit builds for 3.5 and beyond, do not require specific 10 | :: environment configurations. 11 | :: 12 | :: Note: this script needs to be run with the /E:ON and /V:ON flags for the 13 | :: cmd interpreter, at least for (SDK v7.0) 14 | :: 15 | :: More details at: 16 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows 17 | :: http://stackoverflow.com/a/13751649/163740 18 | :: 19 | :: Author: Olivier Grisel 20 | :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 21 | :: 22 | :: Notes about batch files for Python people: 23 | :: 24 | :: Quotes in values are literally part of the values: 25 | :: SET FOO="bar" 26 | :: FOO is now five characters long: " b a r " 27 | :: If you don't want quotes, don't include them on the right-hand side. 28 | :: 29 | :: The CALL lines at the end of this file look redundant, but if you move them 30 | :: outside of the IF clauses, they do not run properly in the SET_SDK_64==Y 31 | :: case, I don't know why. 32 | @ECHO OFF 33 | 34 | SET COMMAND_TO_RUN=%* 35 | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows 36 | SET WIN_WDK=c:\Program Files (x86)\Windows Kits\10\Include\wdf 37 | 38 | :: Extract the major and minor versions, and allow for the minor version to be 39 | :: more than 9. This requires the version number to have two dots in it. 40 | SET MAJOR_PYTHON_VERSION=%PYTHON_VERSION:~0,1% 41 | IF "%PYTHON_VERSION:~3,1%" == "." ( 42 | SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,1% 43 | ) ELSE ( 44 | SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,2% 45 | ) 46 | 47 | :: Based on the Python version, determine what SDK version to use, and whether 48 | :: to set the SDK for 64-bit. 49 | IF %MAJOR_PYTHON_VERSION% == 2 ( 50 | SET WINDOWS_SDK_VERSION="v7.0" 51 | SET SET_SDK_64=Y 52 | ) ELSE ( 53 | IF %MAJOR_PYTHON_VERSION% == 3 ( 54 | SET WINDOWS_SDK_VERSION="v7.1" 55 | IF %MINOR_PYTHON_VERSION% LEQ 4 ( 56 | SET SET_SDK_64=Y 57 | ) ELSE ( 58 | SET SET_SDK_64=N 59 | IF EXIST "%WIN_WDK%" ( 60 | :: See: https://connect.microsoft.com/VisualStudio/feedback/details/1610302/ 61 | REN "%WIN_WDK%" 0wdf 62 | ) 63 | ) 64 | ) ELSE ( 65 | ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" 66 | EXIT 1 67 | ) 68 | ) 69 | 70 | IF %PYTHON_ARCH% == 64 ( 71 | IF %SET_SDK_64% == Y ( 72 | ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture 73 | SET DISTUTILS_USE_SDK=1 74 | SET MSSdk=1 75 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% 76 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release 77 | ECHO Executing: %COMMAND_TO_RUN% 78 | call %COMMAND_TO_RUN% || EXIT 1 79 | ) ELSE ( 80 | ECHO Using default MSVC build environment for 64 bit architecture 81 | ECHO Executing: %COMMAND_TO_RUN% 82 | call %COMMAND_TO_RUN% || EXIT 1 83 | ) 84 | ) ELSE ( 85 | ECHO Using default MSVC build environment for 32 bit architecture 86 | ECHO Executing: %COMMAND_TO_RUN% 87 | call %COMMAND_TO_RUN% || EXIT 1 88 | ) 89 | -------------------------------------------------------------------------------- /ci_scripts/circleci/build_doc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | set -e 4 | 5 | # Decide what kind of documentation build to run, and run it. 6 | # 7 | # If the last commit message has a "[doc skip]" marker, do not build 8 | # the doc. On the contrary if a "[doc build]" marker is found, build the doc 9 | # instead of relying on the subsequent rules. 10 | # 11 | # We always build the documentation for jobs that are not related to a specific 12 | # PR (e.g. a merge to master or a maintenance branch). 13 | # 14 | # If this is a PR, do a full build if there are some files in this PR that are 15 | # under the "doc/" or "examples/" folders, otherwise perform a quick build. 16 | # 17 | # If the inspection of the current commit fails for any reason, the default 18 | # behavior is to quick build the documentation. 19 | 20 | get_build_type() { 21 | if [ -z "$CIRCLE_SHA1" ] 22 | then 23 | echo SKIP: undefined CIRCLE_SHA1 24 | return 25 | fi 26 | commit_msg=$(git log --format=%B -n 1 $CIRCLE_SHA1) 27 | if [ -z "$commit_msg" ] 28 | then 29 | echo QUICK BUILD: failed to inspect commit $CIRCLE_SHA1 30 | return 31 | fi 32 | if [[ "$commit_msg" =~ \[doc\ skip\] ]] 33 | then 34 | echo SKIP: [doc skip] marker found 35 | return 36 | fi 37 | if [[ "$commit_msg" =~ \[doc\ quick\] ]] 38 | then 39 | echo QUICK: [doc quick] marker found 40 | return 41 | fi 42 | if [[ "$commit_msg" =~ \[doc\ build\] ]] 43 | then 44 | echo BUILD: [doc build] marker found 45 | return 46 | fi 47 | if [ -z "$CI_PULL_REQUEST" ] 48 | then 49 | echo BUILD: not a pull request 50 | return 51 | fi 52 | git_range="origin/master...$CIRCLE_SHA1" 53 | git fetch origin master >&2 || (echo QUICK BUILD: failed to get changed filenames for $git_range; return) 54 | filenames=$(git diff --name-only $git_range) 55 | if [ -z "$filenames" ] 56 | then 57 | echo QUICK BUILD: no changed filenames for $git_range 58 | return 59 | fi 60 | if echo "$filenames" | grep -q -e ^examples/ 61 | then 62 | echo BUILD: detected examples/ filename modified in $git_range: $(echo "$filenames" | grep -e ^examples/ | head -n1) 63 | return 64 | fi 65 | echo QUICK BUILD: no examples/ filename modified in $git_range: 66 | echo "$filenames" 67 | } 68 | 69 | build_type=$(get_build_type) 70 | if [[ "$build_type" =~ ^SKIP ]] 71 | then 72 | exit 0 73 | fi 74 | 75 | MAKE_TARGET=html 76 | 77 | # deactivate circleci virtualenv and setup a miniconda env instead 78 | if [[ `type -t deactivate` ]]; then 79 | deactivate 80 | fi 81 | 82 | # Install dependencies with miniconda 83 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \ 84 | -O miniconda.sh 85 | chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH 86 | export PATH="$MINICONDA_PATH/bin:$PATH" 87 | conda update --yes --quiet conda 88 | 89 | # Configure the conda environment and put it in the path using the 90 | # provided versions 91 | conda create -n $CONDA_ENV_NAME --yes --quiet python=3 92 | source activate $CONDA_ENV_NAME 93 | 94 | conda install --yes pip numpy scipy scikit-learn pillow matplotlib sphinx \ 95 | sphinx_rtd_theme numpydoc 96 | pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git 97 | 98 | # Build and install imbalanced-learn in dev mode 99 | ls -l 100 | pip install -e . 101 | 102 | # The pipefail is requested to propagate exit code 103 | set -o pipefail && cd doc && make $MAKE_TARGET 2>&1 | tee ~/log.txt 104 | 105 | cd - 106 | set +o pipefail 107 | -------------------------------------------------------------------------------- /ci_scripts/circleci/checkout_merge_commit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Add `master` branch to the update list. 4 | # Otherwise CircleCI will give us a cached one. 5 | FETCH_REFS="+master:master" 6 | 7 | # Update PR refs for testing. 8 | if [[ -n "${CIRCLE_PR_NUMBER}" ]] 9 | then 10 | FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/head:pr/${CIRCLE_PR_NUMBER}/head" 11 | FETCH_REFS="${FETCH_REFS} +refs/pull/${CIRCLE_PR_NUMBER}/merge:pr/${CIRCLE_PR_NUMBER}/merge" 12 | fi 13 | 14 | # Retrieve the refs. 15 | git fetch -u origin ${FETCH_REFS} 16 | 17 | # Checkout the PR merge ref. 18 | if [[ -n "${CIRCLE_PR_NUMBER}" ]] 19 | then 20 | git checkout -qf "pr/${CIRCLE_PR_NUMBER}/merge" || ( 21 | echo Could not fetch merge commit. >&2 22 | echo There may be conflicts in merging PR \#${CIRCLE_PR_NUMBER} with master. >&2; 23 | exit 1) 24 | fi 25 | 26 | # Check for merge conflicts. 27 | if [[ -n "${CIRCLE_PR_NUMBER}" ]] 28 | then 29 | git branch --merged | grep master > /dev/null 30 | git branch --merged | grep "pr/${CIRCLE_PR_NUMBER}/head" > /dev/null 31 | fi 32 | -------------------------------------------------------------------------------- /ci_scripts/circleci/push_doc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is meant to be called in the "deploy" step defined in 3 | # circle.yml. See https://circleci.com/docs/ for more details. 4 | # The behavior of the script is controlled by environment variable defined 5 | # in the circle.yml in the top level folder of the project. 6 | 7 | GENERATED_DOC_DIR=$1 8 | 9 | if [[ -z "$GENERATED_DOC_DIR" ]]; then 10 | echo "Need to pass directory of the generated doc as argument" 11 | echo "Usage: $0 " 12 | exit 1 13 | fi 14 | 15 | # Absolute path needed because we use cd further down in this script 16 | GENERATED_DOC_DIR=$(readlink -f $GENERATED_DOC_DIR) 17 | 18 | if [ "$CIRCLE_BRANCH" = "master" ] 19 | then 20 | dir=docs # NOTE: I needed to change this from dev to docs for gh-pages to work 21 | else 22 | # Strip off .X 23 | dir="${CIRCLE_BRANCH::-2}" 24 | fi 25 | 26 | MSG="Pushing the docs to $dir/ for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1" 27 | 28 | cd $HOME 29 | if [ ! -d $DOC_REPO ]; 30 | then git clone --depth 1 --no-checkout -b gh-pages "git@github.com:"$USERNAME"/"$DOC_REPO".git"; 31 | fi 32 | cd $DOC_REPO 33 | git config core.sparseCheckout true 34 | echo $dir > .git/info/sparse-checkout 35 | git checkout gh-pages 36 | git reset --hard origin/gh-pages 37 | git rm -rf $dir/ && rm -rf $dir/ 38 | cp -R $GENERATED_DOC_DIR $dir 39 | touch $dir/.nojekyll 40 | git config --global user.email $EMAIL 41 | git config --global user.name $USERNAME 42 | git config --global push.default matching 43 | git add -f $dir/ 44 | git commit -m "$MSG" $dir 45 | git push origin gh-pages 46 | 47 | echo $MSG 48 | -------------------------------------------------------------------------------- /ci_scripts/travis/install.sh: -------------------------------------------------------------------------------- 1 | # Deactivate the travis-provided virtual environment and setup a 2 | # conda-based environment instead 3 | deactivate 4 | 5 | # Use the miniconda installer for faster download / install of conda 6 | # itself 7 | pushd . 8 | cd 9 | mkdir -p download 10 | cd download 11 | echo "Cached in $HOME/download :" 12 | ls -l 13 | echo 14 | if [[ ! -f miniconda.sh ]] 15 | then 16 | wget http://repo.continuum.io/miniconda/Miniconda-3.6.0-Linux-x86_64.sh \ 17 | -O miniconda.sh 18 | fi 19 | chmod +x miniconda.sh && ./miniconda.sh -b 20 | cd .. 21 | export PATH=/home/travis/miniconda/bin:$PATH 22 | conda update --yes conda 23 | popd 24 | 25 | # Configure the conda environment and put it in the path using the 26 | # provided versions 27 | conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ 28 | numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION cython=$CYTHON_VERSION 29 | 30 | source activate testenv 31 | 32 | 33 | if [[ "$COVERAGE" == "true" ]]; then 34 | pip install coverage coveralls 35 | fi 36 | 37 | python --version 38 | python -c "import numpy; print('numpy %s' % numpy.__version__)" 39 | python -c "import scipy; print('scipy %s' % scipy.__version__)" 40 | python setup.py develop 41 | -------------------------------------------------------------------------------- /ci_scripts/travis/success.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | if [[ "$COVERAGE" == "true" ]]; then 4 | # Need to run coveralls from a git checkout, so we copy .coverage 5 | # from TEST_DIR where nosetests has been run 6 | cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR 7 | cd $TRAVIS_BUILD_DIR 8 | # Ignore coveralls failures as the coveralls server is not 9 | # very reliable but we don't want travis to report a failure 10 | # in the github UI just because the coverage report failed to 11 | # be published. 12 | coveralls || echo "Coveralls upload failed" 13 | fi -------------------------------------------------------------------------------- /ci_scripts/travis/test.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | # Get into a temp directory to run test from the installed scikit learn and 4 | # check if we do not leave artifacts 5 | mkdir -p $TEST_DIR 6 | 7 | cd $TEST_DIR 8 | 9 | if [[ "$COVERAGE" == "true" ]]; then 10 | nosetests -s --with-coverage --cover-package=$MODULE $MODULE 11 | else 12 | nosetests -s $MODULE 13 | fi 14 | -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | docker: 5 | - image: circleci/python:3.6.1 6 | environment: 7 | - USERNAME: "thuijskens" 8 | - DOC_REPO: "stability-selection" 9 | - DOC_URL: "docs" 10 | - EMAIL: "thomas_huijskens@hotmail.com" 11 | - MINICONDA_PATH: ~/miniconda 12 | - CONDA_ENV_NAME: testenv 13 | - PYTHON_VERSION: 3 14 | 15 | steps: 16 | - checkout 17 | - run: ./ci_scripts/circleci/checkout_merge_commit.sh 18 | - run: ./ci_scripts/circleci/build_doc.sh 19 | - store_artifacts: 20 | path: doc/_build/html 21 | destination: doc 22 | - store_artifacts: 23 | path: ~/log.txt 24 | - persist_to_workspace: 25 | root: doc/_build/html 26 | paths: . 27 | - attach_workspace: 28 | at: doc/_build/html 29 | - run: ls -ltrh doc/_build/html 30 | - deploy: 31 | command: | 32 | if [[ "${CIRCLE_BRANCH}" =~ ^master$|^[0-9]+\.[0-9]+\.X$ ]]; then 33 | bash ./ci_scripts/circleci/push_doc.sh doc/_build/html 34 | fi 35 | filters: 36 | branches: 37 | ignore: 38 | - gh-pages 39 | 40 | 41 | workflows: 42 | version: 2 43 | build-doc-and-deploy: 44 | jobs: 45 | - build 46 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | -rm -rf $(BUILDDIR)/* 51 | -rm -rf auto_examples/ 52 | -rm -rf generated/* 53 | -rm -rf modules/generated/* 54 | 55 | html: 56 | # These two lines make the build a bit more lengthy, and the 57 | # the embedding of images more robust 58 | rm -rf $(BUILDDIR)/html/_images 59 | #rm -rf _build/doctrees/ 60 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 63 | 64 | dirhtml: 65 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 66 | @echo 67 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 68 | 69 | singlehtml: 70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 71 | @echo 72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 73 | 74 | pickle: 75 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 76 | @echo 77 | @echo "Build finished; now you can process the pickle files." 78 | 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | htmlhelp: 85 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 86 | @echo 87 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 88 | ".hhp project file in $(BUILDDIR)/htmlhelp." 89 | 90 | qthelp: 91 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 92 | @echo 93 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 94 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 95 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/project-template.qhcp" 96 | @echo "To view the help file:" 97 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/project-template.qhc" 98 | 99 | devhelp: 100 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 101 | @echo 102 | @echo "Build finished." 103 | @echo "To view the help file:" 104 | @echo "# mkdir -p $$HOME/.local/share/devhelp/project-template" 105 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/project-template" 106 | @echo "# devhelp" 107 | 108 | epub: 109 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 110 | @echo 111 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 112 | 113 | latex: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo 116 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 117 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 118 | "(use \`make latexpdf' here to do that automatically)." 119 | 120 | latexpdf: 121 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 122 | @echo "Running LaTeX files through pdflatex..." 123 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 124 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 125 | 126 | latexpdfja: 127 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 128 | @echo "Running LaTeX files through platex and dvipdfmx..." 129 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 130 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 131 | 132 | text: 133 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 134 | @echo 135 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 136 | 137 | man: 138 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 139 | @echo 140 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 141 | 142 | texinfo: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo 145 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 146 | @echo "Run \`make' in that directory to run these through makeinfo" \ 147 | "(use \`make info' here to do that automatically)." 148 | 149 | info: 150 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 151 | @echo "Running Texinfo files through makeinfo..." 152 | make -C $(BUILDDIR)/texinfo info 153 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 154 | 155 | gettext: 156 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 157 | @echo 158 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 159 | 160 | changes: 161 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 162 | @echo 163 | @echo "The overview file is in $(BUILDDIR)/changes." 164 | 165 | linkcheck: 166 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 167 | @echo 168 | @echo "Link check complete; look for any errors in the above output " \ 169 | "or in $(BUILDDIR)/linkcheck/output.txt." 170 | 171 | doctest: 172 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 173 | @echo "Testing of doctests in the sources finished, look at the " \ 174 | "results in $(BUILDDIR)/doctest/output.txt." 175 | 176 | xml: 177 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 178 | @echo 179 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 180 | 181 | pseudoxml: 182 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 183 | @echo 184 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 185 | -------------------------------------------------------------------------------- /doc/api.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ================= 3 | 4 | * :doc:`stability_selection` 5 | * :doc:`randomized_lasso` -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # project-template documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jan 18 14:44:12 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | import sphinx_rtd_theme 19 | 20 | # If extensions (or modules to document with autodoc) are in another directory, 21 | # add these directories to sys.path here. If the directory is relative to the 22 | # documentation root, use os.path.abspath to make it absolute, like shown here. 23 | #sys.path.insert(0, os.path.abspath('.')) 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Try to override the matplotlib configuration as early as possible 28 | try: 29 | import gen_rst 30 | except: 31 | pass 32 | # -- General configuration ------------------------------------------------ 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | #needs_sphinx = '1.0' 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = [ 41 | 'sphinx.ext.autodoc', 42 | 'sphinx.ext.autosummary', 43 | 'sphinx.ext.doctest', 44 | 'sphinx.ext.intersphinx', 45 | 'sphinx.ext.todo', 46 | 'numpydoc', 47 | 'sphinx.ext.ifconfig', 48 | 'sphinx.ext.viewcode', 49 | 'sphinx_gallery.gen_gallery', 50 | 'sphinx.ext.mathjax' 51 | ] 52 | 53 | numpydoc_show_class_members = False 54 | 55 | sphinx_gallery_conf = { 56 | # path to your examples scripts 57 | 'examples_dirs' : '../examples', 58 | # path where to save gallery generated examples 59 | 'gallery_dirs' : 'auto_examples', 60 | 'backreferences_dir': os.path.join('generated'), 61 | } 62 | 63 | # Add any paths that contain templates here, relative to this directory. 64 | templates_path = ['_templates'] 65 | 66 | # The suffix of source filenames. 67 | source_suffix = '.rst' 68 | 69 | # The encoding of source files. 70 | #source_encoding = 'utf-8-sig' 71 | 72 | # Generate the plots for the gallery 73 | plot_gallery = True 74 | 75 | # The master toctree document. 76 | master_doc = 'index' 77 | 78 | # General information about the project. 79 | project = u'stability-selection' 80 | copyright = u'2018, Thomas Huijskens' 81 | 82 | # The version info for the project you're documenting, acts as replacement for 83 | # |version| and |release|, also used in various other places throughout the 84 | # built documents. 85 | # 86 | # The short X.Y version. 87 | version = '0.1' 88 | # The full version, including alpha/beta/rc tags. 89 | release = '0.1.0' 90 | 91 | # The language for content autogenerated by Sphinx. Refer to documentation 92 | # for a list of supported languages. 93 | #language = None 94 | 95 | # There are two options for replacing |today|: either, you set today to some 96 | # non-false value, then it is used: 97 | #today = '' 98 | # Else, today_fmt is used as the format for a strftime call. 99 | #today_fmt = '%B %d, %Y' 100 | 101 | # List of patterns, relative to source directory, that match files and 102 | # directories to ignore when looking for source files. 103 | exclude_patterns = ['_build'] 104 | 105 | # The reST default role (used for this markup: `text`) to use for all 106 | # documents. 107 | #default_role = None 108 | 109 | # If true, '()' will be appended to :func: etc. cross-reference text. 110 | #add_function_parentheses = True 111 | 112 | # If true, the current module name will be prepended to all description 113 | # unit titles (such as .. function::). 114 | #add_module_names = True 115 | 116 | # If true, sectionauthor and moduleauthor directives will be shown in the 117 | # output. They are ignored by default. 118 | #show_authors = False 119 | 120 | # The name of the Pygments (syntax highlighting) style to use. 121 | pygments_style = 'sphinx' 122 | 123 | # A list of ignored prefixes for module index sorting. 124 | #modindex_common_prefix = [] 125 | 126 | # If true, keep warnings as "system message" paragraphs in the built documents. 127 | #keep_warnings = False 128 | 129 | 130 | # -- Options for HTML output ---------------------------------------------- 131 | 132 | # The theme to use for HTML and HTML Help pages. See the documentation for 133 | # a list of builtin themes. 134 | html_theme = 'sphinx_rtd_theme' 135 | 136 | # Theme options are theme-specific and customize the look and feel of a theme 137 | # further. For a list of options available for each theme, see the 138 | # documentation. 139 | #html_theme_options = {} 140 | 141 | # Add any paths that contain custom themes here, relative to this directory. 142 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 143 | 144 | # The name for this set of Sphinx documents. If None, it defaults to 145 | # " v documentation". 146 | #html_title = None 147 | 148 | # A shorter title for the navigation bar. Default is the same as html_title. 149 | #html_short_title = None 150 | 151 | # The name of an image file (relative to this directory) to place at the top 152 | # of the sidebar. 153 | #html_logo = None 154 | 155 | # The name of an image file (within the static path) to use as favicon of the 156 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 157 | # pixels large. 158 | #html_favicon = None 159 | 160 | # Add any paths that contain custom static files (such as style sheets) here, 161 | # relative to this directory. They are copied after the builtin static files, 162 | # so a file named "default.css" will overwrite the builtin "default.css". 163 | html_static_path = ['_static'] 164 | 165 | # Add any extra paths that contain custom files (such as robots.txt or 166 | # .htaccess) here, relative to this directory. These files are copied 167 | # directly to the root of the documentation. 168 | #html_extra_path = [] 169 | 170 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 171 | # using the given strftime format. 172 | #html_last_updated_fmt = '%b %d, %Y' 173 | 174 | # If true, SmartyPants will be used to convert quotes and dashes to 175 | # typographically correct entities. 176 | #html_use_smartypants = True 177 | 178 | # Custom sidebar templates, maps document names to template names. 179 | #html_sidebars = {} 180 | 181 | # Additional templates that should be rendered to pages, maps page names to 182 | # template names. 183 | #html_additional_pages = {} 184 | 185 | # If false, no module index is generated. 186 | #html_domain_indices = True 187 | 188 | # If false, no index is generated. 189 | #html_use_index = True 190 | 191 | # If true, the index is split into individual pages for each letter. 192 | #html_split_index = False 193 | 194 | # If true, links to the reST sources are added to the pages. 195 | #html_show_sourcelink = True 196 | 197 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 198 | #html_show_sphinx = True 199 | 200 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 201 | #html_show_copyright = True 202 | 203 | # If true, an OpenSearch description file will be output, and all pages will 204 | # contain a tag referring to it. The value of this option must be the 205 | # base URL from which the finished HTML is served. 206 | #html_use_opensearch = '' 207 | 208 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 209 | #html_file_suffix = None 210 | 211 | # Output file base name for HTML help builder. 212 | htmlhelp_basename = 'project-templatedoc' 213 | 214 | 215 | # -- Options for LaTeX output --------------------------------------------- 216 | 217 | latex_elements = { 218 | # The paper size ('letterpaper' or 'a4paper'). 219 | #'papersize': 'letterpaper', 220 | 221 | # The font size ('10pt', '11pt' or '12pt'). 222 | #'pointsize': '10pt', 223 | 224 | # Additional stuff for the LaTeX preamble. 225 | #'preamble': '', 226 | } 227 | 228 | # Grouping the document tree into LaTeX files. List of tuples 229 | # (source start file, target name, title, 230 | # author, documentclass [howto, manual, or own class]). 231 | latex_documents = [ 232 | ('index', 'stability-selection.tex', u'stability-selection Documentation', 233 | u'Thomas Huijskens', 'manual'), 234 | ] 235 | 236 | # The name of an image file (relative to this directory) to place at the top of 237 | # the title page. 238 | #latex_logo = None 239 | 240 | # For "manual" documents, if this is true, then toplevel headings are parts, 241 | # not chapters. 242 | #latex_use_parts = False 243 | 244 | # If true, show page references after internal links. 245 | #latex_show_pagerefs = False 246 | 247 | # If true, show URL addresses after external links. 248 | #latex_show_urls = False 249 | 250 | # Documents to append as an appendix to all manuals. 251 | #latex_appendices = [] 252 | 253 | # If false, no module index is generated. 254 | #latex_domain_indices = True 255 | 256 | 257 | # -- Options for manual page output --------------------------------------- 258 | 259 | # One entry per manual page. List of tuples 260 | # (source start file, name, description, authors, manual section). 261 | man_pages = [ 262 | ('index', 'stability-selection', u'stability-selection Documentation', 263 | [u'Thomas Huijskens'], 1) 264 | ] 265 | 266 | # If true, show URL addresses after external links. 267 | #man_show_urls = False 268 | 269 | 270 | # -- Options for Texinfo output ------------------------------------------- 271 | 272 | # Grouping the document tree into Texinfo files. List of tuples 273 | # (source start file, target name, title, author, 274 | # dir menu entry, description, category) 275 | texinfo_documents = [ 276 | ('index', 'stability-selection', u'stability-selection Documentation', 277 | u'Thomas Huijskens', 'stability-selection', 'scikit-learn compatible implementation of stability selection.', 278 | 'Miscellaneous'), 279 | ] 280 | 281 | # def generate_example_rst(app, what, name, obj, options, lines): 282 | # # generate empty examples files, so that we don't get 283 | # # inclusion errors if there are no examples for a class / module 284 | # examples_path = os.path.join(app.srcdir, "modules", "generated", 285 | # "%s.examples" % name) 286 | # if not os.path.exists(examples_path): 287 | # # touch file 288 | # open(examples_path, 'w').close() 289 | # 290 | # 291 | # def setup(app): 292 | # app.connect('autodoc-process-docstring', generate_example_rst) 293 | 294 | # Documents to append as an appendix to all manuals. 295 | #texinfo_appendices = [] 296 | 297 | # If false, no module index is generated. 298 | #texinfo_domain_indices = True 299 | 300 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 301 | #texinfo_show_urls = 'footnote' 302 | 303 | # If true, do not generate a @detailmenu in the "Top" node's menu. 304 | #texinfo_no_detailmenu = False 305 | 306 | 307 | # Example configuration for intersphinx: refer to the Python standard library. 308 | intersphinx_mapping = {'http://docs.python.org/': None} 309 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. project-template documentation master file, created by 2 | sphinx-quickstart on Mon Jan 18 14:44:12 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to stability-selection's documentation! 7 | =============================================== 8 | 9 | This project contains an implementation of the stability selection algorithm. 10 | 11 | Stability selection is a technique that aims to enhance and improve existing feature 12 | selection algorithms. For a generic feature selection algorithm, we have a tuning 13 | parameter :math:`\lambda \in \Lambda` that controls the amount of regularisation. Examples 14 | of such algorithms are: 15 | 16 | 1. :math:`\ell_1`-penalized regression (penalization parameter :math:`\lambda`). 17 | 2. Orthogonal matching pursuit (number of steps in forward selection). 18 | 3. Boosting (:math:`\ell_1` penalty) 19 | 20 | These structure learning algorithms have in common is a parameter :math:`\lambda \in \Lambda` 21 | that controls the amount of regularisation. For every value of :math:`\lambda`, we obtain a structure 22 | estimate :math:`S^\lambda = \{1, \ldots, p\}`, which indicates which variables to select. We are 23 | interested to determine whether there exists a :math:`\lambda` such that :math:`S^\lambda` is identical to 24 | :math:`S` with high probability, and how to achieve the right amount of regularisation. 25 | 26 | 27 | Sability selection works as follows: 28 | 29 | 1. Define a candidate set of regularization parameters :math:`\Lambda` and a subsample number :math:`N`. 30 | 2. For each value :math:`\lambda \in \Lambda` do: 31 | 32 | a. For each :math:`i` in :math:`\{1, \ldots, N\}`, do: 33 | 34 | i. Generate a bootstrap sample of the original data :math:`X^{n \times p}` of size :math:`\frac{n}{2}`. 35 | ii. Run the selection algorithm (LASSO) on the bootstrap sample with regularization parameter :math:`\lambda`. 36 | 37 | b. Given the selection sets from each subsample, calculate the empirical selection probability for each model component: 38 | 39 | :math:`\hat{\Pi}^\lambda_k = \mathbb{P}[k \in \hat{S}^\lambda] = \frac{1}{N} \sum_{i = 1}^N \mathbb{I}_{\{k \in \hat{S}_i^\lambda\}}.` 40 | 41 | c. The selection probability for component :math:`k` is its probability of being selected by the algorithm. 42 | 43 | 3. Given the selection probabilities for each component and for each value of :math:`\lambda`, construct the 44 | stable set according to the following definition: 45 | 46 | :math:`\hat{S}^{\text{stable}} = \{k : \max_{\lambda \in \Lambda} \hat{\Pi}_k^\lambda \geq \pi_\text{thr}\}.` 47 | 48 | where :math:`\pi_\text{thr}` is a predefined threshold. 49 | 50 | This algorithm identifies a set of “stable” variables that are selected with high probability. 51 | 52 | 53 | .. toctree:: 54 | :maxdepth: 2 55 | 56 | api 57 | stability_selection 58 | randomized_lasso 59 | auto_examples/index 60 | ... 61 | 62 | See the `README `_ 63 | for more information. 64 | 65 | 66 | Indices and tables 67 | ================== 68 | 69 | * :ref:`genindex` 70 | * :ref:`modindex` 71 | * :ref:`search` 72 | 73 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\project-template.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\project-template.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /doc/randomized_lasso.rst: -------------------------------------------------------------------------------- 1 | Randomized LASSO 2 | ================ 3 | 4 | The documentation of the randomized_lasso module. 5 | 6 | .. automodule:: stability_selection.randomized_lasso 7 | :members: -------------------------------------------------------------------------------- /doc/stability_selection.rst: -------------------------------------------------------------------------------- 1 | Stability selection 2 | =================== 3 | 4 | The documentation of the stability_selection module. 5 | 6 | .. automodule:: stability_selection.stability_selection 7 | :members: -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | .. _general_examples: 2 | 3 | General examples 4 | ================ 5 | 6 | Introductory examples. 7 | -------------------------------------------------------------------------------- /examples/plot_randomized_lasso_path.py: -------------------------------------------------------------------------------- 1 | """ 2 | =========================== 3 | Randomized LASSO example 4 | =========================== 5 | 6 | An example plot of the stability scores for each variable after fitting :class:`stability_selection.StabilitySelection` 7 | with :class:`stability_selection.RandomizedLasso` 8 | """ 9 | 10 | import numpy as np 11 | 12 | from sklearn.utils import check_random_state 13 | from stability_selection import StabilitySelection, RandomizedLasso, plot_stability_path 14 | 15 | 16 | def generate_experiment_data(n=200, p=200, rho=0.6, random_state=3245): 17 | rng = check_random_state(random_state) 18 | 19 | sigma = np.eye(p) 20 | sigma[0, 2] = rho 21 | sigma[2, 0] = rho 22 | sigma[1, 2] = rho 23 | sigma[2, 1] = rho 24 | 25 | X = rng.multivariate_normal(mean=np.zeros(p), cov=sigma, size=(n,)) 26 | beta = np.zeros(p) 27 | beta[:2] = 1.0 28 | epsilon = rng.normal(0.0, 0.25, size=(n,)) 29 | 30 | y = np.matmul(X, beta) + epsilon 31 | 32 | return X, y 33 | 34 | 35 | if __name__ == '__main__': 36 | n, p = 200, 200 37 | rho = 0.6 38 | 39 | X, y = generate_experiment_data() 40 | lambda_grid = np.linspace(0.001, 0.5, num=100) 41 | 42 | for weakness in [0.2, 0.5, 1.0]: 43 | estimator = RandomizedLasso(weakness=weakness) 44 | selector = StabilitySelection(base_estimator=estimator, lambda_name='alpha', 45 | lambda_grid=lambda_grid, threshold=0.9, verbose=1) 46 | selector.fit(X, y) 47 | 48 | fig, ax = plot_stability_path(selector) 49 | fig.show() 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /examples/plot_stability_scores.py: -------------------------------------------------------------------------------- 1 | """ 2 | =========================== 3 | Plotting stability scores 4 | =========================== 5 | 6 | An example plot of the stability scores for each variable after fitting :class:`stability_selection.stability_selection.StabilitySelection` 7 | """ 8 | import numpy as np 9 | 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.pipeline import Pipeline 12 | from sklearn.preprocessing import StandardScaler 13 | from sklearn.utils import check_random_state 14 | from stability_selection import StabilitySelection, plot_stability_path 15 | 16 | 17 | def _generate_dummy_classification_data(p=1000, n=1000, k=5, random_state=123321): 18 | 19 | rng = check_random_state(random_state) 20 | 21 | X = rng.normal(loc=0.0, scale=1.0, size=(n, p)) 22 | betas = np.zeros(p) 23 | important_betas = np.sort(rng.choice(a=np.arange(p), size=k)) 24 | betas[important_betas] = rng.uniform(size=k) 25 | 26 | probs = 1 / (1 + np.exp(-1 * np.matmul(X, betas))) 27 | y = (probs > 0.5).astype(int) 28 | 29 | return X, y, important_betas 30 | 31 | 32 | if __name__ == '__main__': 33 | n, p, k = 500, 1000, 5 34 | 35 | X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) 36 | 37 | base_estimator = Pipeline([ 38 | ('scaler', StandardScaler()), 39 | ('model', LogisticRegression(penalty='l1')) 40 | ]) 41 | selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C', 42 | lambda_grid=np.logspace(-5, -1, 50)) 43 | selector.fit(X, y) 44 | 45 | fig, ax = plot_stability_path(selector) 46 | fig.show() 47 | 48 | selected_variables = selector.get_support(indices=True) 49 | selected_scores = selector.stability_scores_.max(axis=1) 50 | 51 | print('Selected variables are:') 52 | print('-----------------------') 53 | 54 | for idx, (variable, score) in enumerate(zip(selected_variables, selected_scores[selected_variables])): 55 | print('Variable %d: [%d], score %.3f' % (idx + 1, variable, score)) 56 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nose>=1.1.2 2 | scikit-learn>=0.19 3 | matplotlib>=2.0.0 4 | numpy>=1.8.0 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | [aliases] 5 | # python2.7 has upgraded unittest and it is no longer compatible with some 6 | # of our tests, so we run all through nose 7 | test = nosetests 8 | 9 | [nosetests] 10 | # nosetests skips test files with the executable bit by default 11 | # which can silently hide failing tests. 12 | # There are no executable scripts within the scikit-learn project 13 | # so let's turn the --exe flag on to avoid skipping tests by 14 | # mistake. 15 | exe = 1 16 | cover-html = 1 17 | cover-html-dir = coverage 18 | cover-package = sklearn 19 | 20 | detailed-errors = 1 21 | with-doctest = 1 22 | doctest-tests = 1 23 | doctest-extension = rst 24 | doctest-fixtures = _fixture 25 | ignore-files=^setup\.py$ -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from setuptools import setup, find_packages 4 | 5 | with open('requirements.txt') as f: 6 | INSTALL_REQUIRES = [l.strip() for l in f.readlines() if l] 7 | 8 | 9 | try: 10 | import numpy 11 | except ImportError: 12 | print('numpy is required during installation') 13 | sys.exit(1) 14 | 15 | try: 16 | import scipy 17 | except ImportError: 18 | print('scipy is required during installation') 19 | sys.exit(1) 20 | 21 | setup(name='stability-selection', 22 | version='0.0.1', 23 | description='A scikit-learn compatible implementation of stability selection for feature selection', 24 | author='Thomas Huijskens', 25 | packages=find_packages(), 26 | install_requires=INSTALL_REQUIRES, 27 | author_email='thomas_huijskens@hotmail.com', 28 | ) 29 | -------------------------------------------------------------------------------- /stability_selection/__init__.py: -------------------------------------------------------------------------------- 1 | from .stability_selection import StabilitySelection, plot_stability_path 2 | from .randomized_lasso import RandomizedLasso, RandomizedLogisticRegression 3 | 4 | __all__ = [ 5 | 'StabilitySelection', 'plot_stability_path', 'RandomizedLasso', 6 | 'RandomizedLogisticRegression' 7 | ] 8 | -------------------------------------------------------------------------------- /stability_selection/bootstrap.py: -------------------------------------------------------------------------------- 1 | """ 2 | =============================== 3 | Bootstrap helper functions 4 | =============================== 5 | 6 | This module contains helper functions for stability_selection.py 7 | that do bootstrap sampling 8 | """ 9 | 10 | import numpy as np 11 | 12 | from sklearn.utils.random import sample_without_replacement 13 | from sklearn.utils.multiclass import type_of_target 14 | 15 | 16 | __all__ = [ 17 | 'bootstrap_without_replacement', 18 | 'complementary_pairs_bootstrap', 19 | 'stratified_bootstrap' 20 | ] 21 | 22 | 23 | def bootstrap_without_replacement(y, n_subsamples, random_state=None): 24 | """ 25 | Bootstrap without replacement, irrespective of label. It is a wrapper around 26 | sklearn.utils.random.sample_without_replacement. 27 | 28 | Parameters 29 | ---------- 30 | y : array of size [n_subsamples,] 31 | True labels 32 | n_subsamples : int 33 | Number of subsamples in the bootstrap sample 34 | random_state : int, RandomState instance or None, optional, default=None 35 | Pseudo random number generator state used for random uniform sampling 36 | from lists of possible values instead of scipy.stats distributions. 37 | If int, random_state is the seed used by the random number generator; 38 | If RandomState instance, random_state is the random number generator; 39 | If None, the random number generator is the RandomState instance used 40 | by `np.random`. 41 | 42 | Returns 43 | ------- 44 | out : array of size [n_subsamples,] 45 | The sampled subsets of integer. The subset of selected integer might 46 | not be randomized, see the method argument. 47 | """ 48 | n_samples = y.shape[0] 49 | return sample_without_replacement(n_samples, n_subsamples, 50 | random_state=random_state) 51 | 52 | 53 | def complementary_pairs_bootstrap(y, n_subsamples, random_state=None): 54 | """ 55 | Complementary pairs bootstrap. Two subsamples A and B are generated, such 56 | that |A| = n_subsamples, the union of A and B equals {0, ..., n_samples - 1}, 57 | and the intersection of A and B is the empty set. Samples irrespective of 58 | label. 59 | 60 | Parameters 61 | ---------- 62 | y : array of size [n_subsamples,] 63 | True labels 64 | n_subsamples : int 65 | Number of subsamples in the bootstrap sample 66 | random_state : int, RandomState instance or None, optional, default=None 67 | Pseudo random number generator state used for random uniform sampling 68 | from lists of possible values instead of scipy.stats distributions. 69 | If int, random_state is the seed used by the random number generator; 70 | If RandomState instance, random_state is the random number generator; 71 | If None, the random number generator is the RandomState instance used 72 | by `np.random`. 73 | 74 | Returns 75 | ------- 76 | A : array of size [n_subsamples,] 77 | The sampled subsets of integer. The subset of selected integer 78 | might not be randomized, see the method argument. 79 | B : array of size [n_samples - n_subsamples,] 80 | The complement of A. 81 | """ 82 | n_samples = y.shape[0] 83 | subsample = bootstrap_without_replacement(y, n_subsamples, random_state) 84 | complementary_subsample = np.setdiff1d(np.arange(n_samples), subsample) 85 | 86 | return subsample, complementary_subsample 87 | 88 | 89 | def stratified_bootstrap(y, n_subsamples, random_state=None): 90 | """ 91 | Bootstrap without replacement, performed separately for each group in y. 92 | 93 | Parameters 94 | ---------- 95 | y : array of size [n_subsamples,] 96 | True labels 97 | n_subsamples : int 98 | Number of subsamples in the bootstrap sample 99 | random_state : int, RandomState instance or None, optional, default=None 100 | Pseudo random number generator state used for random uniform sampling 101 | from lists of possible values instead of scipy.stats distributions. 102 | If int, random_state is the seed used by the random number generator; 103 | If RandomState instance, random_state is the random number generator; 104 | If None, the random number generator is the RandomState instance used 105 | by `np.random`. 106 | 107 | Returns 108 | ------- 109 | out : array of size [n_subsamples,] 110 | The sampled subsets of integer. The subset of selected integer might 111 | not be randomized, see the method argument. 112 | """ 113 | type_of_target_y = type_of_target(y) 114 | allowed_target_types = ('binary', 'multiclass') 115 | if type_of_target_y not in allowed_target_types: 116 | raise ValueError( 117 | 'Supported target types are: {}. Got {!r} instead.'.format( 118 | allowed_target_types, type_of_target_y)) 119 | 120 | unique_y, y_counts = np.unique(y, return_counts=True) 121 | y_counts_relative = y_counts / y_counts.sum() 122 | y_n_samples = np.int32(np.round(y_counts_relative * n_subsamples)) 123 | 124 | # the above should return grouped subsamples which approximately sum up 125 | # to n_subsamples but may not work out exactly due to rounding errors. 126 | # If this is the case, adjust the count of the largest class 127 | if y_n_samples.sum() != n_subsamples: 128 | delta = n_subsamples - y_n_samples.sum() 129 | majority_class = np.argmax(y_counts) 130 | y_n_samples[majority_class] += delta 131 | 132 | all_selected = np.array([], dtype=np.int32) 133 | for i, u in enumerate(unique_y): 134 | indices = np.where(y == u)[0] 135 | selected_indices = indices[bootstrap_without_replacement(indices, 136 | y_n_samples[i], 137 | random_state)] 138 | all_selected = np.concatenate((all_selected, selected_indices)) 139 | 140 | return all_selected 141 | -------------------------------------------------------------------------------- /stability_selection/randomized_lasso.py: -------------------------------------------------------------------------------- 1 | """ 2 | =========================== 3 | Randomized LASSO estimators 4 | =========================== 5 | 6 | This module contains implementations of randomized logistic regression 7 | and randomized LASSO regression [1]_ . 8 | 9 | References 10 | ---------- 11 | .. [1] Meinshausen, N. and Buhlmann, P., 2010. Stability selection. 12 | Journal of the Royal Statistical Society: Series B 13 | (Statistical Methodology), 72(4), pp.417-473. 14 | """ 15 | import numpy as np 16 | 17 | from scipy import sparse 18 | from scipy.sparse import issparse 19 | 20 | from sklearn.linear_model import LogisticRegression, Lasso 21 | from sklearn.linear_model.base import _preprocess_data 22 | from sklearn.utils import check_X_y, check_random_state 23 | 24 | __all__ = ['RandomizedLogisticRegression', 'RandomizedLasso'] 25 | 26 | 27 | def _rescale_data(X, weights): 28 | if issparse(X): 29 | size = weights.shape[0] 30 | weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size)) 31 | X_rescaled = X * weight_dia 32 | else: 33 | X_rescaled = X * (1 - weights) 34 | 35 | return X_rescaled 36 | 37 | 38 | class RandomizedLogisticRegression(LogisticRegression): 39 | """ 40 | Randomized version of scikit-learns LogisticRegression class. 41 | 42 | Randomized LASSO is a generalization of the LASSO. The LASSO 43 | penalises the absolute value of the coefficients with a penalty 44 | term proportional to `C`, but the randomized LASSO changes the 45 | penalty to a randomly chosen value in the range `[C, C/weakness]`. 46 | 47 | Parameters 48 | ---------- 49 | weakness : float 50 | Weakness value for randomized LASSO. Must be in (0, 1]. 51 | 52 | See also 53 | -------- 54 | sklearn.linear_model.LogisticRegression : learns logistic regression 55 | models using the same algorithm. 56 | """ 57 | def __init__(self, weakness=0.5, tol=1e-4, C=1.0, 58 | fit_intercept=True, intercept_scaling=1, class_weight=None, 59 | random_state=None, solver='liblinear', max_iter=100, 60 | multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): 61 | self.weakness = weakness 62 | super(RandomizedLogisticRegression, self).__init__( 63 | penalty='l1', dual=False, tol=tol, C=C, fit_intercept=fit_intercept, 64 | intercept_scaling=intercept_scaling, class_weight=class_weight, 65 | random_state=random_state, solver=solver, max_iter=max_iter, 66 | multi_class=multi_class, verbose=verbose, warm_start=warm_start, 67 | n_jobs=n_jobs) 68 | 69 | def fit(self, X, y, sample_weight=None): 70 | """Fit the model according to the given training data. 71 | 72 | Parameters 73 | ---------- 74 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 75 | The training input samples. 76 | 77 | y : array-like, shape = [n_samples] 78 | The target values. 79 | 80 | sample_weight : array-like, shape (n_samples,) optional 81 | Array of weights that are assigned to individual samples. 82 | If not provided, then each sample is given unit weight. 83 | """ 84 | if not isinstance(self.weakness, float) or not (0.0 < self.weakness <= 1.0): 85 | raise ValueError('weakness should be a float in (0, 1], got %s' % self.weakness) 86 | 87 | X, y = check_X_y(X, y, accept_sparse='csr', dtype=[np.float64, np.float32], 88 | order="C") 89 | 90 | n_features = X.shape[1] 91 | weakness = 1. - self.weakness 92 | random_state = check_random_state(self.random_state) 93 | 94 | weights = weakness * random_state.randint(0, 1 + 1, size=(n_features,)) 95 | X_rescaled = _rescale_data(X, weights) 96 | return super(RandomizedLogisticRegression, self).fit(X_rescaled, y, sample_weight) 97 | 98 | 99 | class RandomizedLasso(Lasso): 100 | """ 101 | Randomized version of scikit-learns Lasso class. 102 | 103 | Randomized LASSO is a generalization of the LASSO. The LASSO penalises 104 | the absolute value of the coefficients with a penalty term proportional 105 | to `alpha`, but the randomized LASSO changes the penalty to a randomly 106 | chosen value in the range `[alpha, alpha/weakness]`. 107 | 108 | Parameters 109 | ---------- 110 | weakness : float 111 | Weakness value for randomized LASSO. Must be in (0, 1]. 112 | 113 | See also 114 | -------- 115 | sklearn.linear_model.LogisticRegression : learns logistic regression models 116 | using the same algorithm. 117 | """ 118 | def __init__(self, weakness=0.5, alpha=1.0, fit_intercept=True, normalize=False, 119 | precompute=False, copy_X=True, max_iter=1000, 120 | tol=1e-4, warm_start=False, positive=False, 121 | random_state=None, selection='cyclic'): 122 | self.weakness = weakness 123 | super(RandomizedLasso, self).__init__( 124 | alpha=alpha, fit_intercept=fit_intercept, 125 | normalize=normalize, precompute=precompute, copy_X=copy_X, 126 | max_iter=max_iter, tol=tol, warm_start=warm_start, 127 | positive=positive, random_state=random_state, 128 | selection=selection) 129 | 130 | def fit(self, X, y): 131 | """Fit the model according to the given training data. 132 | 133 | Parameters 134 | ---------- 135 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 136 | The training input samples. 137 | 138 | y : array-like, shape = [n_samples] 139 | The target values. 140 | """ 141 | if not isinstance(self.weakness, float) or not (0.0 < self.weakness <= 1.0): 142 | raise ValueError('weakness should be a float in (0, 1], got %s' % self.weakness) 143 | 144 | X, y = check_X_y(X, y, accept_sparse=True) 145 | 146 | n_features = X.shape[1] 147 | weakness = 1. - self.weakness 148 | random_state = check_random_state(self.random_state) 149 | 150 | weights = weakness * random_state.randint(0, 1 + 1, size=(n_features,)) 151 | 152 | # TODO: I am afraid this will do double normalization if set to true 153 | #X, y, _, _ = _preprocess_data(X, y, self.fit_intercept, normalize=self.normalize, copy=False, 154 | # sample_weight=None, return_mean=False) 155 | 156 | # TODO: Check if this is a problem if it happens before standardization 157 | X_rescaled = _rescale_data(X, weights) 158 | return super(RandomizedLasso, self).fit(X_rescaled, y) 159 | -------------------------------------------------------------------------------- /stability_selection/stability_selection.py: -------------------------------------------------------------------------------- 1 | """ 2 | =============================== 3 | Stability selection transformer 4 | =============================== 5 | 6 | This module contains a scikit-learn compatible implementation of 7 | stability selection [1]_ . 8 | 9 | References 10 | ---------- 11 | .. [1] Meinshausen, N. and Buhlmann, P., 2010. Stability selection. 12 | Journal of the Royal Statistical Society: Series B 13 | (Statistical Methodology), 72(4), pp.417-473. 14 | 15 | .. [2] Shah, R.D. and Samworth, R.J., 2013. Variable selection with 16 | error control: another look at stability selection. Journal 17 | of the Royal Statistical Society: Series B (Statistical Methodology), 18 | 75(1), pp.55-80. 19 | """ 20 | 21 | from warnings import warn 22 | 23 | import matplotlib.pyplot as plt 24 | import numpy as np 25 | from sklearn.base import BaseEstimator, TransformerMixin, clone 26 | from sklearn.externals.joblib import Parallel, delayed 27 | from sklearn.feature_selection import SelectFromModel 28 | from sklearn.linear_model import LogisticRegression 29 | from sklearn.pipeline import Pipeline 30 | from sklearn.utils import check_array, check_random_state, check_X_y, safe_mask 31 | from sklearn.utils.validation import check_is_fitted 32 | 33 | from .bootstrap import (bootstrap_without_replacement, 34 | complementary_pairs_bootstrap, stratified_bootstrap) 35 | 36 | __all__ = ['StabilitySelection', 'plot_stability_path'] 37 | 38 | BOOTSTRAP_FUNC_MAPPING = { 39 | 'subsample': bootstrap_without_replacement, 40 | 'complementary_pairs': complementary_pairs_bootstrap, 41 | 'stratified': stratified_bootstrap 42 | } 43 | 44 | 45 | def _return_estimator_from_pipeline(pipeline): 46 | """Returns the final estimator in a Pipeline, or the estimator 47 | if it is not""" 48 | if isinstance(pipeline, Pipeline): 49 | return pipeline._final_estimator 50 | else: 51 | return pipeline 52 | 53 | 54 | def _bootstrap_generator(n_bootstrap_iterations, bootstrap_func, y, 55 | n_subsamples, random_state=None): 56 | for _ in range(n_bootstrap_iterations): 57 | subsample = bootstrap_func(y, n_subsamples, random_state) 58 | if isinstance(subsample, tuple): 59 | for item in subsample: 60 | yield item 61 | else: 62 | yield subsample 63 | 64 | 65 | def _fit_bootstrap_sample(base_estimator, X, y, lambda_name, lambda_value, 66 | threshold=None): 67 | """ 68 | Fits base_estimator on a bootstrap sample of the original data, 69 | and returns a mas of the variables that are selected by the fitted model. 70 | 71 | Parameters 72 | ---------- 73 | base_estimator : Estimator 74 | Estimator to be fitted on the data 75 | 76 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 77 | The training input samples. 78 | 79 | y : array-like, shape = [n_samples] 80 | The target values. 81 | 82 | lambda_name : str 83 | Name of the penalization parameter of base_estimator 84 | 85 | lambda_value : float 86 | Value of the penalization parameter 87 | 88 | threshold : string, float, optional default None 89 | The threshold value to use for feature selection. Features whose 90 | importance is greater or equal are kept while the others are 91 | discarded. If "median" (resp. "mean"), then the ``threshold`` value is 92 | the median (resp. the mean) of the feature importances. A scaling 93 | factor (e.g., "1.25*mean") may also be used. If None and if the 94 | estimator has a parameter penalty set to l1, either explicitly 95 | or implicitly (e.g, Lasso), the threshold used is 1e-5. 96 | Otherwise, "mean" is used by default. 97 | 98 | Returns 99 | ------- 100 | selected_variables : array-like, shape = [n_features] 101 | Boolean mask of selected variables. 102 | """ 103 | 104 | base_estimator.set_params(**{lambda_name: lambda_value}) 105 | base_estimator.fit(X, y) 106 | 107 | # TODO: Reconsider if we really want to use SelectFromModel here or not 108 | selector_model = _return_estimator_from_pipeline(base_estimator) 109 | variable_selector = SelectFromModel(estimator=selector_model, 110 | threshold=threshold, 111 | prefit=True) 112 | return variable_selector.get_support() 113 | 114 | 115 | def plot_stability_path(stability_selection, threshold_highlight=None, 116 | **kwargs): 117 | """Plots stability path. 118 | 119 | Parameters 120 | ---------- 121 | stability_selection : StabilitySelection 122 | Fitted instance of StabilitySelection. 123 | 124 | threshold_highlight : float 125 | Threshold defining the cutoff for the stability scores for the 126 | variables that need to be highlighted. 127 | 128 | kwargs : dict 129 | Arguments passed to matplotlib plot function. 130 | """ 131 | check_is_fitted(stability_selection, 'stability_scores_') 132 | 133 | threshold = stability_selection.threshold if threshold_highlight is None else threshold_highlight 134 | paths_to_highlight = stability_selection.get_support(threshold=threshold) 135 | 136 | x_grid = stability_selection.lambda_grid / np.max(stability_selection.lambda_grid) 137 | 138 | fig, ax = plt.subplots(1, 1, **kwargs) 139 | if not paths_to_highlight.all(): 140 | ax.plot(x_grid, stability_selection.stability_scores_[~paths_to_highlight].T, 141 | 'k:', linewidth=0.5) 142 | 143 | if paths_to_highlight.any(): 144 | ax.plot(x_grid, stability_selection.stability_scores_[paths_to_highlight].T, 145 | 'r-', linewidth=0.5) 146 | 147 | if threshold is not None: 148 | ax.plot(x_grid, threshold * np.ones_like(stability_selection.lambda_grid), 149 | 'b--', linewidth=0.5) 150 | 151 | ax.set_ylabel('Stability score') 152 | ax.set_xlabel('Lambda / max(Lambda)') 153 | 154 | fig.tight_layout() 155 | 156 | return fig, ax 157 | 158 | 159 | class StabilitySelection(BaseEstimator, TransformerMixin): 160 | """Stability selection [1]_ fits the estimator `base_estimator` on 161 | bootstrap samples of the original data set, for different values of 162 | the regularization parameter for `base_estimator`. Variables that 163 | reliably get selected by the model in these bootstrap samples are 164 | considered to be stable variables. 165 | 166 | Parameters 167 | ---------- 168 | base_estimator : object. 169 | The base estimator used for stability selection. The estimator 170 | must have either a ``feature_importances_`` or ``coef_`` 171 | attribute after fitting. 172 | 173 | lambda_name : str. 174 | The name of the penalization parameter for the estimator 175 | `base_estimator`. 176 | 177 | lambda_grid : array-like. 178 | Grid of values of the penalization parameter to iterate over. 179 | 180 | n_bootstrap_iterations : integer. 181 | Number of bootstrap samples to create. 182 | 183 | sample_fraction : float, optional 184 | The fraction of samples to be used in each bootstrap sample. 185 | Should be between 0 and 1. If 1, all samples are used. 186 | 187 | threshold : float. 188 | Threshold defining the minimum cutoff value for the stability scores. 189 | 190 | bootstrap_func : str or callable fun (default=bootstrap_without_replacement) 191 | The function used to subsample the data. This parameter can be: 192 | - A string, which must be one of 193 | - 'subsample': For subsampling without replacement. 194 | - 'complementary_pairs': For complementary pairs subsampling [2]_ . 195 | - 'stratified': For stratified bootstrapping in imbalanced 196 | classification. 197 | - A function that takes y, and a random state 198 | as inputs and returns a list of sample indices in the range 199 | (0, len(y)-1). By default, indices are uniformly subsampled. 200 | 201 | bootstrap_threshold : string, float, optional default None 202 | The threshold value to use for feature selection. Features whose 203 | importance is greater or equal are kept while the others are 204 | discarded. If "median" (resp. "mean"), then the ``threshold`` value is 205 | the median (resp. the mean) of the feature importances. A scaling 206 | factor (e.g., "1.25*mean") may also be used. If None and if the 207 | estimator has a parameter penalty set to l1, either explicitly 208 | or implicitly (e.g, Lasso), the threshold used is 1e-5. 209 | Otherwise, "mean" is used by default. 210 | 211 | verbose : integer. 212 | Controls the verbosity: the higher, the more messages. 213 | 214 | n_jobs : int, default=1 215 | Number of jobs to run in parallel. 216 | 217 | pre_dispatch : int, or string, optional 218 | Controls the number of jobs that get dispatched during parallel 219 | execution. Reducing this number can be useful to avoid an 220 | explosion of memory consumption when more jobs get dispatched 221 | than CPUs can process. This parameter can be: 222 | - None, in which case all the jobs are immediately 223 | created and spawned. Use this for lightweight and 224 | fast-running jobs, to avoid delays due to on-demand 225 | spawning of the jobs 226 | - An int, giving the exact number of total jobs that are 227 | spawned 228 | - A string, giving an expression as a function of n_jobs, 229 | as in '2*n_jobs' 230 | 231 | random_state : int, RandomState instance or None, optional, default=None 232 | Pseudo random number generator state used for random uniform sampling 233 | from lists of possible values instead of scipy.stats distributions. 234 | If int, random_state is the seed used by the random number generator; 235 | If RandomState instance, random_state is the random number generator; 236 | If None, the random number generator is the RandomState instance used 237 | by `np.random`. 238 | 239 | Attributes 240 | ---------- 241 | stability_scores_ : array, shape = [n_features, n_alphas] 242 | Array of stability scores for each feature for each value of the 243 | penalization parameter. 244 | 245 | References 246 | ---------- 247 | 248 | .. [1] Meinshausen, N. and Buhlmann, P., 2010. Stability selection. 249 | Journal of the Royal Statistical Society: Series B 250 | (Statistical Methodology), 72(4), pp.417-473. 251 | .. [2] Shah, R.D. and Samworth, R.J., 2013. Variable selection with 252 | error control: another look at stability selection. Journal 253 | of the Royal Statistical Society: Series B (Statistical Methodology), 254 | 75(1), pp.55-80. 255 | """ 256 | def __init__(self, base_estimator=LogisticRegression(penalty='l1'), lambda_name='C', 257 | lambda_grid=np.logspace(-5, -2, 25), n_bootstrap_iterations=100, 258 | sample_fraction=0.5, threshold=0.6, bootstrap_func=bootstrap_without_replacement, 259 | bootstrap_threshold=None, verbose=0, n_jobs=1, pre_dispatch='2*n_jobs', 260 | random_state=None): 261 | self.base_estimator = base_estimator 262 | self.lambda_name = lambda_name 263 | self.lambda_grid = lambda_grid 264 | self.n_bootstrap_iterations = n_bootstrap_iterations 265 | self.sample_fraction = sample_fraction 266 | self.threshold = threshold 267 | self.bootstrap_func = bootstrap_func 268 | self.bootstrap_threshold = bootstrap_threshold 269 | self.verbose = verbose 270 | self.n_jobs = n_jobs 271 | self.pre_dispatch = pre_dispatch 272 | self.random_state = random_state 273 | 274 | def _validate_input(self): 275 | if not isinstance(self.n_bootstrap_iterations, int) or self.n_bootstrap_iterations <= 0: 276 | raise ValueError('n_bootstrap_iterations should be a positive integer, got %s' % 277 | self.n_bootstrap_iterations) 278 | 279 | if not isinstance(self.sample_fraction, float) or not (0.0 < self.sample_fraction <= 1.0): 280 | raise ValueError('sample_fraction should be a float in (0, 1], got %s' % self.sample_fraction) 281 | 282 | if not isinstance(self.threshold, float) or not (0.0 < self.threshold <= 1.0): 283 | raise ValueError('threshold should be a float in (0, 1], got %s' % self.threshold) 284 | 285 | if self.lambda_name not in self.base_estimator.get_params().keys(): 286 | raise ValueError('lambda_name is set to %s, but base_estimator %s ' 287 | 'does not have a parameter ' 288 | 'with that name' % (self.lambda_name, 289 | self.base_estimator.__class__.__name__)) 290 | 291 | if isinstance(self.bootstrap_func, str): 292 | if self.bootstrap_func not in BOOTSTRAP_FUNC_MAPPING.keys(): 293 | raise ValueError('bootstrap_func is set to %s, but must be one of ' 294 | '%s or a callable' % 295 | (self.bootstrap_func, BOOTSTRAP_FUNC_MAPPING.keys())) 296 | 297 | self.bootstrap_func = BOOTSTRAP_FUNC_MAPPING[self.bootstrap_func] 298 | elif not callable(self.bootstrap_func): 299 | raise ValueError('bootstrap_func must be one of %s or a callable' % 300 | BOOTSTRAP_FUNC_MAPPING.keys()) 301 | 302 | def fit(self, X, y): 303 | """Fit the stability selection model on the given data. 304 | 305 | Parameters 306 | ---------- 307 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 308 | The training input samples. 309 | 310 | y : array-like, shape = [n_samples] 311 | The target values. 312 | """ 313 | 314 | self._validate_input() 315 | 316 | X, y = check_X_y(X, y, accept_sparse='csr') 317 | 318 | n_samples, n_variables = X.shape 319 | n_subsamples = np.floor(self.sample_fraction * n_samples).astype(int) 320 | n_lambdas = self.lambda_grid.shape[0] 321 | 322 | base_estimator = clone(self.base_estimator) 323 | random_state = check_random_state(self.random_state) 324 | stability_scores = np.zeros((n_variables, n_lambdas)) 325 | 326 | for idx, lambda_value in enumerate(self.lambda_grid): 327 | if self.verbose > 0: 328 | print("Fitting estimator for lambda = %.5f (%d / %d) on %d bootstrap samples" % 329 | (lambda_value, idx + 1, n_lambdas, self.n_bootstrap_iterations)) 330 | 331 | bootstrap_samples = _bootstrap_generator(self.n_bootstrap_iterations, 332 | self.bootstrap_func, y, 333 | n_subsamples, random_state=random_state) 334 | 335 | selected_variables = Parallel( 336 | n_jobs=self.n_jobs, verbose=self.verbose, 337 | pre_dispatch=self.pre_dispatch 338 | )(delayed(_fit_bootstrap_sample)(clone(base_estimator), 339 | X=X[safe_mask(X, subsample), :], 340 | y=y[subsample], 341 | lambda_name=self.lambda_name, 342 | lambda_value=lambda_value, 343 | threshold=self.bootstrap_threshold) 344 | for subsample in bootstrap_samples) 345 | 346 | stability_scores[:, idx] = np.vstack(selected_variables).mean(axis=0) 347 | 348 | self.stability_scores_ = stability_scores 349 | return self 350 | 351 | def get_support(self, indices=False, threshold=None): 352 | """Get a mask, or integer index, of the features selected 353 | 354 | Parameters 355 | ---------- 356 | indices : boolean (default False) 357 | If True, the return value will be an array of integers, 358 | rather than a boolean mask. 359 | 360 | threshold: float. 361 | Threshold defining the minimum cutoff value for the 362 | stability scores. 363 | 364 | Returns 365 | ------- 366 | support : array 367 | An index that selects the retained features from a feature vector. 368 | If `indices` is False, this is a boolean array of shape 369 | [# input features], in which an element is True iff its 370 | corresponding feature is selected for retention. If `indices` is 371 | True, this is an integer array of shape [# output features] whose 372 | values are indices into the input feature vector. 373 | """ 374 | 375 | if threshold is not None and (not isinstance(threshold, float) 376 | or not (0.0 < threshold <= 1.0)): 377 | raise ValueError('threshold should be a float in (0, 1], ' 378 | 'got %s' % self.threshold) 379 | 380 | cutoff = self.threshold if threshold is None else threshold 381 | mask = (self.stability_scores_.max(axis=1) > cutoff) 382 | 383 | return mask if not indices else np.where(mask)[0] 384 | 385 | def transform(self, X, threshold=None): 386 | """Reduce X to the selected features. 387 | 388 | Parameters 389 | ---------- 390 | X : array of shape [n_samples, n_features] 391 | The input samples. 392 | 393 | threshold: float. 394 | Threshold defining the minimum cutoff value for the 395 | stability scores. 396 | 397 | Returns 398 | ------- 399 | X_r : array of shape [n_samples, n_selected_features] 400 | The input samples with only the selected features. 401 | """ 402 | X = check_array(X, accept_sparse='csr') 403 | mask = self.get_support(threshold=threshold) 404 | 405 | check_is_fitted(self, 'stability_scores_') 406 | 407 | if len(mask) != X.shape[1]: 408 | raise ValueError("X has a different shape than during fitting.") 409 | 410 | if not mask.any(): 411 | warn("No features were selected: either the data is" 412 | " too noisy or the selection test too strict.", 413 | UserWarning) 414 | return np.empty(0).reshape((X.shape[0], 0)) 415 | 416 | return X[:, safe_mask(X, mask)] 417 | -------------------------------------------------------------------------------- /stability_selection/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/stability-selection/e6e34da3601cc8215cd0b08d5c5f3a9dd3ccfe01/stability_selection/tests/__init__.py -------------------------------------------------------------------------------- /stability_selection/tests/test_common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nose.tools import raises 4 | from numpy.testing import assert_array_equal 5 | from sklearn.utils.estimator_checks import check_estimator 6 | from stability_selection import StabilitySelection 7 | 8 | 9 | def test_transformer(): 10 | # With defaults this can fail because in the low sample size case 11 | # some of the bootstrap samples can have zero cases of the positive class 12 | return check_estimator(StabilitySelection(n_bootstrap_iterations=10, sample_fraction=1.0)) 13 | 14 | 15 | @raises(ValueError) 16 | def test_check_string_threshold(): 17 | StabilitySelection(threshold='wrong_value')._validate_input() 18 | 19 | 20 | @raises(ValueError) 21 | def test_check_threshold_too_large(): 22 | StabilitySelection(threshold=1.5)._validate_input() 23 | 24 | 25 | @raises(ValueError) 26 | def test_check_threshold_too_small(): 27 | StabilitySelection(threshold=0.0)._validate_input() 28 | 29 | 30 | @raises(ValueError) 31 | def test_check_threshold_too_small(): 32 | StabilitySelection().get_support(threshold='wrong_value') 33 | 34 | 35 | @raises(ValueError) 36 | def test_check_arguments(): 37 | StabilitySelection(threshold='wrong_value')._validate_input() 38 | 39 | 40 | @raises(ValueError) 41 | def test_check_wrong_lambda_name(): 42 | StabilitySelection(lambda_name='alpha')._validate_input() 43 | 44 | 45 | @raises(ValueError) 46 | def test_check_wrong_lambda_name(): 47 | StabilitySelection(n_bootstrap_iterations=-1)._validate_input() 48 | 49 | 50 | def test_automatic_lambda_grid(): 51 | selector = StabilitySelection() 52 | selector._validate_input() 53 | assert_array_equal(np.logspace(-5, -2, 25), selector.lambda_grid) 54 | 55 | 56 | @raises(ValueError) 57 | def test_bootstrap_func(): 58 | StabilitySelection(bootstrap_func='nonexistent')._validate_input() 59 | 60 | 61 | @raises(ValueError) 62 | def test_callable_bootstrap_func(): 63 | StabilitySelection(bootstrap_func=0.5)._validate_input() 64 | 65 | 66 | @raises(ValueError) 67 | def test_sample_fraction(): 68 | StabilitySelection(sample_fraction=0.0)._validate_input() 69 | 70 | 71 | @raises(ValueError) 72 | def test_lambda_name(): 73 | StabilitySelection(lambda_name='n_estimators')._validate_input() 74 | -------------------------------------------------------------------------------- /stability_selection/tests/test_randomized_lasso.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.testing import assert_almost_equal 3 | 4 | from nose.tools import raises 5 | from sklearn.utils.estimator_checks import check_estimator 6 | from sklearn.utils import check_random_state 7 | from scipy.sparse import csr_matrix 8 | 9 | from stability_selection import StabilitySelection, RandomizedLasso, \ 10 | RandomizedLogisticRegression 11 | 12 | 13 | def generate_experiment_data(n=200, p=200, rho=0.6, random_state=3245): 14 | rng = check_random_state(random_state) 15 | 16 | sigma = np.eye(p) 17 | sigma[0, 2] = rho 18 | sigma[2, 0] = rho 19 | sigma[1, 2] = rho 20 | sigma[2, 1] = rho 21 | 22 | X = rng.multivariate_normal(mean=np.zeros(p), cov=sigma, size=(n,)) 23 | beta = np.zeros(p) 24 | beta[:2] = 1.0 25 | epsilon = rng.normal(0.0, 0.25, size=(n,)) 26 | 27 | y = np.matmul(X, beta) + epsilon 28 | 29 | return X, y 30 | 31 | 32 | def test_estimator(): 33 | check_estimator(RandomizedLasso) 34 | check_estimator(RandomizedLogisticRegression) 35 | 36 | 37 | @raises(ValueError) 38 | def test_logistic_weakness(): 39 | n, p = 200, 200 40 | rho = 0.6 41 | 42 | X, y = generate_experiment_data(n, p, rho) 43 | RandomizedLogisticRegression(weakness=0.0).fit(X, y) 44 | 45 | 46 | @raises(ValueError) 47 | def test_logistic_weakness(): 48 | n, p = 200, 200 49 | rho = 0.6 50 | 51 | X, y = generate_experiment_data(n, p, rho) 52 | RandomizedLasso(weakness=0.0).fit(X, y) 53 | 54 | 55 | def test_randomized_lasso(): 56 | n, p = 200, 200 57 | rho = 0.6 58 | weakness = 0.2 59 | 60 | X, y = generate_experiment_data(n, p, rho) 61 | lambda_grid = np.linspace(0.01, 0.5, num=100) 62 | 63 | estimator = RandomizedLasso(weakness=weakness) 64 | selector = StabilitySelection(base_estimator=estimator, lambda_name='alpha', 65 | lambda_grid=lambda_grid, threshold=0.9, verbose=1) 66 | selector.fit(X, y) 67 | 68 | chosen_betas = selector.get_support(indices=True) 69 | 70 | assert_almost_equal(np.array([0, 1]), chosen_betas) 71 | 72 | 73 | def test_issparse(): 74 | n, p = 200, 200 75 | rho = 0.6 76 | weakness = 0.2 77 | 78 | X, y = generate_experiment_data(n, p, rho) 79 | lambda_grid = np.linspace(0.01, 0.5, num=100) 80 | 81 | estimator = RandomizedLasso(weakness=weakness) 82 | selector = StabilitySelection(base_estimator=estimator, lambda_name='alpha', 83 | lambda_grid=lambda_grid, threshold=0.9, verbose=1) 84 | selector.fit(csr_matrix(X), y) 85 | -------------------------------------------------------------------------------- /stability_selection/tests/test_stability_selection.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | from numpy.testing import assert_almost_equal 5 | from nose.tools import raises 6 | 7 | from sklearn.linear_model import Lasso 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.pipeline import Pipeline 10 | from sklearn.utils import check_random_state 11 | from stability_selection import StabilitySelection, plot_stability_path 12 | 13 | 14 | def _generate_dummy_regression_data(p=1000, n=1000, k=5, 15 | random_state=123321): 16 | rng = check_random_state(random_state) 17 | 18 | X = rng.normal(loc=0.0, scale=1.0, size=(n, p)) 19 | betas = np.zeros(p) 20 | important_betas = np.sort(rng.choice(a=np.arange(p), size=k)) 21 | betas[important_betas] = rng.uniform(size=k) 22 | 23 | y = np.matmul(X, betas) 24 | 25 | return X, y, important_betas 26 | 27 | 28 | def _generate_dummy_classification_data(p=1000, n=1000, k=5, 29 | random_state=123321): 30 | 31 | rng = check_random_state(random_state) 32 | 33 | X = rng.normal(loc=0.0, scale=1.0, size=(n, p)) 34 | betas = np.zeros(p) 35 | important_betas = np.sort(rng.choice(a=np.arange(p), size=k)) 36 | betas[important_betas] = rng.uniform(size=k) 37 | 38 | probs = 1 / (1 + np.exp(-1 * np.matmul(X, betas))) 39 | y = (probs > 0.5).astype(int) 40 | 41 | return X, y, important_betas 42 | 43 | 44 | def test_stability_selection_classification(): 45 | n, p, k = 1000, 1000, 5 46 | 47 | X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) 48 | selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1) 49 | selector.fit(X, y) 50 | 51 | chosen_betas = selector.get_support(indices=True) 52 | X_r = selector.transform(X) 53 | 54 | assert_almost_equal(important_betas, chosen_betas) 55 | assert(X_r.shape == (n, k)) 56 | assert(selector.stability_scores_.shape == (p, selector.lambda_grid.shape[0])) 57 | 58 | 59 | def test_stability_selection_regression(): 60 | n, p, k = 500, 1000, 5 61 | 62 | X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) 63 | 64 | base_estimator = Pipeline([ 65 | ('scaler', StandardScaler()), 66 | ('model', Lasso()) 67 | ]) 68 | 69 | lambdas_grid = np.logspace(-1, 1, num=10) 70 | 71 | selector = StabilitySelection(base_estimator=base_estimator, 72 | lambda_name='model__alpha', 73 | lambda_grid=lambdas_grid) 74 | selector.fit(X, y) 75 | 76 | chosen_betas = selector.get_support(indices=True) 77 | 78 | assert_almost_equal(important_betas, chosen_betas) 79 | 80 | 81 | def test_with_complementary_pairs_bootstrap(): 82 | n, p, k = 500, 1000, 5 83 | 84 | X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) 85 | 86 | base_estimator = Pipeline([ 87 | ('scaler', StandardScaler()), 88 | ('model', Lasso()) 89 | ]) 90 | 91 | lambdas_grid = np.logspace(-1, 1, num=10) 92 | 93 | selector = StabilitySelection(base_estimator=base_estimator, 94 | lambda_name='model__alpha', 95 | lambda_grid=lambdas_grid, 96 | bootstrap_func='complementary_pairs') 97 | selector.fit(X, y) 98 | 99 | chosen_betas = selector.get_support(indices=True) 100 | 101 | assert_almost_equal(important_betas, chosen_betas) 102 | 103 | 104 | def test_with_stratified_bootstrap(): 105 | n, p, k = 1000, 1000, 5 106 | 107 | X, y, important_betas = _generate_dummy_classification_data(n=n, k=k) 108 | selector = StabilitySelection(lambda_grid=np.logspace(-5, -1, 25), verbose=1, 109 | bootstrap_func='stratified') 110 | selector.fit(X, y) 111 | 112 | chosen_betas = selector.get_support(indices=True) 113 | assert_almost_equal(important_betas, chosen_betas) 114 | 115 | 116 | @raises(ValueError) 117 | def test_different_shape(): 118 | n, p, k = 100, 200, 5 119 | 120 | X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) 121 | 122 | base_estimator = Pipeline([ 123 | ('scaler', StandardScaler()), 124 | ('model', Lasso()) 125 | ]) 126 | 127 | lambdas_grid = np.logspace(-1, 1, num=10) 128 | 129 | selector = StabilitySelection(base_estimator=base_estimator, 130 | lambda_name='model__alpha', 131 | lambda_grid=lambdas_grid) 132 | selector.fit(X, y) 133 | selector.transform(X[:, :-2]) 134 | 135 | 136 | def test_no_features(): 137 | n, p, k = 100, 200, 0 138 | 139 | X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) 140 | 141 | base_estimator = Pipeline([ 142 | ('scaler', StandardScaler()), 143 | ('model', Lasso()) 144 | ]) 145 | 146 | lambdas_grid = np.logspace(-1, 1, num=10) 147 | 148 | selector = StabilitySelection(base_estimator=base_estimator, 149 | lambda_name='model__alpha', 150 | lambda_grid=lambdas_grid) 151 | selector.fit(X, y) 152 | 153 | assert_almost_equal(selector.transform(X), 154 | np.empty(0).reshape((X.shape[0], 0))) 155 | 156 | 157 | def test_stability_plot(): 158 | n, p, k = 500, 200, 5 159 | 160 | X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) 161 | 162 | base_estimator = Pipeline([ 163 | ('scaler', StandardScaler()), 164 | ('model', Lasso()) 165 | ]) 166 | 167 | lambdas_grid = np.logspace(-1, 1, num=10) 168 | 169 | selector = StabilitySelection(base_estimator=base_estimator, 170 | lambda_name='model__alpha', 171 | lambda_grid=lambdas_grid) 172 | selector.fit(X, y) 173 | 174 | plot_stability_path(selector, threshold_highlight=0.5) 175 | -------------------------------------------------------------------------------- /stability_selection/tests/test_stratified_bootstrap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nose.tools import raises 3 | 4 | from stability_selection.bootstrap import stratified_bootstrap 5 | 6 | 7 | @raises(ValueError) 8 | def test_check_not_classification(): 9 | y = np.linspace(0, 1, 21) 10 | stratified_bootstrap(y, 10, random_state=0) 11 | 12 | 13 | def test_stratified_bootstrap(): 14 | zero_to_one_ratio = 3 15 | n_ones = 10 16 | 17 | y = np.array(n_ones * ([0] * zero_to_one_ratio + [1])) 18 | for n_subsamples in [4, 8, 12, 16, 20]: 19 | sample_idx = stratified_bootstrap(y, n_subsamples, random_state=0) 20 | samples = y[sample_idx] 21 | 22 | assert(len(samples) == n_subsamples) 23 | 24 | n_ones = (samples == 1).sum() 25 | n_zeros = (samples == 0).sum() 26 | assert(n_zeros == n_ones * zero_to_one_ratio) 27 | 28 | 29 | def test_random_state(): 30 | zero_to_one_ratio = 3 31 | n_ones = 10 32 | 33 | y = np.array(n_ones * ([0] * zero_to_one_ratio + [1])) 34 | 35 | samples0 = np.sort(stratified_bootstrap(y, 12, random_state=0)) 36 | samples0b = np.sort(stratified_bootstrap(y, 12, random_state=0)) 37 | samples1 = np.sort(stratified_bootstrap(y, 12, random_state=1)) 38 | 39 | assert(np.array_equal(samples0, samples0b)) 40 | assert(not np.array_equal(samples0, samples1)) 41 | --------------------------------------------------------------------------------