├── .coveragerc ├── .gitignore ├── .gitmodules ├── .readthedocs.yml ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.rst ├── THANKS ├── TODO ├── _config.yml ├── appveyor.yml ├── constraints.txt ├── core.py ├── cythexts.py ├── dev-requirements.txt ├── doc-requirements.txt ├── doc ├── Makefile ├── adjusted_MLE │ ├── __init__.py │ ├── sampler_based_quantiles.py │ └── tests │ │ ├── __init__.py │ │ ├── comparison_metrics.py │ │ ├── risk_comparisons.py │ │ ├── test_compare_sampler_mle.py │ │ ├── test_cv_MLE_inference.py │ │ └── test_risk.py ├── examples │ ├── compute_coverages.rst │ ├── conditional_sampling.py │ ├── hiv_approx_ci.py │ └── power_comparison.py ├── learning_examples │ ├── BH │ │ ├── gbm_targets_BH.py │ │ ├── gbm_targets_BH_larger.py │ │ ├── gbm_targets_BH_smallB.py │ │ ├── keras_targets_BH.py │ │ ├── keras_targets_BH_marginal.py │ │ ├── logit_targets_BH.py │ │ ├── logit_targets_BH_marginal.py │ │ ├── logit_targets_BH_single.py │ │ └── random_forest_targets_BH.py │ ├── HIV │ │ ├── CV.py │ │ ├── HIV_scale_CV.py │ │ ├── NRTI_DATA.txt │ │ ├── fixed.py │ │ ├── lambda_1se.py │ │ ├── stability_CV.py │ │ ├── stability_CV_6000.py │ │ ├── stability_CV_6000_null.py │ │ └── stability_selection.py │ ├── bootstrap │ │ ├── test_boot.py │ │ └── test_boot_scale1.py │ ├── calibration │ │ └── lasso_calibration.py │ ├── cross_inference │ │ └── cross_inference.py │ ├── keras │ │ ├── keras_example.py │ │ ├── keras_targets.py │ │ ├── keras_targets_BH_strong.py │ │ ├── keras_targets_BH_weak.py │ │ ├── keras_targets_medium.py │ │ └── keras_targets_small.py │ ├── knockoffs │ │ ├── knockoff_followup.py │ │ ├── knockoff_kernel.py │ │ └── knockoff_kernel_multi.py │ ├── lasso │ │ └── lasso_example.py │ ├── lasso_CV │ │ ├── followup.py │ │ ├── lasso_exact_CV_null.py │ │ └── lasso_example_CV.py │ ├── multi_target │ │ ├── additive_targets.py │ │ ├── additive_targets_small.py │ │ ├── followup_multi.py │ │ ├── gbm2.py │ │ ├── gbm_targets.py │ │ ├── gbm_targets_small.py │ │ ├── lasso_example_multi.py │ │ ├── lasso_example_multi_CV.py │ │ ├── lasso_example_multi_CV_random.py │ │ ├── lasso_example_multi_CV_stronger.py │ │ ├── lasso_example_multi_bigger.py │ │ ├── lasso_example_multi_gbm.py │ │ ├── lasso_example_multi_gbm_sk.py │ │ ├── lasso_example_multi_random.py │ │ ├── lasso_example_multi_random_gbm.py │ │ ├── lasso_example_multi_random_rf.py │ │ ├── lasso_example_multi_rf.py │ │ ├── lasso_example_multi_rf_sk.py │ │ ├── lee_multi.py │ │ ├── lee_multi_500.py │ │ └── lee_multi_bigger.py │ ├── parametric │ │ ├── lasso_selected.py │ │ ├── lasso_selected_resid.py │ │ ├── probit_step.py │ │ └── probit_step_both.py │ ├── riboflavin │ │ ├── CV.py │ │ └── CV_smaller.py │ ├── stability │ │ ├── stability_selection.py │ │ ├── stability_selection_harder.py │ │ └── stability_selection_harder_big.py │ └── standalone │ │ ├── basic_example.py │ │ ├── cleaner_basic_example.py │ │ ├── full_model_example.py │ │ ├── regression_example.py │ │ └── replicate_basic_example.py ├── license.rst ├── notebooks │ ├── Group LASSO Jacobian.Rmd │ ├── Group LASSO Jacobian.ipynb │ ├── UMPU.ipynb │ ├── isotonic.ipynb │ ├── lasso.ipynb │ ├── learning │ │ ├── Different pivots.ipynb │ │ ├── Multiple events in algorithm.ipynb │ │ ├── Multiple events not monotone.ipynb │ │ ├── Multiple randomization with fitting.ipynb │ │ ├── Multiple randomization with fitting_boot.ipynb │ │ ├── Multiple randomization.ipynb │ │ ├── Non convex region II.ipynb │ │ ├── Non convex region.ipynb │ │ ├── simple_example_pivots.pdf │ │ └── simple_example_sel_prob.pdf │ ├── pca_rank1.ipynb │ ├── quadratic_decisions.ipynb │ ├── reduced_covtest.ipynb │ ├── screening.ipynb │ ├── selection_objects.ipynb │ └── spacings.ipynb └── source │ ├── _static │ ├── logo.png │ └── selection.css │ ├── _templates │ └── layout.html │ ├── algorithms │ ├── covtest.Rmd │ ├── covtest.ipynb │ ├── index.rst │ ├── spacings.rst │ └── spacings_files │ │ ├── spacings_23_0.png │ │ ├── spacings_25_0.png │ │ ├── spacings_27_0.png │ │ ├── spacings_29_0.png │ │ ├── spacings_31_0.png │ │ ├── spacings_3_0.png │ │ ├── spacings_4_0.png │ │ ├── spacings_5_0.png │ │ ├── spacings_6_0.png │ │ ├── spacings_7_0.png │ │ └── spacings_9_0.png │ ├── conf.py │ ├── docattribute.rst │ ├── documentation.rst │ ├── download.rst │ ├── index.rst │ ├── learning │ ├── Learning1.Rmd │ ├── Learning1.ipynb │ ├── Learning2.Rmd │ ├── Learning2.ipynb │ └── index.rst │ ├── license.rst │ ├── links_names.txt │ ├── randomized │ ├── index.rst │ ├── lasso.Rmd │ └── lasso.ipynb │ └── sphinxext │ └── math_dollar.py ├── figs ├── pictures.r └── voronoi_figs.py ├── lasso_example_null_CV.py ├── requirements.txt ├── sandbox ├── SPRT.ipynb ├── absurd.py ├── bayesian │ ├── __init__.py │ ├── crime_data_attempt.py │ ├── crime_data_set.py │ ├── dual_bayesian.py │ ├── dual_lasso_test.py │ ├── hiv_inference.py │ ├── lasso_selection.py │ ├── logistic_bayesian.py │ ├── mixed_model.py │ ├── ms_lasso_2stage.py │ ├── random_reduced_lasso_bayesian_model.py │ ├── random_reduced_lasso_test.py │ ├── random_reduced_logistic_test.py │ ├── read_file.py │ ├── reduced_forward_stepwise_test.py │ ├── reduced_lasso_bayesian_model.py │ └── reduced_marginal_screening.py ├── inference_hiv_data.py ├── isotonic.py ├── kmeans.py ├── multi_forward_step.py ├── multistep.ipynb ├── randomized2.py ├── randomized_tests │ ├── test_estimation.py │ ├── test_greedy_step.py │ ├── test_marginalize_subgrad.py │ ├── test_multiple_queries.py │ ├── test_multiple_queries_CI.py │ ├── test_nonrandomized.py │ ├── test_randomization_to_zero.py │ ├── test_reconstruction.py │ ├── test_scaling.py │ ├── test_threshold_score.py │ └── test_without_screening.py ├── sample_splitting.ipynb ├── sample_splitting.py ├── sample_splitting_alex.py ├── sample_splitting_alex_null.py ├── tensorflow_test.py ├── test_cover.py ├── test_isotonic.py ├── test_variance.py └── variance_estimation.py ├── selectinf ├── __init__.py ├── _version.py ├── algorithms │ ├── __init__.py │ ├── api.py │ ├── change_point.py │ ├── covtest.py │ ├── cv.py │ ├── cv_glmnet.py │ ├── debiased_lasso.py │ ├── debiased_lasso_utils.pyx │ ├── forward_step.py │ ├── lasso.py │ ├── pca.py │ ├── screening.py │ ├── softmax.py │ ├── sqrt_lasso.py │ ├── stopping_rules.py │ └── tests │ │ ├── __init__.py │ │ ├── test_IC.py │ │ ├── test_ROSI.py │ │ ├── test_change_point.py │ │ ├── test_compareR.py │ │ ├── test_covtest.py │ │ ├── test_data_carving.py │ │ ├── test_debiased_lasso.py │ │ ├── test_forward_step.py │ │ ├── test_lasso.py │ │ ├── test_screening.py │ │ ├── test_softmax.py │ │ └── test_sqrt_lasso.py ├── api.py ├── base.py ├── constraints │ ├── __init__.py │ ├── affine.py │ ├── api.py │ ├── base.py │ ├── estimation.py │ ├── intervals.py │ ├── quadratic.py │ ├── quasi_affine.py │ └── tests │ │ ├── __init__.py │ │ ├── test_affine.py │ │ ├── test_estimation.py │ │ ├── test_quadratic_tests.py │ │ ├── test_quasi.py │ │ └── test_unknown_sigma.py ├── distributions │ ├── __init__.py │ ├── api.py │ ├── chain.py │ ├── chisq.py │ ├── discrete_family.py │ ├── discrete_multiparameter.py │ ├── intervals.py │ ├── pvalue.py │ └── tests │ │ ├── __init__.py │ │ ├── test_chains.py │ │ ├── test_discreteExFam.py │ │ └── test_multiparameter.py ├── glm.py ├── info.py ├── learning │ ├── Rfitters.py │ ├── Rutils.py │ ├── __init__.py │ ├── core.py │ ├── fitters.py │ ├── keras_fit.py │ ├── learners.py │ ├── samplers.py │ └── utils.py ├── randomized │ ├── __init__.py │ ├── api.py │ ├── cv_view.py │ ├── group_lasso.py │ ├── lasso.py │ ├── modelQ.py │ ├── query.py │ ├── randomization.py │ ├── sandbox │ │ ├── M_estimator_group_lasso.py │ │ ├── M_estimator_nonrandom.py │ │ ├── convenience.py │ │ ├── general_lasso.py │ │ ├── greedy_step.py │ │ ├── group_lasso.py │ │ └── lasso_iv.py │ ├── screening.py │ ├── selective_MLE_utils.pyx │ ├── slope.py │ └── tests │ │ ├── __init__.py │ │ ├── sandbox │ │ ├── test_Mest.py │ │ ├── test_convenience.py │ │ ├── test_cv.py │ │ ├── test_cv_corrected_nonrandomized_lasso.py │ │ ├── test_cv_glmnet.py │ │ ├── test_cv_lee_et_al.py │ │ ├── test_decompose_subgrad.py │ │ ├── test_fixedX.py │ │ ├── test_full_lasso.py │ │ ├── test_general_lasso.py │ │ ├── test_general_lasso_pval.py │ │ ├── test_intervals.py │ │ ├── test_lasso_iv.py │ │ ├── test_multiple_splits.py │ │ ├── test_opt_weighted_intervals.py │ │ ├── test_optimization_sampler.py │ │ ├── test_sampling.py │ │ ├── test_split.py │ │ ├── test_split_compare.py │ │ └── test_sqrt_lasso.py │ │ ├── test_BH.py │ │ ├── test_group_lasso.py │ │ ├── test_lasso.py │ │ ├── test_marginal_screening.py │ │ ├── test_modelQ.py │ │ ├── test_multiple_queries.py │ │ ├── test_naive.py │ │ ├── test_randomization.py │ │ ├── test_selective_MLE.py │ │ ├── test_selective_MLE_high.py │ │ ├── test_selective_MLE_onedim.py │ │ ├── test_slope.py │ │ ├── test_slope_subgrad.py │ │ ├── test_split_lasso.py │ │ └── test_topK.py ├── reduced_optimization │ └── tests │ │ └── __init__.py ├── sampling │ ├── __init__.py │ ├── api.py │ ├── langevin.py │ ├── sequential.py │ ├── sqrt_lasso.pyx │ ├── tests │ │ ├── __init__.py │ │ ├── plots_fs.py │ │ ├── test_fstep_langevin.py │ │ ├── test_kfstep.py │ │ ├── test_pca_langevin.py │ │ ├── test_sample_sphere.py │ │ └── test_sequential.py │ ├── truncnorm.pyx │ └── truncnorm_quadratic.pyx ├── sandbox │ ├── approx_ci │ │ ├── __init__.py │ │ ├── ci_approx_density.py │ │ ├── ci_approx_greedy_step.py │ │ ├── selection_map.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_glm.py │ │ │ ├── test_greedy_step.py │ │ │ └── test_threshold_score.py │ └── bayesian │ │ ├── __init__.py │ │ ├── barrier.py │ │ ├── credible_intervals.py │ │ ├── dual_lasso.py │ │ ├── estimator.py │ │ ├── forward_stepwise_reduced.py │ │ ├── initial_soln.py │ │ ├── lasso_reduced.py │ │ ├── marginal_screening_reduced.py │ │ ├── ms_lasso_2stage_reduced.py │ │ ├── par_carved_reduced.py │ │ ├── par_random_lasso_reduced.py │ │ ├── random_lasso_reduced.py │ │ └── tests │ │ ├── __init__.py │ │ ├── test_carved_lasso.py │ │ ├── test_dual_lasso.py │ │ ├── test_fs.py │ │ ├── test_lasso.py │ │ └── test_ms_lasso_2stage.py ├── src_C │ ├── #sample_preparation.pyx# │ ├── HmcSampler.cpp │ ├── HmcSampler.h │ ├── logfile.txt │ ├── preparation_Eig_Vect.cpp │ ├── preparation_Eig_Vect.h │ ├── sample_preparation.cpp │ ├── sample_preparation.pyx │ └── setup.py ├── tests │ ├── __init__.py │ ├── decorators.py │ ├── flags.py │ ├── instance.py │ ├── test_instance.py │ └── tests.py ├── truncated │ ├── F.py │ ├── T.py │ ├── __init__.py │ ├── api.py │ ├── base.py │ ├── chi.py │ ├── gaussian.py │ └── tests │ │ ├── __init__.py │ │ ├── test_truncated.py │ │ └── test_truncatedFT.py └── utils │ ├── __init__.py │ └── tools.py ├── setup.cfg ├── setup.py ├── setup_helpers.py ├── tools ├── apigen.py ├── build_modref_templates.py ├── gitwash_dumper.py ├── nbtools.py ├── noseall_with_coverage └── strip_notebook.py ├── umpu ├── UMAU.pdf ├── umpu.r └── umpuWriteup.tex └── versioneer.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = selection 4 | include = */selection/* 5 | omit = 6 | */setup.py 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | */*pyc 2 | */*~ 3 | */*/*~ 4 | */*/*/*~ 5 | */*.out 6 | */*.aux 7 | */*.bbl 8 | */*.blg 9 | */*.vrb 10 | */*.synctex* 11 | */*.toc 12 | */*.snm 13 | */*.odt 14 | */*.ps 15 | */*.eps 16 | */*.dvi 17 | */*.log 18 | */*.nav 19 | */*.bak 20 | */*.vrb 21 | */*.pyc 22 | */*/*.pyc 23 | *.pyc 24 | selectinf/*.so 25 | selectinf/*.c 26 | selectinf/*/*.so 27 | selectinf/*/*.c 28 | build 29 | *ipynb_checkpoints 30 | */*ipynb_checkpoints 31 | .idea/* 32 | */.idea/* 33 | */*/.idea/* 34 | *.log 35 | *~ 36 | .*sw* 37 | */*~ 38 | *pyc 39 | */*pyc 40 | *.pdf 41 | *.csv 42 | doc/source/api/generated/* 43 | docs/source/api/generated/* 44 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "travis-tools"] 2 | path = travis-tools 3 | url = https://github.com/matthew-brett/travis-tools.git 4 | [submodule "C-software"] 5 | path = C-software 6 | url = https://github.com/selective-inference/C-software.git 7 | 8 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | builder: html 11 | configuration: doc/source/conf.py 12 | 13 | # Build documentation with MkDocs 14 | #mkdocs: 15 | # configuration: mkdocs.yml 16 | 17 | # Optionally build your docs in additional formats such as PDF and ePub 18 | #formats: all 19 | 20 | # Optionally set the version of Python and requirements required to build your docs 21 | python: 22 | version: 3.6 23 | install: 24 | - requirements: requirements.txt 25 | - requirements: doc-requirements.txt 26 | - method: setuptools 27 | path: . 28 | 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Selective Inference development team 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * The names of any contributors to this software 17 | may not be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHOR LICENSE Makefile* MANIFEST.in setup* README.* 2 | include Changelog TODO 3 | recursive-include doc * 4 | recursive-include tools * 5 | # setup utilities 6 | include setup_helpers.py 7 | include cythexts.py 8 | recursive-include fake_pyrex * 9 | include versioneer.py 10 | include selection/_version.py 11 | include C-software/src/*.h -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | The selection project 3 | ===================== 4 | 5 | This project contains software for selective inference, with emphasis on 6 | selective inference in regression. 7 | 8 | Some key references 9 | ------------------- 10 | 11 | - ``A significance test for the lasso``: http://arxiv.org/abs/1301.7161 12 | - ``Tests in adaptive regression via the Kac-Rice formula``: 13 | http://arxiv.org/abs/1308.3020 14 | - ``Post-selection adaptive inference for Least Angle Regression and the Lasso``: 15 | http://arxiv.org/abs/1401.3889 16 | - ``Exact post-selection inference with the lasso``: 17 | http://arxiv.org/abs/1311.6238 18 | - ``Exact Post Model Selection Inference for Marginal Screening``: 19 | http://arxiv.org/abs/1402.5596 20 | 21 | Install 22 | ------- 23 | 24 | .. code:: python 25 | 26 | git submodule init # travis_tools and C-software 27 | git submodule update 28 | pip install -r requirements.txt 29 | python setup.py install 30 | 31 | Potential speedups 32 | ------------------ 33 | 34 | - We can condition on “parts” of each draw of the sampler, in 35 | particular if we condition on the projection of the rejection 36 | ``sample - center`` onto direction then resampling on the ray can be 37 | sped up for some things like LASSO. Could be some cost in power. 38 | 39 | - Learning a higher dimensional function can perhaps save some time – 40 | proper conditioning has to be checked. 41 | 42 | -------------------------------------------------------------------------------- /THANKS: -------------------------------------------------------------------------------- 1 | Selective Inference Team 2 | ------------------------ 3 | 4 | Contributors to this project include: 5 | 6 | Yuval Benjamini 7 | Leonard Blier 8 | Will Fithian 9 | Jason Lee 10 | Joshua Loftus 11 | Stephen Reid 12 | Dennis Sun 13 | Yuekai Sun 14 | Jonathan Taylor 15 | Xiaoying Tian 16 | Ryan Tibshirani 17 | Robert Tibshirani 18 | 19 | 20 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | - Marginalize group LASSO 2 | - SLOPE: randomized and non-randomized 3 | - selective debiased LASSO 4 | - randomized sqrt LASSO 5 | - alternate randomization 6 | - user's choice of model for non-randomized -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /constraints.txt: -------------------------------------------------------------------------------- 1 | rpy2<2.9 2 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements for developing regreg 2 | # Check these dependencies against regreg/info.py 3 | -r requirements.txt 4 | nose 5 | -------------------------------------------------------------------------------- /doc-requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements for building docs 2 | # Check these dependencies against doc/conf.py 3 | -r dev-requirements.txt 4 | sphinx>=1.4 5 | numpydoc 6 | matplotlib 7 | texext 8 | nb2plots 9 | rpy2 10 | seaborn 11 | statsmodels 12 | tensorflow 13 | keras 14 | nbsphinx 15 | -------------------------------------------------------------------------------- /doc/adjusted_MLE/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/adjusted_MLE/__init__.py -------------------------------------------------------------------------------- /doc/adjusted_MLE/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/adjusted_MLE/tests/__init__.py -------------------------------------------------------------------------------- /doc/adjusted_MLE/tests/test_risk.py: -------------------------------------------------------------------------------- 1 | import numpy as np, os, itertools 2 | import pandas as pd 3 | 4 | from rpy2 import robjects 5 | import rpy2.robjects.numpy2ri 6 | rpy2.robjects.numpy2ri.activate() 7 | import rpy2.robjects.pandas2ri 8 | from rpy2.robjects.packages import importr 9 | 10 | from .comparison_metrics import (sim_xy, 11 | glmnet_lasso, 12 | relative_risk) 13 | from .risk_comparisons import risk_comparison 14 | 15 | def output_file(n=200, 16 | p=500, 17 | rho=0.35, 18 | s=5, 19 | beta_type=1, 20 | snr_values=np.array([0.10, 0.15, 0.20, 0.25, 0.30, 21 | 0.35, 0.42, 0.71, 1.22, 2.07]), 22 | tuning_nonrand="lambda.1se", 23 | tuning_rand="lambda.1se", 24 | randomizing_scale=np.sqrt(0.50), 25 | ndraw=50, 26 | outpath = None): 27 | 28 | df_risk = pd.DataFrame() 29 | if n > p: 30 | full_dispersion = True 31 | else: 32 | full_dispersion = False 33 | 34 | snr_list = [] 35 | for snr in snr_values: 36 | snr_list.append(snr) 37 | relative_risk = np.squeeze(risk_comparison(n=n, 38 | p=p, 39 | nval=n, 40 | rho=rho, 41 | s=s, 42 | beta_type=beta_type, 43 | snr=snr, 44 | randomizer_scale=randomizing_scale, 45 | full_dispersion=full_dispersion, 46 | tuning_nonrand =tuning_nonrand, 47 | tuning_rand=tuning_rand, ndraw = ndraw)) 48 | 49 | df_risk = df_risk.append(pd.DataFrame(data=relative_risk.reshape((1, 6)), columns=['sel-MLE', 'ind-est', 'rand-LASSO', 50 | 'rel-rand-LASSO', 'rel-LASSO','LASSO']), ignore_index=True) 51 | 52 | df_risk['n'] = n 53 | df_risk['p'] = p 54 | df_risk['s'] = s 55 | df_risk['rho'] = rho 56 | df_risk['beta-type'] = beta_type 57 | df_risk['snr'] = pd.Series(np.asarray(snr_list)) 58 | df_risk['target'] = "selected" 59 | 60 | if outpath is None: 61 | outpath = os.path.dirname(__file__) 62 | 63 | outfile_risk_csv = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_risk_betatype" + str(beta_type) + "_rho_" + str(rho) + ".csv") 64 | outfile_risk_html = os.path.join(outpath, "dims_" + str(n) + "_" + str(p) + "_risk_betatype" + str(beta_type) + "_rho_" + str(rho) + ".html") 65 | df_risk.to_csv(outfile_risk_csv, index=False) 66 | df_risk.to_html(outfile_risk_html) 67 | 68 | -------------------------------------------------------------------------------- /doc/examples/conditional_sampling.py: -------------------------------------------------------------------------------- 1 | """ 2 | We demonstrate that our optimization variables have 3 | the correct distribution given the data. 4 | """ 5 | 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from statsmodels.distributions import ECDF 9 | 10 | from selection.randomized.tests.test_sampling import test_conditional_law 11 | 12 | def main(ndraw=50000, burnin=5000, remove_atom=False, unpenalized=True, stepsize=1.e-2): 13 | 14 | fig_idx = 0 15 | for (rand, 16 | mcmc_opt, 17 | mcmc_omega, 18 | truncated_opt, 19 | truncated_omega) in test_conditional_law(ndraw=ndraw, burnin=burnin, stepsize=stepsize, unpenalized=unpenalized): 20 | 21 | fig_idx += 1 22 | fig = plt.figure(num=fig_idx, figsize=(8,8)) 23 | 24 | plt.clf() 25 | idx = 0 26 | for i in range(mcmc_opt.shape[1]): 27 | plt.subplot(3,3,idx+1) 28 | 29 | mcmc_ = mcmc_opt[:, i] 30 | truncated_ = truncated_opt[:, i] 31 | 32 | xval = np.linspace(min(mcmc_.min(), truncated_.min()), 33 | max(mcmc_.max(), truncated_.max()), 34 | 200) 35 | 36 | if remove_atom: 37 | mcmc_ = mcmc_[mcmc_ < np.max(mcmc_)] 38 | mcmc_ = mcmc_[mcmc_ > np.min(mcmc_)] 39 | 40 | plt.plot(xval, ECDF(mcmc_)(xval), label='MCMC') 41 | plt.plot(xval, ECDF(truncated_)(xval), label='truncated') 42 | idx += 1 43 | if idx == 1: 44 | plt.legend(loc='lower right') 45 | 46 | fig.suptitle(' '.join([rand, "opt"])) 47 | 48 | fig_idx += 1 49 | fig = plt.figure(num=fig_idx, figsize=(8,8)) 50 | plt.clf() 51 | idx = 0 52 | for i in range(mcmc_opt.shape[1]): 53 | plt.subplot(3,3,idx+1) 54 | 55 | mcmc_ = mcmc_omega[:, i] 56 | truncated_ = truncated_omega[:, i] 57 | 58 | xval = np.linspace(min(mcmc_.min(), truncated_.min()), 59 | max(mcmc_.max(), truncated_.max()), 60 | 200) 61 | 62 | if remove_atom: 63 | mcmc_ = mcmc_[mcmc_ < np.max(mcmc_)] 64 | mcmc_ = mcmc_[mcmc_ > np.min(mcmc_)] 65 | plt.plot(xval, ECDF(mcmc_)(xval), label='MCMC') 66 | plt.plot(xval, ECDF(truncated_)(xval), label='truncated') 67 | idx += 1 68 | if idx == 1: 69 | plt.legend(loc='lower right') 70 | 71 | fig.suptitle(' '.join([rand, "omega"])) 72 | plt.show() 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /doc/learning_examples/calibration/lasso_calibration.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import normal_sampler, logit_fit 12 | 13 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | dispersion = sigma**2 28 | 29 | S = X.T.dot(y) 30 | covS = dispersion * X.T.dot(X) 31 | smooth_sampler = normal_sampler(S, covS) 32 | 33 | def meta_algorithm(XTX, XTXi, lam, sampler): 34 | 35 | p = XTX.shape[0] 36 | success = np.zeros(p) 37 | 38 | loss = rr.quadratic_loss((p,), Q=XTX) 39 | pen = rr.l1norm(p, lagrange=lam) 40 | 41 | scale = 0. 42 | noisy_S = sampler(scale=scale) 43 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 44 | problem = rr.simple_problem(loss, pen) 45 | soln = problem.solve(max_its=50, tol=1.e-6) 46 | success += soln != 0 47 | return set(np.nonzero(success)[0]) 48 | 49 | XTX = X.T.dot(X) 50 | XTXi = np.linalg.inv(XTX) 51 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 52 | dispersion = np.linalg.norm(resid)**2 / (n-p) 53 | 54 | lam = 4. * np.sqrt(n) 55 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 56 | 57 | # run selection algorithm 58 | 59 | 60 | return full_model_inference(X, 61 | y, 62 | truth, 63 | selection_algorithm, 64 | smooth_sampler, 65 | success_params=(1, 1), 66 | B=B, 67 | fit_probability=logit_fit, 68 | fit_args={'df':20}) 69 | 70 | if __name__ == "__main__": 71 | import statsmodels.api as sm 72 | import matplotlib.pyplot as plt 73 | import pandas as pd 74 | 75 | csvfile = 'lasso_calibration.csv' 76 | outbase = csvfile[:-4] 77 | 78 | for i in range(2000): 79 | for B in np.random.choice([50, 100, 500, 1000, 1500, 2000], 1, replace=True): 80 | df = simulate(B=B) 81 | 82 | if df is not None and i > 0: 83 | 84 | try: # concatenate to disk 85 | df = pd.concat([df, pd.read_csv(csvfile)]) 86 | except FileNotFoundError: 87 | pass 88 | df.to_csv(csvfile, index=False) 89 | 90 | if len(df['pivot']) > 0: 91 | pivot_ax, length_ax = pivot_plot(df, outbase) 92 | 93 | -------------------------------------------------------------------------------- /doc/learning_examples/cross_inference/cross_inference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from selection.learning.core import cross_inference 4 | from selection.learning.keras_fit import keras_fit 5 | 6 | data = np.load('lasso_multi_learning.npz') 7 | learning_data = (data['T'][:2000], data['Y'][:2000]) 8 | 9 | result = cross_inference(learning_data, 10 | data['nuisance'], 11 | data['direction'], 12 | keras_fit, 13 | fit_args={'epochs':3, 'sizes':[10]*5, 'dropout':0., 'activation':'relu'}) 14 | -------------------------------------------------------------------------------- /doc/learning_examples/keras/keras_targets.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, keras_fit 12 | from selection.learning.learners import mixture_learner 13 | mixture_learner.scales = [1]*10 + [1.5,2,3,4,5,10] 14 | 15 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): 16 | 17 | # description of statistical problem 18 | 19 | X, y, truth = gaussian_instance(n=n, 20 | p=p, 21 | s=s, 22 | equicorrelated=False, 23 | rho=0.5, 24 | sigma=sigma, 25 | signal=signal, 26 | random_signs=True, 27 | scale=False)[:3] 28 | 29 | XTX = X.T.dot(X) 30 | XTXi = np.linalg.inv(XTX) 31 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 32 | dispersion = np.linalg.norm(resid)**2 / (n-p) 33 | 34 | S = X.T.dot(y) 35 | covS = dispersion * X.T.dot(X) 36 | splitting_sampler = split_sampler(X * y[:, None], covS) 37 | 38 | def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): 39 | 40 | p = XTX.shape[0] 41 | success = np.zeros(p) 42 | 43 | loss = rr.quadratic_loss((p,), Q=XTX) 44 | pen = rr.l1norm(p, lagrange=lam) 45 | 46 | scale = 0.5 47 | noisy_S = sampler(scale=scale) 48 | soln = XTXi.dot(noisy_S) 49 | solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) 50 | return set(np.nonzero(np.fabs(solnZ) > 2.1)[0]) 51 | 52 | lam = 4. * np.sqrt(n) 53 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) 54 | 55 | # run selection algorithm 56 | 57 | return full_model_inference(X, 58 | y, 59 | truth, 60 | selection_algorithm, 61 | splitting_sampler, 62 | success_params=(5, 7), 63 | B=B, 64 | fit_probability=keras_fit, 65 | fit_args={'epochs':30, 'sizes':[100, 100], 'activation':'relu'}) 66 | 67 | 68 | if __name__ == "__main__": 69 | import statsmodels.api as sm 70 | import matplotlib.pyplot as plt 71 | import pandas as pd 72 | 73 | for i in range(500): 74 | df = simulate(B=10000) 75 | csvfile = 'keras_targets.csv' 76 | outbase = csvfile[:-4] 77 | 78 | if df is not None and i > 0: 79 | 80 | try: # concatenate to disk 81 | df = pd.concat([df, pd.read_csv(csvfile)]) 82 | except FileNotFoundError: 83 | pass 84 | df.to_csv(csvfile, index=False) 85 | 86 | if len(df['pivot']) > 0: 87 | pivot_ax, length_ax = pivot_plot(df, outbase) 88 | 89 | 90 | -------------------------------------------------------------------------------- /doc/learning_examples/keras/keras_targets_medium.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, keras_fit 12 | from selection.learning.learners import mixture_learner 13 | mixture_learner.scales = [1]*10 + [1.5,2,3,4,5,10] 14 | 15 | def simulate(n=200, p=50, s=5, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): 16 | 17 | # description of statistical problem 18 | 19 | X, y, truth = gaussian_instance(n=n, 20 | p=p, 21 | s=s, 22 | equicorrelated=False, 23 | rho=0.5, 24 | sigma=sigma, 25 | signal=signal, 26 | random_signs=True, 27 | scale=False)[:3] 28 | 29 | XTX = X.T.dot(X) 30 | XTXi = np.linalg.inv(XTX) 31 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 32 | dispersion = np.linalg.norm(resid)**2 / (n-p) 33 | 34 | S = X.T.dot(y) 35 | covS = dispersion * X.T.dot(X) 36 | splitting_sampler = split_sampler(X * y[:, None], covS) 37 | 38 | def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): 39 | 40 | p = XTX.shape[0] 41 | success = np.zeros(p) 42 | 43 | loss = rr.quadratic_loss((p,), Q=XTX) 44 | pen = rr.l1norm(p, lagrange=lam) 45 | 46 | scale = 0.5 47 | noisy_S = sampler(scale=scale) 48 | soln = XTXi.dot(noisy_S) 49 | solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) 50 | return set(np.nonzero(np.fabs(solnZ) > 2.1)[0]) 51 | 52 | lam = 4. * np.sqrt(n) 53 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) 54 | 55 | # run selection algorithm 56 | 57 | return full_model_inference(X, 58 | y, 59 | truth, 60 | selection_algorithm, 61 | splitting_sampler, 62 | success_params=(5, 7), 63 | B=B, 64 | fit_probability=keras_fit, 65 | fit_args={'epochs':30, 'sizes':[100, 100], 'activation':'relu'}) 66 | 67 | 68 | if __name__ == "__main__": 69 | import statsmodels.api as sm 70 | import matplotlib.pyplot as plt 71 | import pandas as pd 72 | 73 | for i in range(500): 74 | df = simulate(B=10000) 75 | csvfile = 'keras_targets_medium.csv' 76 | outbase = csvfile[:-4] 77 | 78 | if df is not None and i > 0: 79 | 80 | try: # concatenate to disk 81 | df = pd.concat([df, pd.read_csv(csvfile)]) 82 | except FileNotFoundError: 83 | pass 84 | df.to_csv(csvfile, index=False) 85 | 86 | if len(df['pivot']) > 0: 87 | pivot_ax, length_ax = pivot_plot(df, outbase) 88 | -------------------------------------------------------------------------------- /doc/learning_examples/keras/keras_targets_small.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, keras_fit 12 | from selection.learning.learners import mixture_learner 13 | mixture_learner.scales = [1]*10 + [1.5,2,3,4,5,10] 14 | 15 | def simulate(n=100, p=10, s=5, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): 16 | 17 | # description of statistical problem 18 | 19 | X, y, truth = gaussian_instance(n=n, 20 | p=p, 21 | s=s, 22 | equicorrelated=False, 23 | rho=0.5, 24 | sigma=sigma, 25 | signal=signal, 26 | random_signs=True, 27 | scale=False)[:3] 28 | 29 | XTX = X.T.dot(X) 30 | XTXi = np.linalg.inv(XTX) 31 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 32 | dispersion = np.linalg.norm(resid)**2 / (n-p) 33 | 34 | S = X.T.dot(y) 35 | covS = dispersion * X.T.dot(X) 36 | splitting_sampler = split_sampler(X * y[:, None], covS) 37 | 38 | def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): 39 | 40 | p = XTX.shape[0] 41 | success = np.zeros(p) 42 | 43 | loss = rr.quadratic_loss((p,), Q=XTX) 44 | pen = rr.l1norm(p, lagrange=lam) 45 | 46 | scale = 0.5 47 | noisy_S = sampler(scale=scale) 48 | soln = XTXi.dot(noisy_S) 49 | solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) 50 | return set(np.nonzero(np.fabs(solnZ) > 2.1)[0]) 51 | 52 | lam = 4. * np.sqrt(n) 53 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) 54 | 55 | # run selection algorithm 56 | 57 | return full_model_inference(X, 58 | y, 59 | truth, 60 | selection_algorithm, 61 | splitting_sampler, 62 | success_params=(5, 7), 63 | B=B, 64 | fit_probability=keras_fit, 65 | fit_args={'epochs':30, 'sizes':[100, 100], 'activation':'relu'}) 66 | 67 | 68 | if __name__ == "__main__": 69 | import statsmodels.api as sm 70 | import matplotlib.pyplot as plt 71 | import pandas as pd 72 | 73 | for i in range(500): 74 | df = simulate(B=10000) 75 | csvfile = 'keras_targets_small.csv' 76 | outbase = csvfile[:-4] 77 | 78 | if df is not None and i > 0: 79 | 80 | try: # concatenate to disk 81 | df = pd.concat([df, pd.read_csv(csvfile)]) 82 | except FileNotFoundError: 83 | pass 84 | df.to_csv(csvfile, index=False) 85 | 86 | if len(df['pivot']) > 0: 87 | pivot_ax, length_ax = pivot_plot(df, outbase) 88 | -------------------------------------------------------------------------------- /doc/learning_examples/knockoffs/knockoff_kernel.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import normal_sampler, logit_fit 12 | 13 | def simulate(n=1000, p=50, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=1000): 14 | 15 | # description of statistical problem 16 | 17 | np.random.seed(seed) 18 | X, y, truth = gaussian_instance(n=n, 19 | p=p, 20 | s=s, 21 | equicorrelated=False, 22 | rho=0.5, 23 | sigma=sigma, 24 | signal=signal, 25 | random_signs=True, 26 | scale=False, 27 | center=False)[:3] 28 | 29 | dispersion = sigma**2 30 | 31 | S = X.T.dot(y) 32 | covS = dispersion * X.T.dot(X) 33 | smooth_sampler = normal_sampler(S, covS) 34 | 35 | def meta_algorithm(X, XTXi, resid, sampler): 36 | 37 | n, p = X.shape 38 | 39 | rho = 0.8 40 | S = sampler(scale=0.) # deterministic with scale=0 41 | ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X 42 | Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape) 43 | 44 | X_full = np.hstack([X, Xnew]) 45 | beta_full = np.linalg.pinv(X_full).dot(ynew) 46 | winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:] 47 | return set(np.nonzero(winners)[0]) 48 | 49 | XTX = X.T.dot(X) 50 | XTXi = np.linalg.inv(XTX) 51 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 52 | dispersion = np.linalg.norm(resid)**2 / (n-p) 53 | 54 | selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) 55 | 56 | # run selection algorithm 57 | 58 | return full_model_inference(X, 59 | y, 60 | truth, 61 | selection_algorithm, 62 | smooth_sampler, 63 | success_params=(8, 10), 64 | B=B, 65 | fit_probability=logit_fit, 66 | fit_args={'df':20}, 67 | how_many=1) 68 | 69 | if __name__ == "__main__": 70 | import statsmodels.api as sm 71 | import matplotlib.pyplot as plt 72 | import pandas as pd 73 | 74 | iseed = int(np.fabs(np.random.standard_normal() * 50000)) 75 | for i in range(500): 76 | df = simulate(seed=i + iseed, B=2000) 77 | csvfile = 'knockoff_kernel.csv' 78 | outbase = csvfile[:-4] 79 | 80 | if df is not None and i > 0: 81 | 82 | try: # concatenate to disk 83 | df = pd.concat([df, pd.read_csv(csvfile)]) 84 | except FileNotFoundError: 85 | pass 86 | df.to_csv(csvfile, index=False) 87 | 88 | if len(df['pivot']) > 0: 89 | pivot_ax, length_ax = pivot_plot(df, outbase) 90 | -------------------------------------------------------------------------------- /doc/learning_examples/lasso/lasso_example.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import normal_sampler, logit_fit 12 | 13 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | dispersion = sigma**2 28 | 29 | S = X.T.dot(y) 30 | covS = dispersion * X.T.dot(X) 31 | sampler = normal_sampler(S, covS) 32 | 33 | def meta_algorithm(XTX, XTXi, lam, sampler): 34 | 35 | p = XTX.shape[0] 36 | success = np.zeros(p) 37 | 38 | loss = rr.quadratic_loss((p,), Q=XTX) 39 | pen = rr.l1norm(p, lagrange=lam) 40 | 41 | scale = 0. 42 | noisy_S = sampler(scale=scale) 43 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 44 | problem = rr.simple_problem(loss, pen) 45 | soln = problem.solve(max_its=100, tol=1.e-10) 46 | success += soln != 0 47 | return set(np.nonzero(success)[0]) 48 | 49 | XTX = X.T.dot(X) 50 | XTXi = np.linalg.inv(XTX) 51 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 52 | dispersion = np.linalg.norm(resid)**2 / (n-p) 53 | 54 | lam = 4. * np.sqrt(n) 55 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 56 | 57 | # run selection algorithm 58 | 59 | 60 | return full_model_inference(X, 61 | y, 62 | truth, 63 | selection_algorithm, 64 | sampler, 65 | success_params=(1, 1), 66 | B=B, 67 | fit_probability=logit_fit, 68 | fit_args={'df':20}, 69 | how_many=1) 70 | 71 | 72 | if __name__ == "__main__": 73 | import statsmodels.api as sm 74 | import matplotlib.pyplot as plt 75 | import pandas as pd 76 | 77 | for i in range(500): 78 | df = simulate() 79 | csvfile = 'lasso_exact.csv' 80 | outbase = csvfile[:-4] 81 | 82 | if df is not None and i > 0: 83 | 84 | try: # concatenate to disk 85 | df = pd.concat([df, pd.read_csv(csvfile)]) 86 | except FileNotFoundError: 87 | pass 88 | df.to_csv(csvfile, index=False) 89 | 90 | if len(df['pivot']) > 0: 91 | pivot_ax, length_ax = pivot_plot(df, outbase) 92 | 93 | -------------------------------------------------------------------------------- /doc/learning_examples/lasso_CV/lasso_exact_CV_null.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, probit_fit 12 | from selection.learning.Rutils import lasso_glmnet 13 | 14 | def simulate(n=200, p=100, s=10, signal=(0, 0), sigma=2, alpha=0.1): 15 | 16 | # description of statistical problem 17 | 18 | X, y, truth = gaussian_instance(n=n, 19 | p=p, 20 | s=s, 21 | equicorrelated=False, 22 | rho=0.5, 23 | sigma=sigma, 24 | signal=signal, 25 | random_signs=True, 26 | scale=False)[:3] 27 | 28 | XTX = X.T.dot(X) 29 | XTXi = np.linalg.inv(XTX) 30 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 31 | dispersion = np.linalg.norm(resid)**2 / (n-p) 32 | 33 | S = X.T.dot(y) 34 | covS = dispersion * X.T.dot(X) 35 | splitting_sampler = split_sampler(X * y[:, None], covS) 36 | 37 | def meta_algorithm(X, XTXi, resid, sampler): 38 | 39 | S = sampler(scale=0.) # deterministic with scale=0 40 | ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X 41 | G = lasso_glmnet(X, ynew, *[None]*4) 42 | select = G.select() 43 | return set(list(select[0])) 44 | 45 | selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) 46 | 47 | # run selection algorithm 48 | 49 | return full_model_inference(X, 50 | y, 51 | truth, 52 | selection_algorithm, 53 | splitting_sampler, 54 | success_params=(1, 1), 55 | B=B, 56 | fit_probability=probit_fit, 57 | fit_args={'df':20}, 58 | how_many=1) 59 | 60 | if __name__ == "__main__": 61 | import statsmodels.api as sm 62 | import matplotlib.pyplot as plt 63 | import pandas as pd 64 | 65 | for i in range(500): 66 | df = simulate() 67 | csvfile = 'lasso_exact_CV_null.csv' 68 | outbase = csvfile[:-4] 69 | 70 | if df is not None and i > 0: 71 | 72 | try: # concatenate to disk 73 | df = pd.concat([df, pd.read_csv(csvfile)]) 74 | except FileNotFoundError: 75 | pass 76 | df.to_csv(csvfile, index=False) 77 | 78 | if len(df['pivot']) > 0: 79 | pivot_ax, length_ax = pivot_plot(df, outbase) 80 | -------------------------------------------------------------------------------- /doc/learning_examples/lasso_CV/lasso_example_CV.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, probit_fit 12 | from selection.learning.Rutils import lasso_glmnet 13 | 14 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1): 15 | 16 | # description of statistical problem 17 | 18 | X, y, truth = gaussian_instance(n=n, 19 | p=p, 20 | s=s, 21 | equicorrelated=False, 22 | rho=0.5, 23 | sigma=sigma, 24 | signal=signal, 25 | random_signs=True, 26 | scale=False)[:3] 27 | 28 | dispersion = sigma**2 29 | 30 | S = X.T.dot(y) 31 | covS = dispersion * X.T.dot(X) 32 | splitting_sampler = split_sampler(X * y[:, None], covS) 33 | 34 | 35 | def meta_algorithm(X, XTXi, resid, sampler): 36 | 37 | S = sampler(scale=0.) # deterministic with scale=0 38 | ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X 39 | G = lasso_glmnet(X, ynew, *[None]*4) 40 | select = G.select() 41 | return set(list(select[0])) 42 | 43 | XTX = X.T.dot(X) 44 | XTXi = np.linalg.inv(XTX) 45 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 46 | dispersion = np.linalg.norm(resid)**2 / (n-p) 47 | 48 | selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) 49 | 50 | # run selection algorithm 51 | 52 | return full_model_inference(X, 53 | y, 54 | truth, 55 | selection_algorithm, 56 | splitting_sampler, 57 | success_params=(1, 1), 58 | B=B, 59 | fit_probability=probit_fit, 60 | fit_args={'df':20}, 61 | how_many=1) 62 | 63 | if __name__ == "__main__": 64 | import statsmodels.api as sm 65 | import matplotlib.pyplot as plt 66 | import pandas as pd 67 | 68 | for i in range(500): 69 | df = simulate() 70 | csvfile = 'lasso_exact_CV.csv' 71 | outbase = csvfile[:-4] 72 | 73 | if df is not None and i > 0: 74 | 75 | try: # concatenate to disk 76 | df = pd.concat([df, pd.read_csv(csvfile)]) 77 | except FileNotFoundError: 78 | pass 79 | df.to_csv(csvfile, index=False) 80 | 81 | if len(df['pivot']) > 0: 82 | pivot_ax, length_ax = pivot_plot(df, outbase) 83 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/additive_targets.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | 11 | from selection.learning.utils import full_model_inference, pivot_plot 12 | from selection.learning.core import split_sampler, logit_fit 13 | 14 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): 15 | 16 | # description of statistical problem 17 | 18 | X, y, truth = gaussian_instance(n=n, 19 | p=p, 20 | s=s, 21 | equicorrelated=False, 22 | rho=0.5, 23 | sigma=sigma, 24 | signal=signal, 25 | random_signs=True, 26 | scale=False)[:3] 27 | 28 | dispersion = sigma**2 29 | 30 | S = X.T.dot(y) 31 | covS = dispersion * X.T.dot(X) 32 | smooth_sampler = normal_sampler(S, covS) 33 | splitting_sampler = split_sampler(X * y[:, None], covS) 34 | 35 | def meta_algorithm(XTX, XTXi, lam, sampler): 36 | 37 | p = XTX.shape[0] 38 | success = np.zeros(p) 39 | 40 | loss = rr.quadratic_loss((p,), Q=XTX) 41 | pen = rr.l1norm(p, lagrange=lam) 42 | 43 | scale = 0.5 44 | noisy_S = sampler(scale=scale) 45 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 46 | problem = rr.simple_problem(loss, pen) 47 | soln = problem.solve(max_its=50, tol=1.e-6) 48 | success += soln != 0 49 | return set(np.nonzero(success)[0]) 50 | 51 | XTX = X.T.dot(X) 52 | XTXi = np.linalg.inv(XTX) 53 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 54 | dispersion = np.linalg.norm(resid)**2 / (n-p) 55 | 56 | lam = 4. * np.sqrt(n) 57 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 58 | 59 | # run selection algorithm 60 | 61 | return full_model_inference(X, 62 | y, 63 | truth, 64 | selection_algorithm, 65 | splitting_sampler, 66 | success_params=(1, 1), 67 | B=B, 68 | fit_probability=logit_fit, 69 | fit_args={'df':20}) 70 | 71 | if __name__ == "__main__": 72 | import statsmodels.api as sm 73 | import matplotlib.pyplot as plt 74 | import pandas as pd 75 | 76 | U = np.linspace(0, 1, 101) 77 | plt.clf() 78 | 79 | for i in range(500): 80 | for B in [5000]: 81 | print(B) 82 | df = simulate(B=B) 83 | csvfile = 'additive_targets.csv' 84 | outbase = csvfile[:-4] 85 | 86 | if i % 2 == 1 and i > 0: 87 | 88 | try: 89 | df = pd.concat([df, pd.read_csv(csvfile)]) 90 | df.to_csv(csvfile, index=False) 91 | except FileNotFoundError: 92 | pass 93 | 94 | if len(df['pivot']) > 0: 95 | pivot_ax, length_ax = pivot_plot(df, outbase) 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/additive_targets_small.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | 11 | from selection.learning.utils import full_model_inference, pivot_plot 12 | from selection.learning.core import split_sampler, logit_fit 13 | 14 | def simulate(n=100, p=30, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): 15 | 16 | # description of statistical problem 17 | 18 | X, y, truth = gaussian_instance(n=n, 19 | p=p, 20 | s=s, 21 | equicorrelated=False, 22 | rho=0.5, 23 | sigma=sigma, 24 | signal=signal, 25 | random_signs=True, 26 | scale=False)[:3] 27 | 28 | dispersion = sigma**2 29 | 30 | S = X.T.dot(y) 31 | covS = dispersion * X.T.dot(X) 32 | smooth_sampler = normal_sampler(S, covS) 33 | splitting_sampler = split_sampler(X * y[:, None], covS) 34 | 35 | def meta_algorithm(XTX, XTXi, lam, sampler): 36 | 37 | p = XTX.shape[0] 38 | success = np.zeros(p) 39 | 40 | loss = rr.quadratic_loss((p,), Q=XTX) 41 | pen = rr.l1norm(p, lagrange=lam) 42 | 43 | scale = 0.5 44 | noisy_S = sampler(scale=scale) 45 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 46 | problem = rr.simple_problem(loss, pen) 47 | soln = problem.solve(max_its=50, tol=1.e-6) 48 | success += soln != 0 49 | return set(np.nonzero(success)[0]) 50 | 51 | XTX = X.T.dot(X) 52 | XTXi = np.linalg.inv(XTX) 53 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 54 | dispersion = np.linalg.norm(resid)**2 / (n-p) 55 | 56 | lam = 4. * np.sqrt(n) 57 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 58 | 59 | # run selection algorithm 60 | 61 | return full_model_inference(X, 62 | y, 63 | truth, 64 | selection_algorithm, 65 | splitting_sampler, 66 | success_params=(1, 1), 67 | B=B, 68 | fit_probability=logit_fit, 69 | fit_args={'df':20}) 70 | 71 | if __name__ == "__main__": 72 | import statsmodels.api as sm 73 | import matplotlib.pyplot as plt 74 | import pandas as pd 75 | 76 | U = np.linspace(0, 1, 101) 77 | plt.clf() 78 | 79 | for i in range(500): 80 | for B in [5000]: 81 | print(B) 82 | df = simulate(B=B) 83 | csvfile = 'additive_targets_small.csv' 84 | outbase = csvfile[:-4] 85 | 86 | if i % 2 == 1 and i > 0: 87 | 88 | try: 89 | df = pd.concat([df, pd.read_csv(csvfile)]) 90 | df.to_csv(csvfile, index=False) 91 | except FileNotFoundError: 92 | pass 93 | 94 | if len(df['pivot']) > 0: 95 | pivot_ax, length_ax = pivot_plot(df, outbase) 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/gbm2.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | from selection.algorithms.lasso import ROSI 10 | 11 | from selection.learning.Rutils import lasso_glmnet 12 | from selection.learning.utils import full_model_inference, pivot_plot 13 | from selection.learning.core import normal_sampler, gbm_fit_sk 14 | 15 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): 16 | 17 | # description of statistical problem 18 | 19 | X, y, truth = gaussian_instance(n=n, 20 | p=p, 21 | s=s, 22 | equicorrelated=False, 23 | rho=0.5, 24 | sigma=sigma, 25 | signal=signal, 26 | random_signs=True, 27 | scale=False)[:3] 28 | 29 | dispersion = sigma**2 30 | 31 | S = X.T.dot(y) 32 | covS = dispersion * X.T.dot(X) 33 | smooth_sampler = normal_sampler(S, covS) 34 | 35 | def meta_algorithm(X, XTXi, resid, sampler): 36 | 37 | S = sampler(scale=0.5) # deterministic with scale=0 38 | ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X 39 | G = lasso_glmnet(X, ynew, *[None]*4) 40 | select = G.select() 41 | return set(list(select[0])) 42 | 43 | XTX = X.T.dot(X) 44 | XTXi = np.linalg.inv(XTX) 45 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 46 | dispersion = np.linalg.norm(resid)**2 / (n-p) 47 | 48 | selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) 49 | 50 | # run selection algorithm 51 | 52 | return full_model_inference(X, 53 | y, 54 | truth, 55 | selection_algorithm, 56 | smooth_sampler, 57 | success_params=(1, 1), 58 | B=B, 59 | fit_probability=gbm_fit_sk, 60 | fit_args={'n_estimators':2000}) 61 | 62 | if __name__ == "__main__": 63 | import statsmodels.api as sm 64 | import matplotlib.pyplot as plt 65 | import pandas as pd 66 | 67 | U = np.linspace(0, 1, 101) 68 | plt.clf() 69 | 70 | for i in range(500): 71 | df = simulate() 72 | csvfile = 'lasso_multi_CV_random_gbm.csv' 73 | outbase = csvfile[:-4] 74 | 75 | if df is not None and i > 0: 76 | 77 | try: 78 | df = pd.concat([df, pd.read_csv(csvfile)]) 79 | except FileNotFoundError: 80 | pass 81 | df.to_csv(csvfile, index=False) 82 | 83 | if len(df['pivot']) > 0: 84 | pivot_plot(df, outbase) 85 | 86 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/gbm_targets.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | from selection.algorithms.lasso import ROSI 10 | 11 | from selection.learning.Rutils import lasso_glmnet 12 | from selection.learning.utils import full_model_inference, pivot_plot 13 | from selection.learning.core import normal_sampler, gbm_fit 14 | 15 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): 16 | 17 | # description of statistical problem 18 | 19 | X, y, truth = gaussian_instance(n=n, 20 | p=p, 21 | s=s, 22 | equicorrelated=False, 23 | rho=0.5, 24 | sigma=sigma, 25 | signal=signal, 26 | random_signs=True, 27 | scale=False)[:3] 28 | 29 | dispersion = sigma**2 30 | 31 | S = X.T.dot(y) 32 | covS = dispersion * X.T.dot(X) 33 | smooth_sampler = normal_sampler(S, covS) 34 | 35 | def meta_algorithm(XTX, XTXi, lam, sampler): 36 | 37 | p = XTX.shape[0] 38 | success = np.zeros(p) 39 | 40 | loss = rr.quadratic_loss((p,), Q=XTX) 41 | pen = rr.l1norm(p, lagrange=lam) 42 | 43 | scale = 0.5 44 | noisy_S = sampler(scale=scale) 45 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 46 | problem = rr.simple_problem(loss, pen) 47 | soln = problem.solve(max_its=50, tol=1.e-6) 48 | success += soln != 0 49 | return set(np.nonzero(success)[0]) 50 | 51 | XTX = X.T.dot(X) 52 | XTXi = np.linalg.inv(XTX) 53 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 54 | dispersion = np.linalg.norm(resid)**2 / (n-p) 55 | 56 | lam = 4. * np.sqrt(n) 57 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 58 | 59 | # run selection algorithm 60 | 61 | return full_model_inference(X, 62 | y, 63 | truth, 64 | selection_algorithm, 65 | smooth_sampler, 66 | success_params=(1, 1), 67 | B=B, 68 | fit_probability=gbm_fit, 69 | fit_args={}) 70 | 71 | if __name__ == "__main__": 72 | import statsmodels.api as sm 73 | import matplotlib.pyplot as plt 74 | import pandas as pd 75 | 76 | U = np.linspace(0, 1, 101) 77 | plt.clf() 78 | 79 | for i in range(500): 80 | for B in [5000]: 81 | print(B) 82 | df = simulate(B=B) 83 | csvfile = 'gbm_targets.csv' 84 | outbase = csvfile[:-4] 85 | 86 | if df is not None and i > 0: 87 | 88 | try: 89 | df = pd.concat([df, pd.read_csv(csvfile)]) 90 | except FileNotFoundError: 91 | pass 92 | df.to_csv(csvfile, index=False) 93 | 94 | if len(df['pivot']) > 0: 95 | pivot_plot(df, outbase) 96 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/gbm_targets_small.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | from selection.algorithms.lasso import ROSI 10 | 11 | from selection.learning.Rutils import lasso_glmnet 12 | from selection.learning.utils import full_model_inference, pivot_plot 13 | from selection.learning.core import normal_sampler, gbm_fit 14 | 15 | def simulate(n=100, p=30, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): 16 | 17 | # description of statistical problem 18 | 19 | X, y, truth = gaussian_instance(n=n, 20 | p=p, 21 | s=s, 22 | equicorrelated=False, 23 | rho=0.5, 24 | sigma=sigma, 25 | signal=signal, 26 | random_signs=True, 27 | scale=False)[:3] 28 | 29 | dispersion = sigma**2 30 | 31 | S = X.T.dot(y) 32 | covS = dispersion * X.T.dot(X) 33 | smooth_sampler = normal_sampler(S, covS) 34 | 35 | def meta_algorithm(XTX, XTXi, lam, sampler): 36 | 37 | p = XTX.shape[0] 38 | success = np.zeros(p) 39 | 40 | loss = rr.quadratic_loss((p,), Q=XTX) 41 | pen = rr.l1norm(p, lagrange=lam) 42 | 43 | scale = 0.5 44 | noisy_S = sampler(scale=scale) 45 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 46 | problem = rr.simple_problem(loss, pen) 47 | soln = problem.solve(max_its=50, tol=1.e-6) 48 | success += soln != 0 49 | return set(np.nonzero(success)[0]) 50 | 51 | XTX = X.T.dot(X) 52 | XTXi = np.linalg.inv(XTX) 53 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 54 | dispersion = np.linalg.norm(resid)**2 / (n-p) 55 | 56 | lam = 4. * np.sqrt(n) 57 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 58 | 59 | # run selection algorithm 60 | 61 | return full_model_inference(X, 62 | y, 63 | truth, 64 | selection_algorithm, 65 | smooth_sampler, 66 | success_params=(1, 1), 67 | B=B, 68 | fit_probability=gbm_fit, 69 | fit_args={}) 70 | 71 | if __name__ == "__main__": 72 | import statsmodels.api as sm 73 | import matplotlib.pyplot as plt 74 | import pandas as pd 75 | 76 | U = np.linspace(0, 1, 101) 77 | plt.clf() 78 | 79 | for i in range(500): 80 | for B in [5000]: 81 | print(B) 82 | df = simulate(B=B) 83 | csvfile = 'gbm_targets_small.csv' 84 | outbase = csvfile[:-4] 85 | 86 | if df is not None and i > 0: 87 | 88 | try: 89 | df = pd.concat([df, pd.read_csv(csvfile)]) 90 | except FileNotFoundError: 91 | pass 92 | df.to_csv(csvfile, index=False) 93 | 94 | if len(df['pivot']) > 0: 95 | pivot_plot(df, outbase) 96 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, keras_fit 12 | 13 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | 28 | dispersion = sigma**2 29 | 30 | S = X.T.dot(y) 31 | covS = dispersion * X.T.dot(X) 32 | splitting_sampler = split_sampler(X * y[:, None], covS) 33 | 34 | def meta_algorithm(XTX, XTXi, lam, sampler): 35 | 36 | p = XTX.shape[0] 37 | success = np.zeros(p) 38 | 39 | loss = rr.quadratic_loss((p,), Q=XTX) 40 | pen = rr.l1norm(p, lagrange=lam) 41 | 42 | scale = 0. 43 | noisy_S = sampler(scale=scale) 44 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 45 | problem = rr.simple_problem(loss, pen) 46 | soln = problem.solve(max_its=100, tol=1.e-10) 47 | success += soln != 0 48 | return set(np.nonzero(success)[0]) 49 | 50 | XTX = X.T.dot(X) 51 | XTXi = np.linalg.inv(XTX) 52 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 53 | dispersion = np.linalg.norm(resid)**2 / (n-p) 54 | 55 | lam = 4. * np.sqrt(n) 56 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 57 | 58 | # run selection algorithm 59 | 60 | return full_model_inference(X, 61 | y, 62 | truth, 63 | selection_algorithm, 64 | splitting_sampler, 65 | success_params=(1, 1), 66 | B=B, 67 | fit_probability=keras_fit, 68 | fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) 69 | 70 | if __name__ == "__main__": 71 | import statsmodels.api as sm 72 | import matplotlib.pyplot as plt 73 | import pandas as pd 74 | 75 | for i in range(2000): 76 | df = simulate(B=2000) 77 | csvfile = 'lasso_multi.csv' 78 | outbase = csvfile[:-4] 79 | 80 | if df is not None and i > 0: 81 | 82 | try: # concatenate to disk 83 | df = pd.concat([df, pd.read_csv(csvfile)]) 84 | except FileNotFoundError: 85 | pass 86 | df.to_csv(csvfile, index=False) 87 | 88 | if len(df['pivot']) > 0: 89 | pivot_ax, length_ax = pivot_plot(df, outbase) 90 | 91 | 92 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi_CV.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, keras_fit 12 | from selection.learning.Rutils import lasso_glmnet 13 | 14 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): 15 | 16 | # description of statistical problem 17 | 18 | X, y, truth = gaussian_instance(n=n, 19 | p=p, 20 | s=s, 21 | equicorrelated=False, 22 | rho=0.5, 23 | sigma=sigma, 24 | signal=signal, 25 | random_signs=True, 26 | scale=False)[:3] 27 | 28 | dispersion = sigma**2 29 | 30 | S = X.T.dot(y) 31 | covS = dispersion * X.T.dot(X) 32 | splitting_sampler = split_sampler(X * y[:, None], covS) 33 | 34 | def meta_algorithm(X, XTXi, resid, sampler): 35 | 36 | S = sampler(scale=0.) # deterministic with scale=0 37 | ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X 38 | G = lasso_glmnet(X, ynew, *[None]*4) 39 | select = G.select() 40 | return set(list(select[0])) 41 | 42 | XTX = X.T.dot(X) 43 | XTXi = np.linalg.inv(XTX) 44 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 45 | dispersion = np.linalg.norm(resid)**2 / (n-p) 46 | 47 | selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) 48 | 49 | # run selection algorithm 50 | 51 | return full_model_inference(X, 52 | y, 53 | truth, 54 | selection_algorithm, 55 | splitting_sampler, 56 | success_params=(1, 1), 57 | B=B, 58 | fit_probability=keras_fit, 59 | fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) 60 | 61 | if __name__ == "__main__": 62 | import statsmodels.api as sm 63 | import matplotlib.pyplot as plt 64 | import pandas as pd 65 | 66 | U = np.linspace(0, 1, 101) 67 | plt.clf() 68 | 69 | for i in range(500): 70 | df = simulate() 71 | csvfile = 'lasso_multi_CV.csv' 72 | outbase = csvfile[:-4] 73 | 74 | if df is not None: 75 | 76 | try: 77 | df = pd.concat([df, pd.read_csv(csvfile)]) 78 | except FileNotFoundError: 79 | pass 80 | df.to_csv(csvfile, index=False) 81 | 82 | if len(df['pivot']) > 0: 83 | pivot_plot(df, outbase) 84 | 85 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi_CV_stronger.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, keras_fit 12 | from selection.learning.Rutils import cv_glmnet_lam, lasso_glmnet 13 | 14 | def simulate(n=200, p=100, s=10, signal=(1.5, 2), sigma=2, alpha=0.1, B=3000): 15 | 16 | # description of statistical problem 17 | 18 | X, y, truth = gaussian_instance(n=n, 19 | p=p, 20 | s=s, 21 | equicorrelated=False, 22 | rho=0.5, 23 | sigma=sigma, 24 | signal=signal, 25 | random_signs=True, 26 | scale=False)[:3] 27 | 28 | dispersion = sigma**2 29 | 30 | S = X.T.dot(y) 31 | covS = dispersion * X.T.dot(X) 32 | smooth_sampler = normal_sampler(S, covS) 33 | splitting_sampler = split_sampler(X * y[:, None], covS) 34 | 35 | def meta_algorithm(X, XTXi, resid, sampler): 36 | 37 | S = sampler(scale=0.) # deterministic with scale=0 38 | ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X 39 | G = lasso_glmnet(X, ynew, *[None]*4) 40 | select = G.select() 41 | return set(list(select[0])) 42 | 43 | XTX = X.T.dot(X) 44 | XTXi = np.linalg.inv(XTX) 45 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 46 | dispersion = np.linalg.norm(resid)**2 / (n-p) 47 | 48 | selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) 49 | 50 | # run selection algorithm 51 | 52 | return full_model_inference(X, 53 | y, 54 | truth, 55 | selection_algorithm, 56 | splitting_sampler, 57 | success_params=(1, 1), 58 | B=B, 59 | fit_probability=keras_fit, 60 | fit_args={'epochs':10, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) 61 | 62 | if __name__ == "__main__": 63 | import statsmodels.api as sm 64 | import matplotlib.pyplot as plt 65 | import pandas as pd 66 | 67 | U = np.linspace(0, 1, 101) 68 | plt.clf() 69 | 70 | for i in range(500): 71 | df = simulate() 72 | csvfile = 'lasso_multi_CV_stronger.csv' 73 | outbase = csvfile[:-4] 74 | 75 | if df is not None and i > 0: 76 | 77 | try: 78 | df = pd.concat([df, pd.read_csv(csvfile)]) 79 | except FileNotFoundError: 80 | pass 81 | df.to_csv(csvfile, index=False) 82 | 83 | if len(df['pivot']) > 0: 84 | pivot_plot(df, outbase) 85 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi_bigger.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, keras_fit 12 | 13 | def simulate(n=2000, p=1000, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=4000): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | dispersion = sigma**2 28 | 29 | S = X.T.dot(y) 30 | covS = dispersion * X.T.dot(X) 31 | smooth_sampler = normal_sampler(S, covS) 32 | splitting_sampler = split_sampler(X * y[:, None], covS) 33 | 34 | def meta_algorithm(XTX, XTXi, lam, sampler): 35 | 36 | p = XTX.shape[0] 37 | success = np.zeros(p) 38 | 39 | loss = rr.quadratic_loss((p,), Q=XTX) 40 | pen = rr.l1norm(p, lagrange=lam) 41 | 42 | scale = 0. 43 | noisy_S = sampler(scale=scale) 44 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 45 | problem = rr.simple_problem(loss, pen) 46 | soln = problem.solve(max_its=100, tol=1.e-10) 47 | success += soln != 0 48 | return set(np.nonzero(success)[0]) 49 | 50 | XTX = X.T.dot(X) 51 | XTXi = np.linalg.inv(XTX) 52 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 53 | dispersion = np.linalg.norm(resid)**2 / (n-p) 54 | 55 | lam = 5. * np.sqrt(n) 56 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 57 | 58 | # run selection algorithm 59 | 60 | return full_model_inference(X, 61 | y, 62 | truth, 63 | selection_algorithm, 64 | splitting_sampler, 65 | success_params=(1, 1), 66 | B=B, 67 | fit_probability=logit_fit, 68 | fit_args={'df':20}) 69 | 70 | 71 | if __name__ == "__main__": 72 | import statsmodels.api as sm 73 | import matplotlib.pyplot as plt 74 | import pandas as pd 75 | 76 | U = np.linspace(0, 1, 101) 77 | plt.clf() 78 | 79 | for i in range(500): 80 | df = simulate(B=4000) 81 | csvfile = 'lasso_multi_bigger.csv' 82 | outbase = csvfile[:-4] 83 | 84 | if df is not None and i > 0: 85 | 86 | try: # concatenate to disk 87 | df = pd.concat([df, pd.read_csv(csvfile)]) 88 | except FileNotFoundError: 89 | pass 90 | df.to_csv(csvfile, index=False) 91 | 92 | if len(df['pivot']) > 0: 93 | pivot_ax, length_ax = pivot_plot(df, outbase) 94 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi_gbm.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, gbm_fit 12 | 13 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | dispersion = sigma**2 28 | 29 | S = X.T.dot(y) 30 | covS = dispersion * X.T.dot(X) 31 | splitting_sampler = split_sampler(X * y[:, None], covS) 32 | 33 | def meta_algorithm(XTX, XTXi, lam, sampler): 34 | 35 | p = XTX.shape[0] 36 | success = np.zeros(p) 37 | 38 | loss = rr.quadratic_loss((p,), Q=XTX) 39 | pen = rr.l1norm(p, lagrange=lam) 40 | 41 | scale = 0. 42 | noisy_S = sampler(scale=scale) 43 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 44 | problem = rr.simple_problem(loss, pen) 45 | soln = problem.solve(max_its=100, tol=1.e-10) 46 | success += soln != 0 47 | return set(np.nonzero(success)[0]) 48 | 49 | XTX = X.T.dot(X) 50 | XTXi = np.linalg.inv(XTX) 51 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 52 | dispersion = np.linalg.norm(resid)**2 / (n-p) 53 | 54 | lam = 4. * np.sqrt(n) 55 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 56 | 57 | # run selection algorithm 58 | 59 | return full_model_inference(X, 60 | y, 61 | truth, 62 | selection_algorithm, 63 | splitting_sampler, 64 | success_params=(1, 1), 65 | B=B, 66 | fit_probability=gbm_fit, 67 | fit_args={'ntrees':5000}) 68 | 69 | 70 | if __name__ == "__main__": 71 | import statsmodels.api as sm 72 | import matplotlib.pyplot as plt 73 | import pandas as pd 74 | 75 | U = np.linspace(0, 1, 101) 76 | plt.clf() 77 | 78 | for i in range(500): 79 | df = simulate() 80 | csvfile = 'lasso_multi_gbm.csv' 81 | outbase = csvfile[:-4] 82 | 83 | if df is not None and i > 0: 84 | 85 | try: # concatenate to disk 86 | df = pd.concat([df, pd.read_csv(csvfile)]) 87 | except FileNotFoundError: 88 | pass 89 | df.to_csv(csvfile, index=False) 90 | 91 | if len(df['pivot']) > 0: 92 | pivot_ax, length_ax = pivot_plot(df, outbase) 93 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi_gbm_sk.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, gbm_fit 12 | 13 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | dispersion = sigma**2 28 | 29 | S = X.T.dot(y) 30 | covS = dispersion * X.T.dot(X) 31 | splitting_sampler = split_sampler(X * y[:, None], covS) 32 | 33 | def meta_algorithm(XTX, XTXi, lam, sampler): 34 | 35 | p = XTX.shape[0] 36 | success = np.zeros(p) 37 | 38 | loss = rr.quadratic_loss((p,), Q=XTX) 39 | pen = rr.l1norm(p, lagrange=lam) 40 | 41 | scale = 0. 42 | noisy_S = sampler(scale=scale) 43 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 44 | problem = rr.simple_problem(loss, pen) 45 | soln = problem.solve(max_its=100, tol=1.e-10) 46 | success += soln != 0 47 | return set(np.nonzero(success)[0]) 48 | 49 | XTX = X.T.dot(X) 50 | XTXi = np.linalg.inv(XTX) 51 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 52 | dispersion = np.linalg.norm(resid)**2 / (n-p) 53 | 54 | lam = 4. * np.sqrt(n) 55 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 56 | 57 | # run selection algorithm 58 | 59 | return full_model_inference(X, 60 | y, 61 | truth, 62 | selection_algorithm, 63 | splitting_sampler, 64 | success_params=(1, 1), 65 | B=B, 66 | fit_probability=gbm_fit_sk, 67 | fit_args={'n_estimators':1000}) 68 | 69 | 70 | if __name__ == "__main__": 71 | import statsmodels.api as sm 72 | import matplotlib.pyplot as plt 73 | import pandas as pd 74 | 75 | U = np.linspace(0, 1, 101) 76 | plt.clf() 77 | 78 | for i in range(500): 79 | df = simulate() 80 | csvfile = 'lasso_multi_gbm_sk.csv' 81 | outbase = csvfile[:-4] 82 | 83 | if df is not None and i > 0: 84 | 85 | try: # concatenate to disk 86 | df = pd.concat([df, pd.read_csv(csvfile)]) 87 | except FileNotFoundError: 88 | pass 89 | df.to_csv(csvfile, index=False) 90 | 91 | if len(df['pivot']) > 0: 92 | pivot_ax, length_ax = pivot_plot(df, outbase) 93 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi_random.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import normal_sampler, keras_fit 12 | 13 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | dispersion = sigma**2 28 | 29 | S = X.T.dot(y) 30 | covS = dispersion * X.T.dot(X) 31 | smooth_sampler = normal_sampler(S, covS) 32 | 33 | def meta_algorithm(XTX, XTXi, lam, sampler): 34 | 35 | p = XTX.shape[0] 36 | success = np.zeros(p) 37 | 38 | loss = rr.quadratic_loss((p,), Q=XTX) 39 | pen = rr.l1norm(p, lagrange=lam) 40 | 41 | scale = 0.5 42 | noisy_S = sampler(scale=scale) 43 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 44 | problem = rr.simple_problem(loss, pen) 45 | soln = problem.solve(max_its=100, tol=1.e-10) 46 | success += soln != 0 47 | return set(np.nonzero(success)[0]) 48 | 49 | XTX = X.T.dot(X) 50 | XTXi = np.linalg.inv(XTX) 51 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 52 | dispersion = np.linalg.norm(resid)**2 / (n-p) 53 | 54 | lam = 4. * np.sqrt(n) 55 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 56 | 57 | # run selection algorithm 58 | 59 | return full_model_inference(X, 60 | y, 61 | truth, 62 | selection_algorithm, 63 | smooth_sampler, 64 | success_params=(1, 1), 65 | B=B, 66 | fit_probability=keras_fit, 67 | fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'}) 68 | 69 | 70 | if __name__ == "__main__": 71 | import statsmodels.api as sm 72 | import matplotlib.pyplot as plt 73 | import pandas as pd 74 | 75 | U = np.linspace(0, 1, 101) 76 | plt.clf() 77 | 78 | for i in range(500): 79 | df = simulate() 80 | csvfile = 'lasso_multi_random.csv' 81 | outbase = csvfile[:-4] 82 | 83 | if df is not None and i > 0: 84 | 85 | try: # concatenate to disk 86 | df = pd.concat([df, pd.read_csv(csvfile)]) 87 | except FileNotFoundError: 88 | pass 89 | df.to_csv(csvfile, index=False) 90 | 91 | if len(df['pivot']) > 0: 92 | pivot_ax, length_ax = pivot_plot(df, outbase) 93 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi_random_gbm.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import normal_sampler, gbm_fit 12 | 13 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | dispersion = sigma**2 28 | 29 | S = X.T.dot(y) 30 | covS = dispersion * X.T.dot(X) 31 | smooth_sampler = normal_sampler(S, covS) 32 | splitting_sampler = split_sampler(X * y[:, None], covS) 33 | 34 | def meta_algorithm(XTX, XTXi, lam, sampler): 35 | 36 | p = XTX.shape[0] 37 | success = np.zeros(p) 38 | 39 | loss = rr.quadratic_loss((p,), Q=XTX) 40 | pen = rr.l1norm(p, lagrange=lam) 41 | 42 | scale = 0.5 43 | noisy_S = sampler(scale=scale) 44 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 45 | problem = rr.simple_problem(loss, pen) 46 | soln = problem.solve(max_its=100, tol=1.e-10) 47 | success += soln != 0 48 | return set(np.nonzero(success)[0]) 49 | 50 | XTX = X.T.dot(X) 51 | XTXi = np.linalg.inv(XTX) 52 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 53 | dispersion = np.linalg.norm(resid)**2 / (n-p) 54 | 55 | lam = 4. * np.sqrt(n) 56 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 57 | 58 | # run selection algorithm 59 | 60 | return full_model_inference(X, 61 | y, 62 | truth, 63 | selection_algorithm, 64 | smooth_sampler, 65 | success_params=(1, 1), 66 | B=B, 67 | fit_probability=gbm_fit, 68 | fit_args={'ntrees':5000}) 69 | 70 | if __name__ == "__main__": 71 | import statsmodels.api as sm 72 | import matplotlib.pyplot as plt 73 | import pandas as pd 74 | 75 | U = np.linspace(0, 1, 101) 76 | plt.clf() 77 | 78 | for i in range(500): 79 | df = simulate() 80 | csvfile = 'lasso_multi_random_gbm.csv' 81 | outbase = csvfile[:-4] 82 | 83 | if df is not None and i > 0: 84 | 85 | try: # concatenate to disk 86 | df = pd.concat([df, pd.read_csv(csvfile)]) 87 | except FileNotFoundError: 88 | pass 89 | df.to_csv(csvfile, index=False) 90 | 91 | if len(df['pivot']) > 0: 92 | pivot_ax, length_ax = pivot_plot(df, outbase) 93 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi_random_rf.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import normal_sampler, keras_fit 12 | 13 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | dispersion = sigma**2 28 | 29 | S = X.T.dot(y) 30 | covS = dispersion * X.T.dot(X) 31 | smooth_sampler = normal_sampler(S, covS) 32 | splitting_sampler = split_sampler(X * y[:, None], covS) 33 | 34 | def meta_algorithm(XTX, XTXi, lam, sampler): 35 | 36 | p = XTX.shape[0] 37 | success = np.zeros(p) 38 | 39 | loss = rr.quadratic_loss((p,), Q=XTX) 40 | pen = rr.l1norm(p, lagrange=lam) 41 | 42 | scale = 0.5 43 | noisy_S = sampler(scale=scale) 44 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 45 | problem = rr.simple_problem(loss, pen) 46 | soln = problem.solve(max_its=100, tol=1.e-10) 47 | success += soln != 0 48 | return set(np.nonzero(success)[0]) 49 | 50 | XTX = X.T.dot(X) 51 | XTXi = np.linalg.inv(XTX) 52 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 53 | dispersion = np.linalg.norm(resid)**2 / (n-p) 54 | 55 | lam = 4. * np.sqrt(n) 56 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 57 | 58 | # run selection algorithm 59 | 60 | return full_model_inference(X, 61 | y, 62 | truth, 63 | selection_algorithm, 64 | smooth_sampler, 65 | success_params=(1, 1), 66 | B=B, 67 | fit_probability=random_forest_fit, 68 | fit_args={'ntrees':5000}) 69 | 70 | if __name__ == "__main__": 71 | import statsmodels.api as sm 72 | import matplotlib.pyplot as plt 73 | import pandas as pd 74 | 75 | U = np.linspace(0, 1, 101) 76 | plt.clf() 77 | 78 | for i in range(500): 79 | df = simulate() 80 | csvfile = 'lasso_multi_random_rf.csv' 81 | outbase = csvfile[:-4] 82 | 83 | if df is not None and i > 0: 84 | 85 | try: # concatenate to disk 86 | df = pd.concat([df, pd.read_csv(csvfile)]) 87 | except FileNotFoundError: 88 | pass 89 | df.to_csv(csvfile, index=False) 90 | 91 | if len(df['pivot']) > 0: 92 | pivot_ax, length_ax = pivot_plot(df, outbase) 93 | -------------------------------------------------------------------------------- /doc/learning_examples/multi_target/lasso_example_multi_rf_sk.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | 6 | import regreg.api as rr 7 | 8 | from selection.tests.instance import gaussian_instance 9 | 10 | from selection.learning.utils import full_model_inference, pivot_plot 11 | from selection.learning.core import split_sampler, random_forest_fit_sk 12 | 13 | def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1): 14 | 15 | # description of statistical problem 16 | 17 | X, y, truth = gaussian_instance(n=n, 18 | p=p, 19 | s=s, 20 | equicorrelated=False, 21 | rho=0.5, 22 | sigma=sigma, 23 | signal=signal, 24 | random_signs=True, 25 | scale=False)[:3] 26 | 27 | dispersion = sigma**2 28 | 29 | S = X.T.dot(y) 30 | covS = dispersion * X.T.dot(X) 31 | smooth_sampler = normal_sampler(S, covS) 32 | splitting_sampler = split_sampler(X * y[:, None], covS) 33 | 34 | def meta_algorithm(XTX, XTXi, lam, sampler): 35 | 36 | p = XTX.shape[0] 37 | success = np.zeros(p) 38 | 39 | loss = rr.quadratic_loss((p,), Q=XTX) 40 | pen = rr.l1norm(p, lagrange=lam) 41 | 42 | scale = 0. 43 | noisy_S = sampler(scale=scale) 44 | loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) 45 | problem = rr.simple_problem(loss, pen) 46 | soln = problem.solve(max_its=100, tol=1.e-10) 47 | success += soln != 0 48 | return set(np.nonzero(success)[0]) 49 | 50 | XTX = X.T.dot(X) 51 | XTXi = np.linalg.inv(XTX) 52 | resid = y - X.dot(XTXi.dot(X.T.dot(y))) 53 | dispersion = np.linalg.norm(resid)**2 / (n-p) 54 | 55 | lam = 4. * np.sqrt(n) 56 | selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) 57 | 58 | # run selection algorithm 59 | 60 | # run selection algorithm 61 | 62 | return full_model_inference(X, 63 | y, 64 | truth, 65 | selection_algorithm, 66 | splitting_sampler, 67 | success_params=(1, 1), 68 | B=B, 69 | fit_probability=random_forest_fit_sk, 70 | fit_args={'n_estimators':5000}) 71 | 72 | 73 | if __name__ == "__main__": 74 | import statsmodels.api as sm 75 | import matplotlib.pyplot as plt 76 | import pandas as pd 77 | 78 | U = np.linspace(0, 1, 101) 79 | plt.clf() 80 | 81 | for i in range(500): 82 | df = simulate() 83 | csvfile = 'lasso_multi_rf_sk.csv' 84 | outbase = csvfile[:-4] 85 | 86 | if df is not None and i > 0: 87 | 88 | try: 89 | df = pd.concat([df, pd.read_csv(csvfile)]) 90 | except FileNotFoundError: 91 | pass 92 | df.to_csv(csvfile, index=False) 93 | 94 | if len(df['pivot']) > 0: 95 | pivot_plot(df, outbase) 96 | 97 | -------------------------------------------------------------------------------- /doc/learning_examples/standalone/cleaner_basic_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from selection.learning.core import (infer_general_target, 4 | normal_sampler, 5 | logit_fit, 6 | probit_fit) 7 | 8 | def simulate(n=100): 9 | 10 | # description of statistical problem 11 | 12 | truth = np.array([2. , -2.]) / np.sqrt(n) 13 | 14 | data = np.random.standard_normal((n, 2)) + np.multiply.outer(np.ones(n), truth) 15 | S = np.mean(data, 0) 16 | observed_sampler = normal_sampler(S, 1/n * np.identity(2)) 17 | 18 | def selection_algorithm(sampler): 19 | min_success = 1 20 | ntries = 3 21 | success = 0 22 | for _ in range(ntries): 23 | noisyS = sampler(scale=0.5) 24 | success += noisyS.sum() > 0.2 / np.sqrt(n) 25 | return success >= min_success 26 | 27 | # run selection algorithm 28 | 29 | observed_outcome = selection_algorithm(observed_sampler) 30 | 31 | # find the target, based on the observed outcome 32 | 33 | if observed_outcome: # target is truth[0] 34 | (true_target, 35 | observed_target, 36 | target_cov, 37 | cross_cov) = (truth[0], 38 | S[0], 39 | 1./n * np.identity(1), 40 | np.array([1., 0.]).reshape((2,1)) / n) 41 | else: 42 | (true_target, 43 | observed_target, 44 | target_cov, 45 | cross_cov) = (truth[1], 46 | S[1], 47 | 1./n * np.identity(1), 48 | np.array([0., 1.]).reshape((2,1)) / n) 49 | 50 | pivot, interval = infer_general_target(selection_algorithm, 51 | observed_outcome, 52 | observed_sampler, 53 | observed_target, 54 | cross_cov, 55 | target_cov, 56 | hypothesis=true_target, 57 | fit_probability=probit_fit)[:2] 58 | 59 | return pivot, (interval[0] < true_target) * (interval[1] > true_target), interval[1] - interval[0] 60 | 61 | if __name__ == "__main__": 62 | import statsmodels.api as sm 63 | import matplotlib.pyplot as plt 64 | 65 | n = 100 66 | U = np.linspace(0, 1, 101) 67 | P, L = [], [] 68 | plt.clf() 69 | coverage = 0 70 | for i in range(300): 71 | p, cover, l = simulate(n=n) 72 | coverage += cover 73 | P.append(p) 74 | L.append(l) 75 | print(np.mean(P), np.std(P), np.mean(L) / (2 * 1.65 / np.sqrt(n)), coverage / (i+1)) 76 | 77 | plt.clf() 78 | plt.plot(U, sm.distributions.ECDF(P)(U), 'r', linewidth=3) 79 | plt.plot([0,1], [0,1], 'k--', linewidth=2) 80 | plt.show() 81 | -------------------------------------------------------------------------------- /doc/learning_examples/standalone/full_model_example.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from selection.learning.core import (infer_full_target, 3 | normal_sampler, 4 | logit_fit, 5 | probit_fit) 6 | 7 | def simulate(n=100): 8 | 9 | # description of statistical problem 10 | 11 | truth = np.array([2. , -2.]) / np.sqrt(n) 12 | 13 | dispersion = 2 14 | data = np.sqrt(dispersion) * np.random.standard_normal((n, 2)) + np.multiply.outer(np.ones(n), truth) 15 | S = np.sum(data, 0) 16 | observed_sampler = normal_sampler(S, dispersion * n * np.identity(2)) 17 | 18 | def selection_algorithm(sampler): 19 | min_success = 1 20 | ntries = 3 21 | success = 0 22 | for _ in range(ntries): 23 | noisyS = sampler(scale=0.5) 24 | success += noisyS.sum() > 0.2 * np.sqrt(n) * np.sqrt(dispersion) 25 | if success >= min_success: 26 | return set([1, 0]) 27 | return set([1]) 28 | 29 | # run selection algorithm 30 | 31 | observed_set = selection_algorithm(observed_sampler) 32 | 33 | # find the target, based on the observed outcome 34 | 35 | # we just take the first target 36 | 37 | pivots, covered, lengths = [], [], [] 38 | for idx in observed_set: 39 | true_target = truth[idx] 40 | 41 | pivot, interval = infer_full_target(selection_algorithm, 42 | observed_set, 43 | [idx], 44 | observed_sampler, 45 | dispersion, 46 | hypothesis=[true_target], 47 | fit_probability=probit_fit)[0][:2] 48 | 49 | pivots.append(pivot) 50 | covered.append((interval[0] < true_target) * (interval[1] > true_target)) 51 | lengths.append(interval[1] - interval[0]) 52 | 53 | return pivots, covered, lengths 54 | 55 | if __name__ == "__main__": 56 | import statsmodels.api as sm 57 | import matplotlib.pyplot as plt 58 | 59 | n = 100 60 | U = np.linspace(0, 1, 101) 61 | P, L, coverage = [], [], [] 62 | plt.clf() 63 | for i in range(300): 64 | p, cover, l = simulate(n=n) 65 | coverage.extend(cover) 66 | P.extend(p) 67 | L.extend(l) 68 | print(np.mean(P), np.std(P), np.mean(L) / (2 * 1.65 / np.sqrt(n)), np.mean(coverage)) 69 | 70 | plt.clf() 71 | plt.plot(U, sm.distributions.ECDF(P)(U), 'r', linewidth=3) 72 | plt.plot([0,1], [0,1], 'k--', linewidth=2) 73 | plt.show() 74 | -------------------------------------------------------------------------------- /doc/license.rst: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Selective Inference development team 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * The names of any contributors to this software 17 | may not be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /doc/notebooks/learning/simple_example_pivots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/notebooks/learning/simple_example_pivots.pdf -------------------------------------------------------------------------------- /doc/notebooks/learning/simple_example_sel_prob.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/notebooks/learning/simple_example_sel_prob.pdf -------------------------------------------------------------------------------- /doc/source/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/_static/logo.png -------------------------------------------------------------------------------- /doc/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | {% set title = 'Selection' %} 3 | 4 | {% block rootrellink %} 5 |
  • Selection home
  • 6 | {% endblock %} 7 | 8 | 9 | {% block extrahead %} 10 | 11 | {% endblock %} 12 | 13 | {% block header %} 14 |
    15 | 16 | Selection logo

    Post-selection inference

    17 |
    18 | {% endblock %} 19 | 20 | {# This block gets put at the top of the sidebar #} 21 | {% block sidebarlogo %} 22 | {% endblock %} 23 | 24 |

    Site Navigation

    25 |
    29 | 30 | {# I had to copy the whole search block just to change the rendered text, 31 | so it doesn't mention modules or classes #} 32 | {%- block sidebarsearch %} 33 | {%- if pagename != "search" %} 34 | 35 | 46 | 47 | 48 | {%- endif %} 49 | 50 | {# The sidebarsearch block is the last one available in the default sidebar() 51 | macro, so the only way to add something to the bottom of the sidebar is to 52 | put it here, at the end of the sidebarsearch block (before it closes). 53 | #} 54 | 55 | {%- endblock %} 56 | -------------------------------------------------------------------------------- /doc/source/algorithms/index.rst: -------------------------------------------------------------------------------- 1 | ========================= 2 | Non-randomized algorithms 3 | ========================= 4 | 5 | This is a project that collects various tools for 6 | post-selection inference. 7 | 8 | .. toctree:: 9 | :maxdepth: 2 10 | 11 | covtest.ipynb 12 | spacings 13 | -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_23_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_23_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_25_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_25_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_27_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_27_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_29_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_29_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_31_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_31_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_3_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_3_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_4_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_4_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_5_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_5_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_6_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_6_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_7_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_7_0.png -------------------------------------------------------------------------------- /doc/source/algorithms/spacings_files/spacings_9_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/doc/source/algorithms/spacings_files/spacings_9_0.png -------------------------------------------------------------------------------- /doc/source/docattribute.rst: -------------------------------------------------------------------------------- 1 | .. _doc-attribute: 2 | 3 | Selection documentation attribution 4 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 5 | 6 | This website is based on the `NIPY project website `_, which is licensed under a `Creative Commons Attribution 3.0 License `_. 7 | 8 | We have licensed our own documention using the same license, see :ref:`selectinf-license`. -------------------------------------------------------------------------------- /doc/source/documentation.rst: -------------------------------------------------------------------------------- 1 | .. _documentation-main: 2 | 3 | ============= 4 | Documentation 5 | ============= 6 | 7 | .. only:: html 8 | 9 | :Release: |version| 10 | :Date: |today| 11 | 12 | Download `PDF `_ 13 | 14 | Contents: 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :glob: 19 | 20 | download.rst 21 | license.rst 22 | api/index.rst 23 | docattribute.rst 24 | 25 | 26 | -------------------------------------------------------------------------------- /doc/source/download.rst: -------------------------------------------------------------------------------- 1 | .. _download: 2 | 3 | Downloading and installing the code 4 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 5 | 6 | The post-selection inference source code is hosted at 7 | 8 | http://github.com/selective-inference/Python-software 9 | 10 | Selection depends on the following Python tools 11 | 12 | * `NumPy `_ 13 | 14 | * `SciPy `_ 15 | 16 | * `Cython `_ 17 | 18 | * `Pandas `_ 19 | 20 | You can clone the selection repo using:: 21 | 22 | git clone https://github.com/selective-inference/Python-software.git 23 | 24 | Then installation is a simple call to python:: 25 | 26 | cd selection 27 | git submodule update --init 28 | pip install -r requirements.txt 29 | python setup.py install --prefix=MYDIR 30 | 31 | where MYDIR is a site-packages directory you can write to. This 32 | directory will need to be on your PYTHONPATH for you to import 33 | `selectinf`. That's it! 34 | 35 | Testing your installation 36 | ------------------------- 37 | 38 | There is a small but growing suite of tests that be easily checked using `nose `_:: 39 | 40 | mkdir tmp 41 | cd tmp 42 | nosetests -v selectinf 43 | 44 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. _about_selection: 2 | 3 | ===================== 4 | The Selection project 5 | ===================== 6 | 7 | .. include:: ./links_names.txt 8 | 9 | This is a project that collects various tools for 10 | post-selection inference. 11 | 12 | 13 | .. toctree:: 14 | :maxdepth: 2 15 | 16 | documentation 17 | algorithms/index 18 | randomized/index 19 | learning/index 20 | 21 | 22 | Jonathan Taylor was funded by NSF in writing their portion of the 23 | software. As such, this material is based upon work supported by the 24 | National Science Foundation under Grant DMS 1208857, and by the AFOSR 25 | grant 113039. 26 | 27 | Any opinions, findings, and conclusions or recommendations expressed 28 | in this material are those of the author(s) and do not necessarily 29 | reflect the views of the National Science Foundation. 30 | 31 | .. include:: ../links_names.txt -------------------------------------------------------------------------------- /doc/source/learning/Learning1.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | cell_metadata_filter: all,-slideshow 5 | formats: ipynb,Rmd 6 | text_representation: 7 | extension: .Rmd 8 | format_name: rmarkdown 9 | format_version: '1.1' 10 | jupytext_version: 1.1.1 11 | kernelspec: 12 | display_name: Python 3 13 | language: python 14 | name: python3 15 | --- 16 | 17 | # Learning 1 18 | 19 | ```{python} 20 | import numpy as np 21 | print('notebook 1') 22 | ``` 23 | 24 | ```{python collapsed=TRUE} 25 | 26 | ``` 27 | -------------------------------------------------------------------------------- /doc/source/learning/Learning1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Learning 1" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "notebook 1\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "import numpy as np\n", 25 | "print('notebook 1')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [] 36 | } 37 | ], 38 | "metadata": { 39 | "jupytext": { 40 | "cell_metadata_filter": "all,-slideshow", 41 | "formats": "ipynb,Rmd" 42 | }, 43 | "kernelspec": { 44 | "display_name": "Python 3", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.6.2" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 2 63 | } 64 | -------------------------------------------------------------------------------- /doc/source/learning/Learning2.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | jupyter: 3 | jupytext: 4 | cell_metadata_filter: all,-slideshow 5 | formats: ipynb,Rmd 6 | text_representation: 7 | extension: .Rmd 8 | format_name: rmarkdown 9 | format_version: '1.1' 10 | jupytext_version: 1.1.1 11 | kernelspec: 12 | display_name: Python 3 13 | language: python 14 | name: python3 15 | --- 16 | 17 | # Learning 2 18 | 19 | ```{python} 20 | import numpy as np 21 | print('notebook 2') 22 | ``` 23 | 24 | ```{python collapsed=TRUE} 25 | 26 | ``` 27 | -------------------------------------------------------------------------------- /doc/source/learning/Learning2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Learning 2" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "notebook 2\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "import numpy as np\n", 25 | "print('notebook 2')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [] 36 | } 37 | ], 38 | "metadata": { 39 | "jupytext": { 40 | "cell_metadata_filter": "all,-slideshow", 41 | "formats": "ipynb,Rmd" 42 | }, 43 | "kernelspec": { 44 | "display_name": "Python 3", 45 | "language": "python", 46 | "name": "python3" 47 | }, 48 | "language_info": { 49 | "codemirror_mode": { 50 | "name": "ipython", 51 | "version": 3 52 | }, 53 | "file_extension": ".py", 54 | "mimetype": "text/x-python", 55 | "name": "python", 56 | "nbconvert_exporter": "python", 57 | "pygments_lexer": "ipython3", 58 | "version": "3.6.2" 59 | } 60 | }, 61 | "nbformat": 4, 62 | "nbformat_minor": 2 63 | } 64 | -------------------------------------------------------------------------------- /doc/source/learning/index.rst: -------------------------------------------------------------------------------- 1 | Learning selection 2 | ------------------ 3 | 4 | This package illustrates examples in `Inference after selection through a block box `_ 5 | as well as generalizations based on learning multiparameter functions rather than the simple univariate 6 | case considered above. 7 | 8 | .. toctree:: 9 | :maxdepth: 2 10 | 11 | Learning1.ipynb 12 | Learning2.ipynb -------------------------------------------------------------------------------- /doc/source/license.rst: -------------------------------------------------------------------------------- 1 | .. _selectinf-license: 2 | 3 | ======================================= 4 | Selective Inference License Information 5 | ======================================= 6 | 7 | .. _selectinf-software-license: 8 | 9 | Software License 10 | ----------------- 11 | 12 | Except where otherwise noted, all `selective-inference `_ software is licensed under a 13 | `revised BSD license `_. 14 | 15 | .. _selectinf-documentation-license: 16 | 17 | Documentation License 18 | --------------------- 19 | 20 | Except where otherwise noted, all `selective-inference `_ documentation is licensed under a 21 | `Creative Commons Attribution 3.0 License `_. 22 | 23 | All code fragments in the documentation are licensed under our 24 | software license. 25 | -------------------------------------------------------------------------------- /doc/source/links_names.txt: -------------------------------------------------------------------------------- 1 | .. This (-*- rst -*-) format file contains commonly used link targets 2 | and name substitutions. It may be included in many files, 3 | therefore it should only contain link targets and name 4 | substitutions. Try grepping for "^\.\. _" to find plausible 5 | candidates for this list. 6 | 7 | .. NOTE: reST targets are 8 | __not_case_sensitive__, so only one target definition is needed for 9 | nipy, NIPY, Nipy, etc... 10 | 11 | .. Post selection papers 12 | .. _covtest: http://arxiv.org/abs/1301.7161 13 | 14 | .. Documentation tools 15 | .. _graphviz: http://www.graphviz.org/ 16 | .. _Sphinx: http://sphinx.pocoo.org/ 17 | .. _`Sphinx reST`: http://sphinx.pocoo.org/rest.html 18 | .. _reST: http://docutils.sourceforge.net/rst.html 19 | .. _docutils: http://docutils.sourceforge.net 20 | 21 | .. Licenses 22 | .. _GPL: http://www.gnu.org/licenses/gpl.html 23 | .. _BSD: http://www.opensource.org/licenses/bsd-license.php 24 | .. _LGPL: http://www.gnu.org/copyleft/lesser.html 25 | .. _MIT License: http://www.opensource.org/licenses/mit-license.php 26 | 27 | .. Working process 28 | .. _sourceforge: http://nipy.sourceforge.net/ 29 | .. _github: http://github.com 30 | 31 | .. Code support stuff 32 | .. _pychecker: http://pychecker.sourceforge.net/ 33 | .. _pylint: http://www.logilab.org/project/pylint 34 | .. _pyflakes: http://divmod.org/trac/wiki/DivmodPyflakes 35 | .. _virtualenv: http://pypi.python.org/pypi/virtualenv 36 | .. _git: http://git.or.cz/ 37 | .. _flymake: http://flymake.sourceforge.net/ 38 | .. _rope: http://rope.sourceforge.net/ 39 | .. _pymacs: http://pymacs.progiciels-bpi.ca/pymacs.html 40 | .. _ropemacs: http://rope.sourceforge.net/ropemacs.html 41 | .. _ECB: http://ecb.sourceforge.net/ 42 | .. _emacs_python_mode: http://www.emacswiki.org/cgi-bin/wiki/PythonMode 43 | .. _doctest-mode: http://www.cis.upenn.edu/~edloper/projects/doctestmode/ 44 | .. _nose: http://somethingaboutorange.com/mrl/projects/nose 45 | .. _`python coverage tester`: http://nedbatchelder.com/code/modules/coverage.html 46 | 47 | .. Other python projects 48 | .. _numpy: http://www.scipy.org/NumPy 49 | .. _scipy: http://www.scipy.org 50 | .. _cython: http://www.cython.org/ 51 | .. _ipython: http://ipython.scipy.org 52 | .. _`ipython manual`: http://ipython.scipy.org/doc/manual/html 53 | .. _matplotlib: http://matplotlib.sourceforge.net 54 | .. _python: http://www.python.org 55 | .. _networkx: http://networkx.lanl.gov/ 56 | 57 | .. General software 58 | .. _gcc: http://gcc.gnu.org 59 | .. _xcode: http://developer.apple.com/TOOLS/xcode 60 | .. _mingw: http://www.mingw.org 61 | .. _macports: http://www.macports.org/ 62 | 63 | -------------------------------------------------------------------------------- /doc/source/randomized/index.rst: -------------------------------------------------------------------------------- 1 | ===================== 2 | Randomized algorithms 3 | ===================== 4 | 5 | This module implements several methods for inference after a randomized 6 | selection as described in this paper on `proximal change of variables `_ 7 | 8 | .. toctree:: 9 | :maxdepth: 2 10 | 11 | lasso.ipynb 12 | -------------------------------------------------------------------------------- /doc/source/sphinxext/math_dollar.py: -------------------------------------------------------------------------------- 1 | # emacs: -*- mode: python-mode; py-indent-offset: 4; indent-tabs-mode: nil -*- 2 | # vi: set ft=python sts=4 ts=4 sw=4 et: 3 | ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 4 | # 5 | # See COPYING file distributed along with the NiBabel package for the 6 | # copyright and license terms. 7 | # 8 | ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 9 | import re 10 | 11 | def dollars_to_math(source): 12 | r""" 13 | Replace dollar signs with backticks. 14 | 15 | More precisely, do a regular expression search. Replace a plain 16 | dollar sign ($) by a backtick (`). Replace an escaped dollar sign 17 | (\$) by a dollar sign ($). Don't change a dollar sign preceded or 18 | followed by a backtick (`$ or $`), because of strings like 19 | "``$HOME``". Don't make any changes on lines starting with 20 | spaces, because those are indented and hence part of a block of 21 | code or examples. 22 | 23 | This also doesn't replaces dollar signs enclosed in curly braces, 24 | to avoid nested math environments, such as :: 25 | 26 | $f(n) = 0 \text{ if $n$ is prime}$ 27 | 28 | Thus the above line would get changed to 29 | 30 | `f(n) = 0 \text{ if $n$ is prime}` 31 | """ 32 | s = "\n".join(source) 33 | if s.find("$") == -1: 34 | return 35 | # This searches for "$blah$" inside a pair of curly braces -- 36 | # don't change these, since they're probably coming from a nested 37 | # math environment. So for each match, we replace it with a temporary 38 | # string, and later on we substitute the original back. 39 | global _data 40 | _data = {} 41 | def repl(matchobj): 42 | global _data 43 | s = matchobj.group(0) 44 | t = "___XXX_REPL_%d___" % len(_data) 45 | _data[t] = s 46 | return t 47 | s = re.sub(r"({[^{}$]*\$[^{}$]*\$[^{}]*})", repl, s) 48 | # matches $...$ 49 | dollars = re.compile(r"(? b1*xy[,1]) & (xy[,2] < b2 * xy[,1])) 18 | z <- rep(NA,nrow(xy)) 19 | z[good] <- ci.len(xy[good,1],xy[good,2]) 20 | z 21 | } 22 | 23 | ci.len <- function(x,y) { 24 | cutoff.x <- ifelse(x>y,(y-x)/(b1-1),(y-x)/(b2-1)) 25 | cutoff.y <- ifelse(x>y,b1*(y-x)/(b1-1),b2*(y-x)/(b2-1)) 26 | cutoff <- (cutoff.x + cutoff.y)/sqrt(2) 27 | observed <- (x+y)/sqrt(2) 28 | apply(cbind(observed,cutoff),1,function(x) { 29 | ci <- try(ShortestCI(x[1],1,x[2],.05),silent=TRUE) 30 | if(is.list(ci)) { 31 | return(ci$upper - ci$lower) 32 | } else { 33 | return(NA) 34 | } 35 | }) 36 | } 37 | 38 | xy <- expand.grid(c(-1,seq(0,4,.02)),c(-1,seq(0,4,.02))) 39 | z <- ci.len.wrapper(xy) 40 | ## This is a hack because of a bug in the package 41 | z[xy[,1] > 0 & abs(xy[,1]-xy[,2])<.023] <- 2*1.96 42 | 43 | 44 | rast <- rasterFromXYZ(cbind(xy,z)) 45 | 46 | pdf("CILengthCorr.pdf") 47 | plot(rast,xlim=c(-2.5,4),ylim=c(-2.5,4),xlab=expression(y[1]),ylab=expression(y[2]),col=rev(heat.colors(20)), 48 | main="CI Length for Univariate Model") 49 | abline(h=0,lty=3,col="gray") 50 | abline(v=0,lty=3,col="gray") 51 | arrows(x0=c(0,0,0),y0=c(0,0,0),x1=c(x[1,]),y1=c(x[2,]),length=.15) 52 | abline(0,b1,lty=2) 53 | abline(0,b2,lty=2) 54 | dev.off() 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | numpy 3 | scipy 4 | pandas 5 | mpmath 6 | pyinter 7 | sklearn 8 | regreg 9 | # keras 10 | # tensorflow 11 | traitlets 12 | -------------------------------------------------------------------------------- /sandbox/absurd.py: -------------------------------------------------------------------------------- 1 | import kmeans 2 | import numpy as np 3 | 4 | kmeans = reload(kmeans) 5 | 6 | n = 20 7 | p = 5 8 | n_sample = 50 9 | p_array = [] 10 | 11 | t_distance = [0] 12 | #distance = 5 13 | 14 | import matplotlib.pyplot as plt 15 | x = np.arange(0, 1, 1./n_sample); 16 | plt.plot(x, x, 'g') 17 | 18 | for distance in t_distance: 19 | i=0 20 | while i < n_sample: 21 | compteur_bug = 0 22 | if True: #i%1 == 0: 23 | print i, " / ", n_sample, distance 24 | try: 25 | #kmeans = reload(kmeans) 26 | p_value = kmeans.f(n, p, distance)[0] 27 | if p_value > 0 and p_value < 1: 28 | p_array.append(p_value) 29 | i+=1 30 | except: 31 | raise 32 | 33 | 34 | 35 | 36 | p_array = sorted(p_array) 37 | print p_array 38 | 39 | plt.plot(x, p_array, 'b') 40 | 41 | 42 | 43 | plt.show() 44 | -------------------------------------------------------------------------------- /sandbox/bayesian/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/sandbox/bayesian/__init__.py -------------------------------------------------------------------------------- /sandbox/bayesian/crime_data_attempt.py: -------------------------------------------------------------------------------- 1 | 2 | import os, numpy as np, pandas, statsmodels.api as sm 3 | import time 4 | import matplotlib.pyplot as plt 5 | import regreg.api as rr 6 | from selection.reduced_optimization.initial_soln import selection 7 | from selection.randomized.api import randomization 8 | from selection.reduced_optimization.lasso_reduced import nonnegative_softmax_scaled, neg_log_cube_probability, selection_probability_lasso, \ 9 | sel_prob_gradient_map_lasso, selective_inf_lasso 10 | 11 | crime = pandas.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data', header=None, na_values=['?']) 12 | crime = crime.iloc[:, 5:] 13 | crime.dropna(inplace=True) 14 | crime.head() 15 | 16 | # define X and y 17 | X = crime.iloc[:, :-1] 18 | n, p = X.shape 19 | X -= X.mean(0)[None, :] 20 | X /= (X.std(0)[None, :] * np.sqrt(n)) 21 | 22 | Y = crime.iloc[:, -1] 23 | print("shape", X.shape, Y.shape) 24 | 25 | ols_fit = sm.OLS(Y, X).fit() 26 | print("residual", np.linalg.norm(ols_fit.resid)) 27 | sigma_3TC = np.linalg.norm(ols_fit.resid) / np.sqrt(n-p-1) 28 | OLS_3TC = ols_fit.params 29 | print("sigma", sigma_3TC) 30 | -------------------------------------------------------------------------------- /sandbox/bayesian/mixed_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class instance_mixed(object): 4 | 5 | def __init__(self, n, p, s, sigma=1., rho=0, random_signs=False, scale =True, center=True): 6 | (self.n, self.p, self.s, 7 | self.sigma, 8 | self.rho) = (n, p, s, 9 | sigma, 10 | rho) 11 | 12 | self.X = (np.sqrt(1 - self.rho) * np.random.standard_normal((self.n, self.p)) + 13 | np.sqrt(self.rho) * np.random.standard_normal(self.n)[:, None]) 14 | if center: 15 | self.X -= self.X.mean(0)[None, :] 16 | if scale: 17 | self.X /= (self.X.std(0)[None, :] * np.sqrt(self.n)) 18 | 19 | self.beta = np.zeros(p) 20 | self.beta[:self.s] = np.linspace(0.5, 5.0, num=s) 21 | if random_signs: 22 | self.beta[:self.s] *= (2 * np.random.binomial(1, 0.5, size=(s,)) - 1.) 23 | self.active = np.zeros(p, np.bool) 24 | self.active[:self.s] = True 25 | 26 | def _noise(self): 27 | return np.random.standard_normal(self.n) 28 | 29 | def generate_response(self): 30 | 31 | Y = (self.X.dot(self.beta) + self._noise()) * self.sigma 32 | return self.X, Y, self.beta * self.sigma, np.nonzero(self.active)[0], self.sigma 33 | -------------------------------------------------------------------------------- /sandbox/bayesian/read_file.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os, numpy as np, pandas, statsmodels.api as sm 3 | 4 | #path =r'/Users/snigdhapanigrahi/Results_freq_EQTL/sparsity_5/dim_1/dim_1' 5 | #path =r'/Users/snigdhapanigrahi/Results_reduced_optimization/fixed_lasso/fixed_lasso' 6 | 7 | path =r'/Users/snigdhapanigrahi/Results_reduced_optimization/experiment_dual_0' 8 | #path =r'/Users/snigdhapanigrahi/Results_reduced_optimization/bayesian_dual' 9 | allFiles = glob.glob(path + "/*.txt") 10 | 11 | list_ = [] 12 | for file_ in allFiles: 13 | df = np.loadtxt(file_) 14 | list_.append(df) 15 | 16 | def summary_files(list_): 17 | 18 | coverage_ad = 0. 19 | coverage_unad = 0. 20 | length_ad = 0. 21 | length_unad = 0. 22 | loss_ad = 0. 23 | loss_unad = 0. 24 | 25 | length = len(list_) 26 | print("number of simulations", length) 27 | 28 | for i in range(length): 29 | print("iteration", i) 30 | lasso = list_[i].reshape((6, 1)) 31 | coverage_ad += lasso[0,0] 32 | coverage_unad += lasso[1,0] 33 | length_ad += lasso[2,0] 34 | length_unad += lasso[3,0] 35 | loss_ad += lasso[4,0] 36 | loss_unad += lasso[5, 0] 37 | 38 | return coverage_ad / length, coverage_unad / length, length_ad / length, length_unad / length,\ 39 | loss_ad/length, loss_unad/length 40 | 41 | print(summary_files(list_)) 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /sandbox/randomized_tests/test_reconstruction.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | 4 | import regreg.api as rr 5 | 6 | from selection.tests.decorators import wait_for_return_value, register_report 7 | import selection.tests.reports as reports 8 | 9 | from selection.api import multiple_queries 10 | from selection.randomized.glm import split_glm_group_lasso, target as glm_target 11 | from selection.tests.instance import logistic_instance 12 | 13 | @wait_for_return_value() 14 | def test_reconstruction(s=3, 15 | n=200, 16 | p=50, 17 | signal=7, 18 | rho=0.1, 19 | split_frac=0.8, 20 | lam_frac=0.7, 21 | ndraw=100, 22 | burnin=200, 23 | bootstrap=True, 24 | solve_args={'min_its':50, 'tol':1.e-10}, 25 | reference_known=False): 26 | 27 | X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) 28 | 29 | m = int(split_frac * n) 30 | nonzero = np.where(beta)[0] 31 | 32 | loss = rr.glm.logistic(X, y) 33 | epsilon = 1. / np.sqrt(n) 34 | 35 | lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 2000)))).max(0)) 36 | W = np.ones(p)*lam 37 | W[0] = 0 # use at least some unpenalized 38 | penalty = rr.group_lasso(np.arange(p), 39 | weights=dict(zip(np.arange(p), W)), lagrange=1.) 40 | 41 | M_est = split_glm_group_lasso(loss, epsilon, m, penalty) 42 | mv = multiple_queries([M_est]) 43 | mv.solve() 44 | 45 | M_est.selection_variable['variables'] = M_est.selection_variable['variables'] 46 | nactive = np.sum(M_est.selection_variable['variables']) 47 | 48 | if nactive==0: 49 | return None 50 | 51 | if set(nonzero).issubset(np.nonzero(M_est.selection_variable['variables'])[0]): 52 | 53 | active_set = np.nonzero(M_est.selection_variable['variables'])[0] 54 | 55 | target_sampler, target_observed = glm_target(loss, 56 | M_est.selection_variable['variables'], 57 | mv) 58 | 59 | target_sample = target_sampler.sample(ndraw=ndraw, 60 | burnin=burnin, 61 | keep_opt=True) 62 | 63 | reconstruction = target_sampler.reconstruct(target_sample) 64 | logdens = target_sampler.log_density(target_sample) 65 | return logdens.shape 66 | -------------------------------------------------------------------------------- /sandbox/tensorflow_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow_fit 2 | import tensorflow as tf 3 | import numpy as np 4 | ntries, sigma, q = 21, 1, 0.3 5 | Z = np.linspace(-8, 8, 1001) 6 | 7 | def algorithm(Z, ntries=ntries, q=q): 8 | proportion = 0 9 | for _ in range(ntries): 10 | proportion += ((Z + sigma * np.random.standard_normal() > 0) * 11 | (Z + 1 + sigma * np.random.standard_normal() > 0) * 12 | (Z - 0.5 + sigma * np.random.standard_normal() > 0)) 13 | proportion /= ntries 14 | return proportion > q 15 | 16 | Z = np.linspace(-8, 8, 1001) 17 | 18 | 19 | # a function that is parameterized by hyperparameters 20 | def create_network(num_hidden,num_outputs): 21 | def create(features): 22 | N = features.shape[0] 23 | X = features # np.reshape(features,(None,1)) 24 | hidA = tf.layers.Dense(activation=tf.nn.relu,units=num_hidden, name='hidA') 25 | outlayer = tf.layers.Dense(activation=tf.nn.relu,units=num_outputs, name='hid') 26 | #outlayer = tf.layers.Dense(activation=tf.nn.relu, name='hid') 27 | output = outlayer(hidA(X)) 28 | return output 29 | return create 30 | 31 | def fit_algorithm(algorithm, B=500, ntries=ntries, q=q, Zval=Z, link='probit'): 32 | 33 | Z = np.random.standard_normal(B) * 2 34 | Z = np.hstack([Z, 35 | np.random.standard_normal(B), 36 | np.random.standard_normal(B) * 3, 37 | np.random.standard_normal(B) * 0.5]) 38 | print('ZS=',Z.shape) 39 | 40 | # is there no "active part" that updates the Z proposals somewhere? 41 | Y = np.array([algorithm(z, ntries=ntries, q=q) for z in Z]) 42 | optimize = tensorflow_fit.create_optimizer() # a default optimizer 43 | predictor_f = tensorflow_fit.fit(np.reshape(Z, (Z.shape[0], 1)), 44 | np.reshape(Y, (Y.shape[0], 1)), 45 | create_network(10, 1), 46 | tensorflow_fit.create_l2_loss, 47 | optimize) 48 | print('ZS2=',Zval.shape) 49 | return predictor_f(np.reshape(Zval,(Zval.shape[0],1))) 50 | 51 | def simulate(ntries=ntries, sigma=sigma, truth=0): 52 | 53 | while True: 54 | Z = np.random.standard_normal() + truth 55 | if algorithm(Z, ntries, q=q): 56 | return Z 57 | 58 | Z = np.linspace(-8, 8, 1001) 59 | W1 = fit_algorithm(algorithm, ntries=ntries, q=q, Zval=Z) 60 | print('done') 61 | plt.plot(Z, np.log(W1)) 62 | selective_law1 = discrete_family(Z, W1 * scipy.stats.norm.pdf(Z)) 63 | 64 | 65 | def pivot1(z, truth=0): 66 | return 1 - selective_law1.cdf(truth, z) 67 | 68 | -------------------------------------------------------------------------------- /sandbox/test_cover.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from selection.algorithms.tests.test_lasso import test_data_carving 4 | 5 | P = [] 6 | covered = [] 7 | 8 | num_except = 0 9 | for _ in range(500): 10 | try: 11 | results = test_data_carving(compute_intervals=True, 12 | burnin=5000, 13 | ndraw=10000)[0] 14 | covered.extend(results[-4]) 15 | P.extend(results[0]) 16 | print np.mean(P), np.std(P), 'null' 17 | print np.mean(covered), 'covered' 18 | 19 | except KeyboardInterrupt: 20 | raise KeyboardInterrupt 21 | except: 22 | num_except += 1; print('num except: %d' % num_except); pass 23 | pass 24 | 25 | 26 | -------------------------------------------------------------------------------- /sandbox/test_isotonic.py: -------------------------------------------------------------------------------- 1 | from ..isotonic import isotonic 2 | import numpy as np 3 | 4 | def test_isotonic(): 5 | y = np.random.standard_normal(50) 6 | I = isotonic(y) 7 | print I.first_jump 8 | print I.largest_jump 9 | print I.combine_jumps(2) 10 | -------------------------------------------------------------------------------- /selectinf/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /selectinf/algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/algorithms/__init__.py -------------------------------------------------------------------------------- /selectinf/algorithms/api.py: -------------------------------------------------------------------------------- 1 | from .lasso import (lasso, 2 | data_carving as data_carving_lasso, 3 | additive_noise as additive_noise_lasso) 4 | 5 | from .sqrt_lasso import (choose_lambda as choose_lambda_sqrt_lasso, 6 | solve_sqrt_lasso) 7 | 8 | from .forward_step import (forward_step, 9 | info_crit_stop) 10 | 11 | from .covtest import (covtest, 12 | selected_covtest) 13 | -------------------------------------------------------------------------------- /selectinf/algorithms/pca.py: -------------------------------------------------------------------------------- 1 | """ 2 | Step 1 test based on largest singular vector. 3 | 4 | This is the test described in `Kac Rice`_ for $X=I$ and the penalty being the nuclear norm 5 | 6 | .. math:: 7 | 8 | {\cal P}(\beta) = \sim_{i=1}^{\text{min(n,p)}} \sigma_i(\beta) 9 | 10 | for $\beta \in \mathbb{R}^{n \times p}$. 11 | 12 | .. _Kac Rice: http://arxiv.org/abs/1308.3020 13 | """ 14 | 15 | import numpy as np 16 | from ..distributions.pvalue import general_pvalue 17 | 18 | def pvalue(X, sigma=1, nsim=5000): 19 | n, p = X.shape 20 | D = np.linalg.svd(X)[1] / sigma 21 | m = n+p-2 22 | H = np.zeros(m) 23 | 24 | nonzero = np.hstack([D[1:],-D[1:]]) 25 | H[:nonzero.shape[0]] = nonzero 26 | 27 | return max(0, min(general_pvalue(D[0], D[1], np.inf, H, nsim=nsim), 1)) 28 | -------------------------------------------------------------------------------- /selectinf/algorithms/screening.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import eye as sparse_eye 3 | 4 | from ..constraints.affine import constraints 5 | 6 | def _basis_vector(j,n): 7 | """ 8 | j-th elementary basis vector in R^n 9 | """ 10 | e = np.zeros(n) 11 | e[j] = 1. 12 | return e 13 | 14 | class topK(object): 15 | 16 | alpha = 0.1 17 | 18 | def __init__(self, X, Y, K, sigma, covariance=None): 19 | n, p = X.shape 20 | self.Z = np.dot(X.T, Y) 21 | self.X, self.Y = X, Y 22 | self.sign = np.sign(self.Z) 23 | self.covariance = covariance 24 | self.K = K 25 | order = np.argsort(np.fabs(self.Z)) 26 | self.selected = order[-K:] 27 | self.selected_sign = self.sign[order[-K:]] 28 | 29 | partial = np.identity(p)[order[:-K]] 30 | partial = np.vstack([partial, -partial]) 31 | 32 | full_matrix = [] 33 | for k in range(1, K+1): 34 | partial_cp = partial.copy() 35 | partial_cp[:,order[-k]] = -self.sign[order[-k]] 36 | full_matrix.append(np.dot(partial_cp, X.T)) 37 | linear_part = np.vstack(full_matrix) 38 | self.constraints = constraints(linear_part, 39 | np.zeros(linear_part.shape[0]), 40 | covariance=covariance) 41 | self.constraints.covariance *= sigma**2 42 | self.sigma = sigma 43 | 44 | @property 45 | def intervals(self, doc="OLS intervals for active variables adjusted for selection."): 46 | if not hasattr(self, "_intervals"): 47 | p = self.Z.shape[0] 48 | self._intervals = [] 49 | C = self.constraints 50 | for j in self.selected: 51 | s = self.sign[j] 52 | eta = self.X[:,j] * s 53 | _interval = C.interval(eta, 54 | self.Y, 55 | self.alpha) 56 | self._intervals.append((j, (eta*self.Y).sum(), 57 | _interval)) 58 | return self._intervals 59 | 60 | def test(): 61 | n, p, sigma = 40, 100, 1.4 62 | X = np.random.standard_normal((n,p)) 63 | Y = np.random.standard_normal(n) * sigma 64 | 65 | top10 = topK(X, Y, 10, sigma) 66 | return top10, top10.intervals 67 | -------------------------------------------------------------------------------- /selectinf/algorithms/stopping_rules.py: -------------------------------------------------------------------------------- 1 | """ 2 | Stopping rules used in sequential FDR control. 3 | 4 | See `http://arxiv.org/abs/1309.5352`_ 5 | 6 | """ 7 | 8 | import numpy as np 9 | 10 | def simple_stop(pvalues, alpha): 11 | """ 12 | Compute the number of rejections using 13 | simple stop, the first time a p-value is above 14 | alpha. 15 | 16 | Parameters 17 | ---------- 18 | 19 | pvalues : np.float 20 | 21 | alpha : float 22 | 23 | Returns 24 | ------- 25 | 26 | num_rejections : int 27 | 28 | """ 29 | if not np.all(pvalues <= alpha): 30 | return np.min(np.nonzero(pvalues > alpha)[0]) 31 | else: 32 | return pvalues.shape[0] 33 | 34 | def strong_stop(pvalues, alpha): 35 | """ 36 | 37 | Compute the number of rejections using 38 | strong stop of `http://arxiv.org/abs/1309.5352`_ 39 | 40 | >>> strong_stop(np.array([0.5,0.6,0.7,0.8,0.9]), 0.05) 41 | 0 42 | >>> strong_stop(np.array([0.001, 0.002, 0.0015, 0.0013, 0.05, 0.6]), 0.05) 43 | 3 44 | 45 | In R: 46 | 47 | > strongstop(c(0.001, 0.002, 0.0015, 0.0013, 0.05, 0.6), 0.05) 48 | [1] 3 49 | > strongstop(c(0.5,0.6,0.7,0.8,0.9), 0.05) 50 | [1] 0 51 | 52 | Parameters 53 | ---------- 54 | 55 | pvalues : np.float 56 | 57 | alpha : float 58 | 59 | Returns 60 | ------- 61 | 62 | num_rejections : int 63 | 64 | Based on R code: 65 | ---------------- 66 | 67 | strongstop <- function(p.values,alpha) { 68 | d <- length(p.values) 69 | lhs <- exp(rev(cumsum(rev(log(p.values)/(1:d))))) # LHS from G'Sell et al. 70 | rhs <- alpha * (1:d) / d # RHS from G'Sell et al. 71 | return(max(c(0,which(lhs <= rhs)))) 72 | } 73 | 74 | """ 75 | n = pvalues.shape[0] 76 | LHS = np.exp(np.cumsum((np.log(pvalues) / np.linspace(1., n, n))[::-1])[::-1]) 77 | RHS = alpha * np.linspace(1., n, n) / n 78 | if np.any(LHS <= RHS): 79 | return max(np.nonzero(LHS <= RHS)[0])+1 80 | return 0 81 | 82 | 83 | def forward_stop(pvalues, alpha): 84 | """ 85 | 86 | Compute the number of rejections using 87 | forward stop of `http://arxiv.org/abs/1309.5352`_ 88 | 89 | >>> forward_stop(np.array([0.5,0.6,0.7,0.8,0.9]), 0.05) 90 | 0 91 | >>> forward_stop(np.array([0.001, 0.002, 0.0015, 0.0013, 0.05, 0.6]), 0.05) 92 | 5 93 | 94 | In R: 95 | 96 | > forwardstop(c(0.5,0.6,0.7,0.8,0.9), 0.05) 97 | [1] 0 98 | > forwardstop(c(0.001, 0.002, 0.0015, 0.0013, 0.05, 0.6), 0.05) 99 | [1] 5 100 | > 101 | 102 | Parameters 103 | ---------- 104 | 105 | pvalues : np.float 106 | 107 | alpha : float 108 | 109 | Returns 110 | ------- 111 | 112 | num_rejections : int 113 | 114 | Based on R code: 115 | ---------------- 116 | 117 | forwardstop <- function(p, alpha) { 118 | m <- length(p) 119 | sums <- -(1/(1:m))*cumsum(log(1-p)) 120 | return(max(c(0, which(sums < alpha)))) 121 | } 122 | 123 | """ 124 | 125 | n = pvalues.shape[0] 126 | sums = (-1. / np.linspace(1, n, n)) * np.cumsum(np.log(1 - pvalues)) 127 | if np.any(sums < alpha): 128 | return max(np.nonzero(sums < alpha)[0])+1 129 | return 0 130 | 131 | 132 | -------------------------------------------------------------------------------- /selectinf/algorithms/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/algorithms/tests/__init__.py -------------------------------------------------------------------------------- /selectinf/algorithms/tests/test_IC.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | 3 | import numpy as np 4 | from ...tests.instance import gaussian_instance 5 | from ...constraints.affine import sample_from_constraints 6 | from ...distributions.discrete_family import discrete_family 7 | 8 | from ..forward_step import info_crit_stop 9 | 10 | def test_data_carving_IC(n=600, 11 | p=100, 12 | s=10, 13 | sigma=5, 14 | rho=0.25, 15 | signal=(3.5,5.), 16 | split_frac=0.9, 17 | ndraw=25000, 18 | burnin=5000, 19 | df=np.inf, 20 | coverage=0.90, 21 | compute_intervals=False): 22 | 23 | X, y, beta, active, sigma, _ = gaussian_instance(n=n, 24 | p=p, 25 | s=s, 26 | sigma=sigma, 27 | rho=rho, 28 | signal=signal, 29 | df=df, 30 | equicorrelated=False) 31 | mu = np.dot(X, beta) 32 | splitn = int(n*split_frac) 33 | indices = np.arange(n) 34 | np.random.shuffle(indices) 35 | stage_one = indices[:splitn] 36 | 37 | FS = info_crit_stop(y, X, sigma, cost=np.log(n), subset=stage_one) 38 | 39 | con = FS.constraints() 40 | 41 | X_E = X[:,FS.active] 42 | X_Ei = np.linalg.pinv(X_E) 43 | beta_bar = X_Ei.dot(y) 44 | mu_E = X_E.dot(beta_bar) 45 | sigma_E = np.linalg.norm(y-mu_E) / np.sqrt(n - len(FS.active)) 46 | 47 | con.mean[:] = mu_E 48 | con.covariance = sigma_E**2 * np.identity(n) 49 | 50 | print(sigma_E, sigma) 51 | Z = sample_from_constraints(con, 52 | y, 53 | ndraw=ndraw, 54 | burnin=burnin) 55 | 56 | pvalues = [] 57 | for idx, var in enumerate(FS.active): 58 | active = copy(FS.active) 59 | active.remove(var) 60 | X_r = X[:,active] # restricted design 61 | mu_r = X_r.dot(np.linalg.pinv(X_r).dot(y)) 62 | delta_mu = (mu_r - mu_E) / sigma_E**2 63 | 64 | W = np.exp(Z.dot(delta_mu)) 65 | fam = discrete_family(Z.dot(X_Ei[idx].T), W) 66 | pval = fam.cdf(0, x=beta_bar[idx]) 67 | pval = 2 * min(pval, 1 - pval) 68 | pvalues.append((pval, beta[var])) 69 | 70 | return pvalues 71 | -------------------------------------------------------------------------------- /selectinf/algorithms/tests/test_change_point.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..change_point import one_jump_instance, change_point 3 | 4 | def test_change_point(delta=0.1, p=60, sigma=1, plot=False): 5 | 6 | y, signal = one_jump_instance(delta, p, sigma) 7 | CP = change_point(y) 8 | fit, relaxed_fit, summary, segments = CP.fit() 9 | if plot: 10 | import matplotlib.pyplot as plt 11 | plt.figure(figsize=(8,6)) 12 | plt.scatter(np.arange(y.shape[0]), y) 13 | plt.plot(fit, 'r', label='Penalized', linewidth=3) 14 | plt.plot(relaxed_fit, 'k', label='Relaxed', linewidth=3) 15 | plt.plot(signal, 'g', label='Truth', linewidth=3) 16 | plt.legend(loc='upper left') 17 | return segments 18 | -------------------------------------------------------------------------------- /selectinf/algorithms/tests/test_data_carving.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ...tests.instance import gaussian_instance 3 | from ..lasso import data_carving, data_splitting 4 | 5 | def sim(): 6 | X, Y, _, active, sigma = gaussian_instance() 7 | print(sigma) 8 | G = data_carving.gaussian(X, Y, 1., split_frac=0.9, sigma=sigma) 9 | G.fit() 10 | if set(active).issubset(G.active) and G.active.shape[0] > len(active): 11 | return [G.hypothesis_test(G.active[len(active)], burnin=5000, ndraw=10000)] 12 | return [] 13 | 14 | def sim2(): 15 | X, Y, _, active, sigma = gaussian_instance(n=150, s=3) 16 | G = data_splitting.gaussian(X, Y, 5., split_frac=0.5, sigma=sigma) 17 | G.fit(use_full=True) 18 | if set(active).issubset(G.active) and G.active.shape[0] > len(active): 19 | return [G.hypothesis_test(G.active[len(active)])] 20 | return [] 21 | 22 | -------------------------------------------------------------------------------- /selectinf/algorithms/tests/test_screening.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..screening import topK 3 | import nose.tools as nt 4 | 5 | def test_class(threshold=1): 6 | 7 | Z = np.random.standard_normal(10) 8 | C = np.eye(10) 9 | M = topK(C, Z, 1, 1) 10 | M.constraints 11 | 12 | M.intervals 13 | return M 14 | 15 | -------------------------------------------------------------------------------- /selectinf/algorithms/tests/test_softmax.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.testing.decorators as dec 3 | 4 | from itertools import product 5 | from ..softmax import softmax_objective 6 | 7 | @dec.skipif(True, "need some tests for softmax objective") 8 | def test_softmax(): 9 | raise ValueError('need some tests for softmax objective') 10 | -------------------------------------------------------------------------------- /selectinf/algorithms/tests/test_sqrt_lasso.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | import numpy.testing.decorators as dec 5 | import nose.tools as nt 6 | 7 | import regreg.api as rr 8 | 9 | from ...tests.instance import gaussian_instance as instance 10 | from ...tests.decorators import (set_sampling_params_iftrue, 11 | set_seed_iftrue, 12 | wait_for_return_value) 13 | 14 | from ...tests.flags import SET_SEED, SMALL_SAMPLES 15 | from ..sqrt_lasso import (solve_sqrt_lasso, 16 | choose_lambda, 17 | goodness_of_fit, 18 | sqlasso_objective, 19 | sqlasso_objective_skinny, 20 | solve_sqrt_lasso_fat, 21 | solve_sqrt_lasso_skinny) 22 | from ..lasso import lasso 23 | 24 | @wait_for_return_value() 25 | @set_sampling_params_iftrue(SMALL_SAMPLES, nsim=10, burnin=10, ndraw=10) 26 | @dec.slow 27 | def test_goodness_of_fit(n=20, p=25, s=10, sigma=20., 28 | nsim=10, burnin=2000, ndraw=8000): 29 | P = [] 30 | while True: 31 | y = np.random.standard_normal(n) * sigma 32 | beta = np.zeros(p) 33 | X = np.random.standard_normal((n,p)) + 0.3 * np.random.standard_normal(n)[:,None] 34 | X /= (X.std(0)[None,:] * np.sqrt(n)) 35 | y += np.dot(X, beta) * sigma 36 | lam_theor = .7 * choose_lambda(X, quantile=0.9) 37 | L = lasso.sqrt_lasso(X, y, lam_theor) 38 | L.fit() 39 | pval = goodness_of_fit(L, 40 | lambda x: np.max(np.fabs(x)), 41 | burnin=burnin, 42 | ndraw=ndraw) 43 | P.append(pval) 44 | Pa = np.array(P) 45 | Pa = Pa[~np.isnan(Pa)] 46 | if (~np.isnan(np.array(Pa))).sum() >= nsim: 47 | break 48 | 49 | return Pa, np.zeros_like(Pa, np.bool) 50 | 51 | @set_seed_iftrue(SET_SEED) 52 | def test_skinny_fat(): 53 | 54 | X, Y = instance()[:2] 55 | n, p = X.shape 56 | lam = choose_lambda(X) 57 | obj1 = sqlasso_objective(X, Y) 58 | obj2 = sqlasso_objective_skinny(X, Y) 59 | soln1 = solve_sqrt_lasso_fat(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0] 60 | soln2 = solve_sqrt_lasso_skinny(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0] 61 | 62 | np.testing.assert_allclose(soln1, soln2, rtol=1.e-3) 63 | 64 | X, Y = instance(p=50)[:2] 65 | n, p = X.shape 66 | lam = choose_lambda(X) 67 | obj1 = sqlasso_objective(X, Y) 68 | obj2 = sqlasso_objective_skinny(X, Y) 69 | soln1 = solve_sqrt_lasso_fat(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0] 70 | soln2 = solve_sqrt_lasso_skinny(X, Y, weights=np.ones(p) * lam, solve_args={'min_its':500})[0] 71 | 72 | np.testing.assert_allclose(soln1, soln2, rtol=1.e-3) 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /selectinf/api.py: -------------------------------------------------------------------------------- 1 | from .constraints.api import * 2 | from .algorithms.api import * 3 | from .distributions.api import * 4 | from .randomized.api import * 5 | from .truncated.api import * 6 | from .sampling.api import * 7 | -------------------------------------------------------------------------------- /selectinf/base.py: -------------------------------------------------------------------------------- 1 | import regreg.api as rr 2 | import regreg.affine as ra 3 | 4 | def restricted_estimator(loss, active, solve_args={'min_its':50, 'tol':1.e-10}): 5 | """ 6 | Fit a restricted model using only columns `active`. 7 | 8 | Parameters 9 | ---------- 10 | 11 | Mest_loss : objective function 12 | A GLM loss. 13 | 14 | active : ndarray 15 | Which columns to use. 16 | 17 | solve_args : dict 18 | Passed to `solve`. 19 | 20 | Returns 21 | ------- 22 | 23 | soln : ndarray 24 | Solution to restricted problem. 25 | 26 | """ 27 | X, Y = loss.data 28 | 29 | if not loss._is_transform and hasattr(loss, 'saturated_loss'): # M_est is a glm 30 | X_restricted = X[:,active] 31 | loss_restricted = rr.affine_smooth(loss.saturated_loss, X_restricted) 32 | else: 33 | I_restricted = ra.selector(active, ra.astransform(X).input_shape[0], ra.identity((active.sum(),))) 34 | loss_restricted = rr.affine_smooth(loss, I_restricted.T) 35 | beta_E = loss_restricted.solve(**solve_args) 36 | 37 | return beta_E 38 | -------------------------------------------------------------------------------- /selectinf/constraints/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /selectinf/constraints/api.py: -------------------------------------------------------------------------------- 1 | from .affine import constraints as affine_constraints 2 | -------------------------------------------------------------------------------- /selectinf/constraints/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/constraints/tests/__init__.py -------------------------------------------------------------------------------- /selectinf/constraints/tests/test_quasi.py: -------------------------------------------------------------------------------- 1 | """ 2 | test_quasi.py 3 | Date: 2014-10-17 4 | Author: Xiaoying Tian 5 | """ 6 | 7 | from __future__ import division, print_function 8 | import nose.tools as nt 9 | import numpy as np 10 | 11 | from ..quasi_affine import (quadratic_inequality_solver, 12 | intersection, 13 | sqrt_inequality_solver) 14 | from ...tests.flags import SET_SEED 15 | from ...tests.decorators import set_seed_iftrue 16 | 17 | def test_quadratic_solver(): 18 | yield np.testing.assert_almost_equal, quadratic_inequality_solver(7,0.,-28),[[-2.0,2.0]] 19 | yield (np.testing.assert_almost_equal, quadratic_inequality_solver(1,-1,-5.), 20 | [[-1.7912878474779199, 2.7912878474779199]]) 21 | yield (np.testing.assert_almost_equal, quadratic_inequality_solver(1,-1,5.), [[]]) 22 | yield (np.testing.assert_almost_equal, quadratic_inequality_solver(-1,-1,-5.), 23 | [[float("-inf"), float("inf")]]) 24 | yield (np.testing.assert_almost_equal, 25 | quadratic_inequality_solver(-1,6,-5.), [[float("-inf"), 1.0], [5.0, float("inf")]]) 26 | yield (np.testing.assert_almost_equal, quadratic_inequality_solver(0.,6,-5.), 27 | [[float("-inf"), 0.8333333333333334]]) 28 | yield (np.testing.assert_almost_equal, 29 | quadratic_inequality_solver(0.,6,5.),[[float("-inf"), -0.8333333333333334]]) 30 | yield nt.assert_raises, ValueError, quadratic_inequality_solver, 0., 0., 5. 31 | yield (np.testing.assert_almost_equal, 32 | quadratic_inequality_solver(1,3,2,"greater than"), [[float("-inf"), -2.], [-1., float("inf")]]) 33 | 34 | def test_intersection(): 35 | yield np.testing.assert_almost_equal, intersection([], []), [] 36 | yield np.testing.assert_almost_equal, intersection([], [1,2]), [] 37 | yield np.testing.assert_almost_equal, intersection([2,3], []), [] 38 | yield np.testing.assert_almost_equal, intersection([2,3], [1,2]), [] 39 | yield np.testing.assert_almost_equal, intersection([3,4], [1,2]), [] 40 | yield np.testing.assert_almost_equal, intersection([-1,4], [1,2]), [1,2] 41 | yield np.testing.assert_almost_equal, intersection([1,4], [-1,2]), [1,2] 42 | yield np.testing.assert_almost_equal, intersection([1,4], [-1,12]), [1,4] 43 | 44 | @set_seed_iftrue(SET_SEED) 45 | def test_sqrt_solver(): 46 | a, b, c = np.random.random_integers(-50, 50, 3) 47 | n = 100 48 | intervals = sqrt_inequality_solver(a, b, c, n) 49 | print(a, b, c, intervals) 50 | for x in np.linspace(-20, 20): 51 | hold = (func(x, a, b, c, n) <= 0) 52 | in_interval = any([contains(x, I) for I in intervals]) 53 | yield (np.testing.assert_almost_equal, np.array(hold, np.float), 54 | np.array(in_interval, np.float)) 55 | 56 | 57 | def contains(x, I): 58 | if I: 59 | return (x >= I[0] and x <= I[1]) 60 | else: 61 | return False 62 | 63 | 64 | def func(x, a, b, c, n): 65 | return a*x + b * np.sqrt(n + x**2) - c 66 | 67 | -------------------------------------------------------------------------------- /selectinf/constraints/tests/test_unknown_sigma.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | from .. import affine 4 | from ..quasi_affine import constraints_unknown_sigma 5 | 6 | def simulate(A=None, theta=0, R=None, eta=None): 7 | 8 | n = 22 9 | p = 4 10 | k = 18 11 | if R is None: 12 | R = np.linalg.svd(np.random.standard_normal((n,n-k)), full_matrices=0)[0] 13 | R = np.dot(R, R.T) 14 | R = 0.1 * R + np.diag([0]*p + [1.] * (n-p)) 15 | R = np.linalg.svd(R, full_matrices=0)[0] 16 | R = R[:,:(n-p)] 17 | R = np.dot(R, R.T) 18 | if A is None: 19 | A = np.diag([1.]*p) + 0.05 * np.random.standard_normal((p,p)) 20 | sel = np.identity(n)[:p] 21 | A = np.dot(A, sel) 22 | b = -np.ones(p) 23 | n = R.shape[0] 24 | df = np.diag(R).sum() 25 | 26 | if eta is None: 27 | eta = np.random.standard_normal(n) * 3 28 | eta = eta - np.dot(R, eta) 29 | 30 | counter = 0 31 | while True: 32 | counter += 1 33 | Z = np.random.standard_normal(n) * 1.5 + eta * theta / np.linalg.norm(eta)**2 34 | sigma_hat = np.linalg.norm(np.dot(R, Z)) / np.sqrt(df) 35 | if np.all(np.dot(A, Z) <= b * sigma_hat): 36 | return A, b, R, Z, eta, counter 37 | if counter >= 1000: 38 | break 39 | return None 40 | 41 | 42 | def instance(theta=0, A=None, R=None, eta=None): 43 | 44 | result = None 45 | while not result: 46 | result = simulate(theta=theta, A=A, R=R, eta=eta) 47 | 48 | A, b, R, Z, eta, counter = result 49 | from ..truncated_T import truncated_T 50 | 51 | intervals, obs = constraints_unknown_sigma(A, b, Z, eta, R, 52 | value_under_null=theta) 53 | df = np.diag(R).sum() 54 | truncT = truncated_T(np.array([(interval.lower_value, 55 | interval.upper_value) for interval in intervals]), df) 56 | sf = truncT.sf(obs) 57 | pval = 2 * min(sf, 1.-sf) 58 | if pval < 1.e-6: 59 | print(sf, obs, intervals) 60 | return float(pval) 61 | 62 | if __name__ == "__main__": 63 | 64 | P = [] 65 | 66 | n = 22 67 | p = 4 68 | k = 18 69 | 70 | A = np.diag([1.]*p) + 0.05 * np.random.standard_normal((p,p)) 71 | sel = np.identity(n)[:p] 72 | A = np.dot(A, sel) 73 | 74 | R = np.linalg.svd(np.random.standard_normal((n,n-k)), full_matrices=0)[0] 75 | R = np.dot(R, R.T) 76 | R = 0.1 * R + np.diag([0]*p + [1.] * (n-p)) 77 | R = np.linalg.svd(R, full_matrices=0)[0] 78 | R = R[:,:(n-p)] 79 | R = np.dot(R, R.T) 80 | 81 | eta = np.random.standard_normal(n) * 3 82 | eta = eta - np.dot(R, eta) 83 | 84 | for i in range(1000): 85 | P.append(instance(theta=3.,R=R, A=A, eta=eta)) 86 | print(i, np.mean(P), np.std(P)) 87 | U = np.linspace(0,1,51) 88 | 89 | # make any plots not use display 90 | 91 | from matplotlib import use 92 | use('Agg') 93 | import matplotlib.pyplot as plt 94 | 95 | # used for ECDF 96 | 97 | import statsmodels.api as sm 98 | plt.plot(U, sm.distributions.ECDF(P)(U)) 99 | plt.plot([0,1],[0,1]) 100 | plt.show() 101 | -------------------------------------------------------------------------------- /selectinf/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/distributions/__init__.py -------------------------------------------------------------------------------- /selectinf/distributions/api.py: -------------------------------------------------------------------------------- 1 | from .discrete_family import discrete_family 2 | from .intervals import intervals_from_sample 3 | -------------------------------------------------------------------------------- /selectinf/distributions/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/distributions/tests/__init__.py -------------------------------------------------------------------------------- /selectinf/distributions/tests/test_chains.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..chain import parallel_test, serial_test 4 | from ...constraints.affine import constraints, gaussian_hit_and_run 5 | 6 | def test_gaussian_chain(): 7 | 8 | n = 30 9 | 10 | A = np.eye(n)[:3] 11 | b = np.ones(A.shape[0]) 12 | 13 | con = constraints(A, b) 14 | state = np.random.standard_normal(n) 15 | state[:3] = 0 16 | 17 | gaussian_chain = gaussian_hit_and_run(con, state, nstep=100) 18 | 19 | counter = 0 20 | for step in gaussian_chain: 21 | counter += 1 22 | 23 | if counter >= 100: 24 | break 25 | 26 | test_statistic = lambda z: np.sum(z) 27 | 28 | parallel = parallel_test(gaussian_chain, 29 | gaussian_chain.state, 30 | test_statistic, 31 | ndraw=20) 32 | 33 | serial = serial_test(gaussian_chain, 34 | gaussian_chain.state, 35 | test_statistic, 36 | ndraw=20) 37 | 38 | return parallel, serial 39 | -------------------------------------------------------------------------------- /selectinf/distributions/tests/test_discreteExFam.py: -------------------------------------------------------------------------------- 1 | # Testing 2 | from __future__ import print_function 3 | import numpy as np 4 | import nose.tools as nt 5 | from scipy.stats import poisson 6 | from ..discrete_family import discrete_family 7 | 8 | def test_MLE(): 9 | 10 | X = np.arange(100) 11 | observed = 4 12 | pois = discrete_family(X, poisson.pmf(X, 4.5)) 13 | 14 | MLE, var = pois.MLE(observed, tol=1.e-7, max_iter=30)[:2] 15 | mean_param = pois.E(MLE, lambda x: x) 16 | nt.assert_true(np.fabs(mean_param - observed) / observed < 1.e-4) 17 | nt.assert_true(np.fabs(mean_param - var*mean_param**2) < 1.e-3) 18 | 19 | def test_discreteExFam(): 20 | 21 | X = np.arange(100) 22 | pois = discrete_family(X, poisson.pmf(X, 1)) 23 | tol = 1e-5 24 | 25 | print(pois._leftCutFromRight(theta=0.4618311,rightCut=(5,.5)), pois._test2RejectsLeft(theta=2.39,observed=5,auxVar=.5)) 26 | print (pois.interval(observed=5,alpha=.05,randomize=True,auxVar=.5)) 27 | 28 | print (abs(1-sum(pois.pdf(0)))) 29 | pois.ccdf(0, 3, .4) 30 | 31 | print(pois.MLE(1.3)) 32 | 33 | print (pois.Var(np.log(2), lambda x: x)) 34 | print (pois.Cov(np.log(2), lambda x: x, lambda x: x)) 35 | 36 | lc = pois._rightCutFromLeft(0, (0,.01)) 37 | print ((0,0.01), pois._leftCutFromRight(0, lc)) 38 | 39 | pois._rightCutFromLeft(-10, (0,.01)) 40 | #[pois.test2Cutoffs(t)[1] for t in range(-10,3)] 41 | pois._critCovFromLeft(-10, (0,.01)) 42 | 43 | pois._critCovFromLeft(0, (0,.01)) 44 | pois._critCovFromRight(0, lc) 45 | 46 | pois._critCovFromLeft(5, (5, 1)) 47 | 48 | pois._test2RejectsLeft(np.log(5),5) 49 | pois._test2RejectsRight(np.log(5),5) 50 | 51 | pois._test2RejectsLeft(np.log(20),5) 52 | pois._test2RejectsRight(np.log(.1),5) 53 | 54 | print (pois._inter2Upper(5,auxVar=.5)) 55 | print (pois.interval(5,auxVar=.5)) 56 | 57 | -------------------------------------------------------------------------------- /selectinf/distributions/tests/test_multiparameter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..discrete_multiparameter import multiparameter_family 3 | 4 | def test_multiparameter(): 5 | 6 | X = [[3,4],[4,5],[5,8.]] 7 | w = [0.3, 0.5, 0.4] 8 | theta = [0.1,0.3] 9 | 10 | family = multiparameter_family(X, w) 11 | mu1 = family.mean(theta) 12 | 13 | X_arr = np.array(X) 14 | exponent = np.dot(X_arr, theta) 15 | 16 | w_arr = np.array(w) * np.exp(exponent) 17 | w_arr /= w_arr.sum() 18 | 19 | mu2 = (X_arr * w_arr[:,None]).sum(0) 20 | 21 | np.testing.assert_allclose(mu1, mu2) 22 | 23 | info1 = family.information(theta)[1] 24 | 25 | T = np.zeros((3,2,2)) 26 | for i in range(2): 27 | for j in range(2): 28 | T[:,i,j] = X_arr[:,i] * X_arr[:,j] 29 | 30 | second_moment = (T * w_arr[:,None,None]).sum(0) 31 | info2 = second_moment - np.outer(mu1, mu1) 32 | 33 | np.testing.assert_allclose(info1, info2) 34 | 35 | mu3 = np.array([family.E(theta, lambda x: x[:,i]) for i in range(2)]) 36 | np.testing.assert_allclose(mu1, mu3) 37 | 38 | cov01 = np.array(family.Cov(theta, lambda x: x[:,0], lambda x: x[:,1])) 39 | np.testing.assert_allclose(cov01, info1[0,1]) 40 | 41 | var0 = np.array(family.Var(theta, lambda x: x[:,0])) 42 | np.testing.assert_allclose(var0, info1[0,0]) 43 | 44 | observed = np.array([4.2,6.3]) 45 | theta_hat = family.MLE(observed, tol=1.e-12, max_iters=50) 46 | 47 | np.testing.assert_allclose(observed, family.mean(theta_hat)) 48 | -------------------------------------------------------------------------------- /selectinf/info.py: -------------------------------------------------------------------------------- 1 | """ This file contains defines parameters for regreg that we use to fill 2 | settings in setup.py, the regreg top-level docstring, and for building the docs. 3 | In setup.py in particular, we exec this file, so it cannot import regreg 4 | """ 5 | 6 | # regreg version information. An empty _version_extra corresponds to a 7 | # full release. '.dev' as a _version_extra string means this is a development 8 | # version 9 | _version_major = 0 10 | _version_minor = 1 11 | _version_micro = 0 12 | _version_extra = '' 13 | 14 | # Format expected by setup.py and doc/source/conf.py: string of form "X.Y.Z" 15 | __version__ = "%s.%s.%s%s" % (_version_major, 16 | _version_minor, 17 | _version_micro, 18 | _version_extra) 19 | 20 | CLASSIFIERS = ["Development Status :: 3 - Alpha", 21 | "Environment :: Console", 22 | "Intended Audience :: Science/Research", 23 | "License :: OSI Approved :: BSD License", 24 | "Operating System :: OS Independent", 25 | "Programming Language :: Python", 26 | "Topic :: Scientific/Engineering"] 27 | 28 | description = 'Testing a fixed value of lambda' 29 | 30 | # Note: this long_description is actually a copy/paste from the top-level 31 | # README.txt, so that it shows up nicely on PyPI. So please remember to edit 32 | # it only in one place and sync it correctly. 33 | long_description = \ 34 | """ 35 | ============ 36 | Fixed lambda 37 | ============ 38 | 39 | This mini-package contains a module to perform 40 | a fixed lambda test for the LASSO. 41 | """ 42 | 43 | # versions 44 | NUMPY_MIN_VERSION='1.7.1' 45 | SCIPY_MIN_VERSION = '0.9' 46 | CYTHON_MIN_VERSION = '0.21' 47 | MPMATH_MIN_VERSION = "0.18" 48 | PYINTER_MIN_VERSION = "0.1.6" 49 | SKLEARN_MIN_VERSION = '0.19' 50 | 51 | NAME = 'selectinf' 52 | MAINTAINER = "Jonathan Taylor" 53 | MAINTAINER_EMAIL = "" 54 | DESCRIPTION = description 55 | LONG_DESCRIPTION = long_description 56 | URL = "http://github.org/jonathan.taylor/selective-inference" 57 | DOWNLOAD_URL = "" 58 | LICENSE = "BSD license" 59 | CLASSIFIERS = CLASSIFIERS 60 | AUTHOR = "fixed_lambda developers" 61 | AUTHOR_EMAIL = "" 62 | PLATFORMS = "OS Independent" 63 | MAJOR = _version_major 64 | MINOR = _version_minor 65 | MICRO = _version_micro 66 | ISRELEASE = _version_extra == '' 67 | VERSION = __version__ 68 | STATUS = 'alpha' 69 | PROVIDES = ["fixed_lambda"] 70 | REQUIRES = ["numpy (>=%s)" % NUMPY_MIN_VERSION, 71 | "scipy (>=%s)" % SCIPY_MIN_VERSION, 72 | "mpmath (>=%s)" % MPMATH_MIN_VERSION, 73 | "pyinter"] 74 | -------------------------------------------------------------------------------- /selectinf/learning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/learning/__init__.py -------------------------------------------------------------------------------- /selectinf/learning/fitters.py: -------------------------------------------------------------------------------- 1 | import uuid, functools 2 | 3 | import numpy as np 4 | from scipy.stats import norm as ndist 5 | from sklearn import ensemble 6 | 7 | def gbm_fit_sk(T, Y, **params): 8 | 9 | fitfns = [] 10 | for j in range(Y.shape[1]): 11 | print('variable %d' % (j+1,)) 12 | y = Y[:,j].astype(np.int) 13 | clf = ensemble.GradientBoostingClassifier(**params) 14 | clf.fit(T, y) 15 | 16 | def fit_fn(clf, t): 17 | return clf.predict_proba(t)[:,1] 18 | 19 | fitfns.append(functools.partial(fit_fn, clf)) 20 | 21 | return fitfns 22 | 23 | def random_forest_fit_sk(T, Y, **params): 24 | 25 | fitfns = [] 26 | for j in range(Y.shape[1]): 27 | print('variable %d' % (j+1,)) 28 | y = Y[:,j].astype(np.int) 29 | clf = ensemble.RandomForestClassifier(**params) 30 | clf.fit(T, y) 31 | 32 | def fit_fn(clf, t): 33 | return clf.predict_proba(t)[:,1] 34 | 35 | fitfns.append(functools.partial(fit_fn, clf)) 36 | 37 | return fitfns 38 | 39 | -------------------------------------------------------------------------------- /selectinf/learning/keras_fit.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Based on https://stackoverflow.com/questions/44164749/how-does-keras-handle-multilabel-classification 3 | ''' 4 | 5 | import warnings 6 | 7 | try: 8 | 9 | from keras.models import Sequential 10 | from keras.layers import Dense, Dropout 11 | from keras.optimizers import SGD 12 | 13 | def keras_fit(T, Y, **kwargs): 14 | 15 | if Y.ndim == 1: 16 | Y.shape = (-1, 1) 17 | 18 | fitfns = [] 19 | 20 | for j in range(Y.shape[1]): 21 | y = Y[:,j] 22 | 23 | fit_fn = keras_fit_multilabel(T, y, **kwargs)[0] 24 | fitfns.append(fit_fn) 25 | return fitfns 26 | 27 | def keras_fit_multilabel(T, Y, sizes=[500, 500], epochs=50, activation='relu', dropout=0, **ignored): 28 | 29 | if Y.ndim == 1: 30 | Y.shape = (-1, 1) 31 | 32 | model = Sequential() 33 | for s in sizes: 34 | model.add(Dense(s, activation=activation, input_dim=T.shape[1])) 35 | if dropout > 0: 36 | model.add(Dropout(dropout)) 37 | 38 | # the final layer 39 | model.add(Dense(Y.shape[1], activation='sigmoid')) 40 | 41 | sgd = SGD(lr=0.03, decay=1e-3, momentum=0.6, nesterov=True) 42 | model.compile(loss='binary_crossentropy', 43 | optimizer=sgd) 44 | 45 | model.fit(T, Y, epochs=epochs) 46 | fitfns = [lambda T_test: model.predict(T_test)[:,j] for j in range(Y.shape[1])] 47 | return fitfns 48 | 49 | except ImportError: 50 | warnings.warn('module `keras` not importable, `keras_fit` and `keras_fit_multilabel` will not be importable') 51 | -------------------------------------------------------------------------------- /selectinf/randomized/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/randomized/__init__.py -------------------------------------------------------------------------------- /selectinf/randomized/api.py: -------------------------------------------------------------------------------- 1 | from .query import multiple_queries, query 2 | from .randomization import randomization 3 | from .lasso import lasso, split_lasso 4 | from .screening import marginal_screening, stepup, topK 5 | from .slope import slope 6 | from .group_lasso import group_lasso 7 | -------------------------------------------------------------------------------- /selectinf/randomized/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ...tests.decorators import wait_for_return_value, set_sampling_params_iftrue 4 | from ...tests.instance import logistic_instance, gaussian_instance 5 | -------------------------------------------------------------------------------- /selectinf/randomized/tests/sandbox/test_cv_glmnet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import regreg.api as rr 3 | 4 | from ...algorithms.cv_glmnet import CV_glmnet 5 | from ...tests.instance import gaussian_instance 6 | 7 | def test_cv_glmnet(): 8 | np.random.seed(2) 9 | n, p = 3000, 1000 10 | X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=30, rho=0., sigma=1) 11 | loss = rr.glm.gaussian(X,y) 12 | CV_glmnet_gaussian = CV_glmnet(loss, 'gaussian') 13 | lam_CV, lam_1SD, lam_seq, CV_err, SD = CV_glmnet_gaussian.using_glmnet() 14 | print("CV error curve (nonrandomized):", CV_err) 15 | lam_grid_size = CV_glmnet_gaussian.lam_seq.shape[0] 16 | lam_CVR, SD, CVR, CV1, lam_seq = CV_glmnet_gaussian.choose_lambda_CVR(scale1=0.1, scale2=0.1) 17 | print("nonrandomized index:", list(lam_seq).index(lam_CV)) # index of the minimizer 18 | print("lam for nonrandomized CV plus sigma rule:",lam_CV,lam_1SD) 19 | print("lam_CVR:",lam_CVR) 20 | print("randomized index:", list(lam_seq).index(lam_CVR)) 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /selectinf/randomized/tests/sandbox/test_fixedX.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import norm as ndist 3 | 4 | import regreg.api as rr 5 | 6 | from ...tests.flags import SMALL_SAMPLES, SET_SEED 7 | from ...tests.instance import gaussian_instance 8 | from ...tests.decorators import wait_for_return_value, set_seed_iftrue, set_sampling_params_iftrue 9 | 10 | from ..api import randomization 11 | from ..glm import (resid_bootstrap, 12 | glm_nonparametric_bootstrap, 13 | fixedX_group_lasso) 14 | 15 | 16 | @set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=10, burnin=10) 17 | @set_seed_iftrue(SET_SEED) 18 | @wait_for_return_value() 19 | def test_fixedX(ndraw=10000, burnin=2000): # nsim needed for decorator 20 | s, n, p = 5, 200, 20 21 | 22 | randomizer = randomization.laplace((p,), scale=1.) 23 | X, Y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=0.1, signal=7) 24 | 25 | lam_frac = 1. 26 | lam = lam_frac * np.mean(np.fabs(X.T.dot(np.random.standard_normal((n, 50000)))).max(0)) * sigma 27 | W = np.ones(p) * lam 28 | epsilon = 1. / np.sqrt(n) 29 | 30 | penalty = rr.group_lasso(np.arange(p), 31 | weights=dict(zip(np.arange(p), W)), lagrange=1.) 32 | 33 | M_est = fixedX_group_lasso(X, Y, epsilon, penalty, randomizer) 34 | M_est.solve() 35 | 36 | active_set = M_est.selection_variable['variables'] 37 | nactive = active_set.sum() 38 | 39 | if set(nonzero).issubset(np.nonzero(active_set)[0]) and active_set.sum() > len(nonzero): 40 | 41 | selected_features = np.zeros(p, np.bool) 42 | selected_features[active_set] = True 43 | 44 | Xactive = X[:,active_set] 45 | unpenalized_mle = np.linalg.pinv(Xactive).dot(Y) 46 | 47 | form_covariances = glm_nonparametric_bootstrap(n, n) 48 | target_info, target_observed = resid_bootstrap(M_est.loss, active_set) 49 | 50 | cov_info = M_est.setup_sampler() 51 | target_cov, score_cov = form_covariances(target_info, 52 | cross_terms=[cov_info], 53 | nsample=M_est.nboot) 54 | 55 | opt_sample = M_est.sampler.sample(ndraw, 56 | burnin) 57 | 58 | pvalues = M_est.sampler.coefficient_pvalues(unpenalized_mle, 59 | target_cov, 60 | score_cov, 61 | parameter=np.zeros(selected_features.sum()), 62 | sample=opt_sample) 63 | intervals = M_est.sampler.confidence_intervals(unpenalized_mle, target_cov, score_cov, sample=opt_sample) 64 | 65 | true_vec = beta[M_est.selection_variable['variables']] 66 | 67 | L, U = intervals.T 68 | 69 | covered = np.zeros(nactive, np.bool) 70 | active_var = np.zeros(nactive, np.bool) 71 | active_set = np.nonzero(active_set)[0] 72 | 73 | for j in range(nactive): 74 | if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): 75 | covered[j] = 1 76 | active_var[j] = active_set[j] in nonzero 77 | 78 | return pvalues, covered, active_var 79 | 80 | -------------------------------------------------------------------------------- /selectinf/randomized/tests/sandbox/test_full_lasso.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import nose.tools as nt 3 | 4 | import selection.randomized.lasso as L; reload(L) 5 | from selection.randomized.lasso import lasso 6 | from selection.tests.instance import gaussian_instance 7 | import matplotlib.pyplot as plt 8 | 9 | def test_full_lasso(n=200, p=30, signal_fac=1.5, s=5, ndraw=5000, burnin=1000, sigma=3, full=False, rho=0.4, randomizer_scale=1): 10 | """ 11 | General LASSO -- 12 | """ 13 | 14 | inst, const = gaussian_instance, lasso.gaussian 15 | signal = np.sqrt(signal_fac * np.log(p)) 16 | X, Y, beta = inst(n=n, 17 | p=p, 18 | signal=signal, 19 | s=s, 20 | equicorrelated=False, 21 | rho=rho, 22 | sigma=sigma, 23 | random_signs=True)[:3] 24 | 25 | n, p = X.shape 26 | 27 | W = np.ones(X.shape[1]) * np.sqrt(1.5 * np.log(p)) * sigma 28 | 29 | conv = const(X, 30 | Y, 31 | W, 32 | randomizer_scale=randomizer_scale * sigma) 33 | 34 | signs = conv.fit(solve_args={'min_its':500, 'tol':1.e-13}) 35 | nonzero = signs != 0 36 | 37 | conv2 = lasso.gaussian(X, 38 | Y, 39 | W, 40 | randomizer_scale=randomizer_scale * sigma) 41 | conv2.fit(perturb=conv._initial_omega, solve_args={'min_its':500, 'tol':1.e-13}) 42 | conv2.decompose_subgradient(condition=np.ones(p, np.bool)) 43 | 44 | np.testing.assert_allclose(conv2._view.sampler.affine_con.covariance, 45 | conv.sampler.affine_con.covariance) 46 | 47 | np.testing.assert_allclose(conv2._view.sampler.affine_con.mean, 48 | conv.sampler.affine_con.mean) 49 | 50 | np.testing.assert_allclose(conv2._view.sampler.affine_con.linear_part, 51 | conv.sampler.affine_con.linear_part) 52 | 53 | np.testing.assert_allclose(conv2._view.sampler.affine_con.offset, 54 | conv.sampler.affine_con.offset) 55 | 56 | np.testing.assert_allclose(conv2._view.initial_soln, 57 | conv.initial_soln) 58 | 59 | np.testing.assert_allclose(conv2._view.initial_subgrad, 60 | conv.initial_subgrad) 61 | -------------------------------------------------------------------------------- /selectinf/randomized/tests/sandbox/test_general_lasso.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | import numpy as np 3 | import nose.tools as nt 4 | 5 | from ..lasso import lasso 6 | from ...tests.instance import (gaussian_instance, 7 | logistic_instance, 8 | poisson_instance) 9 | from ...tests.flags import SMALL_SAMPLES 10 | from ...tests.decorators import set_sampling_params_iftrue 11 | 12 | @set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=50, burnin=20) 13 | def test_lasso_constructors(ndraw=1000, burnin=200): 14 | """ 15 | Smoke tests for lasso convenience constructors 16 | """ 17 | cls = lasso 18 | for const_info, rand, marginalize, condition in product(zip([gaussian_instance, 19 | logistic_instance, 20 | poisson_instance], 21 | [cls.gaussian, 22 | cls.logistic, 23 | cls.poisson]), 24 | ['gaussian', 'logistic', 'laplace'], 25 | [False, True], 26 | [False, True]): 27 | 28 | print(rand) 29 | inst, const = const_info 30 | X, Y = inst(n=100, p=20, signal=5, s=10)[:2] 31 | n, p = X.shape 32 | 33 | W = np.ones(X.shape[1]) * 0.2 34 | W[0] = 0 35 | W[3:] = 50. 36 | np.random.shuffle(W) 37 | conv = const(X, Y, W, randomizer=rand) 38 | nboot = 1000 39 | if SMALL_SAMPLES: 40 | nboot = 20 41 | signs = conv.fit(nboot=nboot) 42 | 43 | marginalize = None 44 | if marginalize: 45 | marginalize = np.zeros(p, np.bool) 46 | marginalize[:int(p/2)] = True 47 | 48 | condition = None 49 | if condition: 50 | if marginalize: 51 | condition = ~marginalize 52 | else: 53 | condition = np.ones(p, np.bool) 54 | condition[-int(p/4):] = False 55 | 56 | selected_features = np.zeros(p, np.bool) 57 | selected_features[:3] = True 58 | 59 | conv.summary(selected_features, 60 | ndraw=ndraw, 61 | burnin=burnin, 62 | compute_intervals=True) 63 | 64 | conv.decompose_subgradient(marginalize=marginalize, 65 | condition=condition) 66 | 67 | conv.summary(selected_features, 68 | ndraw=ndraw, 69 | burnin=burnin) 70 | 71 | conv.decompose_subgradient(condition=np.ones(p, np.bool)) 72 | 73 | conv.summary(selected_features, 74 | ndraw=ndraw, 75 | burnin=burnin) 76 | -------------------------------------------------------------------------------- /selectinf/randomized/tests/sandbox/test_opt_weighted_intervals.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | import numpy as np 3 | import nose.tools as nt 4 | 5 | from ..convenience import lasso, step, threshold 6 | from ..query import optimization_sampler 7 | from ...tests.instance import (gaussian_instance, 8 | logistic_instance, 9 | poisson_instance) 10 | from ...tests.flags import SMALL_SAMPLES 11 | from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue 12 | import matplotlib.pyplot as plt 13 | 14 | from scipy.stats import t as tdist 15 | from ..glm import glm_nonparametric_bootstrap, pairs_bootstrap_glm 16 | from ..M_estimator import restricted_Mest 17 | 18 | @set_seed_iftrue(False, 200) 19 | @set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=1000, burnin=100) 20 | def test_opt_weighted_intervals(ndraw=20000, burnin=2000): 21 | 22 | results = [] 23 | cls = lasso 24 | for const_info, rand in product(zip([gaussian_instance], [cls.gaussian]), ['laplace', 'gaussian']): 25 | 26 | inst, const = const_info 27 | 28 | X, Y, beta = inst(n=100, p=20, s=0, signal=5., sigma=5.)[:3] 29 | n, p = X.shape 30 | 31 | W = np.ones(X.shape[1]) * 8 32 | conv = const(X, Y, W, randomizer=rand, parametric_cov_estimator=True) 33 | signs = conv.fit() 34 | print("signs", signs) 35 | 36 | marginalizing_groups = np.ones(p, np.bool) 37 | #marginalizing_groups[:int(p/2)] = True 38 | conditioning_groups = ~marginalizing_groups 39 | #conditioning_groups[-int(p/4):] = False 40 | conv.decompose_subgradient(marginalizing_groups=marginalizing_groups, 41 | conditioning_groups=conditioning_groups) 42 | 43 | selected_features = conv._view.selection_variable['variables'] 44 | nactive=selected_features.sum() 45 | print("nactive", nactive) 46 | if nactive==0: 47 | results.append(None) 48 | else: 49 | sel_pivots, sel_pval, sel_ci = conv.summary(selected_features, 50 | parameter=beta[selected_features], 51 | ndraw=ndraw, 52 | burnin=burnin, 53 | compute_intervals=True) 54 | print(sel_pivots) 55 | results.append((rand, sel_pivots, sel_ci, beta[selected_features])) 56 | 57 | return results 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /selectinf/randomized/tests/test_modelQ.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | 3 | import numpy as np 4 | import nose.tools as nt 5 | 6 | import regreg.api as rr 7 | 8 | from ..modelQ import modelQ 9 | from ..lasso import lasso 10 | from ...tests.instance import gaussian_instance 11 | 12 | def test_modelQ(): 13 | 14 | n, p, s = 200, 50, 4 15 | X, y, beta = gaussian_instance(n=n, 16 | p=p, 17 | s=s, 18 | sigma=1)[:3] 19 | 20 | lagrange = 5. * np.ones(p) * np.sqrt(n) 21 | perturb = np.random.standard_normal(p) * n 22 | LH = lasso.gaussian(X, y, lagrange) 23 | LH.fit(perturb=perturb, solve_args={'min_its':1000}) 24 | 25 | LQ = modelQ(X.T.dot(X), X, y, lagrange) 26 | LQ.fit(perturb=perturb, solve_args={'min_its':1000}) 27 | LQ.summary() # smoke test 28 | 29 | conH = LH.sampler.affine_con 30 | conQ = LQ.sampler.affine_con 31 | 32 | np.testing.assert_allclose(LH.initial_soln, LQ.initial_soln) 33 | np.testing.assert_allclose(LH.initial_subgrad, LQ.initial_subgrad) 34 | 35 | np.testing.assert_allclose(conH.linear_part, conQ.linear_part) 36 | np.testing.assert_allclose(conH.offset, conQ.offset) 37 | 38 | np.testing.assert_allclose(LH._beta_full, LQ._beta_full) 39 | 40 | -------------------------------------------------------------------------------- /selectinf/randomized/tests/test_randomization.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import nose.tools as nt 5 | 6 | from ..randomization import randomization 7 | 8 | def test_noise_dbns(): 9 | 10 | X = np.random.standard_normal((10, 5)) 11 | Q = X.T.dot(X) 12 | noises = [randomization.isotropic_gaussian((5,), 1.), 13 | randomization.laplace((5,), 1.), 14 | randomization.logistic((5,), 1.), 15 | randomization.gaussian(Q)] 16 | 17 | v1, v2 = [], [] 18 | 19 | for i, noise in enumerate(noises): 20 | 21 | x = np.random.standard_normal(5) 22 | u = np.random.standard_normal(5) 23 | v1.append(np.exp(noise.log_density(x))) 24 | v2.append(noise._density(x)) 25 | 26 | noise.smooth_objective(x, 'func') 27 | noise.smooth_objective(x, 'grad') 28 | noise.smooth_objective(x, 'both') 29 | noise.gradient(x) 30 | 31 | nt.assert_equal(noise.sample().shape, (5,)) 32 | nt.assert_equal(noise.sample().shape, (5,)) 33 | 34 | if noise.CGF is not None: 35 | u = np.zeros(5) 36 | u[:2] = 0.1 37 | noise.CGF.smooth_objective(u, 'both') 38 | 39 | if noise.CGF_conjugate is not None: 40 | noise.CGF_conjugate.smooth_objective(x, 'both') 41 | 42 | 43 | -------------------------------------------------------------------------------- /selectinf/randomized/tests/test_slope_subgrad.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..slope import _projection_onto_selected_subgradients 4 | 5 | def test_projection(): 6 | 7 | prox_arg = np.random.normal(0,1,10) 8 | weights = np.linspace(3, 5, 10)[::-1] 9 | ordering = np.random.choice(10, 10, replace=False) 10 | cluster_sizes = [2,3,1,1,3] 11 | active_signs = np.ones(10) 12 | 13 | proj = _projection_onto_selected_subgradients(prox_arg, 14 | weights, 15 | ordering, 16 | cluster_sizes, 17 | active_signs) 18 | 19 | print("projection", proj) 20 | 21 | 22 | -------------------------------------------------------------------------------- /selectinf/reduced_optimization/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/reduced_optimization/tests/__init__.py -------------------------------------------------------------------------------- /selectinf/sampling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/sampling/__init__.py -------------------------------------------------------------------------------- /selectinf/sampling/api.py: -------------------------------------------------------------------------------- 1 | from .langevin import projected_langevin 2 | from .truncnorm import (sample_truncnorm_white, 3 | sample_truncnorm_white_sphere, 4 | sample_truncnorm_white_ball) 5 | -------------------------------------------------------------------------------- /selectinf/sampling/langevin.py: -------------------------------------------------------------------------------- 1 | """ 2 | Projected Langevin sampler of `http://arxiv.org/abs/1507.02564`_ 3 | """ 4 | from __future__ import print_function 5 | 6 | import numpy as np 7 | from scipy.stats import norm as ndist 8 | 9 | class projected_langevin(object): 10 | 11 | def __init__(self, 12 | initial_condition, 13 | gradient_map, 14 | projection_map, 15 | stepsize): 16 | 17 | (self.state, 18 | self.gradient_map, 19 | self.projection_map, 20 | self.stepsize) = (np.copy(initial_condition), 21 | gradient_map, 22 | projection_map, 23 | stepsize) 24 | self._shape = self.state.shape[0] 25 | self._sqrt_step = np.sqrt(self.stepsize) 26 | self._noise = ndist(loc=0,scale=1) 27 | 28 | def __iter__(self): 29 | return self 30 | 31 | def next(self): 32 | nattempt = 0 33 | while True: 34 | 35 | proj_arg = (self.state 36 | + 0.5 * self.stepsize * self.gradient_map(self.state) 37 | + self._noise.rvs(self._shape) * self._sqrt_step) 38 | candidate = self.projection_map(proj_arg) 39 | if not np.all(np.isfinite(self.gradient_map(candidate))): 40 | nattempt += 1 41 | self._sqrt_step *= 0.8 42 | self.stepsize = self._sqrt_step**2 43 | if nattempt >= 30: 44 | raise ValueError('unable to find feasible step') 45 | else: 46 | self.state[:] = candidate 47 | break 48 | -------------------------------------------------------------------------------- /selectinf/sampling/sequential.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sequential Monte Carlo for approximately constrained Gaussians. 3 | 4 | http://arxiv.org/abs/1410.8209 5 | 6 | """ 7 | 8 | import numpy as np 9 | 10 | def sample(white_constraint, 11 | nsample, 12 | proposal_sigma=0.2, 13 | temps=np.linspace(0, 50, 51.)): 14 | """ 15 | Build up an approximately constrained Gaussian 16 | based on relaxations of the constraint. 17 | 18 | Parameters 19 | ---------- 20 | 21 | white_constraint : `selection.constraints.affine` 22 | Affine constraint with identity covariance 23 | 24 | nsample : int 25 | How many samples to draw? 26 | 27 | proposal_sigma : float 28 | 29 | """ 30 | 31 | n = white_constraint.dim 32 | sample_z = np.random.standard_normal((n, nsample)) 33 | 34 | def constraint_function(z, con): 35 | value = (np.dot(con.linear_part, z) - con.offset[:,None]) 36 | return value.max(0) 37 | 38 | def constraint_logit(temp, z, con): 39 | tmp_z = constraint_function(z, con) 40 | tmp_v = np.exp(-temp * tmp_z) 41 | return tmp_v / (1 + tmp_v) 42 | 43 | def MH_sample(temp, z_cur, con): 44 | step = np.random.standard_normal(z_cur.shape) * proposal_sigma 45 | z_new = z_cur + step 46 | 47 | W_new = constraint_logit(temp, z_new, con) 48 | W_cur = constraint_logit(temp, z_cur, con) 49 | W_new *= np.exp(-(z_new**2).sum(0)/2) 50 | W_cur *= np.exp(-(z_cur**2).sum(0)/2) 51 | 52 | coin_flip = np.less_equal(np.random.sample(z_cur.shape[1]), W_new / W_cur) 53 | final_sample = coin_flip * z_new + (1 - coin_flip) * z_cur 54 | return final_sample 55 | 56 | weights = np.ones(nsample, np.float) / nsample 57 | 58 | num = np.ones(nsample) / 2 59 | for i in range(temps.shape[0]-1): 60 | 61 | num, den = constraint_logit(temps[i+1], sample_z, white_constraint), num 62 | 63 | weights *= np.exp(np.log(num) - np.log(den)) 64 | weights /= weights.sum() 65 | 66 | ESS = 1. / (weights**2).sum() 67 | if ESS < nsample / 2.: 68 | idx_z = np.random.choice(np.arange(nsample), size=(nsample,), replace=True, p=weights) 69 | sample_z = sample_z[:, idx_z] 70 | weights = np.ones(nsample, np.float) / nsample 71 | sample_z = MH_sample(temps[i+1], sample_z, white_constraint) 72 | 73 | return sample_z 74 | 75 | 76 | -------------------------------------------------------------------------------- /selectinf/sampling/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/sampling/tests/__init__.py -------------------------------------------------------------------------------- /selectinf/sampling/tests/plots_fs.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | from .test_fstep_langevin import test_fstep 4 | from .test_kfstep import test_kfstep 5 | import random 6 | 7 | def main(): 8 | 9 | import statsmodels.api as sm 10 | from scipy.stats import probplot, uniform 11 | from matplotlib import pyplot as plt 12 | random.seed(4) 13 | 14 | fig = plt.figure() 15 | plot_1step = fig.add_subplot(121) 16 | plot_kstep = fig.add_subplot(122) 17 | 18 | 19 | P0 = [] 20 | for i in range(300): 21 | 22 | print("iteration", i) 23 | p0 = test_fstep(Langevin_steps=10000, burning=2000) 24 | P0.append(p0) 25 | 26 | print("one step FS done! mean: ", np.mean(P0), "std: ", np.std(P0)) 27 | #probplot(P0, dist=uniform, sparams=(0,1), plot=plot_1step, fit=False) 28 | #plot_1step.plot([0, 1], color='k', linestyle='-', linewidth=2) 29 | 30 | ecdf = sm.distributions.ECDF(P0) 31 | x = np.linspace(min(P0), max(P0)) 32 | y = ecdf(x) 33 | plot_1step.plot(x, y, '-o',lw=2) 34 | plot_1step.plot([0, 1], [0, 1], 'k-', lw=2) 35 | 36 | plot_1step.set_title("One step FS") 37 | plot_1step.set_xlim([0,1]) 38 | plot_1step.set_ylim([0,1]) 39 | 40 | 41 | P0 = [] 42 | for i in range(300): 43 | print("iteration", i) 44 | p0 = test_kfstep(Langevin_steps=10000, burning=2000) 45 | P0.append(p0) 46 | 47 | print("k steps FS done done! mean: ", np.mean(P0), "std: ", np.std(P0)) 48 | #probplot(P0, dist=uniform, sparams=(0,1), plot=plot_kstep, fit=False) 49 | #plot_kstep.plot([0, 1], color='k', linestyle='-', linewidth=2) 50 | 51 | 52 | ecdf = sm.distributions.ECDF(P0) 53 | x = np.linspace(min(P0), max(P0)) 54 | y = ecdf(x) 55 | plot_kstep.plot(x, y,'-o', lw=2) 56 | plot_kstep.plot([0, 1], [0, 1], 'k-', lw=2) 57 | 58 | plot_kstep.set_title("Four steps FS") 59 | plot_kstep.set_xlim([0,1]) 60 | plot_kstep.set_ylim([0,1]) 61 | 62 | 63 | 64 | plt.show() 65 | plt.savefig('FS_Langevin.pdf') 66 | 67 | 68 | -------------------------------------------------------------------------------- /selectinf/sampling/tests/test_pca_langevin.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..langevin import projected_langevin 4 | 5 | ### Some examples: PCA from https://arxiv.org/abs/1410.8260 6 | 7 | def _log_vandermonde(eigenvals, power=1): 8 | """ 9 | Log of the Van der Monde determinant. 10 | """ 11 | eigenvals = np.asarray(eigenvals) 12 | p = eigenvals.shape[0] 13 | idx = np.arange(p) 14 | logdiff = np.log(np.fabs(np.subtract.outer(eigenvals, eigenvals))) 15 | mask = np.greater.outer(idx, idx) 16 | 17 | return power * (logdiff * mask).sum() 18 | 19 | def _grad_log_vandermonde(eigenvals, power=1): 20 | """ 21 | Log of the Van der Monde determinant. 22 | """ 23 | eigenvals = np.asarray(eigenvals) 24 | p = eigenvals.shape[0] 25 | idx = np.arange(p) 26 | diff = np.subtract.outer(eigenvals, eigenvals) 27 | diff_sign = -np.sign(diff) 28 | mask = (diff > 0) 29 | return (1. / (np.fabs(diff) + np.identity(p)) * mask * diff_sign).sum(1) 30 | 31 | def _log_wishart_white(eigenvals, n): 32 | """ 33 | Log-eigenvalue density of Wishart($I_{p \times p}$, n) assuming n>p, 34 | up to normalizing constant. 35 | """ 36 | eigenvals = np.asarray(eigenvals) 37 | p = eigenvals.shape[0] 38 | 39 | return ((n - p - 1) * 0.5 * np.log(eigenvals).sum() 40 | + _log_vandermonde(eigenvals, power=1) 41 | - eigenvals.sum() * 0.5) 42 | 43 | def _grad_log_wishart_white(eigenvals, n): 44 | """ 45 | Gradient of log-eigenvalue density of Wishart($I_{p \times p}$, n) 46 | assuming n>p. 47 | """ 48 | eigenvals = np.asarray(eigenvals) 49 | p = eigenvals.shape[0] 50 | return ((n - p - 1) * 0.5 / (eigenvals + 1.e-7) 51 | + _grad_log_vandermonde(eigenvals, power=1) - 0.5) 52 | 53 | def main(n=50): 54 | 55 | from regreg.atoms._isotonic import _isotonic_regression 56 | import matplotlib.pyplot as plt 57 | initial = np.ones(n) + 0.01 * np.random.standard_normal(n) 58 | grad_map = lambda val: _grad_log_wishart_white(val, n) 59 | 60 | def projection_map(vals): 61 | iso = np.zeros_like(vals) 62 | _isotonic_regression(vals, np.ones_like(vals), iso) 63 | vals = np.asarray(iso) 64 | return np.maximum(vals, 1.e-6) 65 | 66 | sampler = projected_langevin(initial, 67 | grad_map, 68 | projection_map, 69 | 0.01) 70 | sampler = iter(sampler) 71 | 72 | path = [initial.copy()] 73 | for _ in range(200): 74 | print(sampler.state) 75 | sampler.next() 76 | path.append(sampler.state.copy()) 77 | path = np.array(path) 78 | 79 | [plt.plot(path[:,i]) for i in range(5)] 80 | plt.show() 81 | 82 | -------------------------------------------------------------------------------- /selectinf/sampling/tests/test_sequential.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.testing.decorators as dec 3 | from scipy.stats import norm as ndist 4 | 5 | from ...constraints.affine import constraints 6 | from ..sequential import sample 7 | from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue 8 | from ...tests.flags import SMALL_SAMPLES, SET_SEED 9 | 10 | @dec.slow 11 | @set_seed_iftrue(SET_SEED) 12 | @set_sampling_params_iftrue(SMALL_SAMPLES, ndraw=10, nsim=10) 13 | def test_sequentially_constrained(ndraw=100, nsim=50): 14 | S = -np.identity(10)[:3] 15 | b = -6 * np.ones(3) 16 | C = constraints(S, b) 17 | W = sample(C, nsim, temps=np.linspace(0, 200, 1001)) 18 | U = np.linspace(0, 1, 101) 19 | 20 | -------------------------------------------------------------------------------- /selectinf/sandbox/approx_ci/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/sandbox/approx_ci/__init__.py -------------------------------------------------------------------------------- /selectinf/sandbox/approx_ci/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/sandbox/approx_ci/tests/__init__.py -------------------------------------------------------------------------------- /selectinf/sandbox/bayesian/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/sandbox/bayesian/__init__.py -------------------------------------------------------------------------------- /selectinf/sandbox/bayesian/credible_intervals.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import norm as ndist 3 | 4 | class projected_langevin(object): 5 | 6 | def __init__(self, 7 | initial_condition, 8 | gradient_map, 9 | projection_map, 10 | stepsize): 11 | 12 | (self.state, 13 | self.gradient_map, 14 | self.projection_map, 15 | self.stepsize) = (np.copy(initial_condition), 16 | gradient_map, 17 | projection_map, 18 | stepsize) 19 | self._shape = self.state.shape[0] 20 | self._sqrt_step = np.sqrt(self.stepsize) 21 | self._noise = ndist(loc=0,scale=1) 22 | 23 | def __iter__(self): 24 | return self 25 | 26 | def next(self): 27 | while True: 28 | proj_arg = (self.state + 0.5 * self.stepsize * self.gradient_map(self.state) 29 | + self._noise.rvs(self._shape) * self._sqrt_step) 30 | candidate = self.projection_map(proj_arg) 31 | if not np.all(np.isfinite(self.gradient_map(candidate))): 32 | print(candidate, self._sqrt_step) 33 | self._sqrt_step *= 0.8 34 | else: 35 | self.state[:] = candidate 36 | break 37 | -------------------------------------------------------------------------------- /selectinf/sandbox/bayesian/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/sandbox/bayesian/tests/__init__.py -------------------------------------------------------------------------------- /selectinf/src_C/#sample_preparation.pyx#: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | #from cython.view cimport array as cvarray 6 | from libc.stdlib cimport malloc, free 7 | 8 | 9 | cdef extern from "preparation_Eig_Vect.h": 10 | void samples(int n, 11 | int dim, 12 | int seed, 13 | double* initial, 14 | int numlin, 15 | int numquad, 16 | double* lin, 17 | double* quad, 18 | double* quad_lin, 19 | double* offset_lin, 20 | double* offset_quad, 21 | double* samples_Carray) 22 | 23 | 24 | 25 | def quad_sampler(int n_sample, 26 | initial, 27 | quad,# = np.array([]).reshape((0, 0, 0)), 28 | quad_lin,# = np.array([]).reshape((0, 0)), 29 | lin,# = np.array([]).reshape((0,0)), 30 | offset_quad,# = np.array([]), 31 | offset_lin # = np.array([]) 32 | ): 33 | 34 | 35 | 36 | cdef int numquad = quad.shape[0] 37 | cdef int p = quad.shape[1] 38 | cdef int numlin = lin.shape[0] 39 | 40 | cdef np.ndarray[np.double_t, ndim=3] quad2 = np.ascontiguousarray(-quad) 41 | cdef np.ndarray[np.double_t, ndim=2] quad_lin2 = np.ascontiguousarray(-quad_lin) 42 | cdef np.ndarray[np.double_t, ndim=1] offset_quad2 = np.ascontiguousarray(offset_quad) 43 | 44 | cdef double *pt_quad 45 | cdef double *pt_quad_lin 46 | cdef double *pt_quad_offset 47 | if numquad > 0: 48 | pt_quad = &quad2[0, 0, 0] 49 | pt_quad_lin = &quad_lin2[0, 0] 50 | pt_quad_offset = &offset_quad2[0] 51 | 52 | 53 | cdef np.ndarray[np.double_t, ndim=2] lin2 = np.ascontiguousarray(-lin ) 54 | cdef np.ndarray[np.double_t, ndim=1] offset_lin2 = np.ascontiguousarray(offset_lin ) 55 | 56 | cdef double *pt_lin 57 | cdef double *pt_lin_offset 58 | if numlin > 0: 59 | pt_lin_offset = &offset_lin2[0] 60 | pt_lin = &lin2[0, 0] 61 | 62 | cdef np.ndarray[np.double_t, ndim=1] initial2 = np.ascontiguousarray(initial) 63 | 64 | cdef int seed = np.random.randint(1, 100000) 65 | 66 | cdef double *samples_Carray = malloc(n_sample*p * sizeof(double)) 67 | 68 | samples(n_sample, 69 | p, 70 | seed, 71 | &initial2[0], 72 | numlin, 73 | numquad, 74 | pt_lin, 75 | pt_quad, 76 | pt_quad_lin, 77 | pt_lin_offset, 78 | pt_quad_offset, 79 | samples_Carray) 80 | 81 | 82 | cdef np.ndarray[np.double_t, ndim=2] samples_array = np.zeros((n_sample, p)) 83 | for i in range(n_sample): 84 | for j in range(p): 85 | samples_array[i, j] = samples_Carray[i*p + j] 86 | 87 | free(samples_Carray) 88 | 89 | return samples_array 90 | 91 | 92 | -------------------------------------------------------------------------------- /selectinf/src_C/HmcSampler.h: -------------------------------------------------------------------------------- 1 | /* 2 | * File: HmcSampler.h 3 | * Author: aripakman 4 | * 5 | * Created on July 4, 2012, 10:44 AM 6 | */ 7 | 8 | #ifndef HMCSAMPLER_H 9 | #define HMCSAMPLER_H 10 | 11 | #define _USE_MATH_DEFINES 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | using namespace Eigen; 19 | using namespace std; 20 | using namespace std::tr1; 21 | 22 | struct LinearConstraint{ 23 | VectorXd f; 24 | double g; 25 | }; 26 | 27 | struct QuadraticConstraint{ 28 | MatrixXd A; 29 | VectorXd B; 30 | double C; 31 | }; 32 | 33 | 34 | class HmcSampler { 35 | public: 36 | 37 | HmcSampler(const int & d, const int & seed); 38 | 39 | void setInitialValue(const VectorXd & initial); 40 | void addLinearConstraint(const VectorXd & f, const double & g); 41 | void addQuadraticConstraint(const MatrixXd & A, const VectorXd & B, const double & C); 42 | MatrixXd sampleNext(bool returnTrace = false); 43 | 44 | private: 45 | int dim; 46 | VectorXd lastSample; 47 | static const double min_t; 48 | vector linearConstraints; 49 | vector quadraticConstraints; 50 | 51 | ranlux64_base_01 eng1; 52 | // mt19937 eng1; //to sample time and momenta 53 | uniform_real<> ud; 54 | normal_distribution<> nd; 55 | 56 | void _getNextLinearHitTime(const VectorXd & a, const VectorXd & b, double & t, int & cn ); 57 | void _getNextQuadraticHitTime(const VectorXd & a, const VectorXd & b, double & t, int & cn, const bool ); 58 | double _verifyConstraints(const VectorXd &); 59 | void _updateTrace( VectorXd const & a, VectorXd const & b, double const & tt, MatrixXd & tracePoints); 60 | }; 61 | 62 | #endif /* HMCSAMPLER_H */ 63 | 64 | -------------------------------------------------------------------------------- /selectinf/src_C/logfile.txt: -------------------------------------------------------------------------------- 1 | -1-0 2 | -0-1 3 | 4 | 0.846196-0.9041 5 | 0.7401690.590085 6 | -0.18959-0.17084 7 | -0.4238650.0333025 8 | -0.592693-0.266382 9 | 0.0690678-0.00674659 10 | -0.174223-0.431466 11 | 0.6978830.440892 12 | 0.144409-0.675854 13 | -0.3425970.0214389 14 | 15 | 0.846196-0.9041 16 | 0.7401690.590085 17 | -0.18959-0.17084 18 | -0.4238650.0333025 19 | -0.592693-0.266382 20 | 0.0690678-0.00674659 21 | -0.174223-0.431466 22 | 0.6978830.440892 23 | 0.144409-0.675854 24 | -0.3425970.0214389 25 | -------------------------------------------------------------------------------- /selectinf/src_C/preparation_Eig_Vect.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "HmcSampler.h" 4 | 5 | #include "preparation_Eig_Vect.h" 6 | 7 | 8 | #include 9 | 10 | using namespace std; 11 | using namespace Eigen; 12 | 13 | void samples( 14 | int n, 15 | int dim, 16 | int seed, 17 | double *initial, 18 | int numlin, 19 | int numquad, 20 | double *lin, 21 | double *quad, 22 | double *quad_lin, 23 | double *offset_lin, 24 | double *offset_quad, 25 | double *samples_Carray 26 | ){ 27 | 28 | 29 | const Map initial_value(initial, dim); 30 | 31 | 32 | 33 | ofstream logfile; 34 | logfile.open ("logfile.txt"); 35 | 36 | 37 | HmcSampler hmc1(dim, seed); 38 | if (numlin >0){ 39 | const Map F(lin, numlin, dim); 40 | const Map g(offset_lin, numlin); 41 | 42 | for(int i=0; i0){ 48 | 49 | for(int i=0; i A_Map(indice, dim, dim); 52 | 53 | 54 | for(int k=0; k B_Map(&quad_lin[i*dim], dim); 64 | VectorXd B(B_Map); 65 | double C = offset_quad[i]; 66 | hmc1.addQuadraticConstraint(A,B,C); 67 | } 68 | 69 | } 70 | 71 | hmc1.setInitialValue(initial_value); 72 | 73 | MatrixXd samples(n,dim); 74 | 75 | for (int i=0; i 0: 48 | pt_quad = &quad2[0, 0, 0] 49 | pt_quad_lin = &quad_lin2[0, 0] 50 | pt_quad_offset = &offset_quad2[0] 51 | 52 | 53 | 54 | print "quad inequalities generated" 55 | 56 | 57 | cdef np.ndarray[np.double_t, ndim=2] lin2 = np.ascontiguousarray(-lin ) 58 | cdef np.ndarray[np.double_t, ndim=1] offset_lin2 = np.ascontiguousarray(offset_lin ) 59 | 60 | cdef double *pt_lin 61 | cdef double *pt_lin_offset 62 | if numlin > 0: 63 | pt_lin_offset = &offset_lin2[0] 64 | pt_lin = &lin2[0, 0] 65 | 66 | cdef np.ndarray[np.double_t, ndim=1] initial2 = np.ascontiguousarray(initial) 67 | 68 | cdef int seed = np.random.randint(1, 100000) 69 | 70 | cdef double *samples_Carray = malloc(n_sample*p * sizeof(double)) 71 | 72 | samples(n_sample, 73 | p, 74 | seed, 75 | &initial2[0], 76 | numlin, 77 | numquad, 78 | pt_lin, 79 | pt_quad, 80 | pt_quad_lin, 81 | pt_lin_offset, 82 | pt_quad_offset, 83 | samples_Carray) 84 | 85 | 86 | cdef np.ndarray[np.double_t, ndim=2] samples_array = np.zeros((n_sample, p)) 87 | for i in range(n_sample): 88 | for j in range(p): 89 | samples_array[i, j] = samples_Carray[i*p + j] 90 | 91 | free(samples_Carray) 92 | 93 | return samples_array 94 | 95 | 96 | -------------------------------------------------------------------------------- /selectinf/src_C/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup, Extension 2 | from Cython.Build import cythonize 3 | 4 | from Cython.Distutils import build_ext 5 | import numpy as np 6 | 7 | 8 | 9 | setup( 10 | #name = 'kmean', 11 | cmdclass = {'build_ext': build_ext}, 12 | include_dirs = [np.get_include()], 13 | ## ext_modules = cythonize("sample_preparation.pyx", 14 | ## language="c++") 15 | ext_modules = [Extension('sampler', 16 | ["sample_preparation.pyx" , 17 | 'preparation_Eig_Vect.cpp' , 18 | 'HmcSampler.cpp'], 19 | language="c++", 20 | extra_compile_args = ["-W", 21 | "-Wall", 22 | "-ansi", 23 | "-pedantic", 24 | "-stdlib=libstdc++"#, 25 | #"-fPIC" 26 | ], 27 | extra_link_args = ["-stdlib=libstdc++"] 28 | )] 29 | 30 | ) 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /selectinf/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/tests/__init__.py -------------------------------------------------------------------------------- /selectinf/tests/flags.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | SMALL_SAMPLES = False 4 | SET_SEED = False 5 | 6 | if "USE_SMALL_SAMPLES" in os.environ: 7 | SMALL_SAMPLES = True 8 | 9 | if "USE_TEST_SEED" in os.environ: 10 | SET_SEED = True 11 | -------------------------------------------------------------------------------- /selectinf/tests/test_instance.py: -------------------------------------------------------------------------------- 1 | from numpy import inf 2 | from itertools import product 3 | from .instance import gaussian_instance, logistic_instance, HIV_NRTI 4 | 5 | def test_gaussian_instance(): 6 | 7 | for scale, center, random_signs, df in product( 8 | [True, False], 9 | [True, False], 10 | [True, False], 11 | [40, inf]): 12 | gaussian_instance(n=10, 13 | p=20, 14 | s=4, 15 | random_signs=random_signs, 16 | scale=scale, 17 | center=center, 18 | df=df) 19 | 20 | def test_logistic_instance(): 21 | 22 | for scale, center, random_signs in product( 23 | [True, False], 24 | [True, False], 25 | [True, False]): 26 | logistic_instance(n=10, 27 | p=20, 28 | s=4, 29 | random_signs=random_signs, 30 | scale=scale, 31 | center=center) 32 | 33 | def test_HIV_instance(): 34 | 35 | HIV_NRTI() 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /selectinf/tests/tests.py: -------------------------------------------------------------------------------- 1 | from ..algorithms import tests as truncated 2 | from ..distributions import tests as distributions 3 | from ..truncated import tests as truncated 4 | from ..constraints import tests as constraints 5 | from ..sampling import tests as sampling 6 | -------------------------------------------------------------------------------- /selectinf/truncated/F.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import mpmath as mp 3 | 4 | from .base import truncated 5 | 6 | 7 | def sf_F(d1, d2, scale): 8 | 9 | def sf(a, b=np.inf, dps=15): 10 | dps_temp = mp.mp.dps 11 | mp.mp.dps = dps 12 | 13 | tmp_a = d1*a/d2 14 | tmp_b = d1*b/d2 15 | beta_a = tmp_a / (1. + tmp_a) 16 | beta_b = tmp_b / (1. + tmp_b) 17 | if b == np.inf: 18 | beta_b = 1. 19 | sf = mp.betainc(d1/2., d2/2., 20 | x1=beta_a, x2=beta_b, 21 | regularized=True) 22 | mp.mp.dps = dps_temp 23 | return sf 24 | 25 | return sf 26 | 27 | def null_f(x): 28 | raise ValueError("Shouldn't be called") 29 | return 0 30 | 31 | 32 | class truncated_F(truncated): 33 | def __init__(self, intervals, d1, d2, scale=1): 34 | self._d1 = d1 35 | self._d2 = d2 36 | self._scale = scale 37 | 38 | truncated.__init__(self, 39 | intervals, 40 | null_f, 41 | null_f, 42 | sf_F(d1, d2, scale), 43 | null_f) 44 | -------------------------------------------------------------------------------- /selectinf/truncated/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/truncated/__init__.py -------------------------------------------------------------------------------- /selectinf/truncated/api.py: -------------------------------------------------------------------------------- 1 | from .base import find_root 2 | 3 | from .gaussian import truncated_gaussian 4 | from .chi import truncated_chi, truncated_chi2 5 | from .T import truncated_T 6 | from .F import truncated_F 7 | -------------------------------------------------------------------------------- /selectinf/truncated/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/truncated/tests/__init__.py -------------------------------------------------------------------------------- /selectinf/truncated/tests/test_truncated.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import nose.tools as nt 3 | import numpy as np 4 | import numpy.testing.decorators as dec 5 | 6 | from ..gaussian import truncated_gaussian, truncated_gaussian_old 7 | from ...tests.decorators import set_sampling_params_iftrue, set_seed_iftrue 8 | from ...tests.flags import SMALL_SAMPLES, SET_SEED 9 | 10 | intervals = [(-np.inf,-4.),(3.,np.inf)] 11 | 12 | tg = truncated_gaussian(intervals) 13 | 14 | X = np.linspace(-5,5,101) 15 | F = [tg.cdf(x) for x in X] 16 | 17 | def test_sigma(): 18 | tg2 = truncated_gaussian_old(intervals, scale=2.) 19 | tg1 = truncated_gaussian_old(np.array(intervals)/2., scale=1.) 20 | 21 | Z = 3.5 22 | nt.assert_equal(np.around(float(tg1.cdf(Z/2.)), 3), 23 | np.around(float(tg2.cdf(Z)), 3)) 24 | np.testing.assert_equal(np.around(np.array(2 * tg1.equal_tailed_interval(Z/2,0.05)), 4), 25 | np.around(np.array(tg2.equal_tailed_interval(Z,0.05)), 4)) 26 | 27 | @set_seed_iftrue(SET_SEED) 28 | @dec.skipif(True, 'checking coverage: this is random with highish failure rate') 29 | @set_sampling_params_iftrue(SMALL_SAMPLES, nsim=100) 30 | def test_equal_tailed_coverage(nsim=1000): 31 | 32 | alpha = 0.25 33 | tg = truncated_gaussian_old([(2.3,np.inf)], scale=2) 34 | coverage = 0 35 | for i in range(nsim): 36 | while True: 37 | Z = np.random.standard_normal() * 2 38 | if Z > 2.3: 39 | break 40 | L, U = tg.equal_tailed_interval(Z, alpha) 41 | coverage += (U > 0) * (L < 0) 42 | SE = np.sqrt(alpha*(1-alpha)*nsim) 43 | print(coverage) 44 | nt.assert_true(np.fabs(coverage - (1-alpha)*nsim) < 2*SE) 45 | 46 | @set_seed_iftrue(SET_SEED) 47 | @dec.skipif(True, 'really slow') 48 | @set_sampling_params_iftrue(SMALL_SAMPLES, nsim=100) 49 | def test_UMAU_coverage(nsim=1000): 50 | 51 | alpha = 0.25 52 | tg = truncated_gaussian_old([(2.3,np.inf)], scale=2) 53 | coverage = 0 54 | for i in range(nsim): 55 | while True: 56 | Z = np.random.standard_normal()*2 57 | if Z > 2.3: 58 | break 59 | L, U = tg.UMAU_interval(Z, alpha) 60 | coverage += (U > 0) * (L < 0) 61 | SE = np.sqrt(alpha*(1-alpha)*nsim) 62 | print(coverage) 63 | nt.assert_true(np.fabs(coverage - (1-alpha)*nsim) < 2.1*SE) 64 | -------------------------------------------------------------------------------- /selectinf/truncated/tests/test_truncatedFT.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import f as fdist, t as tdist 3 | 4 | from ..F import sf_F 5 | from ..T import sf_T 6 | 7 | def test_F(): 8 | 9 | f1 = sf_F(3.,20.,1) 10 | f2 = fdist(3.,20.) 11 | 12 | V = np.linspace(1,7,201) 13 | V1 = [float(f1(v)) for v in V] 14 | V2 = f2.sf(V) 15 | np.testing.assert_allclose(V1, V2) 16 | 17 | V = np.linspace(1,7,11) 18 | V1 = [float(f1(u,v)) for u,v in zip(V[:-1],V[1:])] 19 | V2 = [f2.sf(u)-f2.sf(v) for u,v in zip(V[:-1],V[1:])] 20 | np.testing.assert_allclose(V1, V2) 21 | 22 | def test_T(): 23 | 24 | f1 = sf_T(20.) 25 | f2 = tdist(20.) 26 | 27 | V = np.linspace(-2,3,201) 28 | V1 = [float(f1(v)) for v in V] 29 | V2 = f2.sf(V) 30 | np.testing.assert_allclose(V1, V2) 31 | 32 | V = np.linspace(-2,3,11) 33 | V1 = [float(f1(u,v)) for u,v in zip(V[:-1],V[1:])] 34 | V2 = [f2.sf(u)-f2.sf(v) for u,v in zip(V[:-1],V[1:])] 35 | np.testing.assert_allclose(V1, V2) 36 | -------------------------------------------------------------------------------- /selectinf/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/selectinf/utils/__init__.py -------------------------------------------------------------------------------- /selectinf/utils/tools.py: -------------------------------------------------------------------------------- 1 | import time 2 | from functools import wraps 3 | 4 | 5 | dict_time = dict() 6 | 7 | 8 | def timethis(func): 9 | ''' 10 | Decorator that reports the execution time. 11 | ''' 12 | dict_time[func.__name__] = (0, 0) 13 | 14 | @wraps(func) 15 | def wrapper(*args, **kwargs): 16 | start = time.time() 17 | result = func(*args, **kwargs) 18 | end = time.time() 19 | #print(func.__name__, end-start) 20 | 21 | k, t = dict_time[func.__name__] 22 | dict_time[func.__name__] = k+1, t + end-start 23 | 24 | return result 25 | return wrapper 26 | 27 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [versioneer] 2 | VCS = git 3 | style = pep440 4 | versionfile_source = selection/_version.py 5 | tag_prefix = 6 | parentdir_prefix = selection- 7 | -------------------------------------------------------------------------------- /tools/build_modref_templates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 3 | # vi: set ft=python sts=4 ts=4 sw=4 et: 4 | """Script to auto-generate our API docs. 5 | 6 | This script should run in Python 2 and Python 3 7 | """ 8 | # stdlib imports 9 | import os 10 | 11 | # local imports 12 | from apigen import ApiDocWriter 13 | 14 | #***************************************************************************** 15 | if __name__ == '__main__': 16 | package = 'selectinf' 17 | outdir = os.path.join('source', 'api', 'generated') 18 | docwriter = ApiDocWriter(package) 19 | docwriter.package_skip_patterns += [r'\.fixes$', 20 | r'\.externals$', 21 | #r'\.labs\.viz', 22 | ] 23 | docwriter.write_api_docs(outdir) 24 | docwriter.write_index(outdir, 'gen', relative_to=os.path.join('source', 'api')) 25 | print('%d files written' % len(docwriter.written_modules)) 26 | -------------------------------------------------------------------------------- /tools/noseall_with_coverage: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | import os 4 | 5 | os.system(""" 6 | env USE_TEST_SEED=1 USE_SMALL_SAMPLES=1 nosetests --with-coverage --cover-package=selection --verbose selection 7 | """) 8 | -------------------------------------------------------------------------------- /tools/strip_notebook.py: -------------------------------------------------------------------------------- 1 | """ 2 | simple example script for running notebooks and saving the resulting notebook. 3 | 4 | Usage: `strip_notebook.py` foo.ipynb [bar.ipynb [...]]` 5 | 6 | Each notebook is stripped of its outputs after checking that it executes. 7 | Used to clean notebooks before committing to git. 8 | """ 9 | 10 | from selection.utils.nbtools import strip_outputs, reads, writes 11 | from argparse import ArgumentParser 12 | 13 | def main(): 14 | parser = ArgumentParser( 15 | description='Run cells in notebook and strip outputs.') 16 | parser.add_argument('--clobber', action='store_true', 17 | help='if set, overwrite existing notebook files with stripped version') 18 | parser.add_argument('--norun', action='store_true', 19 | help='if set, do not run cells before stripping') 20 | parser.add_argument('notebooks', 21 | metavar='NB', 22 | help='Notebooks to strip outputs from.', 23 | nargs='+', 24 | type=str) 25 | 26 | args = parser.parse_args() 27 | 28 | for ipynb in args.notebooks: 29 | print("running and stripping %s" % ipynb) 30 | with open(ipynb) as f: 31 | stripped_nb = strip_outputs(reads(f.read(), 'json'), 32 | run_cells=not args.norun) 33 | if args.clobber: 34 | print('clobbering %s' % ipynb) 35 | with open(ipynb, 'w') as f: 36 | f.write(writes(stripped_nb, 'json')) 37 | else: 38 | print('not clobbering %s' % ipynb) 39 | 40 | if __name__ == '__main__': 41 | main() 42 | -------------------------------------------------------------------------------- /umpu/UMAU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selective-inference/Python-software/e906fbb98946b129eb6713e8956bde7a080181f4/umpu/UMAU.pdf --------------------------------------------------------------------------------