├── requirements.txt ├── run.sh ├── CONTRIBUTING.md ├── asymp_system_solve_test.py ├── setup.py ├── sloe_experiments ├── experiment_helpers_test.py ├── p_values.py ├── runtime.py ├── est_gamma.py ├── sweep_coverage.py └── experiment_helpers.py ├── probe_frontier_test.py ├── third_party └── py │ └── scipy │ ├── optimize │ └── Zeros │ │ ├── zeros.h │ │ └── brentq.c │ └── LICENSE.txt ├── README.md ├── mle_param_integrands.h ├── mle_param_integrands.cc ├── asymp_system_solve.py ├── unbiased_logistic_regression_test.py ├── probe_frontier.py ├── LICENSE └── unbiased_logistic_regression.py /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.16.5 2 | scipy==1.5.4 3 | apache-beam 4 | absl-py 5 | scikit-learn 6 | statsmodels 7 | pybind11 8 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The SLOE Logistic Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | set -e 17 | set -x 18 | 19 | virtualenv -p python3 . 20 | source ./bin/activate 21 | 22 | pip install -r requirements.txt 23 | python setup.py build 24 | python setup.py install 25 | python -m sloe_logistic.asymp_system_solve_test 26 | python -m sloe_logistic.unbiased_logistic_regression_test 27 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | # Issues 4 | 5 | * Please tag your issue with `bug`, `feature request`, or `question` to help us 6 | effectively respond. 7 | * Please include the version of Uncertainty Metrics you are running. 8 | * Please provide the command line you ran as well as the log output. 9 | 10 | # Pull Requests 11 | 12 | Please send in fixes and feature additions through Pull Requests. 13 | 14 | ## Contributor License Agreement 15 | 16 | Contributions to this project must be accompanied by a Contributor License 17 | Agreement. You (or your employer) retain the copyright to your contribution, 18 | this simply gives us permission to use and redistribute your contributions as 19 | part of the project. Head over to to see 20 | your current agreements on file or to sign a new one. 21 | 22 | You generally only need to submit a CLA once, so if you've already submitted one 23 | (even if it was for a different project), you probably don't need to do it 24 | again. 25 | 26 | ## Code reviews 27 | 28 | All submissions, including submissions by project members, require review. We 29 | use GitHub pull requests for this purpose. Consult 30 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 31 | information on using pull requests. 32 | -------------------------------------------------------------------------------- /asymp_system_solve_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tests for sloe_logistic.asymp_system_solve.""" 17 | 18 | from absl.testing import absltest 19 | import numpy as np 20 | from sloe_logistic import asymp_system_solve 21 | 22 | 23 | class AsympSystemSolveTest(absltest.TestCase): 24 | 25 | def test_correction_factors_solve(self): 26 | sol = asymp_system_solve.correction_factors( 27 | 0.2, 1, np.sqrt(5), 0, use_eta=False) 28 | target = [1.499, 3.027, 2.1214, 0.0] 29 | for i in range(4): 30 | self.assertAlmostEqual(sol[i], target[i], places=3) 31 | 32 | sol = asymp_system_solve.correction_factors( 33 | 0.1, 8.881028475794636, np.sqrt(5), 0, use_eta=True) 34 | target = [1.174, 1.007, 1.086, 0.0] 35 | for i in range(4): 36 | self.assertAlmostEqual(sol[i], target[i], places=3) 37 | 38 | def test_frontier(self): 39 | sol = asymp_system_solve.frontier(0.1) 40 | self.assertAlmostEqual(sol, 9.890, places=3) 41 | 42 | sol = asymp_system_solve.frontier(0.2) 43 | self.assertAlmostEqual(sol, 4.550, places=3) 44 | 45 | if __name__ == '__main__': 46 | absltest.main() 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The SLOE Logistic Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Builds sloe_logistic package.""" 16 | 17 | from distutils import core 18 | from distutils.command import build_clib 19 | 20 | from pybind11.setup_helpers import build_ext 21 | from pybind11.setup_helpers import Pybind11Extension 22 | 23 | libraries = [ 24 | ("scipy_brentq", { 25 | "sources": ["third_party/py/scipy/optimize/Zeros/brentq.c",], 26 | }), 27 | ] 28 | 29 | ext_modules = [ 30 | Pybind11Extension("sloe_logistic.mle_param_integrands", [ 31 | "mle_param_integrands.cc", 32 | ]), 33 | ] 34 | 35 | core.setup( 36 | name="sloe_logistic", 37 | version="0.0.1", 38 | description="Implements SLOE method and Logistic Regression Inference", 39 | long_description="Code to supplement the ICML submission SLOE: A Faster " 40 | "Method for Statistical Inference in High-Dimensional Logistic Regression.", 41 | packages=["sloe_logistic", "sloe_logistic.sloe_experiments"], 42 | package_dir={ 43 | "sloe_logistic": ".", 44 | "sloe_logistic.sloe_experiments": "sloe_experiments/" 45 | }, 46 | libraries=libraries, 47 | ext_modules=ext_modules, 48 | cmdclass={ 49 | "build_ext": build_ext, 50 | "build_clib": build_clib.build_clib, 51 | }, 52 | zip_safe=False, 53 | ) 54 | -------------------------------------------------------------------------------- /sloe_experiments/experiment_helpers_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tests for experiment_helpers.""" 17 | 18 | from absl.testing import absltest 19 | from sloe_logistic.sloe_experiments import experiment_helpers 20 | 21 | 22 | class ExperimentHelpersTest(absltest.TestCase): 23 | 24 | def test_simulation(self): 25 | params = experiment_helpers.SimulationParams(4000, 400, 1, seed=202103) 26 | sim = experiment_helpers.Simulation(params) 27 | features, outputs = sim.sample() 28 | 29 | self.assertAlmostEqual(features.mean(), 0, places=3) 30 | self.assertAlmostEqual(outputs.mean(), 0.5, places=2) 31 | 32 | def test_gwas_simulation(self): 33 | params = experiment_helpers.SimulationParams(4000, 400, 1, seed=202103) 34 | params.covariates = 'gwas' 35 | sim = experiment_helpers.GWASSimulation(params) 36 | features, outputs = sim.sample() 37 | 38 | self.assertAlmostEqual(features.mean(), 0, places=3) 39 | self.assertAlmostEqual(outputs.mean(), 0.5, places=2) 40 | 41 | def test_gwas_simulation_checks_covariates(self): 42 | params = experiment_helpers.SimulationParams(4000, 400, 1, seed=202103) 43 | params.covariates = 'not_gwas' 44 | with self.assertRaises(ValueError): 45 | _ = experiment_helpers.GWASSimulation(params) 46 | 47 | if __name__ == '__main__': 48 | absltest.main() 49 | -------------------------------------------------------------------------------- /probe_frontier_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tests for sloe_logistic.asymp_system_solve.""" 17 | 18 | from absl.testing import absltest 19 | import numpy as np 20 | from sloe_logistic import probe_frontier 21 | 22 | 23 | class ProbeFrontierTest(absltest.TestCase): 24 | 25 | def get_simulated_data(self, n, d): 26 | np.random.seed(1) 27 | features = np.random.randn(n, d) 28 | beta = np.sqrt(5 * 2.0 / d) * np.ones(d) 29 | beta[(d // 2):] = 0 30 | 31 | outcome = (np.random.rand(n) <= 1.0 / 32 | (1.0 + np.exp(-features.dot(beta)))).astype(float) 33 | 34 | return features, outcome 35 | 36 | def test_probe_frontier_model(self): 37 | n, d = 1000, 100 38 | features, outcome = self.get_simulated_data(n, d) 39 | model = probe_frontier.ProbeFrontierLogisticRegression(num_subsamples=4) 40 | model.fit(features, outcome) 41 | 42 | self.assertLen(model.coef_.reshape(-1), features.shape[1]) 43 | 44 | def test_corrected_p_values(self): 45 | """Check null P value CDF is within 95% CI of uniform CDF.""" 46 | n, d = 4000, 400 47 | features, outcome = self.get_simulated_data(n, d) 48 | model = probe_frontier.ProbeFrontierLogisticRegression(num_subsamples=4) 49 | model.fit(features, outcome) 50 | 51 | thresh = 0.1 52 | emp_p_cdf = model.p_values().reshape(-1)[(d // 2):] <= thresh 53 | self.assertAlmostEqual( 54 | emp_p_cdf.mean(), 55 | thresh, 56 | delta=1.96 * emp_p_cdf.std() / np.sqrt(d // 2)) 57 | 58 | 59 | if __name__ == '__main__': 60 | absltest.main() 61 | -------------------------------------------------------------------------------- /third_party/py/scipy/optimize/Zeros/zeros.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The SLOE Logistic Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /* Written by Charles Harris charles.harris@sdl.usu.edu */ 16 | 17 | /* Modified to not depend on Python everywhere by Travis Oliphant. 18 | */ 19 | 20 | #ifndef ZEROS_H 21 | #define ZEROS_H 22 | 23 | typedef struct { 24 | int funcalls; 25 | int iterations; 26 | int error_num; 27 | } scipy_zeros_info; 28 | 29 | 30 | /* Must agree with _ECONVERGED, _ESIGNERR, _ECONVERR in zeros.py */ 31 | #define CONVERGED 0 32 | #define SIGNERR -1 33 | #define CONVERR -2 34 | #define EVALUEERR -3 35 | #define INPROGRESS 1 36 | 37 | typedef double (*callback_type)(double, void*); 38 | typedef double (*solver_type)(callback_type, double, double, double, double, 39 | int, void *, scipy_zeros_info*); 40 | 41 | extern double bisect(callback_type f, double xa, double xb, double xtol, 42 | double rtol, int iter, void *func_data, 43 | scipy_zeros_info *solver_stats); 44 | extern double ridder(callback_type f, double xa, double xb, double xtol, 45 | double rtol, int iter, void *func_data, 46 | scipy_zeros_info *solver_stats); 47 | extern double brenth(callback_type f, double xa, double xb, double xtol, 48 | double rtol, int iter, void *func_data, 49 | scipy_zeros_info *solver_stats); 50 | extern double brentq(callback_type f, double xa, double xb, double xtol, 51 | double rtol, int iter, void *func_data, 52 | scipy_zeros_info *solver_stats); 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Code to run experiments in *SLOE: A Faster Method for Statistical Inference in High-Dimensional Logistic Regression*. 2 | 3 | Not an official Google product. 4 | 5 | ## Method Introduction 6 | This library provides statistical inference for high dimensional logistic 7 | regression maximum likelihood, based largely on the breakthrough results from 8 | Sur and Candès (PNAS, 2019). The challenge with applying their results is that 9 | they depend on an unobserved signal strength quantity. Our method estimates this 10 | quantity via a leave-one-out approach, which we outline in our paper [1]. 11 | 12 | By high-dimensions, we mean that the ratio of the number of covariates `p` to 13 | the sample size `n` is strictly between 0 and 0.5. When the number of covariates 14 | is too large, the data is separable, and our method will not help to recover 15 | from such a case. When the number of covariates is small (say, `p <= 0.05 * n`), 16 | the high dimensional adjustment is a bit numerically unstable, and adds little 17 | value over the standard large-sample theory. 18 | 19 | The setting studied is complementary to sparse high dimensional regimes. We 20 | assume that there are a relatively large number of covariates that are weakly 21 | correlated with the binary outcome. If one expects only a very small number of 22 | the many candidate covariates to have a nonzero coefficient in the model, 23 | sparse model selection and post-selective inference is probably a better 24 | approach than the one taken here. 25 | 26 | ## Installation and tests 27 | Run `run.sh` to install requirements and package, and run tests. 28 | 29 | ## Usage 30 | The main approach proposed in our work is implemented in the 31 | `UnbiasedLogisticRegression` class in `unbiased_logistic_regression.py`. This 32 | has an `sklearn`-like interface, with a `fit`, `decision_function` and 33 | `predict_proba` API. Additionally, for inference, we've added a 34 | `prediction_intervals` method. See the inline documentation for more details 35 | of usage. 36 | 37 | # Citation 38 | [1] S. Yadlowsky, T. Yun, C. McLean, A. D'Amour (2021). "SLOE: A Faster 39 | Method for Statistical Inference in High-Dimensional Logistic Regression". 40 | [arXiv:2103.12725](http://arxiv.org/abs/2103.12725) [stat.ML]. 41 | -------------------------------------------------------------------------------- /mle_param_integrands.h: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The SLOE Logistic Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef MLE_PARAM_INTEGRANDS_H_ 16 | #define MLE_PARAM_INTEGRANDS_H_ 17 | 18 | #include 19 | 20 | extern "C" { 21 | #include "third_party/py/scipy/optimize/Zeros/zeros.h" 22 | } 23 | 24 | namespace logistic_hd { 25 | 26 | // Integrands for the equations defined in Eq. 5 from Sur and Candès 27 | // (PNAS, 2019). These are called by the bivariate integration over Z1 and Z2 28 | // in `asymp_system_solve.py`. 29 | double integrand(double Z1, double Z2, double kappa, double gamma, double b0, 30 | double alpha, double lambda, double sigma, double beta0, 31 | int eq_num); 32 | 33 | // Computes the derivative of the objective that defines the proximal operator. 34 | // The prox operator is the value of z that makes this zero. 35 | double prox_deriv(double z, void *args); 36 | 37 | double sigmoid(double z); 38 | 39 | // Computes the derivative of the prox operator for the logistic regression 40 | // log likelihood. 41 | double prox_impl(double lambda, double x, double xtol = 1e-8, 42 | double rtol = 1e-8, int maxiters = 1000); 43 | 44 | // Computes the pdf of the bivariate normal without any input validation 45 | // because this is called many times during optimization. 46 | double pdf(double x1, double x2); 47 | 48 | // Helper function to pass values between our code and the scipy.optimize API. 49 | double scipy_zeros_functions_func(double x, void *params); 50 | 51 | typedef struct prox_params { 52 | double lambda; 53 | double x; 54 | } prox_params; 55 | 56 | } // namespace logistic_hd 57 | 58 | #endif // MLE_PARAM_INTEGRANDS_H_ 59 | -------------------------------------------------------------------------------- /sloe_experiments/p_values.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Run experiment to understand uniformity of p-values generated by SLOE. 17 | 18 | Tests the SLOE estimator empirically by computing it 19 | over a bunch of different seeds, and storing in csv files to be analyzed in a 20 | colab. 21 | """ 22 | 23 | 24 | from absl import app 25 | from absl import flags 26 | import apache_beam as beam 27 | from apache_beam.options import pipeline_options 28 | import numpy as np 29 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper 30 | 31 | FLAGS = flags.FLAGS 32 | 33 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run') 34 | flags.DEFINE_string('output_path', '/tmp/counts.txt', 'The output file path') 35 | flags.DEFINE_string( 36 | 'coverage_target', 'true_preds', 37 | 'Which value to check coverage in prediction intervals? Options ' 38 | '`true_preds` or `calib_ests`' 39 | ) 40 | 41 | 42 | def run_sim(seed): 43 | """Runs simulation and computes estimated p-values to compare to uniform.""" 44 | # Model parameters 45 | 46 | sim_params = exp_helper.SimulationParams.create_from_flags() 47 | sim_params.seed = 201216 + seed 48 | sim = exp_helper.Simulation(sim_params) 49 | 50 | x1, y1 = sim.sample() 51 | 52 | logit_model = exp_helper.create_inference_model() 53 | logit_model_fit = logit_model.fit(x1, y1) 54 | 55 | p_values = logit_model_fit.p_values() 56 | return np.sort(p_values[sim.null_indices()]) 57 | 58 | 59 | def main(unused_argv): 60 | # If you have custom beam options add them here. 61 | beam_options = pipeline_options.PipelineOptions() 62 | 63 | with beam.Pipeline(options=beam_options) as pipe: 64 | _ = ( 65 | pipe 66 | | beam.Create(range(FLAGS.num_sims)) 67 | | beam.Map(run_sim) 68 | | beam.Map(exp_helper.numpy_array_to_csv) 69 | | beam.Reshuffle() 70 | | 71 | 'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5)) 72 | 73 | 74 | if __name__ == '__main__': 75 | app.run(main) 76 | -------------------------------------------------------------------------------- /sloe_experiments/runtime.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Run experiment to understand runtime of SLOE relative to ProbeFrontier. 17 | 18 | Tests the runtime of the SLOE estimator in compared to 19 | ProbeFrontier over many seeds, storing in csv files to be analyzed in a colab. 20 | """ 21 | import time 22 | 23 | 24 | 25 | from absl import app 26 | from absl import flags 27 | import apache_beam as beam 28 | from apache_beam.options import pipeline_options 29 | import numpy as np 30 | 31 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper 32 | 33 | 34 | FLAGS = flags.FLAGS 35 | 36 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run') 37 | flags.DEFINE_string('output_path', '/tmp/counts.txt', 'The output file path') 38 | 39 | N_RANGE = [500, 1000, 2000, 3000, 4000, 6000, 8000, 16000] 40 | 41 | 42 | def run_sim(val): 43 | """Runs simulation and compare runtime of SLOE and ProbeFrontier.""" 44 | n = val[0] 45 | seed = 201216 + val[1] 46 | # Model parameters 47 | 48 | sim_params = exp_helper.SimulationParams.create_from_flags() 49 | sim_params.seed = seed 50 | sim_params.training_n = n 51 | sim_params.p = int(n * FLAGS.features_per_sample) 52 | sim = exp_helper.Simulation(sim_params) 53 | 54 | new_method_model = exp_helper.create_inference_model('newmethod') 55 | pf_model = exp_helper.create_inference_model('probe_frontier') 56 | 57 | x1, y1 = sim.sample() 58 | if pf_model.is_separable(x1, y1): 59 | return 60 | 61 | tic = time.perf_counter() 62 | m = new_method_model.fit(x1, y1) 63 | toc = time.perf_counter() 64 | new_method_time = toc - tic 65 | # Deleting model here to keep memory clean for probe frontier model. 66 | del new_method_model, m 67 | 68 | tic = time.perf_counter() 69 | m, v = pf_model.fit(x1, y1) 70 | toc = time.perf_counter() 71 | probe_frontier_time = toc - tic 72 | 73 | return [np.array([n, seed, new_method_time, probe_frontier_time, v])] 74 | 75 | 76 | def main(unused_argv): 77 | # If you have custom beam options add them here. 78 | beam_options = pipeline_options.PipelineOptions() 79 | 80 | with beam.Pipeline(options=beam_options) as pipe: 81 | _ = ( 82 | pipe 83 | | beam.Create(range(FLAGS.num_sims)) 84 | | beam.FlatMap(exp_helper.multiple_sample_sizes, N_RANGE) 85 | | 'PrepShuffle' >> beam.Reshuffle() 86 | | beam.FlatMap(run_sim) 87 | | beam.Map(exp_helper.numpy_array_to_csv) 88 | | beam.Reshuffle() 89 | | 90 | 'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5)) 91 | 92 | 93 | if __name__ == '__main__': 94 | app.run(main) 95 | -------------------------------------------------------------------------------- /sloe_experiments/est_gamma.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Run experiment to understand convergence of SLOE estimator of eta. 17 | 18 | Tests the SLOE estimator empirically by computing it 19 | over a range of sample sizes for a bunch of different seeds, and storing in 20 | csv files to be analyzed in a colab. 21 | """ 22 | 23 | 24 | from absl import app 25 | from absl import flags 26 | import apache_beam as beam 27 | from apache_beam.options import pipeline_options 28 | import numpy as np 29 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper 30 | import statsmodels.api as sm 31 | 32 | FLAGS = flags.FLAGS 33 | 34 | flags.DEFINE_string('output_path', '/tmp/counts.txt', 'The output file path') 35 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run') 36 | flags.DEFINE_string('img_path', '/tmp/counts.png', 'Path to save plots') 37 | 38 | N_RANGE = [250, 500, 1000, 2000, 4000] 39 | 40 | 41 | def multiple_sample_sizes(seed): 42 | """Run same seed over multiple sample sizes.""" 43 | for n in N_RANGE: 44 | yield [n, seed] 45 | 46 | 47 | def run_sim(params): 48 | """Runs simulation and computes estimated eta_hat to compare to truth.""" 49 | n = params[0] 50 | seed = params[1] 51 | kappa = FLAGS.features_per_sample 52 | p = int(n * kappa) 53 | 54 | gamma = np.sqrt(FLAGS.signal_strength) 55 | rand_state = np.random.RandomState(201216 + seed) 56 | 57 | p_positive = int(p / 8) 58 | p_negative = p_positive 59 | p_zero = p - p_positive - p_negative 60 | beta = 2 * np.concatenate( 61 | (np.ones(p_positive), -np.ones(p_negative), np.zeros(p_zero))) 62 | beta *= gamma 63 | 64 | features = rand_state.randn(n, p) / np.sqrt(p) 65 | labels = (rand_state.rand(n) <= 1.0 / 66 | (1.0 + np.exp(-features.dot(beta)))).astype(float) 67 | 68 | logit_model = sm.Logit(labels, features) 69 | logit_model_fit = logit_model.fit(disp=False) 70 | beta_hat = logit_model_fit.params 71 | 72 | hessian = logit_model.hessian(beta_hat) 73 | # Computes X_i^T H^{-1} X_i for all examples. Used in Sherman-Morrison formula 74 | # below. 75 | xi_hessian_inv_xi = np.diag( 76 | features.dot(np.linalg.solve(hessian, features.T))) 77 | pred = logit_model_fit.predict(features) 78 | # Sherman-Morrison formula for X_i^T H_{-i}^{-1} X_i, where H_{-i} is Hessian 79 | # without i-th example. 80 | mod = xi_hessian_inv_xi / (1.0 + xi_hessian_inv_xi * pred * (1 - pred)) 81 | infl = mod * (labels - pred) + features.dot(beta_hat) 82 | 83 | eta_hat = np.var(infl) 84 | 85 | eta_hat_simp = np.linalg.norm(beta_hat)**2 86 | 87 | return np.array([n, seed, eta_hat, eta_hat_simp]) 88 | 89 | 90 | def main(unused_argv): 91 | # If you have custom beam options add them here. 92 | beam_options = pipeline_options.PipelineOptions() 93 | 94 | with beam.Pipeline(options=beam_options) as pipe: 95 | _ = ( 96 | pipe 97 | | beam.Create(range(FLAGS.num_sims)) 98 | | beam.FlatMap(multiple_sample_sizes) 99 | | 'PrepShuffle' >> beam.Reshuffle() 100 | | beam.Map(run_sim) 101 | | beam.Map(exp_helper.numpy_array_to_csv) 102 | | beam.Reshuffle() 103 | | 104 | 'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5)) 105 | 106 | 107 | if __name__ == '__main__': 108 | app.run(main) 109 | -------------------------------------------------------------------------------- /mle_param_integrands.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2021 The SLOE Logistic Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "mle_param_integrands.h" 16 | 17 | #include 18 | 19 | #include "pybind11/pybind11.h" 20 | 21 | namespace logistic_hd { 22 | 23 | double sigmoid(double z) { 24 | const double v = 1.0 / (1 + exp(-z)); 25 | return (v); 26 | } 27 | 28 | double prox_deriv(double z, void *args) { 29 | prox_params *myargs = reinterpret_cast(args); 30 | return (myargs->lambda * sigmoid(z) + z - myargs->x); 31 | } 32 | 33 | double prox_impl(double lambda, double x, double xtol, double rtol, 34 | int maxiters) { 35 | prox_params params; 36 | scipy_zeros_info solver_stats; 37 | double lower; 38 | double upper; 39 | 40 | params.lambda = lambda; 41 | params.x = x; 42 | 43 | if (lambda * x > 0) { 44 | lower = x - lambda - 1e-4; 45 | upper = x + 1e-4; 46 | } else { 47 | lower = x - lambda / 2.0 - 1e-4; 48 | upper = x + 1e-4; 49 | } 50 | lower = -abs(x) - 8; 51 | upper = abs(x) + 8; 52 | 53 | if (abs(prox_deriv(lower, ¶ms)) < 1e-8) { 54 | return (lower); 55 | } 56 | if (abs(prox_deriv(upper, ¶ms)) < 1e-8) { 57 | return (upper); 58 | } 59 | 60 | const double x0 = brentq(&prox_deriv, lower, upper, xtol, rtol, maxiters, 61 | reinterpret_cast(¶ms), &solver_stats); 62 | 63 | return (x0); 64 | } 65 | 66 | double integrand(double Z1, double Z2, double kappa, double gamma, double b0, 67 | double alpha, double lambda, double sigma, double beta0, 68 | int eq_num) { 69 | double eq; 70 | 71 | const double S1 = gamma * Z1 / alpha + beta0; 72 | const double S2 = gamma * Z1 + sigma * Z2 + b0; 73 | 74 | const double prox_S2 = prox_impl(lambda, S2); 75 | const double prox_lambda_S2 = prox_impl(lambda, lambda + S2); 76 | 77 | const double sig_S1 = sigmoid(S1); 78 | const double sig_neg_S1 = 1 - sig_S1; 79 | 80 | if (eq_num == 1) { 81 | eq = sig_S1 * pow(S2 - prox_lambda_S2, 2); 82 | eq += sig_neg_S1 * pow(S2 - prox_S2, 2); 83 | } else if (eq_num == 2) { 84 | eq = sig_S1 * Z2 * prox_lambda_S2; 85 | eq += sig_neg_S1 * Z2 * prox_S2; 86 | } else if (eq_num == 3) { 87 | eq = sig_S1 * Z1 * prox_lambda_S2; 88 | eq += sig_neg_S1 * Z1 * prox_S2; 89 | } else { 90 | const double prox_neg_S2 = prox_impl(lambda, -S2); 91 | eq = -sig_S1 * sigmoid(prox_neg_S2); 92 | eq += sig_neg_S1 * sigmoid(prox_S2); 93 | } 94 | 95 | return (eq * pdf(Z1, Z2)); 96 | } 97 | 98 | double pdf(double x1, double x2) { 99 | return (exp(-(pow(x1, 2) + pow(x2, 2)) / 2.0) / (2 * M_PI)); 100 | } 101 | 102 | } // namespace logistic_hd 103 | 104 | PYBIND11_MODULE(mle_param_integrands, m) { 105 | m.doc() = "Logistic Regression MLE High Dimensional Integrands"; 106 | 107 | m.def("sigmoid", &logistic_hd::sigmoid, 108 | "Sigmoid for a float (unvectorized, no error checking)"); 109 | m.def("integrand", &logistic_hd::integrand, 110 | "Integrand for equation to get high dimensional adjustment"); 111 | m.def("prox_deriv", &logistic_hd::prox_deriv, 112 | "Derivative prox objective for logistic link"); 113 | m.def("prox_impl", &logistic_hd::prox_impl, 114 | "Computes prox for logistic link times lambda"); 115 | m.def("pdf", &logistic_hd::pdf, 116 | "Computes pdf of bivariate normal distribution"); 117 | } 118 | -------------------------------------------------------------------------------- /third_party/py/scipy/optimize/Zeros/brentq.c: -------------------------------------------------------------------------------- 1 | /* Written by Charles Harris charles.harris@sdl.usu.edu */ 2 | 3 | #include 4 | #include "zeros.h" 5 | 6 | #define MIN(a, b) ((a) < (b) ? (a) : (b)) 7 | 8 | /* 9 | At the top of the loop the situation is the following: 10 | 11 | 1. the root is bracketed between xa and xb 12 | 2. xa is the most recent estimate 13 | 3. xp is the previous estimate 14 | 4. |fp| < |fb| 15 | 16 | The order of xa and xp doesn't matter, but assume xp < xb. Then xa lies to 17 | the right of xp and the assumption is that xa is increasing towards the root. 18 | In this situation we will attempt quadratic extrapolation as long as the 19 | condition 20 | 21 | * |fa| < |fp| < |fb| 22 | 23 | is satisfied. That is, the function value is decreasing as we go along. 24 | Note the 4 above implies that the right inequlity already holds. 25 | 26 | The first check is that xa is still to the left of the root. If not, xb is 27 | replaced by xp and the interval reverses, with xb < xa. In this situation 28 | we will try linear interpolation. That this has happened is signaled by the 29 | equality xb == xp; 30 | 31 | The second check is that |fa| < |fb|. If this is not the case, we swap 32 | xa and xb and resort to bisection. 33 | 34 | */ 35 | 36 | double 37 | brentq(callback_type f, double xa, double xb, double xtol, double rtol, 38 | int iter, void *func_data, scipy_zeros_info *solver_stats) 39 | { 40 | double xpre = xa, xcur = xb; 41 | double xblk = 0., fpre, fcur, fblk = 0., spre = 0., scur = 0., sbis; 42 | /* the tolerance is 2*delta */ 43 | double delta; 44 | double stry, dpre, dblk; 45 | int i; 46 | solver_stats->error_num = INPROGRESS; 47 | 48 | fpre = (*f)(xpre, func_data); 49 | fcur = (*f)(xcur, func_data); 50 | solver_stats->funcalls = 2; 51 | if (fpre*fcur > 0) { 52 | solver_stats->error_num = SIGNERR; 53 | return 0.; 54 | } 55 | if (fpre == 0) { 56 | solver_stats->error_num = CONVERGED; 57 | return xpre; 58 | } 59 | if (fcur == 0) { 60 | solver_stats->error_num = CONVERGED; 61 | return xcur; 62 | } 63 | 64 | solver_stats->iterations = 0; 65 | for (i = 0; i < iter; i++) { 66 | solver_stats->iterations++; 67 | if (fpre*fcur < 0) { 68 | xblk = xpre; 69 | fblk = fpre; 70 | spre = scur = xcur - xpre; 71 | } 72 | if (fabs(fblk) < fabs(fcur)) { 73 | xpre = xcur; 74 | xcur = xblk; 75 | xblk = xpre; 76 | 77 | fpre = fcur; 78 | fcur = fblk; 79 | fblk = fpre; 80 | } 81 | 82 | delta = (xtol + rtol*fabs(xcur))/2; 83 | sbis = (xblk - xcur)/2; 84 | if (fcur == 0 || fabs(sbis) < delta) { 85 | solver_stats->error_num = CONVERGED; 86 | return xcur; 87 | } 88 | 89 | if (fabs(spre) > delta && fabs(fcur) < fabs(fpre)) { 90 | if (xpre == xblk) { 91 | /* interpolate */ 92 | stry = -fcur*(xcur - xpre)/(fcur - fpre); 93 | } 94 | else { 95 | /* extrapolate */ 96 | dpre = (fpre - fcur)/(xpre - xcur); 97 | dblk = (fblk - fcur)/(xblk - xcur); 98 | stry = -fcur*(fblk*dblk - fpre*dpre) 99 | /(dblk*dpre*(fblk - fpre)); 100 | } 101 | if (2*fabs(stry) < MIN(fabs(spre), 3*fabs(sbis) - delta)) { 102 | /* good short step */ 103 | spre = scur; 104 | scur = stry; 105 | } else { 106 | /* bisect */ 107 | spre = sbis; 108 | scur = sbis; 109 | } 110 | } 111 | else { 112 | /* bisect */ 113 | spre = sbis; 114 | scur = sbis; 115 | } 116 | 117 | xpre = xcur; fpre = fcur; 118 | if (fabs(scur) > delta) { 119 | xcur += scur; 120 | } 121 | else { 122 | xcur += (sbis > 0 ? delta : -delta); 123 | } 124 | 125 | fcur = (*f)(xcur, func_data); 126 | solver_stats->funcalls++; 127 | } 128 | solver_stats->error_num = CONVERR; 129 | return xcur; 130 | } 131 | -------------------------------------------------------------------------------- /asymp_system_solve.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Solves nonlinear equations for high dim correction factors for MLE. 17 | 18 | Solves the nonlinear equations in Sur and Candès (PNAS., 2019) to find 19 | the adjustment factors for bias and variance of logistic regression MLE. 20 | """ 21 | 22 | 23 | import functools 24 | 25 | from absl import app 26 | import numpy as np 27 | import scipy 28 | import scipy.integrate 29 | import scipy.optimize 30 | 31 | import sloe_logistic.mle_param_integrands as mle_helper 32 | 33 | 34 | def _t_integrand(z, v, t, gamma): 35 | """Integrand used to calculate when the logistic MLE exists.""" 36 | return 2 * mle_helper.sigmoid(gamma * v) * mle_helper.pdf(z, v) * ( 37 | max(z - t * v, 0)**2) 38 | 39 | 40 | def _t_problem(t, gamma): 41 | """Minimizer of this integrand in t is the frontier where the MLE exists.""" 42 | loss, _ = scipy.integrate.dblquad(_t_integrand, -8, 8, -8, 8, ( 43 | t, 44 | gamma, 45 | ), 1e-6, 1e-6) 46 | return loss 47 | 48 | 49 | def _g_mle_inv(gamma): 50 | """Frontier where data separable in limit. Gives kappa in terms of gamma.""" 51 | res = scipy.optimize.minimize_scalar( 52 | _t_problem, bounds=(-10, 10), args=(gamma,), method='Bounded') 53 | return _t_problem(res.x, gamma) 54 | 55 | 56 | def frontier(kappa): 57 | """Frontier where data separable in limit. Gives gamma in terms of kappa.""" 58 | gamma_star = scipy.optimize.brentq(lambda gamma: _g_mle_inv(gamma) - kappa, 0, 59 | 25) 60 | return gamma_star 61 | 62 | 63 | def equations(kappa, eta, gamma, beta0, use_eta, alpha, lambda_, sigma, b0): 64 | """The solution to these equations gives the high dimensional adjustment.""" 65 | if use_eta: 66 | gamma = np.sqrt(max(eta - sigma**2, 0.0001)) 67 | else: 68 | gamma *= alpha 69 | 70 | eq1, _ = scipy.integrate.dblquad( 71 | mle_helper.integrand, -8, 8, -8, 8, 72 | (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 1), 1e-4, 1e-4) 73 | eq2, _ = scipy.integrate.dblquad( 74 | mle_helper.integrand, -8, 8, -8, 8, 75 | (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 2), 1e-4, 1e-4) 76 | eq3, _ = scipy.integrate.dblquad( 77 | mle_helper.integrand, -8, 8, -8, 8, 78 | (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 3), 1e-4, 1e-4) 79 | eq4, _ = scipy.integrate.dblquad( 80 | mle_helper.integrand, -8, 8, -8, 8, 81 | (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 4), 1, 1) 82 | eq1 -= sigma**2 * kappa 83 | eq2 -= abs(sigma) * (1 - kappa) 84 | eq3 -= gamma 85 | 86 | return -np.array([eq1, eq2, eq3, eq4]) 87 | 88 | 89 | def get_system(kappa, eta, gamma, b0, use_eta=True): 90 | system_ = functools.partial(equations, kappa, eta, gamma, b0, use_eta) 91 | return system_ 92 | 93 | 94 | def correction_factors(kappa, eta, gamma, b0, use_eta=True): 95 | """Computes correction factors for MLE of high dimensional logistic reg.""" 96 | system_ = get_system(kappa, eta, gamma, b0, use_eta) 97 | if use_eta: 98 | init = np.array([2, 2, np.sqrt(eta / 2), b0 / 2]) 99 | else: 100 | init = np.array([2, 2, np.sqrt(gamma**2 + 1), b0]) 101 | soln = scipy.optimize.root( 102 | lambda x: system_(*x), 103 | init, 104 | method='lm', 105 | options={ 106 | 'xtol': 1e-4, 107 | 'eps': 1e-8 108 | }) 109 | x0 = soln.x 110 | if kappa >= 0.03 and (x0[0] < 1 or x0[2] < 0.1): 111 | print('Rerunning due to convergence issue') 112 | init += 0.1 * np.random.randn(4) 113 | init = np.maximum(init, np.array([1, 0.5, 0.1, b0 / 2.0])) 114 | soln = scipy.optimize.root( 115 | lambda x: system_(*x), 116 | init, 117 | method='lm', 118 | options={ 119 | 'xtol': 1e-4, 120 | 'eps': 1e-8 121 | }) 122 | x0 = soln.x 123 | return x0 124 | 125 | 126 | def main(argv): 127 | if len(argv) > 1: 128 | raise app.UsageError('Too many command-line arguments.') 129 | 130 | sol = correction_factors(0.2, 1, np.sqrt(5), 0, use_eta=False) 131 | print(sol) 132 | sol = correction_factors(0.1, 8.881028475794636, np.sqrt(5), 0, use_eta=True) 133 | print(sol) 134 | 135 | 136 | if __name__ == '__main__': 137 | app.run(main) 138 | -------------------------------------------------------------------------------- /unbiased_logistic_regression_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Tests for sloe_logistic.asymp_system_solve.""" 17 | 18 | from absl.testing import absltest 19 | import numpy as np 20 | from sloe_logistic import unbiased_logistic_regression 21 | 22 | 23 | class UnbiasedLogisticRegressionTest(absltest.TestCase): 24 | 25 | def get_simulated_data(self, n, d): 26 | np.random.seed(1) 27 | features = np.random.randn(n, d) 28 | beta = np.sqrt(5 * 2.0 / d) * np.ones(d) 29 | beta[(d // 2):] = 0 30 | 31 | outcome = (np.random.rand(n) <= 1.0 / 32 | (1.0 + np.exp(-features.dot(beta)))).astype(float) 33 | 34 | return features, outcome 35 | 36 | def test_unbiased_model(self): 37 | """Tests that UnbiasedLogisticRegression.fit runs without errors.""" 38 | n, d = 1000, 100 39 | features, outcome = self.get_simulated_data(n, d) 40 | model = unbiased_logistic_regression.UnbiasedLogisticRegression( 41 | fit_intercept=False) 42 | model.fit(features, outcome) 43 | 44 | self.assertLen(model.coef_.reshape(-1), features.shape[1]) 45 | 46 | def test_cant_fit_intercept(self): 47 | """Tests that UnbiasedLogisticRegression doesn't allow fit_intercept. 48 | 49 | Currently, there's no support for fitting the intercept. This checks that 50 | trying to fit an intercept raises an error instead of silently ignoring 51 | the intercept. 52 | """ 53 | with self.assertRaises(ValueError): 54 | _ = unbiased_logistic_regression.UnbiasedLogisticRegression( 55 | fit_intercept=True) 56 | 57 | def test_platt_model(self): 58 | """Tests that PlattScaledLogisticRegression.fit runs without errors.""" 59 | n, d = 1000, 100 60 | features, outcome = self.get_simulated_data(n, d) 61 | model = unbiased_logistic_regression.PlattScaledLogisticRegression( 62 | fit_intercept=False) 63 | model.fit(features, outcome) 64 | 65 | def test_standard_mle_model(self): 66 | """Tests that LogisticRegressionMLE.fit runs without errors.""" 67 | n, d = 1000, 100 68 | features, outcome = self.get_simulated_data(n, d) 69 | model = unbiased_logistic_regression.LogisticRegressionMLE( 70 | fit_intercept=False) 71 | model.fit(features, outcome) 72 | 73 | def test_bootstrap_model(self): 74 | """Tests that LogisticRegressionPercBoot.fit runs without errors.""" 75 | n, d = 1000, 100 76 | features, outcome = self.get_simulated_data(n, d) 77 | model = unbiased_logistic_regression.LogisticRegressionPercBoot( 78 | fit_intercept=False) 79 | model.fit(features, outcome) 80 | 81 | def test_bootstrap_prediction_intervals(self): 82 | """Tests that LogisticRegressionPercBoot.prediction_intervals runs.""" 83 | n, d = 1000, 100 84 | features, outcome = self.get_simulated_data(n, d) 85 | model = unbiased_logistic_regression.LogisticRegressionPercBoot( 86 | fit_intercept=False) 87 | model.fit(features, outcome) 88 | model.prediction_intervals(features) 89 | 90 | def test_regularized_model(self): 91 | """Tests that CVRegLogisticRegression.fit runs without errors.""" 92 | n, d = 1000, 100 93 | features, outcome = self.get_simulated_data(n, d) 94 | model = unbiased_logistic_regression.CVRegLogisticRegression( 95 | fit_intercept=False) 96 | model.fit(features, outcome) 97 | 98 | self.assertLen(model.coef_.reshape(-1), features.shape[1]) 99 | 100 | def test_prediction_intervals(self): 101 | n, d = 1000, 100 102 | features, outcome = self.get_simulated_data(n, d) 103 | model = unbiased_logistic_regression.UnbiasedLogisticRegression( 104 | fit_intercept=False) 105 | model.fit(features, outcome) 106 | 107 | test_features, _ = self.get_simulated_data(100, d) 108 | intervals = model.prediction_intervals(test_features) 109 | estimated_probs = model.predict_proba(test_features)[:, 1] 110 | 111 | np.testing.assert_array_less(intervals[:, 0], estimated_probs) 112 | np.testing.assert_array_less(estimated_probs, intervals[:, 2]) 113 | 114 | def test_corrected_p_values(self): 115 | """Check null P value CDF is within 95% CI of uniform CDF.""" 116 | n, d = 4000, 400 117 | features, outcome = self.get_simulated_data(n, d) 118 | model = unbiased_logistic_regression.UnbiasedLogisticRegression( 119 | fit_intercept=False) 120 | model.fit(features, outcome) 121 | 122 | thresh = 0.1 123 | emp_p_cdf = model.p_values().reshape(-1)[(d // 2):] <= thresh 124 | self.assertAlmostEqual( 125 | emp_p_cdf.mean(), 126 | thresh, 127 | delta=1.96 * emp_p_cdf.std() / np.sqrt(d // 2)) 128 | 129 | 130 | if __name__ == '__main__': 131 | absltest.main() 132 | -------------------------------------------------------------------------------- /probe_frontier.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Implements logistic regression w/ ProbeFrontier estimator of bias correction. 17 | 18 | Implements the bias correction and inference for the MLE using the ProbeFrontier 19 | estimator of the signal strength as in [1]. Theory for arbitrary covariance with 20 | Gaussian features from [2], and empirical evidence suggesting good performance 21 | for non-Gaussian designs. 22 | 23 | [1] Sur, Pragya, and Emmanuel J. Candès. "A modern maximum-likelihood theory 24 | for high-dimensional logistic regression." Proceedings of the National Academy 25 | of Sciences 116.29 (2019): 14516-14525. 26 | [2] Zhao, Qian, Pragya Sur, and Emmanuel J. Candes. "The asymptotic distribution 27 | of the mle in high-dimensional logistic models: Arbitrary covariance." arXiv 28 | preprint arXiv:2001.09351 (2020). 29 | """ 30 | from absl import app 31 | import numpy as np 32 | import scipy 33 | from sloe_logistic import asymp_system_solve 34 | from sloe_logistic import unbiased_logistic_regression 35 | import statsmodels.api as sm 36 | import statsmodels.tools 37 | 38 | 39 | class ProbeFrontierLogisticRegression( 40 | unbiased_logistic_regression.UnbiasedLogisticRegression): 41 | """Implements ProbeFrontier and statistical inference with it.""" 42 | 43 | def __init__(self, num_subsamples=10): 44 | super().__init__(fit_intercept=False) 45 | self.num_subsamples = num_subsamples 46 | self.sep_calls = 0 47 | 48 | def fit(self, features, outcome, weights=None, verbose=False): 49 | """Fit ProbeFrontier model.""" 50 | if self.fit_intercept: 51 | raise NotImplementedError("ProbeFrontier doesn't work with intercept") 52 | self.sep_calls = 0 53 | 54 | self.sm.fit(features, outcome, weights) 55 | 56 | if weights is None: 57 | weights = 1 58 | 59 | kappa = float(features.shape[1]) / features.shape[0] 60 | gamma_hat = self.estimate_gamma(features, outcome) 61 | 62 | self.alpha, _, sigma, _ = asymp_system_solve.correction_factors( 63 | kappa, None, gamma_hat, 0, use_eta=False) 64 | 65 | self.coef_ = self.sm.coef_ / self.alpha 66 | self.intercept_ = 0 67 | 68 | self._set_coef_cov(features, sigma / np.sqrt(kappa), self.alpha) 69 | 70 | return self, self.sep_calls 71 | 72 | def estimate_gamma(self, features, outcome): 73 | """Estimate gamma.""" 74 | estimated_kappa_threshold = self.probe_frontier(features, outcome) 75 | if estimated_kappa_threshold < 0: 76 | print(features, outcome) 77 | if estimated_kappa_threshold >= 0.499: 78 | return 0.0 79 | return asymp_system_solve.frontier(estimated_kappa_threshold) 80 | 81 | def probe_frontier(self, features, outcome): 82 | """Probe for frontier.""" 83 | n, p = features.shape 84 | upper_frac = n 85 | lower_frac = min(n, 1.99 * p) 86 | obs = [] 87 | while abs(upper_frac - lower_frac) > (0.05 * p): 88 | frac = int((upper_frac + lower_frac) / 2) 89 | p_sep = 0 90 | for _ in range(self.num_subsamples): 91 | indices = np.random.choice(n, frac, replace=False) 92 | feature_sub = features[indices, :] 93 | outcome_sub = outcome[indices] 94 | p_sep += self.is_separable(feature_sub, outcome_sub) 95 | p_sep /= float(self.num_subsamples) 96 | obs.append([frac, p_sep]) 97 | if p_sep >= 0.8: 98 | lower_frac = frac 99 | elif p_sep <= 0.2: 100 | upper_frac = frac 101 | elif p_sep > 0.5: 102 | lower_frac = 0.5 * lower_frac + 0.5 * frac 103 | else: 104 | upper_frac = 0.5 * upper_frac + 0.5 * frac 105 | 106 | if len(obs) <= 2: 107 | frac = int(0.5 * (upper_frac + lower_frac)) 108 | else: 109 | obs = np.array(obs) 110 | 111 | if (obs[0, 1] > (1 - 1.5 / self.num_subsamples)): 112 | frac = obs[0, 0] 113 | elif (obs[-1, 1] < (1.5 / self.num_subsamples)): 114 | frac = obs[-1, 0] 115 | else: 116 | try: 117 | interp = sm.GLM( 118 | obs[:, 1], 119 | sm.add_constant(obs[:, 0].reshape(-1, 1)), 120 | family=sm.families.Binomial()) 121 | res = interp.fit() 122 | frac = -res.params[0] / res.params[1] 123 | 124 | except statsmodels.tools.sm_exceptions.PerfectSeparationError: 125 | threshold = np.argmax(np.diff(obs[:, 1], prepend=0)) 126 | frac = obs[threshold, 0] 127 | 128 | return min(float(p) / frac, 0.5) 129 | 130 | def is_separable(self, features, outcome): 131 | """Check whether data are linearly separable.""" 132 | self.sep_calls += 1 133 | n, p = features.shape 134 | features_aug = np.ones((n, p + 1)) 135 | features_aug[:, :-1] = features 136 | features_aug *= (2 * outcome - 1).reshape(-1, 1) 137 | b = -np.ones(n) 138 | res = scipy.optimize.linprog( 139 | b, A_eq=features_aug.T, b_eq=np.zeros(p + 1), method='interior-point') 140 | if res.status == 0: 141 | return res.fun > -1e-6 142 | elif res.status == 2: 143 | return False 144 | elif res.status == 3: 145 | return False 146 | else: 147 | print(res) 148 | raise Exception('Error finding separability') 149 | 150 | 151 | def main(argv): 152 | if len(argv) > 1: 153 | raise app.UsageError('Too many command-line arguments.') 154 | 155 | p = ProbeFrontierLogisticRegression() 156 | 157 | features = np.random.randn(600, 300) / np.sqrt(300) 158 | outcome = (np.random.rand(600) <= 1 / 159 | (1.0 + np.exp(-1 * features.sum(axis=1)))).astype(float) 160 | primal = p.is_separable(features, outcome) 161 | print(primal) 162 | 163 | features = np.array([[1, 1], [0, 0]]) 164 | outcome = np.array([1, 0]) 165 | print(p.is_separable(features, outcome)) 166 | features = np.array([[1, 1], [0, 0], [-1, -1]]) 167 | outcome = np.array([1, 0, 1]) 168 | print(p.is_separable(features, outcome)) 169 | 170 | features = np.random.randn(100, 100) 171 | outcome = (np.random.rand(100) <= 0.5).astype(float) 172 | print(p.is_separable(features, outcome)) 173 | 174 | features = np.random.randn(100, 10) 175 | outcome = (np.random.rand(100) <= 0.5).astype(float) 176 | print(p.is_separable(features, outcome)) 177 | 178 | if __name__ == '__main__': 179 | app.run(main) 180 | -------------------------------------------------------------------------------- /sloe_experiments/sweep_coverage.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Run experiment to understand coverage of CIs generated by SLOE. 17 | 18 | Tests the SLOE estimator empirically by computing 19 | confidence intervals (CIs) using it over a bunch of different seeds and aspect 20 | ratios, calculating properties such as coverage and size, and storing in csv 21 | files to be analyzed in a colab. 22 | """ 23 | 24 | 25 | from absl import app 26 | from absl import flags 27 | import apache_beam as beam 28 | from apache_beam.options import pipeline_options 29 | import numpy as np 30 | import sklearn.linear_model 31 | from sklearn.model_selection import LeaveOneOut 32 | 33 | from sloe_logistic import probe_frontier 34 | from sloe_logistic import unbiased_logistic_regression 35 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper 36 | 37 | 38 | GAMMA_RANGE = [0.1, 1, 5] 39 | FLAGS = flags.FLAGS 40 | 41 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run') 42 | flags.DEFINE_string('output_path', '/tmp/counts', 'The output file path') 43 | flags.DEFINE_enum( 44 | 'coverage_target', 'true_preds', ['true_preds', 'calib_ests', 'reg_ests'], 45 | 'Which value to check coverage in prediction intervals?') 46 | flags.DEFINE_boolean('include_bootstrap', False, 47 | 'Include bootstrap CIs as well? These are slow.') 48 | flags.DEFINE_float( 49 | 'kappa_spacing', 0.05, 50 | 'Resolution of graph in terms of spacing between kappa evaluated.') 51 | flags.DEFINE_float( 52 | 'coverage_rate', 95, 'What level confidence intervals' 53 | 'should be tested (0-100)?') 54 | 55 | 56 | def run_sim(params): 57 | """Runs simulation and computes properties of the estimated CIs.""" 58 | kappa = params[0] 59 | gamma = params[1] 60 | seed = 201216 + params[2] 61 | 62 | sim_params = exp_helper.SimulationParams.create_from_flags() 63 | sim_params.seed = seed 64 | sim_params.gamma = np.sqrt(gamma) 65 | sim_params.p = int(sim_params.training_n * kappa) 66 | sim = exp_helper.create_sim(sim_params) 67 | 68 | x1, y1 = sim.sample() 69 | 70 | pfr = probe_frontier.ProbeFrontierLogisticRegression() 71 | if pfr.is_separable(x1, y1): 72 | return 73 | 74 | # Draw test data 75 | x2, _ = sim.sample(int(sim_params.training_n / 4)) 76 | true_logits = x2.dot(sim.beta) 77 | bias_selector = np.abs(true_logits) > 1e-2 78 | 79 | # Calculate coverage 80 | if FLAGS.coverage_target == 'true_preds': 81 | target = 1.0 / (1.0 + np.exp(-true_logits)).reshape(-1) 82 | elif FLAGS.coverage_target == 'calib_ests': 83 | ps_logit_model = unbiased_logistic_regression.PlattScaledLogisticRegression( 84 | fit_intercept=sim_params.intercept or sim_params.uncentered) 85 | ps_logit_model.fit(x1, y1) 86 | target = ps_logit_model.predict_proba(x2)[:, 1] 87 | elif FLAGS.coverage_target == 'reg_ests': 88 | ps_logit_model = sklearn.linear_model.LogisticRegressionCV( 89 | cv=LeaveOneOut(), 90 | fit_intercept=False, 91 | Cs=20, 92 | penalty='l2', 93 | solver='newton-cg') 94 | ps_logit_model.fit(x1, y1) 95 | target = ps_logit_model.predict_proba(x2)[:, 1] 96 | else: 97 | raise ValueError("Invalid choice of coverage target '{}'.".format( 98 | FLAGS.coverage_target)) 99 | 100 | try: 101 | new_method_model = exp_helper.create_inference_model('newmethod') 102 | new_method_model.set_coverage(FLAGS.coverage_rate) 103 | _ = new_method_model.fit(x1, y1) 104 | new_pred_int = new_method_model.prediction_intervals(x2) 105 | new_logit_int = new_method_model.prediction_intervals(x2, logit=True) 106 | except ValueError as e: 107 | print(e) 108 | return 109 | 110 | std_method_model = exp_helper.create_inference_model('mle') 111 | std_method_model.set_coverage(FLAGS.coverage_rate) 112 | _ = std_method_model.fit(x1, y1) 113 | std_pred_int = std_method_model.prediction_intervals(x2) 114 | std_logit_int = std_method_model.prediction_intervals(x2, logit=True) 115 | 116 | new_coverage = np.logical_and( 117 | new_pred_int[:, 0].reshape(-1) <= target, 118 | target <= new_pred_int[:, 2].reshape(-1)).astype(float) 119 | std_coverage = np.logical_and( 120 | std_pred_int[:, 0].reshape(-1) <= target, 121 | target <= std_pred_int[:, 2].reshape(-1)).astype(float) 122 | 123 | new_width = np.abs(new_logit_int[:, 2] - new_logit_int[:, 0]) 124 | std_width = np.abs(std_logit_int[:, 2] - std_logit_int[:, 0]) 125 | 126 | new_bias = new_logit_int[bias_selector, 1] / true_logits[bias_selector] 127 | std_bias = std_logit_int[bias_selector, 1] / true_logits[bias_selector] 128 | 129 | results = [ 130 | gamma, kappa, seed, 131 | np.mean(new_coverage), 132 | np.mean(new_width), 133 | np.mean(new_bias), 134 | np.mean(std_coverage), 135 | np.mean(std_width), 136 | np.mean(std_bias) 137 | ] 138 | 139 | if FLAGS.include_bootstrap: 140 | boot_method_model = exp_helper.create_inference_model('bootstrap') 141 | boot_method_model.set_coverage(FLAGS.coverage_rate) 142 | _ = boot_method_model.fit(x1, y1) 143 | boot_pred_int = boot_method_model.prediction_intervals(x2) 144 | boot_logit_int = boot_method_model.prediction_intervals(x2, logit=True) 145 | 146 | boot_coverage = np.logical_and( 147 | boot_pred_int[:, 0].reshape(-1) <= target, 148 | target <= boot_pred_int[:, 2].reshape(-1)).astype(float) 149 | boot_width = np.abs(boot_logit_int[:, 2] - boot_logit_int[:, 0]) 150 | boot_bias = boot_logit_int[bias_selector, 1] / true_logits[bias_selector] 151 | 152 | results.append(np.mean(boot_coverage)) 153 | results.append(np.mean(boot_width)) 154 | results.append(np.mean(boot_bias)) 155 | 156 | return [np.array(results)] 157 | 158 | 159 | def main(unused_argv): 160 | kappa_range = np.arange(0.05, 0.5 + 0.5 * FLAGS.kappa_spacing, 161 | FLAGS.kappa_spacing) 162 | 163 | # If you have custom beam options add them here. 164 | beam_options = pipeline_options.PipelineOptions() 165 | 166 | with beam.Pipeline(options=beam_options) as pipe: 167 | _ = ( 168 | pipe 169 | | beam.Create(range(FLAGS.num_sims)) 170 | | beam.FlatMap(exp_helper.multiple_sim_params, kappa_range, 171 | GAMMA_RANGE) 172 | | 'PrepShuffle' >> beam.Reshuffle() 173 | | beam.FlatMap(run_sim) 174 | | beam.Map(exp_helper.numpy_array_to_csv) 175 | | beam.Reshuffle() 176 | | 177 | 'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5)) 178 | 179 | 180 | if __name__ == '__main__': 181 | app.run(main) 182 | -------------------------------------------------------------------------------- /third_party/py/scipy/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2001, 2002 Enthought, Inc. 2 | All rights reserved. 3 | 4 | Copyright (c) 2003-2017 SciPy Developers. 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | a. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | b. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | c. Neither the name of Enthought nor the names of the SciPy Developers 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS 24 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 25 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 30 | THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | 33 | 34 | SciPy bundles a number of libraries that are compatibly licensed. We list 35 | these here. 36 | 37 | Name: Numpydoc 38 | Files: doc/sphinxext/numpydoc/* 39 | License: 2-clause BSD 40 | For details, see doc/sphinxext/LICENSE.txt 41 | 42 | Name: scipy-sphinx-theme 43 | Files: doc/scipy-sphinx-theme/* 44 | License: 3-clause BSD, PSF and Apache 2.0 45 | For details, see doc/sphinxext/LICENSE.txt 46 | 47 | Name: Six 48 | Files: scipy/_lib/six.py 49 | License: MIT 50 | For details, see the header inside scipy/_lib/six.py 51 | 52 | Name: Decorator 53 | Files: scipy/_lib/decorator.py 54 | License: 2-clause BSD 55 | For details, see the header inside scipy/_lib/decorator.py 56 | 57 | Name: ID 58 | Files: scipy/linalg/src/id_dist/* 59 | License: 3-clause BSD 60 | For details, see scipy/linalg/src/id_dist/doc/doc.tex 61 | 62 | Name: L-BFGS-B 63 | Files: scipy/optimize/lbfgsb/* 64 | License: BSD license 65 | For details, see scipy/optimize/lbfgsb/README 66 | 67 | Name: SuperLU 68 | Files: scipy/sparse/linalg/dsolve/SuperLU/* 69 | License: 3-clause BSD 70 | For details, see scipy/sparse/linalg/dsolve/SuperLU/License.txt 71 | 72 | Name: ARPACK 73 | Files: scipy/sparse/linalg/eigen/arpack/ARPACK/* 74 | License: 3-clause BSD 75 | For details, see scipy/sparse/linalg/eigen/arpack/ARPACK/COPYING 76 | 77 | Name: Qhull 78 | Files: scipy/spatial/qhull/* 79 | License: Qhull license (BSD-like) 80 | For details, see scipy/spatial/qhull/COPYING.txt 81 | 82 | Name: Cephes 83 | Files: scipy/special/cephes/* 84 | License: 3-clause BSD 85 | Distributed under 3-clause BSD license with permission from the author, 86 | see https://lists.debian.org/debian-legal/2004/12/msg00295.html 87 | 88 | Cephes Math Library Release 2.8: June, 2000 89 | Copyright 1984, 1995, 2000 by Stephen L. Moshier 90 | 91 | This software is derived from the Cephes Math Library and is 92 | incorporated herein by permission of the author. 93 | 94 | All rights reserved. 95 | 96 | Redistribution and use in source and binary forms, with or without 97 | modification, are permitted provided that the following conditions are met: 98 | * Redistributions of source code must retain the above copyright 99 | notice, this list of conditions and the following disclaimer. 100 | * Redistributions in binary form must reproduce the above copyright 101 | notice, this list of conditions and the following disclaimer in the 102 | documentation and/or other materials provided with the distribution. 103 | * Neither the name of the nor the 104 | names of its contributors may be used to endorse or promote products 105 | derived from this software without specific prior written permission. 106 | 107 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 108 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 109 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 110 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 111 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 112 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 113 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 114 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 115 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 116 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 117 | 118 | Name: Faddeeva 119 | Files: scipy/special/Faddeeva.* 120 | License: MIT 121 | Copyright (c) 2012 Massachusetts Institute of Technology 122 | 123 | Permission is hereby granted, free of charge, to any person obtaining 124 | a copy of this software and associated documentation files (the 125 | "Software"), to deal in the Software without restriction, including 126 | without limitation the rights to use, copy, modify, merge, publish, 127 | distribute, sublicense, and/or sell copies of the Software, and to 128 | permit persons to whom the Software is furnished to do so, subject to 129 | the following conditions: 130 | 131 | The above copyright notice and this permission notice shall be 132 | included in all copies or substantial portions of the Software. 133 | 134 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 135 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 136 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 137 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 138 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 139 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 140 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 141 | 142 | Name: qd 143 | Files: scipy/special/cephes/dd_*.[ch] 144 | License: modified BSD license ("BSD-LBNL-License.doc") 145 | This work was supported by the Director, Office of Science, Division 146 | of Mathematical, Information, and Computational Sciences of the 147 | U.S. Department of Energy under contract numbers DE-AC03-76SF00098 and 148 | DE-AC02-05CH11231. 149 | 150 | Copyright (c) 2003-2009, The Regents of the University of California, 151 | through Lawrence Berkeley National Laboratory (subject to receipt of 152 | any required approvals from U.S. Dept. of Energy) All rights reserved. 153 | 154 | 1. Redistribution and use in source and binary forms, with or 155 | without modification, are permitted provided that the following 156 | conditions are met: 157 | 158 | (1) Redistributions of source code must retain the copyright 159 | notice, this list of conditions and the following disclaimer. 160 | 161 | (2) Redistributions in binary form must reproduce the copyright 162 | notice, this list of conditions and the following disclaimer in 163 | the documentation and/or other materials provided with the 164 | distribution. 165 | 166 | (3) Neither the name of the University of California, Lawrence 167 | Berkeley National Laboratory, U.S. Dept. of Energy nor the names 168 | of its contributors may be used to endorse or promote products 169 | derived from this software without specific prior written 170 | permission. 171 | 172 | 2. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 173 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 174 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 175 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 176 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 177 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 178 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 179 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 180 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 181 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 182 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 183 | 184 | 3. You are under no obligation whatsoever to provide any bug fixes, 185 | patches, or upgrades to the features, functionality or performance of 186 | the source code ("Enhancements") to anyone; however, if you choose to 187 | make your Enhancements available either publicly, or directly to 188 | Lawrence Berkeley National Laboratory, without imposing a separate 189 | written license agreement for such Enhancements, then you hereby grant 190 | the following license: a non-exclusive, royalty-free perpetual license 191 | to install, use, modify, prepare derivative works, incorporate into 192 | other computer software, distribute, and sublicense such enhancements 193 | or derivative works thereof, in binary and source code form. 194 | -------------------------------------------------------------------------------- /sloe_experiments/experiment_helpers.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Helpers used across many experiments to understand SLOE estimator. 17 | 18 | Implements the simulation settings studied in the paper 19 | and provides a bunch of helper functions used throughout to create and analyze 20 | simulations. 21 | """ 22 | 23 | 24 | from absl import flags 25 | import numpy as np 26 | 27 | from sloe_logistic import probe_frontier 28 | from sloe_logistic import unbiased_logistic_regression 29 | 30 | FLAGS = flags.FLAGS 31 | 32 | flags.DEFINE_enum( 33 | "covariates", "gaussian", ["gaussian", "gwas"], 34 | "Covariate generating distribution for sim. If gaussian, see --covariance" 35 | "for more details about distribution.") 36 | flags.DEFINE_enum( 37 | "covariance", "isotropic", ["isotropic", "elliptical"], 38 | "Covariance of covariates.") 39 | flags.DEFINE_float("features_per_sample", 0.2, 40 | "number of features per sample (kappa)") 41 | flags.DEFINE_float("intercept", 0, "intercept of logits") 42 | flags.DEFINE_enum( 43 | "method", "newmethod", ["newmethod", "mle", "probefrontier"], 44 | "Which method for estimation and inference?") 45 | flags.DEFINE_boolean("one_and_none", False, 46 | "Put all of the signal in one (the first) covariate. " 47 | "This does not meet assumptions of method, but provides " 48 | "a nice robustness check to see how inaccurate results " 49 | "will be.") 50 | flags.DEFINE_integer("sample_size", 1000, "number of samples per simulation") 51 | flags.DEFINE_float("signal_strength", 5, "variance of logits (gamma^2)") 52 | flags.DEFINE_boolean( 53 | "uncentered", False, 54 | "By default, covariates are centered. This makes them uncentered (w/o effecting intercept)?." 55 | ) 56 | 57 | 58 | class SimulationParams(object): 59 | """Simulation parameters shared across SLOE estimator experiments.""" 60 | 61 | def __init__(self, 62 | training_n, 63 | p, 64 | gamma, 65 | covariates="gaussian", 66 | covariance="isotropic", 67 | one_and_none=False, 68 | uncentered=False, 69 | intercept=0, 70 | seed=None): 71 | self.training_n = training_n 72 | self.p = p 73 | self.gamma = gamma 74 | self.covariates = covariates 75 | self.covariance = covariance 76 | self.one_and_none = one_and_none 77 | self.uncentered = uncentered 78 | self.intercept = intercept 79 | self.seed = seed 80 | 81 | @classmethod 82 | def create_from_flags(cls): 83 | """Create a SimulationParams object from FLAGS.""" 84 | n = FLAGS.sample_size 85 | kappa = FLAGS.features_per_sample 86 | gamma = np.sqrt(FLAGS.signal_strength) 87 | covariates = FLAGS.covariates 88 | covariance = FLAGS.covariance 89 | one_and_none = FLAGS.one_and_none 90 | uncentered = FLAGS.uncentered 91 | intercept = FLAGS.intercept 92 | 93 | p = int(n * kappa) 94 | return SimulationParams(n, p, gamma, covariates, covariance, one_and_none, 95 | uncentered, intercept) 96 | 97 | 98 | class Simulation(object): 99 | """Standard simulation model used in most experiments in SLOE paper.""" 100 | 101 | def __init__(self, simulation_params): 102 | self.simulation_params = simulation_params 103 | 104 | self._check_sim_params() 105 | self._reset_random_state() 106 | self._initialize_params() 107 | 108 | def _initialize_params(self): 109 | """Initializes statistical params of model from simulation parameters.""" 110 | p = self.simulation_params.p 111 | 112 | self.intercept_ = self.simulation_params.intercept 113 | 114 | if self.simulation_params.one_and_none: 115 | self.beta = np.zeros(p) 116 | self.beta[0] = self.simulation_params.gamma * np.sqrt(p) 117 | else: 118 | self.p_positive = int(p / 8) 119 | self.p_negative = self.p_positive 120 | self.p_zero = p - self.p_positive - self.p_negative 121 | self.beta = 2 * np.concatenate((np.ones( 122 | self.p_positive), -np.ones(self.p_negative), np.zeros(self.p_zero))) 123 | self.beta *= self.simulation_params.gamma 124 | 125 | if self.simulation_params.covariance == "isotropic": 126 | self.diag = np.ones(p) 127 | elif self.simulation_params.covariance == "elliptical": 128 | self.diag = self.random_state.rand(p) + 0.5 129 | self.diag /= self.diag[:(self.p_positive + self.p_negative)].mean() 130 | self.diag[0] = 1 131 | else: 132 | raise NotImplementedError("No covariance {}".format( 133 | self.simulation_params.covariance)) 134 | 135 | if self.simulation_params.uncentered: 136 | self.centering = np.ones(p) 137 | self.intercept_ -= self.beta.dot(self.centering) 138 | else: 139 | self.centering = 0 140 | 141 | def null_indices(self): 142 | """Get null indices.""" 143 | return slice(-self.p_zero, None, None) 144 | 145 | def _check_sim_params(self): 146 | if self.simulation_params.covariates != "gaussian": 147 | raise ValueError( 148 | "Simulation parameters calls for {} covariate distribution, " 149 | "but this class generates Gaussian covariates.".format( 150 | self.simulation_params.covariates)) 151 | 152 | def _reset_random_state(self): 153 | self.random_state = np.random.RandomState(seed=self.simulation_params.seed) 154 | 155 | def _sample_x(self, n): 156 | return self.diag * self.random_state.randn( 157 | n, self.simulation_params.p) / np.sqrt( 158 | self.simulation_params.p) + self.centering 159 | 160 | def sample(self, n=None): 161 | """Sample data from simulation.""" 162 | if n is None: 163 | n = self.simulation_params.training_n 164 | 165 | x1 = self._sample_x(n) 166 | y1 = (self.random_state.rand(n) <= 1.0 / 167 | (1.0 + np.exp(-x1.dot(self.beta) - self.intercept_))).astype(float) 168 | return (x1, y1) 169 | 170 | 171 | class GWASSimulation(Simulation): 172 | """From Sur and Candes, 2019. PNAS. Section 4(g).""" 173 | 174 | def __init__(self, simulation_params): 175 | super().__init__(simulation_params) 176 | 177 | self._initialize_cov_params() 178 | 179 | def _initialize_cov_params(self): 180 | self.equil = 0.5 * self.random_state.rand(self.simulation_params.p) + 0.25 181 | 182 | def _check_sim_params(self): 183 | if self.simulation_params.covariates != "gwas": 184 | raise ValueError( 185 | "Simulation parameters calls for {} covariate distribution, " 186 | "but this class generates GWAS-like covariates.".format( 187 | self.simulation_params.covariates)) 188 | 189 | def covariate_mean(self): 190 | return 2 * (1 - self.equil) 191 | 192 | def covariate_std(self): 193 | return 2 * (1 - self.equil) * self.equil 194 | 195 | def _sample_x(self, n): 196 | p = self.simulation_params.p 197 | x1 = np.zeros((n, p)) 198 | equil = self.equil 199 | for j in range(p): 200 | pj = equil[j] 201 | probs = np.array([pj**2, 2 * pj * (1 - pj), (1 - pj)**2]) 202 | x1[:, j] = self.random_state.choice(3, size=(n,), p=probs) 203 | x1 -= self.covariate_mean().reshape(1, -1) 204 | x1 /= self.covariate_std().reshape(1, -1) * np.sqrt(p) 205 | return x1 206 | 207 | 208 | def multiple_sim_params(seed, kappa_range, gamma_range): 209 | """For each seed, map to a variety of simulation parameters.""" 210 | for kappa in kappa_range: 211 | for gamma in gamma_range: 212 | yield [kappa, gamma, seed] 213 | 214 | 215 | def multiple_sample_sizes(seed, n_range): 216 | """For each seed, map to a variety of sample sizes.""" 217 | for n in n_range: 218 | yield [n, seed] 219 | 220 | 221 | def create_sim(sim_params): 222 | """Create a simulation according to passed params.""" 223 | if sim_params.covariates == "gaussian": 224 | return Simulation(sim_params) 225 | elif sim_params.covariates == "gwas": 226 | return GWASSimulation(sim_params) 227 | else: 228 | raise NotImplementedError("No simulation with covariates {}".format( 229 | FLAGS.covariates)) 230 | 231 | 232 | def create_inference_model(method=None, fit_intercept=False): 233 | """Create a model to use for inference, getting default from FLAGS.""" 234 | if method is None: 235 | method = FLAGS.method 236 | 237 | if method == "probe_frontier": 238 | if fit_intercept: 239 | raise NotImplementedError( 240 | "ProbeFrontier can't fit an intercept right now") 241 | logit_model = probe_frontier.ProbeFrontierLogisticRegression( 242 | num_subsamples=8) 243 | elif method == "mle": 244 | logit_model = unbiased_logistic_regression.LogisticRegressionMLE( 245 | fit_intercept=fit_intercept) 246 | elif method == "bootstrap": 247 | logit_model = unbiased_logistic_regression.LogisticRegressionPercBoot( 248 | fit_intercept=fit_intercept) 249 | elif method == "newmethod": 250 | logit_model = unbiased_logistic_regression.UnbiasedLogisticRegression( 251 | fit_intercept=fit_intercept) 252 | else: 253 | raise NotImplementedError("No method {}".format(FLAGS.method)) 254 | return logit_model 255 | 256 | 257 | def numpy_array_to_csv(arr): 258 | return ",".join(["%.5f" % num for num in arr]) 259 | 260 | 261 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /unbiased_logistic_regression.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2021 The SLOE Logistic Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Implements methods for inference for logistic regression based on the MLE. 17 | 18 | Implements SLOE and other methods for inference for logistic regression 19 | based on the MLE. 20 | """ 21 | 22 | import numpy as np 23 | import scipy 24 | import scipy.stats 25 | import sklearn.linear_model 26 | 27 | from sloe_logistic import asymp_system_solve 28 | 29 | 30 | class ScaledLogisticRegression(object): 31 | """Generic class for methods rescaling the logistic regression MLE.""" 32 | 33 | def __init__(self): 34 | pass 35 | 36 | def predict_proba(self, features, *args, **kwargs): 37 | del args 38 | del kwargs 39 | results = np.zeros((features.shape[0], 2)) 40 | log_odds_ratio = features.dot(self.coef_.T).reshape(-1) + self.intercept_ 41 | results[:, 1] = self._expit(log_odds_ratio) 42 | results[:, 0] = 1 - results[:, 1] 43 | return results 44 | 45 | def predict_inv_proba(self, features, *args, **kwargs): 46 | """Provides reciprocal of probability given features.""" 47 | return 1 / self.predict_proba(features, *args, **kwargs) 48 | 49 | def _expit(self, logit, trimmed=False): 50 | if trimmed: 51 | logit = np.minimum(logit, 5) 52 | logit = np.maximum(logit, -5) 53 | return 1.0 / (1.0 + np.exp(-logit)) 54 | 55 | 56 | class PlattScaledLogisticRegression(ScaledLogisticRegression): 57 | """Rescales the logit reg MLE to make it calibrated using approximation.""" 58 | 59 | def __init__(self, fit_intercept=True, **kwargs): 60 | del kwargs 61 | super().__init__() 62 | self.fit_intercept = fit_intercept 63 | self.sm = sklearn.linear_model.LogisticRegression( 64 | fit_intercept=fit_intercept, 65 | penalty="none", 66 | solver="newton-cg", 67 | warm_start=False) 68 | 69 | def fit(self, features, outcome, weights=None, verbose=False): 70 | """Compute MLE and then use Taylor approximation rescale for calibration.""" 71 | del verbose 72 | self.sm.fit(features, outcome, weights) 73 | 74 | refit_weights = None 75 | if refit_weights is None: 76 | refit_weights = 1 77 | 78 | # Get leave-one-out logits to pass in to Platt scaling 79 | pred = self.sm.predict_proba(features)[:, 1] 80 | hessian = -features.T.dot( 81 | (refit_weights * pred * (1 - pred)).reshape(-1, 1) * features) 82 | xihinvxi = np.diag(features.dot(np.linalg.solve(hessian, features.T))) 83 | mod = xihinvxi / (1.0 + xihinvxi * refit_weights * pred * (1 - pred)) 84 | features = mod * refit_weights * ( 85 | outcome - pred) + self.sm.decision_function(features) 86 | 87 | # Fit model for outcome using LOO logit estimates as feature. Coefficient on 88 | # feature is scaling to recalibrate model. 89 | cm = sklearn.linear_model.LogisticRegression( 90 | penalty="none", fit_intercept=self.fit_intercept) 91 | cm.fit(features.reshape(-1, 1), outcome.reshape(-1), weights) 92 | self.coef_ = self.sm.coef_ * cm.coef_ 93 | if self.fit_intercept: 94 | self.intercept_ = cm.coef_ * self.sm.intercept_ + cm.intercept_ 95 | else: 96 | self.intercept_ = 0 97 | return self 98 | 99 | 100 | class CVRegLogisticRegression(ScaledLogisticRegression): 101 | """Cross-validated regularized logistic regression MLE.""" 102 | 103 | def __init__(self, fit_intercept=True, Cs=10, **kwargs): 104 | super().__init__(**kwargs) 105 | self.fit_intercept = fit_intercept 106 | self.sm = sklearn.linear_model.LogisticRegressionCV( 107 | fit_intercept=fit_intercept, 108 | Cs=Cs, 109 | penalty="l2", 110 | solver="newton-cg") 111 | 112 | def fit(self, features, outcome, weights=None, verbose=False): 113 | """Fit cross-validated model.""" 114 | del verbose 115 | 116 | self.sm.fit(features, outcome, weights) 117 | 118 | if self.fit_intercept: 119 | self.intercept_ = self.sm.intercept_ 120 | else: 121 | self.intercept_ = 0 122 | self.coef_ = self.sm.coef_ 123 | 124 | return self 125 | 126 | 127 | class LogisticRegressionInference(ScaledLogisticRegression): 128 | """Base class inference with logit reg that computes P/CIs from covariance.""" 129 | 130 | def __init__(self, fit_intercept=True, ci=50, **kwargs): 131 | super().__init__(**kwargs) 132 | self.fit_intercept = fit_intercept 133 | self.coef_cov = None 134 | self.hessian = None 135 | self.chi_sq_rescale = 1 136 | self.set_coverage(ci) 137 | 138 | def set_coverage(self, ci): 139 | """Sets expected coverage level.""" 140 | self.ci_coverage = ci / 100.0 141 | self.z = scipy.stats.norm.ppf(0.5 + self.ci_coverage / 2.0) 142 | 143 | def _set_coef_cov(self, *args): 144 | pass 145 | 146 | def _get_prediction_variances(self, features): 147 | if self.fit_intercept: 148 | features_aug = np.ones((features.shape[0], features.shape[1] + 1)) 149 | features_aug[:, :-1] = features 150 | else: 151 | features_aug = features 152 | return (features_aug.dot(self.coef_cov) * 153 | features_aug).sum(axis=-1).reshape(-1) 154 | 155 | def p_values(self): 156 | """Get p-values for a fitted model using Wald test.""" 157 | scale = np.sqrt(np.diag(self.coef_cov)) 158 | if self.fit_intercept: 159 | scale = scale[:-1] 160 | t = np.abs(self.coef_) / scale 161 | t = t.reshape(-1) 162 | p = 2 * scipy.stats.norm.sf(t) 163 | return p 164 | 165 | def decision_function(self, features): 166 | """Compute logits (ie decision function in sklearn parlance.""" 167 | return features.dot(self.coef_.T).reshape(-1) + self.intercept_ 168 | 169 | def prediction_intervals(self, features, logit=False): 170 | """Computes prediction CI for each row of features using coef covariance.""" 171 | if self.coef_cov is None: 172 | raise Exception( 173 | "No covariance matrix defined yet, so can't do inference.") 174 | 175 | logits = self.decision_function(features) 176 | variances = self._get_prediction_variances(features) 177 | 178 | lower_ci = logits - self.z * np.sqrt(variances) 179 | upper_ci = logits + self.z * np.sqrt(variances) 180 | 181 | results = np.zeros((features.shape[0], 3)) 182 | results[:, 0] = lower_ci 183 | results[:, 1] = logits 184 | results[:, 2] = upper_ci 185 | 186 | if not logit: 187 | results = self._expit(results) 188 | 189 | return results 190 | 191 | def predict_proba(self, X): 192 | logits = self.decision_function(X) 193 | 194 | preds = self._expit(logits) 195 | 196 | results = np.zeros((X.shape[0], 2)) 197 | results[:, 1] = preds 198 | results[:, 0] = 1 - preds 199 | return results 200 | 201 | def predict_inv_proba(self, X): 202 | logits = self.decision_function(X) 203 | 204 | pos_exps = np.exp(logits) 205 | neg_exps = np.exp(-logits) 206 | 207 | results = np.zeros((X.shape[0], 2)) 208 | results[:, 1] = 1 + neg_exps 209 | results[:, 0] = 1 + pos_exps 210 | 211 | return results 212 | 213 | 214 | class LogisticRegressionMLE(LogisticRegressionInference): 215 | """Computes the un-rescaled MLE and standard large-sample stats inference.""" 216 | 217 | def __init__(self, fit_intercept=True, **kwargs): 218 | super().__init__(fit_intercept=fit_intercept, **kwargs) 219 | self.fit_intercept = fit_intercept 220 | self.sm = sklearn.linear_model.LogisticRegression( 221 | fit_intercept=fit_intercept, 222 | penalty="none", 223 | solver="newton-cg", 224 | warm_start=False) 225 | 226 | def fit(self, features, outcomes, weights=None, verbose=False): 227 | """Fit standard MLE model and compute coefficient covariance matrix.""" 228 | del verbose 229 | 230 | self.sm.fit(features, outcomes, weights) 231 | 232 | self.coef_ = self.sm.coef_ 233 | if self.fit_intercept: 234 | self.intercept_ = self.sm.intercept_ 235 | else: 236 | self.intercept_ = 0 237 | 238 | self._set_coef_cov(features, weights) 239 | 240 | return self 241 | 242 | def _set_coef_cov(self, features, weights): 243 | """Use large-sample asymp. to compute coefficient covariance matrix.""" 244 | if weights is None: 245 | weights = 1 246 | pred = self.sm.predict_proba(features)[:, 1] 247 | _, p = features.shape 248 | if self.fit_intercept: 249 | features_aug = np.ones((features.shape[0], features.shape[1] + 1)) 250 | features_aug[:, :-1] = features 251 | dim = p + 1 252 | else: 253 | features_aug = features 254 | dim = p 255 | hessian = features_aug.T.dot( 256 | (weights * pred * 257 | (1 - pred)).reshape(-1, 1) * features_aug) / np.mean(weights) 258 | self.hessian = -hessian 259 | self.coef_cov = scipy.linalg.solve(hessian, np.eye(dim), assume_a="pos") 260 | 261 | 262 | class LogisticRegressionPercBoot(LogisticRegressionInference): 263 | """Fit standard MLE using multiplier bootstrap and compute percentile CIs. 264 | 265 | It is not recommended to use this method in practice if d / n ~> 0.05. The 266 | results from our paper suggest that it is very biased and has poor precision. 267 | """ 268 | 269 | def __init__(self, fit_intercept=True, num_boot=20, **kwargs): 270 | super().__init__(fit_intercept=fit_intercept, **kwargs) 271 | self.fit_intercept = fit_intercept 272 | self.sm = sklearn.linear_model.LogisticRegression( 273 | fit_intercept=fit_intercept, 274 | penalty="none", 275 | solver="newton-cg", 276 | warm_start=False) 277 | self.num_boot = num_boot 278 | 279 | def fit(self, features, outcome, weights=None, verbose=False): 280 | """Fit main model and bootstrapped models with multiplier bootstrap.""" 281 | del verbose 282 | self.sm.fit(features, outcome, weights) 283 | 284 | self.coef_ = self.sm.coef_ 285 | if self.fit_intercept: 286 | self.intercept_ = self.sm.intercept_ 287 | else: 288 | self.intercept_ = 0 289 | 290 | if weights is None: 291 | weights = 1.0 292 | 293 | n = features.shape[0] 294 | self.bootstraps = [] 295 | for _ in range(self.num_boot): 296 | self.sm.fit(features, outcome, 297 | weights * np.random.poisson(lam=1.0, size=n)) 298 | if np.linalg.norm(self.sm.coef_) >= 1e6: 299 | continue 300 | d = {"coef": self.sm.coef_.reshape(-1)} 301 | if self.fit_intercept: 302 | d["intercept"] = self.sm.intercept_ 303 | else: 304 | d["intercept"] = 0 305 | self.bootstraps.append(d) 306 | 307 | return self 308 | 309 | def p_values(self): 310 | raise NotImplementedError( 311 | "This form of bootstrap does not lend itself well to p-values") 312 | 313 | def approx_lrt_p_values(self): 314 | raise NotImplementedError( 315 | "This form of bootstrap does not lend itself well to p-values") 316 | 317 | def _predict_with_param_dict(self, params, features): 318 | return features.dot(params["coef"]).reshape(-1) + params["intercept"] 319 | 320 | def prediction_intervals(self, X, logit=False): 321 | """Computes percentile CIs for feature rows using bootstrap samples.""" 322 | all_preds = np.array( 323 | [self._predict_with_param_dict(d, X) for d in self.bootstraps]) 324 | 325 | ci_range = (1 - self.ci_coverage) / 2 326 | results = np.quantile(all_preds, q=(ci_range, 0.5, 1 - ci_range), axis=0).T 327 | 328 | if not logit: 329 | results = self._expit(results) 330 | 331 | return results 332 | 333 | 334 | class UnbiasedLogisticRegression(LogisticRegressionInference): 335 | """Corrected bias and inference with the logitistic regression MLE.""" 336 | 337 | def __init__(self, fit_intercept=False, **kwargs): 338 | super().__init__(fit_intercept, **kwargs) 339 | 340 | self.fit_intercept = fit_intercept 341 | if fit_intercept: 342 | raise ValueError("This model doesn't allow fitting an intercept.") 343 | 344 | self.sm = sklearn.linear_model.LogisticRegression( 345 | fit_intercept=fit_intercept, 346 | penalty="none", 347 | solver="newton-cg", 348 | warm_start=False) 349 | 350 | def fit(self, features, outcome, weights=None, verbose=False): 351 | """Fit MLE, estimate eta with SLOE, de-bias, and estimate covariance.""" 352 | del verbose 353 | kappa = float(features.shape[1]) / features.shape[0] 354 | 355 | self.sm.fit(features, outcome, weights) 356 | 357 | if weights is None: 358 | weights = 1 359 | 360 | pred = self.sm.predict_proba(features)[:, 1] 361 | weights /= np.mean(weights) 362 | diag = weights * pred * (1 - pred) 363 | hessian = -features.T.dot(diag.reshape(-1, 1) * features) 364 | self.hessian = hessian 365 | xihinvxi = np.einsum("ij,ji->i", features, 366 | np.linalg.solve(hessian, features.T)) 367 | mod = xihinvxi / (1.0 + xihinvxi * diag) 368 | infl = mod * weights * (outcome - 369 | pred) + self.sm.decision_function(features) 370 | 371 | eta_hat = np.var(infl) 372 | 373 | b0 = 0 374 | 375 | self.alpha, lambda_, sigma, intercept_est = asymp_system_solve.correction_factors( 376 | kappa, eta_hat, np.sqrt(eta_hat), b0, use_eta=True) 377 | if (kappa >= 0.05 and self.alpha < 0.999) or self.alpha > 5 \ 378 | or lambda_ < 0.1 or sigma < 0.3 or lambda_ > 1e3 or sigma > 1e3: 379 | raise ValueError("Problem with optimization") 380 | 381 | self.eta_hat = eta_hat 382 | self.lambda_ = lambda_ 383 | self.sigma = sigma 384 | self.intercept_est = intercept_est 385 | 386 | self.chi_sq_rescale = lambda_ * self.alpha**2 / sigma**2 387 | self.coef_ = self.sm.coef_ / self.alpha 388 | self.intercept_ = 0 389 | 390 | self._set_coef_cov(features, sigma / np.sqrt(kappa), self.alpha) 391 | 392 | return self 393 | 394 | def _set_coef_cov(self, features, sigma, alpha): 395 | n, p = features.shape 396 | features_aug = features 397 | dim = p 398 | feature_cov = features_aug.T.dot(features_aug) 399 | one_on_tau_sq = scipy.linalg.solve(feature_cov, np.eye(dim), assume_a="pos") 400 | self.coef_cov = one_on_tau_sq 401 | self.coef_cov *= (1 - float(p) / n) * ((sigma / alpha)**2) 402 | --------------------------------------------------------------------------------