├── requirements.txt
├── run.sh
├── CONTRIBUTING.md
├── asymp_system_solve_test.py
├── setup.py
├── sloe_experiments
├── experiment_helpers_test.py
├── p_values.py
├── runtime.py
├── est_gamma.py
├── sweep_coverage.py
└── experiment_helpers.py
├── probe_frontier_test.py
├── third_party
└── py
│ └── scipy
│ ├── optimize
│ └── Zeros
│ │ ├── zeros.h
│ │ └── brentq.c
│ └── LICENSE.txt
├── README.md
├── mle_param_integrands.h
├── mle_param_integrands.cc
├── asymp_system_solve.py
├── unbiased_logistic_regression_test.py
├── probe_frontier.py
├── LICENSE
└── unbiased_logistic_regression.py
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.16.5
2 | scipy==1.5.4
3 | apache-beam
4 | absl-py
5 | scikit-learn
6 | statsmodels
7 | pybind11
8 |
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The SLOE Logistic Authors.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #!/bin/bash
16 | set -e
17 | set -x
18 |
19 | virtualenv -p python3 .
20 | source ./bin/activate
21 |
22 | pip install -r requirements.txt
23 | python setup.py build
24 | python setup.py install
25 | python -m sloe_logistic.asymp_system_solve_test
26 | python -m sloe_logistic.unbiased_logistic_regression_test
27 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | # Issues
4 |
5 | * Please tag your issue with `bug`, `feature request`, or `question` to help us
6 | effectively respond.
7 | * Please include the version of Uncertainty Metrics you are running.
8 | * Please provide the command line you ran as well as the log output.
9 |
10 | # Pull Requests
11 |
12 | Please send in fixes and feature additions through Pull Requests.
13 |
14 | ## Contributor License Agreement
15 |
16 | Contributions to this project must be accompanied by a Contributor License
17 | Agreement. You (or your employer) retain the copyright to your contribution,
18 | this simply gives us permission to use and redistribute your contributions as
19 | part of the project. Head over to to see
20 | your current agreements on file or to sign a new one.
21 |
22 | You generally only need to submit a CLA once, so if you've already submitted one
23 | (even if it was for a different project), you probably don't need to do it
24 | again.
25 |
26 | ## Code reviews
27 |
28 | All submissions, including submissions by project members, require review. We
29 | use GitHub pull requests for this purpose. Consult
30 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
31 | information on using pull requests.
32 |
--------------------------------------------------------------------------------
/asymp_system_solve_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Tests for sloe_logistic.asymp_system_solve."""
17 |
18 | from absl.testing import absltest
19 | import numpy as np
20 | from sloe_logistic import asymp_system_solve
21 |
22 |
23 | class AsympSystemSolveTest(absltest.TestCase):
24 |
25 | def test_correction_factors_solve(self):
26 | sol = asymp_system_solve.correction_factors(
27 | 0.2, 1, np.sqrt(5), 0, use_eta=False)
28 | target = [1.499, 3.027, 2.1214, 0.0]
29 | for i in range(4):
30 | self.assertAlmostEqual(sol[i], target[i], places=3)
31 |
32 | sol = asymp_system_solve.correction_factors(
33 | 0.1, 8.881028475794636, np.sqrt(5), 0, use_eta=True)
34 | target = [1.174, 1.007, 1.086, 0.0]
35 | for i in range(4):
36 | self.assertAlmostEqual(sol[i], target[i], places=3)
37 |
38 | def test_frontier(self):
39 | sol = asymp_system_solve.frontier(0.1)
40 | self.assertAlmostEqual(sol, 9.890, places=3)
41 |
42 | sol = asymp_system_solve.frontier(0.2)
43 | self.assertAlmostEqual(sol, 4.550, places=3)
44 |
45 | if __name__ == '__main__':
46 | absltest.main()
47 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The SLOE Logistic Authors.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """Builds sloe_logistic package."""
16 |
17 | from distutils import core
18 | from distutils.command import build_clib
19 |
20 | from pybind11.setup_helpers import build_ext
21 | from pybind11.setup_helpers import Pybind11Extension
22 |
23 | libraries = [
24 | ("scipy_brentq", {
25 | "sources": ["third_party/py/scipy/optimize/Zeros/brentq.c",],
26 | }),
27 | ]
28 |
29 | ext_modules = [
30 | Pybind11Extension("sloe_logistic.mle_param_integrands", [
31 | "mle_param_integrands.cc",
32 | ]),
33 | ]
34 |
35 | core.setup(
36 | name="sloe_logistic",
37 | version="0.0.1",
38 | description="Implements SLOE method and Logistic Regression Inference",
39 | long_description="Code to supplement the ICML submission SLOE: A Faster "
40 | "Method for Statistical Inference in High-Dimensional Logistic Regression.",
41 | packages=["sloe_logistic", "sloe_logistic.sloe_experiments"],
42 | package_dir={
43 | "sloe_logistic": ".",
44 | "sloe_logistic.sloe_experiments": "sloe_experiments/"
45 | },
46 | libraries=libraries,
47 | ext_modules=ext_modules,
48 | cmdclass={
49 | "build_ext": build_ext,
50 | "build_clib": build_clib.build_clib,
51 | },
52 | zip_safe=False,
53 | )
54 |
--------------------------------------------------------------------------------
/sloe_experiments/experiment_helpers_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Tests for experiment_helpers."""
17 |
18 | from absl.testing import absltest
19 | from sloe_logistic.sloe_experiments import experiment_helpers
20 |
21 |
22 | class ExperimentHelpersTest(absltest.TestCase):
23 |
24 | def test_simulation(self):
25 | params = experiment_helpers.SimulationParams(4000, 400, 1, seed=202103)
26 | sim = experiment_helpers.Simulation(params)
27 | features, outputs = sim.sample()
28 |
29 | self.assertAlmostEqual(features.mean(), 0, places=3)
30 | self.assertAlmostEqual(outputs.mean(), 0.5, places=2)
31 |
32 | def test_gwas_simulation(self):
33 | params = experiment_helpers.SimulationParams(4000, 400, 1, seed=202103)
34 | params.covariates = 'gwas'
35 | sim = experiment_helpers.GWASSimulation(params)
36 | features, outputs = sim.sample()
37 |
38 | self.assertAlmostEqual(features.mean(), 0, places=3)
39 | self.assertAlmostEqual(outputs.mean(), 0.5, places=2)
40 |
41 | def test_gwas_simulation_checks_covariates(self):
42 | params = experiment_helpers.SimulationParams(4000, 400, 1, seed=202103)
43 | params.covariates = 'not_gwas'
44 | with self.assertRaises(ValueError):
45 | _ = experiment_helpers.GWASSimulation(params)
46 |
47 | if __name__ == '__main__':
48 | absltest.main()
49 |
--------------------------------------------------------------------------------
/probe_frontier_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Tests for sloe_logistic.asymp_system_solve."""
17 |
18 | from absl.testing import absltest
19 | import numpy as np
20 | from sloe_logistic import probe_frontier
21 |
22 |
23 | class ProbeFrontierTest(absltest.TestCase):
24 |
25 | def get_simulated_data(self, n, d):
26 | np.random.seed(1)
27 | features = np.random.randn(n, d)
28 | beta = np.sqrt(5 * 2.0 / d) * np.ones(d)
29 | beta[(d // 2):] = 0
30 |
31 | outcome = (np.random.rand(n) <= 1.0 /
32 | (1.0 + np.exp(-features.dot(beta)))).astype(float)
33 |
34 | return features, outcome
35 |
36 | def test_probe_frontier_model(self):
37 | n, d = 1000, 100
38 | features, outcome = self.get_simulated_data(n, d)
39 | model = probe_frontier.ProbeFrontierLogisticRegression(num_subsamples=4)
40 | model.fit(features, outcome)
41 |
42 | self.assertLen(model.coef_.reshape(-1), features.shape[1])
43 |
44 | def test_corrected_p_values(self):
45 | """Check null P value CDF is within 95% CI of uniform CDF."""
46 | n, d = 4000, 400
47 | features, outcome = self.get_simulated_data(n, d)
48 | model = probe_frontier.ProbeFrontierLogisticRegression(num_subsamples=4)
49 | model.fit(features, outcome)
50 |
51 | thresh = 0.1
52 | emp_p_cdf = model.p_values().reshape(-1)[(d // 2):] <= thresh
53 | self.assertAlmostEqual(
54 | emp_p_cdf.mean(),
55 | thresh,
56 | delta=1.96 * emp_p_cdf.std() / np.sqrt(d // 2))
57 |
58 |
59 | if __name__ == '__main__':
60 | absltest.main()
61 |
--------------------------------------------------------------------------------
/third_party/py/scipy/optimize/Zeros/zeros.h:
--------------------------------------------------------------------------------
1 | // Copyright 2021 The SLOE Logistic Authors.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | /* Written by Charles Harris charles.harris@sdl.usu.edu */
16 |
17 | /* Modified to not depend on Python everywhere by Travis Oliphant.
18 | */
19 |
20 | #ifndef ZEROS_H
21 | #define ZEROS_H
22 |
23 | typedef struct {
24 | int funcalls;
25 | int iterations;
26 | int error_num;
27 | } scipy_zeros_info;
28 |
29 |
30 | /* Must agree with _ECONVERGED, _ESIGNERR, _ECONVERR in zeros.py */
31 | #define CONVERGED 0
32 | #define SIGNERR -1
33 | #define CONVERR -2
34 | #define EVALUEERR -3
35 | #define INPROGRESS 1
36 |
37 | typedef double (*callback_type)(double, void*);
38 | typedef double (*solver_type)(callback_type, double, double, double, double,
39 | int, void *, scipy_zeros_info*);
40 |
41 | extern double bisect(callback_type f, double xa, double xb, double xtol,
42 | double rtol, int iter, void *func_data,
43 | scipy_zeros_info *solver_stats);
44 | extern double ridder(callback_type f, double xa, double xb, double xtol,
45 | double rtol, int iter, void *func_data,
46 | scipy_zeros_info *solver_stats);
47 | extern double brenth(callback_type f, double xa, double xb, double xtol,
48 | double rtol, int iter, void *func_data,
49 | scipy_zeros_info *solver_stats);
50 | extern double brentq(callback_type f, double xa, double xb, double xtol,
51 | double rtol, int iter, void *func_data,
52 | scipy_zeros_info *solver_stats);
53 |
54 | #endif
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Code to run experiments in *SLOE: A Faster Method for Statistical Inference in High-Dimensional Logistic Regression*.
2 |
3 | Not an official Google product.
4 |
5 | ## Method Introduction
6 | This library provides statistical inference for high dimensional logistic
7 | regression maximum likelihood, based largely on the breakthrough results from
8 | Sur and Candès (PNAS, 2019). The challenge with applying their results is that
9 | they depend on an unobserved signal strength quantity. Our method estimates this
10 | quantity via a leave-one-out approach, which we outline in our paper [1].
11 |
12 | By high-dimensions, we mean that the ratio of the number of covariates `p` to
13 | the sample size `n` is strictly between 0 and 0.5. When the number of covariates
14 | is too large, the data is separable, and our method will not help to recover
15 | from such a case. When the number of covariates is small (say, `p <= 0.05 * n`),
16 | the high dimensional adjustment is a bit numerically unstable, and adds little
17 | value over the standard large-sample theory.
18 |
19 | The setting studied is complementary to sparse high dimensional regimes. We
20 | assume that there are a relatively large number of covariates that are weakly
21 | correlated with the binary outcome. If one expects only a very small number of
22 | the many candidate covariates to have a nonzero coefficient in the model,
23 | sparse model selection and post-selective inference is probably a better
24 | approach than the one taken here.
25 |
26 | ## Installation and tests
27 | Run `run.sh` to install requirements and package, and run tests.
28 |
29 | ## Usage
30 | The main approach proposed in our work is implemented in the
31 | `UnbiasedLogisticRegression` class in `unbiased_logistic_regression.py`. This
32 | has an `sklearn`-like interface, with a `fit`, `decision_function` and
33 | `predict_proba` API. Additionally, for inference, we've added a
34 | `prediction_intervals` method. See the inline documentation for more details
35 | of usage.
36 |
37 | # Citation
38 | [1] S. Yadlowsky, T. Yun, C. McLean, A. D'Amour (2021). "SLOE: A Faster
39 | Method for Statistical Inference in High-Dimensional Logistic Regression".
40 | [arXiv:2103.12725](http://arxiv.org/abs/2103.12725) [stat.ML].
41 |
--------------------------------------------------------------------------------
/mle_param_integrands.h:
--------------------------------------------------------------------------------
1 | // Copyright 2021 The SLOE Logistic Authors.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef MLE_PARAM_INTEGRANDS_H_
16 | #define MLE_PARAM_INTEGRANDS_H_
17 |
18 | #include
19 |
20 | extern "C" {
21 | #include "third_party/py/scipy/optimize/Zeros/zeros.h"
22 | }
23 |
24 | namespace logistic_hd {
25 |
26 | // Integrands for the equations defined in Eq. 5 from Sur and Candès
27 | // (PNAS, 2019). These are called by the bivariate integration over Z1 and Z2
28 | // in `asymp_system_solve.py`.
29 | double integrand(double Z1, double Z2, double kappa, double gamma, double b0,
30 | double alpha, double lambda, double sigma, double beta0,
31 | int eq_num);
32 |
33 | // Computes the derivative of the objective that defines the proximal operator.
34 | // The prox operator is the value of z that makes this zero.
35 | double prox_deriv(double z, void *args);
36 |
37 | double sigmoid(double z);
38 |
39 | // Computes the derivative of the prox operator for the logistic regression
40 | // log likelihood.
41 | double prox_impl(double lambda, double x, double xtol = 1e-8,
42 | double rtol = 1e-8, int maxiters = 1000);
43 |
44 | // Computes the pdf of the bivariate normal without any input validation
45 | // because this is called many times during optimization.
46 | double pdf(double x1, double x2);
47 |
48 | // Helper function to pass values between our code and the scipy.optimize API.
49 | double scipy_zeros_functions_func(double x, void *params);
50 |
51 | typedef struct prox_params {
52 | double lambda;
53 | double x;
54 | } prox_params;
55 |
56 | } // namespace logistic_hd
57 |
58 | #endif // MLE_PARAM_INTEGRANDS_H_
59 |
--------------------------------------------------------------------------------
/sloe_experiments/p_values.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Run experiment to understand uniformity of p-values generated by SLOE.
17 |
18 | Tests the SLOE estimator empirically by computing it
19 | over a bunch of different seeds, and storing in csv files to be analyzed in a
20 | colab.
21 | """
22 |
23 |
24 | from absl import app
25 | from absl import flags
26 | import apache_beam as beam
27 | from apache_beam.options import pipeline_options
28 | import numpy as np
29 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper
30 |
31 | FLAGS = flags.FLAGS
32 |
33 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run')
34 | flags.DEFINE_string('output_path', '/tmp/counts.txt', 'The output file path')
35 | flags.DEFINE_string(
36 | 'coverage_target', 'true_preds',
37 | 'Which value to check coverage in prediction intervals? Options '
38 | '`true_preds` or `calib_ests`'
39 | )
40 |
41 |
42 | def run_sim(seed):
43 | """Runs simulation and computes estimated p-values to compare to uniform."""
44 | # Model parameters
45 |
46 | sim_params = exp_helper.SimulationParams.create_from_flags()
47 | sim_params.seed = 201216 + seed
48 | sim = exp_helper.Simulation(sim_params)
49 |
50 | x1, y1 = sim.sample()
51 |
52 | logit_model = exp_helper.create_inference_model()
53 | logit_model_fit = logit_model.fit(x1, y1)
54 |
55 | p_values = logit_model_fit.p_values()
56 | return np.sort(p_values[sim.null_indices()])
57 |
58 |
59 | def main(unused_argv):
60 | # If you have custom beam options add them here.
61 | beam_options = pipeline_options.PipelineOptions()
62 |
63 | with beam.Pipeline(options=beam_options) as pipe:
64 | _ = (
65 | pipe
66 | | beam.Create(range(FLAGS.num_sims))
67 | | beam.Map(run_sim)
68 | | beam.Map(exp_helper.numpy_array_to_csv)
69 | | beam.Reshuffle()
70 | |
71 | 'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5))
72 |
73 |
74 | if __name__ == '__main__':
75 | app.run(main)
76 |
--------------------------------------------------------------------------------
/sloe_experiments/runtime.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Run experiment to understand runtime of SLOE relative to ProbeFrontier.
17 |
18 | Tests the runtime of the SLOE estimator in compared to
19 | ProbeFrontier over many seeds, storing in csv files to be analyzed in a colab.
20 | """
21 | import time
22 |
23 |
24 |
25 | from absl import app
26 | from absl import flags
27 | import apache_beam as beam
28 | from apache_beam.options import pipeline_options
29 | import numpy as np
30 |
31 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper
32 |
33 |
34 | FLAGS = flags.FLAGS
35 |
36 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run')
37 | flags.DEFINE_string('output_path', '/tmp/counts.txt', 'The output file path')
38 |
39 | N_RANGE = [500, 1000, 2000, 3000, 4000, 6000, 8000, 16000]
40 |
41 |
42 | def run_sim(val):
43 | """Runs simulation and compare runtime of SLOE and ProbeFrontier."""
44 | n = val[0]
45 | seed = 201216 + val[1]
46 | # Model parameters
47 |
48 | sim_params = exp_helper.SimulationParams.create_from_flags()
49 | sim_params.seed = seed
50 | sim_params.training_n = n
51 | sim_params.p = int(n * FLAGS.features_per_sample)
52 | sim = exp_helper.Simulation(sim_params)
53 |
54 | new_method_model = exp_helper.create_inference_model('newmethod')
55 | pf_model = exp_helper.create_inference_model('probe_frontier')
56 |
57 | x1, y1 = sim.sample()
58 | if pf_model.is_separable(x1, y1):
59 | return
60 |
61 | tic = time.perf_counter()
62 | m = new_method_model.fit(x1, y1)
63 | toc = time.perf_counter()
64 | new_method_time = toc - tic
65 | # Deleting model here to keep memory clean for probe frontier model.
66 | del new_method_model, m
67 |
68 | tic = time.perf_counter()
69 | m, v = pf_model.fit(x1, y1)
70 | toc = time.perf_counter()
71 | probe_frontier_time = toc - tic
72 |
73 | return [np.array([n, seed, new_method_time, probe_frontier_time, v])]
74 |
75 |
76 | def main(unused_argv):
77 | # If you have custom beam options add them here.
78 | beam_options = pipeline_options.PipelineOptions()
79 |
80 | with beam.Pipeline(options=beam_options) as pipe:
81 | _ = (
82 | pipe
83 | | beam.Create(range(FLAGS.num_sims))
84 | | beam.FlatMap(exp_helper.multiple_sample_sizes, N_RANGE)
85 | | 'PrepShuffle' >> beam.Reshuffle()
86 | | beam.FlatMap(run_sim)
87 | | beam.Map(exp_helper.numpy_array_to_csv)
88 | | beam.Reshuffle()
89 | |
90 | 'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5))
91 |
92 |
93 | if __name__ == '__main__':
94 | app.run(main)
95 |
--------------------------------------------------------------------------------
/sloe_experiments/est_gamma.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Run experiment to understand convergence of SLOE estimator of eta.
17 |
18 | Tests the SLOE estimator empirically by computing it
19 | over a range of sample sizes for a bunch of different seeds, and storing in
20 | csv files to be analyzed in a colab.
21 | """
22 |
23 |
24 | from absl import app
25 | from absl import flags
26 | import apache_beam as beam
27 | from apache_beam.options import pipeline_options
28 | import numpy as np
29 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper
30 | import statsmodels.api as sm
31 |
32 | FLAGS = flags.FLAGS
33 |
34 | flags.DEFINE_string('output_path', '/tmp/counts.txt', 'The output file path')
35 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run')
36 | flags.DEFINE_string('img_path', '/tmp/counts.png', 'Path to save plots')
37 |
38 | N_RANGE = [250, 500, 1000, 2000, 4000]
39 |
40 |
41 | def multiple_sample_sizes(seed):
42 | """Run same seed over multiple sample sizes."""
43 | for n in N_RANGE:
44 | yield [n, seed]
45 |
46 |
47 | def run_sim(params):
48 | """Runs simulation and computes estimated eta_hat to compare to truth."""
49 | n = params[0]
50 | seed = params[1]
51 | kappa = FLAGS.features_per_sample
52 | p = int(n * kappa)
53 |
54 | gamma = np.sqrt(FLAGS.signal_strength)
55 | rand_state = np.random.RandomState(201216 + seed)
56 |
57 | p_positive = int(p / 8)
58 | p_negative = p_positive
59 | p_zero = p - p_positive - p_negative
60 | beta = 2 * np.concatenate(
61 | (np.ones(p_positive), -np.ones(p_negative), np.zeros(p_zero)))
62 | beta *= gamma
63 |
64 | features = rand_state.randn(n, p) / np.sqrt(p)
65 | labels = (rand_state.rand(n) <= 1.0 /
66 | (1.0 + np.exp(-features.dot(beta)))).astype(float)
67 |
68 | logit_model = sm.Logit(labels, features)
69 | logit_model_fit = logit_model.fit(disp=False)
70 | beta_hat = logit_model_fit.params
71 |
72 | hessian = logit_model.hessian(beta_hat)
73 | # Computes X_i^T H^{-1} X_i for all examples. Used in Sherman-Morrison formula
74 | # below.
75 | xi_hessian_inv_xi = np.diag(
76 | features.dot(np.linalg.solve(hessian, features.T)))
77 | pred = logit_model_fit.predict(features)
78 | # Sherman-Morrison formula for X_i^T H_{-i}^{-1} X_i, where H_{-i} is Hessian
79 | # without i-th example.
80 | mod = xi_hessian_inv_xi / (1.0 + xi_hessian_inv_xi * pred * (1 - pred))
81 | infl = mod * (labels - pred) + features.dot(beta_hat)
82 |
83 | eta_hat = np.var(infl)
84 |
85 | eta_hat_simp = np.linalg.norm(beta_hat)**2
86 |
87 | return np.array([n, seed, eta_hat, eta_hat_simp])
88 |
89 |
90 | def main(unused_argv):
91 | # If you have custom beam options add them here.
92 | beam_options = pipeline_options.PipelineOptions()
93 |
94 | with beam.Pipeline(options=beam_options) as pipe:
95 | _ = (
96 | pipe
97 | | beam.Create(range(FLAGS.num_sims))
98 | | beam.FlatMap(multiple_sample_sizes)
99 | | 'PrepShuffle' >> beam.Reshuffle()
100 | | beam.Map(run_sim)
101 | | beam.Map(exp_helper.numpy_array_to_csv)
102 | | beam.Reshuffle()
103 | |
104 | 'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5))
105 |
106 |
107 | if __name__ == '__main__':
108 | app.run(main)
109 |
--------------------------------------------------------------------------------
/mle_param_integrands.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2021 The SLOE Logistic Authors.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "mle_param_integrands.h"
16 |
17 | #include
18 |
19 | #include "pybind11/pybind11.h"
20 |
21 | namespace logistic_hd {
22 |
23 | double sigmoid(double z) {
24 | const double v = 1.0 / (1 + exp(-z));
25 | return (v);
26 | }
27 |
28 | double prox_deriv(double z, void *args) {
29 | prox_params *myargs = reinterpret_cast(args);
30 | return (myargs->lambda * sigmoid(z) + z - myargs->x);
31 | }
32 |
33 | double prox_impl(double lambda, double x, double xtol, double rtol,
34 | int maxiters) {
35 | prox_params params;
36 | scipy_zeros_info solver_stats;
37 | double lower;
38 | double upper;
39 |
40 | params.lambda = lambda;
41 | params.x = x;
42 |
43 | if (lambda * x > 0) {
44 | lower = x - lambda - 1e-4;
45 | upper = x + 1e-4;
46 | } else {
47 | lower = x - lambda / 2.0 - 1e-4;
48 | upper = x + 1e-4;
49 | }
50 | lower = -abs(x) - 8;
51 | upper = abs(x) + 8;
52 |
53 | if (abs(prox_deriv(lower, ¶ms)) < 1e-8) {
54 | return (lower);
55 | }
56 | if (abs(prox_deriv(upper, ¶ms)) < 1e-8) {
57 | return (upper);
58 | }
59 |
60 | const double x0 = brentq(&prox_deriv, lower, upper, xtol, rtol, maxiters,
61 | reinterpret_cast(¶ms), &solver_stats);
62 |
63 | return (x0);
64 | }
65 |
66 | double integrand(double Z1, double Z2, double kappa, double gamma, double b0,
67 | double alpha, double lambda, double sigma, double beta0,
68 | int eq_num) {
69 | double eq;
70 |
71 | const double S1 = gamma * Z1 / alpha + beta0;
72 | const double S2 = gamma * Z1 + sigma * Z2 + b0;
73 |
74 | const double prox_S2 = prox_impl(lambda, S2);
75 | const double prox_lambda_S2 = prox_impl(lambda, lambda + S2);
76 |
77 | const double sig_S1 = sigmoid(S1);
78 | const double sig_neg_S1 = 1 - sig_S1;
79 |
80 | if (eq_num == 1) {
81 | eq = sig_S1 * pow(S2 - prox_lambda_S2, 2);
82 | eq += sig_neg_S1 * pow(S2 - prox_S2, 2);
83 | } else if (eq_num == 2) {
84 | eq = sig_S1 * Z2 * prox_lambda_S2;
85 | eq += sig_neg_S1 * Z2 * prox_S2;
86 | } else if (eq_num == 3) {
87 | eq = sig_S1 * Z1 * prox_lambda_S2;
88 | eq += sig_neg_S1 * Z1 * prox_S2;
89 | } else {
90 | const double prox_neg_S2 = prox_impl(lambda, -S2);
91 | eq = -sig_S1 * sigmoid(prox_neg_S2);
92 | eq += sig_neg_S1 * sigmoid(prox_S2);
93 | }
94 |
95 | return (eq * pdf(Z1, Z2));
96 | }
97 |
98 | double pdf(double x1, double x2) {
99 | return (exp(-(pow(x1, 2) + pow(x2, 2)) / 2.0) / (2 * M_PI));
100 | }
101 |
102 | } // namespace logistic_hd
103 |
104 | PYBIND11_MODULE(mle_param_integrands, m) {
105 | m.doc() = "Logistic Regression MLE High Dimensional Integrands";
106 |
107 | m.def("sigmoid", &logistic_hd::sigmoid,
108 | "Sigmoid for a float (unvectorized, no error checking)");
109 | m.def("integrand", &logistic_hd::integrand,
110 | "Integrand for equation to get high dimensional adjustment");
111 | m.def("prox_deriv", &logistic_hd::prox_deriv,
112 | "Derivative prox objective for logistic link");
113 | m.def("prox_impl", &logistic_hd::prox_impl,
114 | "Computes prox for logistic link times lambda");
115 | m.def("pdf", &logistic_hd::pdf,
116 | "Computes pdf of bivariate normal distribution");
117 | }
118 |
--------------------------------------------------------------------------------
/third_party/py/scipy/optimize/Zeros/brentq.c:
--------------------------------------------------------------------------------
1 | /* Written by Charles Harris charles.harris@sdl.usu.edu */
2 |
3 | #include
4 | #include "zeros.h"
5 |
6 | #define MIN(a, b) ((a) < (b) ? (a) : (b))
7 |
8 | /*
9 | At the top of the loop the situation is the following:
10 |
11 | 1. the root is bracketed between xa and xb
12 | 2. xa is the most recent estimate
13 | 3. xp is the previous estimate
14 | 4. |fp| < |fb|
15 |
16 | The order of xa and xp doesn't matter, but assume xp < xb. Then xa lies to
17 | the right of xp and the assumption is that xa is increasing towards the root.
18 | In this situation we will attempt quadratic extrapolation as long as the
19 | condition
20 |
21 | * |fa| < |fp| < |fb|
22 |
23 | is satisfied. That is, the function value is decreasing as we go along.
24 | Note the 4 above implies that the right inequlity already holds.
25 |
26 | The first check is that xa is still to the left of the root. If not, xb is
27 | replaced by xp and the interval reverses, with xb < xa. In this situation
28 | we will try linear interpolation. That this has happened is signaled by the
29 | equality xb == xp;
30 |
31 | The second check is that |fa| < |fb|. If this is not the case, we swap
32 | xa and xb and resort to bisection.
33 |
34 | */
35 |
36 | double
37 | brentq(callback_type f, double xa, double xb, double xtol, double rtol,
38 | int iter, void *func_data, scipy_zeros_info *solver_stats)
39 | {
40 | double xpre = xa, xcur = xb;
41 | double xblk = 0., fpre, fcur, fblk = 0., spre = 0., scur = 0., sbis;
42 | /* the tolerance is 2*delta */
43 | double delta;
44 | double stry, dpre, dblk;
45 | int i;
46 | solver_stats->error_num = INPROGRESS;
47 |
48 | fpre = (*f)(xpre, func_data);
49 | fcur = (*f)(xcur, func_data);
50 | solver_stats->funcalls = 2;
51 | if (fpre*fcur > 0) {
52 | solver_stats->error_num = SIGNERR;
53 | return 0.;
54 | }
55 | if (fpre == 0) {
56 | solver_stats->error_num = CONVERGED;
57 | return xpre;
58 | }
59 | if (fcur == 0) {
60 | solver_stats->error_num = CONVERGED;
61 | return xcur;
62 | }
63 |
64 | solver_stats->iterations = 0;
65 | for (i = 0; i < iter; i++) {
66 | solver_stats->iterations++;
67 | if (fpre*fcur < 0) {
68 | xblk = xpre;
69 | fblk = fpre;
70 | spre = scur = xcur - xpre;
71 | }
72 | if (fabs(fblk) < fabs(fcur)) {
73 | xpre = xcur;
74 | xcur = xblk;
75 | xblk = xpre;
76 |
77 | fpre = fcur;
78 | fcur = fblk;
79 | fblk = fpre;
80 | }
81 |
82 | delta = (xtol + rtol*fabs(xcur))/2;
83 | sbis = (xblk - xcur)/2;
84 | if (fcur == 0 || fabs(sbis) < delta) {
85 | solver_stats->error_num = CONVERGED;
86 | return xcur;
87 | }
88 |
89 | if (fabs(spre) > delta && fabs(fcur) < fabs(fpre)) {
90 | if (xpre == xblk) {
91 | /* interpolate */
92 | stry = -fcur*(xcur - xpre)/(fcur - fpre);
93 | }
94 | else {
95 | /* extrapolate */
96 | dpre = (fpre - fcur)/(xpre - xcur);
97 | dblk = (fblk - fcur)/(xblk - xcur);
98 | stry = -fcur*(fblk*dblk - fpre*dpre)
99 | /(dblk*dpre*(fblk - fpre));
100 | }
101 | if (2*fabs(stry) < MIN(fabs(spre), 3*fabs(sbis) - delta)) {
102 | /* good short step */
103 | spre = scur;
104 | scur = stry;
105 | } else {
106 | /* bisect */
107 | spre = sbis;
108 | scur = sbis;
109 | }
110 | }
111 | else {
112 | /* bisect */
113 | spre = sbis;
114 | scur = sbis;
115 | }
116 |
117 | xpre = xcur; fpre = fcur;
118 | if (fabs(scur) > delta) {
119 | xcur += scur;
120 | }
121 | else {
122 | xcur += (sbis > 0 ? delta : -delta);
123 | }
124 |
125 | fcur = (*f)(xcur, func_data);
126 | solver_stats->funcalls++;
127 | }
128 | solver_stats->error_num = CONVERR;
129 | return xcur;
130 | }
131 |
--------------------------------------------------------------------------------
/asymp_system_solve.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Solves nonlinear equations for high dim correction factors for MLE.
17 |
18 | Solves the nonlinear equations in Sur and Candès (PNAS., 2019) to find
19 | the adjustment factors for bias and variance of logistic regression MLE.
20 | """
21 |
22 |
23 | import functools
24 |
25 | from absl import app
26 | import numpy as np
27 | import scipy
28 | import scipy.integrate
29 | import scipy.optimize
30 |
31 | import sloe_logistic.mle_param_integrands as mle_helper
32 |
33 |
34 | def _t_integrand(z, v, t, gamma):
35 | """Integrand used to calculate when the logistic MLE exists."""
36 | return 2 * mle_helper.sigmoid(gamma * v) * mle_helper.pdf(z, v) * (
37 | max(z - t * v, 0)**2)
38 |
39 |
40 | def _t_problem(t, gamma):
41 | """Minimizer of this integrand in t is the frontier where the MLE exists."""
42 | loss, _ = scipy.integrate.dblquad(_t_integrand, -8, 8, -8, 8, (
43 | t,
44 | gamma,
45 | ), 1e-6, 1e-6)
46 | return loss
47 |
48 |
49 | def _g_mle_inv(gamma):
50 | """Frontier where data separable in limit. Gives kappa in terms of gamma."""
51 | res = scipy.optimize.minimize_scalar(
52 | _t_problem, bounds=(-10, 10), args=(gamma,), method='Bounded')
53 | return _t_problem(res.x, gamma)
54 |
55 |
56 | def frontier(kappa):
57 | """Frontier where data separable in limit. Gives gamma in terms of kappa."""
58 | gamma_star = scipy.optimize.brentq(lambda gamma: _g_mle_inv(gamma) - kappa, 0,
59 | 25)
60 | return gamma_star
61 |
62 |
63 | def equations(kappa, eta, gamma, beta0, use_eta, alpha, lambda_, sigma, b0):
64 | """The solution to these equations gives the high dimensional adjustment."""
65 | if use_eta:
66 | gamma = np.sqrt(max(eta - sigma**2, 0.0001))
67 | else:
68 | gamma *= alpha
69 |
70 | eq1, _ = scipy.integrate.dblquad(
71 | mle_helper.integrand, -8, 8, -8, 8,
72 | (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 1), 1e-4, 1e-4)
73 | eq2, _ = scipy.integrate.dblquad(
74 | mle_helper.integrand, -8, 8, -8, 8,
75 | (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 2), 1e-4, 1e-4)
76 | eq3, _ = scipy.integrate.dblquad(
77 | mle_helper.integrand, -8, 8, -8, 8,
78 | (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 3), 1e-4, 1e-4)
79 | eq4, _ = scipy.integrate.dblquad(
80 | mle_helper.integrand, -8, 8, -8, 8,
81 | (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 4), 1, 1)
82 | eq1 -= sigma**2 * kappa
83 | eq2 -= abs(sigma) * (1 - kappa)
84 | eq3 -= gamma
85 |
86 | return -np.array([eq1, eq2, eq3, eq4])
87 |
88 |
89 | def get_system(kappa, eta, gamma, b0, use_eta=True):
90 | system_ = functools.partial(equations, kappa, eta, gamma, b0, use_eta)
91 | return system_
92 |
93 |
94 | def correction_factors(kappa, eta, gamma, b0, use_eta=True):
95 | """Computes correction factors for MLE of high dimensional logistic reg."""
96 | system_ = get_system(kappa, eta, gamma, b0, use_eta)
97 | if use_eta:
98 | init = np.array([2, 2, np.sqrt(eta / 2), b0 / 2])
99 | else:
100 | init = np.array([2, 2, np.sqrt(gamma**2 + 1), b0])
101 | soln = scipy.optimize.root(
102 | lambda x: system_(*x),
103 | init,
104 | method='lm',
105 | options={
106 | 'xtol': 1e-4,
107 | 'eps': 1e-8
108 | })
109 | x0 = soln.x
110 | if kappa >= 0.03 and (x0[0] < 1 or x0[2] < 0.1):
111 | print('Rerunning due to convergence issue')
112 | init += 0.1 * np.random.randn(4)
113 | init = np.maximum(init, np.array([1, 0.5, 0.1, b0 / 2.0]))
114 | soln = scipy.optimize.root(
115 | lambda x: system_(*x),
116 | init,
117 | method='lm',
118 | options={
119 | 'xtol': 1e-4,
120 | 'eps': 1e-8
121 | })
122 | x0 = soln.x
123 | return x0
124 |
125 |
126 | def main(argv):
127 | if len(argv) > 1:
128 | raise app.UsageError('Too many command-line arguments.')
129 |
130 | sol = correction_factors(0.2, 1, np.sqrt(5), 0, use_eta=False)
131 | print(sol)
132 | sol = correction_factors(0.1, 8.881028475794636, np.sqrt(5), 0, use_eta=True)
133 | print(sol)
134 |
135 |
136 | if __name__ == '__main__':
137 | app.run(main)
138 |
--------------------------------------------------------------------------------
/unbiased_logistic_regression_test.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Tests for sloe_logistic.asymp_system_solve."""
17 |
18 | from absl.testing import absltest
19 | import numpy as np
20 | from sloe_logistic import unbiased_logistic_regression
21 |
22 |
23 | class UnbiasedLogisticRegressionTest(absltest.TestCase):
24 |
25 | def get_simulated_data(self, n, d):
26 | np.random.seed(1)
27 | features = np.random.randn(n, d)
28 | beta = np.sqrt(5 * 2.0 / d) * np.ones(d)
29 | beta[(d // 2):] = 0
30 |
31 | outcome = (np.random.rand(n) <= 1.0 /
32 | (1.0 + np.exp(-features.dot(beta)))).astype(float)
33 |
34 | return features, outcome
35 |
36 | def test_unbiased_model(self):
37 | """Tests that UnbiasedLogisticRegression.fit runs without errors."""
38 | n, d = 1000, 100
39 | features, outcome = self.get_simulated_data(n, d)
40 | model = unbiased_logistic_regression.UnbiasedLogisticRegression(
41 | fit_intercept=False)
42 | model.fit(features, outcome)
43 |
44 | self.assertLen(model.coef_.reshape(-1), features.shape[1])
45 |
46 | def test_cant_fit_intercept(self):
47 | """Tests that UnbiasedLogisticRegression doesn't allow fit_intercept.
48 |
49 | Currently, there's no support for fitting the intercept. This checks that
50 | trying to fit an intercept raises an error instead of silently ignoring
51 | the intercept.
52 | """
53 | with self.assertRaises(ValueError):
54 | _ = unbiased_logistic_regression.UnbiasedLogisticRegression(
55 | fit_intercept=True)
56 |
57 | def test_platt_model(self):
58 | """Tests that PlattScaledLogisticRegression.fit runs without errors."""
59 | n, d = 1000, 100
60 | features, outcome = self.get_simulated_data(n, d)
61 | model = unbiased_logistic_regression.PlattScaledLogisticRegression(
62 | fit_intercept=False)
63 | model.fit(features, outcome)
64 |
65 | def test_standard_mle_model(self):
66 | """Tests that LogisticRegressionMLE.fit runs without errors."""
67 | n, d = 1000, 100
68 | features, outcome = self.get_simulated_data(n, d)
69 | model = unbiased_logistic_regression.LogisticRegressionMLE(
70 | fit_intercept=False)
71 | model.fit(features, outcome)
72 |
73 | def test_bootstrap_model(self):
74 | """Tests that LogisticRegressionPercBoot.fit runs without errors."""
75 | n, d = 1000, 100
76 | features, outcome = self.get_simulated_data(n, d)
77 | model = unbiased_logistic_regression.LogisticRegressionPercBoot(
78 | fit_intercept=False)
79 | model.fit(features, outcome)
80 |
81 | def test_bootstrap_prediction_intervals(self):
82 | """Tests that LogisticRegressionPercBoot.prediction_intervals runs."""
83 | n, d = 1000, 100
84 | features, outcome = self.get_simulated_data(n, d)
85 | model = unbiased_logistic_regression.LogisticRegressionPercBoot(
86 | fit_intercept=False)
87 | model.fit(features, outcome)
88 | model.prediction_intervals(features)
89 |
90 | def test_regularized_model(self):
91 | """Tests that CVRegLogisticRegression.fit runs without errors."""
92 | n, d = 1000, 100
93 | features, outcome = self.get_simulated_data(n, d)
94 | model = unbiased_logistic_regression.CVRegLogisticRegression(
95 | fit_intercept=False)
96 | model.fit(features, outcome)
97 |
98 | self.assertLen(model.coef_.reshape(-1), features.shape[1])
99 |
100 | def test_prediction_intervals(self):
101 | n, d = 1000, 100
102 | features, outcome = self.get_simulated_data(n, d)
103 | model = unbiased_logistic_regression.UnbiasedLogisticRegression(
104 | fit_intercept=False)
105 | model.fit(features, outcome)
106 |
107 | test_features, _ = self.get_simulated_data(100, d)
108 | intervals = model.prediction_intervals(test_features)
109 | estimated_probs = model.predict_proba(test_features)[:, 1]
110 |
111 | np.testing.assert_array_less(intervals[:, 0], estimated_probs)
112 | np.testing.assert_array_less(estimated_probs, intervals[:, 2])
113 |
114 | def test_corrected_p_values(self):
115 | """Check null P value CDF is within 95% CI of uniform CDF."""
116 | n, d = 4000, 400
117 | features, outcome = self.get_simulated_data(n, d)
118 | model = unbiased_logistic_regression.UnbiasedLogisticRegression(
119 | fit_intercept=False)
120 | model.fit(features, outcome)
121 |
122 | thresh = 0.1
123 | emp_p_cdf = model.p_values().reshape(-1)[(d // 2):] <= thresh
124 | self.assertAlmostEqual(
125 | emp_p_cdf.mean(),
126 | thresh,
127 | delta=1.96 * emp_p_cdf.std() / np.sqrt(d // 2))
128 |
129 |
130 | if __name__ == '__main__':
131 | absltest.main()
132 |
--------------------------------------------------------------------------------
/probe_frontier.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Implements logistic regression w/ ProbeFrontier estimator of bias correction.
17 |
18 | Implements the bias correction and inference for the MLE using the ProbeFrontier
19 | estimator of the signal strength as in [1]. Theory for arbitrary covariance with
20 | Gaussian features from [2], and empirical evidence suggesting good performance
21 | for non-Gaussian designs.
22 |
23 | [1] Sur, Pragya, and Emmanuel J. Candès. "A modern maximum-likelihood theory
24 | for high-dimensional logistic regression." Proceedings of the National Academy
25 | of Sciences 116.29 (2019): 14516-14525.
26 | [2] Zhao, Qian, Pragya Sur, and Emmanuel J. Candes. "The asymptotic distribution
27 | of the mle in high-dimensional logistic models: Arbitrary covariance." arXiv
28 | preprint arXiv:2001.09351 (2020).
29 | """
30 | from absl import app
31 | import numpy as np
32 | import scipy
33 | from sloe_logistic import asymp_system_solve
34 | from sloe_logistic import unbiased_logistic_regression
35 | import statsmodels.api as sm
36 | import statsmodels.tools
37 |
38 |
39 | class ProbeFrontierLogisticRegression(
40 | unbiased_logistic_regression.UnbiasedLogisticRegression):
41 | """Implements ProbeFrontier and statistical inference with it."""
42 |
43 | def __init__(self, num_subsamples=10):
44 | super().__init__(fit_intercept=False)
45 | self.num_subsamples = num_subsamples
46 | self.sep_calls = 0
47 |
48 | def fit(self, features, outcome, weights=None, verbose=False):
49 | """Fit ProbeFrontier model."""
50 | if self.fit_intercept:
51 | raise NotImplementedError("ProbeFrontier doesn't work with intercept")
52 | self.sep_calls = 0
53 |
54 | self.sm.fit(features, outcome, weights)
55 |
56 | if weights is None:
57 | weights = 1
58 |
59 | kappa = float(features.shape[1]) / features.shape[0]
60 | gamma_hat = self.estimate_gamma(features, outcome)
61 |
62 | self.alpha, _, sigma, _ = asymp_system_solve.correction_factors(
63 | kappa, None, gamma_hat, 0, use_eta=False)
64 |
65 | self.coef_ = self.sm.coef_ / self.alpha
66 | self.intercept_ = 0
67 |
68 | self._set_coef_cov(features, sigma / np.sqrt(kappa), self.alpha)
69 |
70 | return self, self.sep_calls
71 |
72 | def estimate_gamma(self, features, outcome):
73 | """Estimate gamma."""
74 | estimated_kappa_threshold = self.probe_frontier(features, outcome)
75 | if estimated_kappa_threshold < 0:
76 | print(features, outcome)
77 | if estimated_kappa_threshold >= 0.499:
78 | return 0.0
79 | return asymp_system_solve.frontier(estimated_kappa_threshold)
80 |
81 | def probe_frontier(self, features, outcome):
82 | """Probe for frontier."""
83 | n, p = features.shape
84 | upper_frac = n
85 | lower_frac = min(n, 1.99 * p)
86 | obs = []
87 | while abs(upper_frac - lower_frac) > (0.05 * p):
88 | frac = int((upper_frac + lower_frac) / 2)
89 | p_sep = 0
90 | for _ in range(self.num_subsamples):
91 | indices = np.random.choice(n, frac, replace=False)
92 | feature_sub = features[indices, :]
93 | outcome_sub = outcome[indices]
94 | p_sep += self.is_separable(feature_sub, outcome_sub)
95 | p_sep /= float(self.num_subsamples)
96 | obs.append([frac, p_sep])
97 | if p_sep >= 0.8:
98 | lower_frac = frac
99 | elif p_sep <= 0.2:
100 | upper_frac = frac
101 | elif p_sep > 0.5:
102 | lower_frac = 0.5 * lower_frac + 0.5 * frac
103 | else:
104 | upper_frac = 0.5 * upper_frac + 0.5 * frac
105 |
106 | if len(obs) <= 2:
107 | frac = int(0.5 * (upper_frac + lower_frac))
108 | else:
109 | obs = np.array(obs)
110 |
111 | if (obs[0, 1] > (1 - 1.5 / self.num_subsamples)):
112 | frac = obs[0, 0]
113 | elif (obs[-1, 1] < (1.5 / self.num_subsamples)):
114 | frac = obs[-1, 0]
115 | else:
116 | try:
117 | interp = sm.GLM(
118 | obs[:, 1],
119 | sm.add_constant(obs[:, 0].reshape(-1, 1)),
120 | family=sm.families.Binomial())
121 | res = interp.fit()
122 | frac = -res.params[0] / res.params[1]
123 |
124 | except statsmodels.tools.sm_exceptions.PerfectSeparationError:
125 | threshold = np.argmax(np.diff(obs[:, 1], prepend=0))
126 | frac = obs[threshold, 0]
127 |
128 | return min(float(p) / frac, 0.5)
129 |
130 | def is_separable(self, features, outcome):
131 | """Check whether data are linearly separable."""
132 | self.sep_calls += 1
133 | n, p = features.shape
134 | features_aug = np.ones((n, p + 1))
135 | features_aug[:, :-1] = features
136 | features_aug *= (2 * outcome - 1).reshape(-1, 1)
137 | b = -np.ones(n)
138 | res = scipy.optimize.linprog(
139 | b, A_eq=features_aug.T, b_eq=np.zeros(p + 1), method='interior-point')
140 | if res.status == 0:
141 | return res.fun > -1e-6
142 | elif res.status == 2:
143 | return False
144 | elif res.status == 3:
145 | return False
146 | else:
147 | print(res)
148 | raise Exception('Error finding separability')
149 |
150 |
151 | def main(argv):
152 | if len(argv) > 1:
153 | raise app.UsageError('Too many command-line arguments.')
154 |
155 | p = ProbeFrontierLogisticRegression()
156 |
157 | features = np.random.randn(600, 300) / np.sqrt(300)
158 | outcome = (np.random.rand(600) <= 1 /
159 | (1.0 + np.exp(-1 * features.sum(axis=1)))).astype(float)
160 | primal = p.is_separable(features, outcome)
161 | print(primal)
162 |
163 | features = np.array([[1, 1], [0, 0]])
164 | outcome = np.array([1, 0])
165 | print(p.is_separable(features, outcome))
166 | features = np.array([[1, 1], [0, 0], [-1, -1]])
167 | outcome = np.array([1, 0, 1])
168 | print(p.is_separable(features, outcome))
169 |
170 | features = np.random.randn(100, 100)
171 | outcome = (np.random.rand(100) <= 0.5).astype(float)
172 | print(p.is_separable(features, outcome))
173 |
174 | features = np.random.randn(100, 10)
175 | outcome = (np.random.rand(100) <= 0.5).astype(float)
176 | print(p.is_separable(features, outcome))
177 |
178 | if __name__ == '__main__':
179 | app.run(main)
180 |
--------------------------------------------------------------------------------
/sloe_experiments/sweep_coverage.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Run experiment to understand coverage of CIs generated by SLOE.
17 |
18 | Tests the SLOE estimator empirically by computing
19 | confidence intervals (CIs) using it over a bunch of different seeds and aspect
20 | ratios, calculating properties such as coverage and size, and storing in csv
21 | files to be analyzed in a colab.
22 | """
23 |
24 |
25 | from absl import app
26 | from absl import flags
27 | import apache_beam as beam
28 | from apache_beam.options import pipeline_options
29 | import numpy as np
30 | import sklearn.linear_model
31 | from sklearn.model_selection import LeaveOneOut
32 |
33 | from sloe_logistic import probe_frontier
34 | from sloe_logistic import unbiased_logistic_regression
35 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper
36 |
37 |
38 | GAMMA_RANGE = [0.1, 1, 5]
39 | FLAGS = flags.FLAGS
40 |
41 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run')
42 | flags.DEFINE_string('output_path', '/tmp/counts', 'The output file path')
43 | flags.DEFINE_enum(
44 | 'coverage_target', 'true_preds', ['true_preds', 'calib_ests', 'reg_ests'],
45 | 'Which value to check coverage in prediction intervals?')
46 | flags.DEFINE_boolean('include_bootstrap', False,
47 | 'Include bootstrap CIs as well? These are slow.')
48 | flags.DEFINE_float(
49 | 'kappa_spacing', 0.05,
50 | 'Resolution of graph in terms of spacing between kappa evaluated.')
51 | flags.DEFINE_float(
52 | 'coverage_rate', 95, 'What level confidence intervals'
53 | 'should be tested (0-100)?')
54 |
55 |
56 | def run_sim(params):
57 | """Runs simulation and computes properties of the estimated CIs."""
58 | kappa = params[0]
59 | gamma = params[1]
60 | seed = 201216 + params[2]
61 |
62 | sim_params = exp_helper.SimulationParams.create_from_flags()
63 | sim_params.seed = seed
64 | sim_params.gamma = np.sqrt(gamma)
65 | sim_params.p = int(sim_params.training_n * kappa)
66 | sim = exp_helper.create_sim(sim_params)
67 |
68 | x1, y1 = sim.sample()
69 |
70 | pfr = probe_frontier.ProbeFrontierLogisticRegression()
71 | if pfr.is_separable(x1, y1):
72 | return
73 |
74 | # Draw test data
75 | x2, _ = sim.sample(int(sim_params.training_n / 4))
76 | true_logits = x2.dot(sim.beta)
77 | bias_selector = np.abs(true_logits) > 1e-2
78 |
79 | # Calculate coverage
80 | if FLAGS.coverage_target == 'true_preds':
81 | target = 1.0 / (1.0 + np.exp(-true_logits)).reshape(-1)
82 | elif FLAGS.coverage_target == 'calib_ests':
83 | ps_logit_model = unbiased_logistic_regression.PlattScaledLogisticRegression(
84 | fit_intercept=sim_params.intercept or sim_params.uncentered)
85 | ps_logit_model.fit(x1, y1)
86 | target = ps_logit_model.predict_proba(x2)[:, 1]
87 | elif FLAGS.coverage_target == 'reg_ests':
88 | ps_logit_model = sklearn.linear_model.LogisticRegressionCV(
89 | cv=LeaveOneOut(),
90 | fit_intercept=False,
91 | Cs=20,
92 | penalty='l2',
93 | solver='newton-cg')
94 | ps_logit_model.fit(x1, y1)
95 | target = ps_logit_model.predict_proba(x2)[:, 1]
96 | else:
97 | raise ValueError("Invalid choice of coverage target '{}'.".format(
98 | FLAGS.coverage_target))
99 |
100 | try:
101 | new_method_model = exp_helper.create_inference_model('newmethod')
102 | new_method_model.set_coverage(FLAGS.coverage_rate)
103 | _ = new_method_model.fit(x1, y1)
104 | new_pred_int = new_method_model.prediction_intervals(x2)
105 | new_logit_int = new_method_model.prediction_intervals(x2, logit=True)
106 | except ValueError as e:
107 | print(e)
108 | return
109 |
110 | std_method_model = exp_helper.create_inference_model('mle')
111 | std_method_model.set_coverage(FLAGS.coverage_rate)
112 | _ = std_method_model.fit(x1, y1)
113 | std_pred_int = std_method_model.prediction_intervals(x2)
114 | std_logit_int = std_method_model.prediction_intervals(x2, logit=True)
115 |
116 | new_coverage = np.logical_and(
117 | new_pred_int[:, 0].reshape(-1) <= target,
118 | target <= new_pred_int[:, 2].reshape(-1)).astype(float)
119 | std_coverage = np.logical_and(
120 | std_pred_int[:, 0].reshape(-1) <= target,
121 | target <= std_pred_int[:, 2].reshape(-1)).astype(float)
122 |
123 | new_width = np.abs(new_logit_int[:, 2] - new_logit_int[:, 0])
124 | std_width = np.abs(std_logit_int[:, 2] - std_logit_int[:, 0])
125 |
126 | new_bias = new_logit_int[bias_selector, 1] / true_logits[bias_selector]
127 | std_bias = std_logit_int[bias_selector, 1] / true_logits[bias_selector]
128 |
129 | results = [
130 | gamma, kappa, seed,
131 | np.mean(new_coverage),
132 | np.mean(new_width),
133 | np.mean(new_bias),
134 | np.mean(std_coverage),
135 | np.mean(std_width),
136 | np.mean(std_bias)
137 | ]
138 |
139 | if FLAGS.include_bootstrap:
140 | boot_method_model = exp_helper.create_inference_model('bootstrap')
141 | boot_method_model.set_coverage(FLAGS.coverage_rate)
142 | _ = boot_method_model.fit(x1, y1)
143 | boot_pred_int = boot_method_model.prediction_intervals(x2)
144 | boot_logit_int = boot_method_model.prediction_intervals(x2, logit=True)
145 |
146 | boot_coverage = np.logical_and(
147 | boot_pred_int[:, 0].reshape(-1) <= target,
148 | target <= boot_pred_int[:, 2].reshape(-1)).astype(float)
149 | boot_width = np.abs(boot_logit_int[:, 2] - boot_logit_int[:, 0])
150 | boot_bias = boot_logit_int[bias_selector, 1] / true_logits[bias_selector]
151 |
152 | results.append(np.mean(boot_coverage))
153 | results.append(np.mean(boot_width))
154 | results.append(np.mean(boot_bias))
155 |
156 | return [np.array(results)]
157 |
158 |
159 | def main(unused_argv):
160 | kappa_range = np.arange(0.05, 0.5 + 0.5 * FLAGS.kappa_spacing,
161 | FLAGS.kappa_spacing)
162 |
163 | # If you have custom beam options add them here.
164 | beam_options = pipeline_options.PipelineOptions()
165 |
166 | with beam.Pipeline(options=beam_options) as pipe:
167 | _ = (
168 | pipe
169 | | beam.Create(range(FLAGS.num_sims))
170 | | beam.FlatMap(exp_helper.multiple_sim_params, kappa_range,
171 | GAMMA_RANGE)
172 | | 'PrepShuffle' >> beam.Reshuffle()
173 | | beam.FlatMap(run_sim)
174 | | beam.Map(exp_helper.numpy_array_to_csv)
175 | | beam.Reshuffle()
176 | |
177 | 'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5))
178 |
179 |
180 | if __name__ == '__main__':
181 | app.run(main)
182 |
--------------------------------------------------------------------------------
/third_party/py/scipy/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2001, 2002 Enthought, Inc.
2 | All rights reserved.
3 |
4 | Copyright (c) 2003-2017 SciPy Developers.
5 | All rights reserved.
6 |
7 | Redistribution and use in source and binary forms, with or without
8 | modification, are permitted provided that the following conditions are met:
9 |
10 | a. Redistributions of source code must retain the above copyright notice,
11 | this list of conditions and the following disclaimer.
12 | b. Redistributions in binary form must reproduce the above copyright
13 | notice, this list of conditions and the following disclaimer in the
14 | documentation and/or other materials provided with the distribution.
15 | c. Neither the name of Enthought nor the names of the SciPy Developers
16 | may be used to endorse or promote products derived from this software
17 | without specific prior written permission.
18 |
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS
24 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
25 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
30 | THE POSSIBILITY OF SUCH DAMAGE.
31 |
32 |
33 |
34 | SciPy bundles a number of libraries that are compatibly licensed. We list
35 | these here.
36 |
37 | Name: Numpydoc
38 | Files: doc/sphinxext/numpydoc/*
39 | License: 2-clause BSD
40 | For details, see doc/sphinxext/LICENSE.txt
41 |
42 | Name: scipy-sphinx-theme
43 | Files: doc/scipy-sphinx-theme/*
44 | License: 3-clause BSD, PSF and Apache 2.0
45 | For details, see doc/sphinxext/LICENSE.txt
46 |
47 | Name: Six
48 | Files: scipy/_lib/six.py
49 | License: MIT
50 | For details, see the header inside scipy/_lib/six.py
51 |
52 | Name: Decorator
53 | Files: scipy/_lib/decorator.py
54 | License: 2-clause BSD
55 | For details, see the header inside scipy/_lib/decorator.py
56 |
57 | Name: ID
58 | Files: scipy/linalg/src/id_dist/*
59 | License: 3-clause BSD
60 | For details, see scipy/linalg/src/id_dist/doc/doc.tex
61 |
62 | Name: L-BFGS-B
63 | Files: scipy/optimize/lbfgsb/*
64 | License: BSD license
65 | For details, see scipy/optimize/lbfgsb/README
66 |
67 | Name: SuperLU
68 | Files: scipy/sparse/linalg/dsolve/SuperLU/*
69 | License: 3-clause BSD
70 | For details, see scipy/sparse/linalg/dsolve/SuperLU/License.txt
71 |
72 | Name: ARPACK
73 | Files: scipy/sparse/linalg/eigen/arpack/ARPACK/*
74 | License: 3-clause BSD
75 | For details, see scipy/sparse/linalg/eigen/arpack/ARPACK/COPYING
76 |
77 | Name: Qhull
78 | Files: scipy/spatial/qhull/*
79 | License: Qhull license (BSD-like)
80 | For details, see scipy/spatial/qhull/COPYING.txt
81 |
82 | Name: Cephes
83 | Files: scipy/special/cephes/*
84 | License: 3-clause BSD
85 | Distributed under 3-clause BSD license with permission from the author,
86 | see https://lists.debian.org/debian-legal/2004/12/msg00295.html
87 |
88 | Cephes Math Library Release 2.8: June, 2000
89 | Copyright 1984, 1995, 2000 by Stephen L. Moshier
90 |
91 | This software is derived from the Cephes Math Library and is
92 | incorporated herein by permission of the author.
93 |
94 | All rights reserved.
95 |
96 | Redistribution and use in source and binary forms, with or without
97 | modification, are permitted provided that the following conditions are met:
98 | * Redistributions of source code must retain the above copyright
99 | notice, this list of conditions and the following disclaimer.
100 | * Redistributions in binary form must reproduce the above copyright
101 | notice, this list of conditions and the following disclaimer in the
102 | documentation and/or other materials provided with the distribution.
103 | * Neither the name of the nor the
104 | names of its contributors may be used to endorse or promote products
105 | derived from this software without specific prior written permission.
106 |
107 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
108 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
109 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
110 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY
111 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
112 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
113 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
114 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
115 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
116 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
117 |
118 | Name: Faddeeva
119 | Files: scipy/special/Faddeeva.*
120 | License: MIT
121 | Copyright (c) 2012 Massachusetts Institute of Technology
122 |
123 | Permission is hereby granted, free of charge, to any person obtaining
124 | a copy of this software and associated documentation files (the
125 | "Software"), to deal in the Software without restriction, including
126 | without limitation the rights to use, copy, modify, merge, publish,
127 | distribute, sublicense, and/or sell copies of the Software, and to
128 | permit persons to whom the Software is furnished to do so, subject to
129 | the following conditions:
130 |
131 | The above copyright notice and this permission notice shall be
132 | included in all copies or substantial portions of the Software.
133 |
134 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
135 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
136 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
137 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
138 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
139 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
140 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
141 |
142 | Name: qd
143 | Files: scipy/special/cephes/dd_*.[ch]
144 | License: modified BSD license ("BSD-LBNL-License.doc")
145 | This work was supported by the Director, Office of Science, Division
146 | of Mathematical, Information, and Computational Sciences of the
147 | U.S. Department of Energy under contract numbers DE-AC03-76SF00098 and
148 | DE-AC02-05CH11231.
149 |
150 | Copyright (c) 2003-2009, The Regents of the University of California,
151 | through Lawrence Berkeley National Laboratory (subject to receipt of
152 | any required approvals from U.S. Dept. of Energy) All rights reserved.
153 |
154 | 1. Redistribution and use in source and binary forms, with or
155 | without modification, are permitted provided that the following
156 | conditions are met:
157 |
158 | (1) Redistributions of source code must retain the copyright
159 | notice, this list of conditions and the following disclaimer.
160 |
161 | (2) Redistributions in binary form must reproduce the copyright
162 | notice, this list of conditions and the following disclaimer in
163 | the documentation and/or other materials provided with the
164 | distribution.
165 |
166 | (3) Neither the name of the University of California, Lawrence
167 | Berkeley National Laboratory, U.S. Dept. of Energy nor the names
168 | of its contributors may be used to endorse or promote products
169 | derived from this software without specific prior written
170 | permission.
171 |
172 | 2. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
173 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
174 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
175 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
176 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
177 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
178 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
179 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
180 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
181 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
182 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
183 |
184 | 3. You are under no obligation whatsoever to provide any bug fixes,
185 | patches, or upgrades to the features, functionality or performance of
186 | the source code ("Enhancements") to anyone; however, if you choose to
187 | make your Enhancements available either publicly, or directly to
188 | Lawrence Berkeley National Laboratory, without imposing a separate
189 | written license agreement for such Enhancements, then you hereby grant
190 | the following license: a non-exclusive, royalty-free perpetual license
191 | to install, use, modify, prepare derivative works, incorporate into
192 | other computer software, distribute, and sublicense such enhancements
193 | or derivative works thereof, in binary and source code form.
194 |
--------------------------------------------------------------------------------
/sloe_experiments/experiment_helpers.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Helpers used across many experiments to understand SLOE estimator.
17 |
18 | Implements the simulation settings studied in the paper
19 | and provides a bunch of helper functions used throughout to create and analyze
20 | simulations.
21 | """
22 |
23 |
24 | from absl import flags
25 | import numpy as np
26 |
27 | from sloe_logistic import probe_frontier
28 | from sloe_logistic import unbiased_logistic_regression
29 |
30 | FLAGS = flags.FLAGS
31 |
32 | flags.DEFINE_enum(
33 | "covariates", "gaussian", ["gaussian", "gwas"],
34 | "Covariate generating distribution for sim. If gaussian, see --covariance"
35 | "for more details about distribution.")
36 | flags.DEFINE_enum(
37 | "covariance", "isotropic", ["isotropic", "elliptical"],
38 | "Covariance of covariates.")
39 | flags.DEFINE_float("features_per_sample", 0.2,
40 | "number of features per sample (kappa)")
41 | flags.DEFINE_float("intercept", 0, "intercept of logits")
42 | flags.DEFINE_enum(
43 | "method", "newmethod", ["newmethod", "mle", "probefrontier"],
44 | "Which method for estimation and inference?")
45 | flags.DEFINE_boolean("one_and_none", False,
46 | "Put all of the signal in one (the first) covariate. "
47 | "This does not meet assumptions of method, but provides "
48 | "a nice robustness check to see how inaccurate results "
49 | "will be.")
50 | flags.DEFINE_integer("sample_size", 1000, "number of samples per simulation")
51 | flags.DEFINE_float("signal_strength", 5, "variance of logits (gamma^2)")
52 | flags.DEFINE_boolean(
53 | "uncentered", False,
54 | "By default, covariates are centered. This makes them uncentered (w/o effecting intercept)?."
55 | )
56 |
57 |
58 | class SimulationParams(object):
59 | """Simulation parameters shared across SLOE estimator experiments."""
60 |
61 | def __init__(self,
62 | training_n,
63 | p,
64 | gamma,
65 | covariates="gaussian",
66 | covariance="isotropic",
67 | one_and_none=False,
68 | uncentered=False,
69 | intercept=0,
70 | seed=None):
71 | self.training_n = training_n
72 | self.p = p
73 | self.gamma = gamma
74 | self.covariates = covariates
75 | self.covariance = covariance
76 | self.one_and_none = one_and_none
77 | self.uncentered = uncentered
78 | self.intercept = intercept
79 | self.seed = seed
80 |
81 | @classmethod
82 | def create_from_flags(cls):
83 | """Create a SimulationParams object from FLAGS."""
84 | n = FLAGS.sample_size
85 | kappa = FLAGS.features_per_sample
86 | gamma = np.sqrt(FLAGS.signal_strength)
87 | covariates = FLAGS.covariates
88 | covariance = FLAGS.covariance
89 | one_and_none = FLAGS.one_and_none
90 | uncentered = FLAGS.uncentered
91 | intercept = FLAGS.intercept
92 |
93 | p = int(n * kappa)
94 | return SimulationParams(n, p, gamma, covariates, covariance, one_and_none,
95 | uncentered, intercept)
96 |
97 |
98 | class Simulation(object):
99 | """Standard simulation model used in most experiments in SLOE paper."""
100 |
101 | def __init__(self, simulation_params):
102 | self.simulation_params = simulation_params
103 |
104 | self._check_sim_params()
105 | self._reset_random_state()
106 | self._initialize_params()
107 |
108 | def _initialize_params(self):
109 | """Initializes statistical params of model from simulation parameters."""
110 | p = self.simulation_params.p
111 |
112 | self.intercept_ = self.simulation_params.intercept
113 |
114 | if self.simulation_params.one_and_none:
115 | self.beta = np.zeros(p)
116 | self.beta[0] = self.simulation_params.gamma * np.sqrt(p)
117 | else:
118 | self.p_positive = int(p / 8)
119 | self.p_negative = self.p_positive
120 | self.p_zero = p - self.p_positive - self.p_negative
121 | self.beta = 2 * np.concatenate((np.ones(
122 | self.p_positive), -np.ones(self.p_negative), np.zeros(self.p_zero)))
123 | self.beta *= self.simulation_params.gamma
124 |
125 | if self.simulation_params.covariance == "isotropic":
126 | self.diag = np.ones(p)
127 | elif self.simulation_params.covariance == "elliptical":
128 | self.diag = self.random_state.rand(p) + 0.5
129 | self.diag /= self.diag[:(self.p_positive + self.p_negative)].mean()
130 | self.diag[0] = 1
131 | else:
132 | raise NotImplementedError("No covariance {}".format(
133 | self.simulation_params.covariance))
134 |
135 | if self.simulation_params.uncentered:
136 | self.centering = np.ones(p)
137 | self.intercept_ -= self.beta.dot(self.centering)
138 | else:
139 | self.centering = 0
140 |
141 | def null_indices(self):
142 | """Get null indices."""
143 | return slice(-self.p_zero, None, None)
144 |
145 | def _check_sim_params(self):
146 | if self.simulation_params.covariates != "gaussian":
147 | raise ValueError(
148 | "Simulation parameters calls for {} covariate distribution, "
149 | "but this class generates Gaussian covariates.".format(
150 | self.simulation_params.covariates))
151 |
152 | def _reset_random_state(self):
153 | self.random_state = np.random.RandomState(seed=self.simulation_params.seed)
154 |
155 | def _sample_x(self, n):
156 | return self.diag * self.random_state.randn(
157 | n, self.simulation_params.p) / np.sqrt(
158 | self.simulation_params.p) + self.centering
159 |
160 | def sample(self, n=None):
161 | """Sample data from simulation."""
162 | if n is None:
163 | n = self.simulation_params.training_n
164 |
165 | x1 = self._sample_x(n)
166 | y1 = (self.random_state.rand(n) <= 1.0 /
167 | (1.0 + np.exp(-x1.dot(self.beta) - self.intercept_))).astype(float)
168 | return (x1, y1)
169 |
170 |
171 | class GWASSimulation(Simulation):
172 | """From Sur and Candes, 2019. PNAS. Section 4(g)."""
173 |
174 | def __init__(self, simulation_params):
175 | super().__init__(simulation_params)
176 |
177 | self._initialize_cov_params()
178 |
179 | def _initialize_cov_params(self):
180 | self.equil = 0.5 * self.random_state.rand(self.simulation_params.p) + 0.25
181 |
182 | def _check_sim_params(self):
183 | if self.simulation_params.covariates != "gwas":
184 | raise ValueError(
185 | "Simulation parameters calls for {} covariate distribution, "
186 | "but this class generates GWAS-like covariates.".format(
187 | self.simulation_params.covariates))
188 |
189 | def covariate_mean(self):
190 | return 2 * (1 - self.equil)
191 |
192 | def covariate_std(self):
193 | return 2 * (1 - self.equil) * self.equil
194 |
195 | def _sample_x(self, n):
196 | p = self.simulation_params.p
197 | x1 = np.zeros((n, p))
198 | equil = self.equil
199 | for j in range(p):
200 | pj = equil[j]
201 | probs = np.array([pj**2, 2 * pj * (1 - pj), (1 - pj)**2])
202 | x1[:, j] = self.random_state.choice(3, size=(n,), p=probs)
203 | x1 -= self.covariate_mean().reshape(1, -1)
204 | x1 /= self.covariate_std().reshape(1, -1) * np.sqrt(p)
205 | return x1
206 |
207 |
208 | def multiple_sim_params(seed, kappa_range, gamma_range):
209 | """For each seed, map to a variety of simulation parameters."""
210 | for kappa in kappa_range:
211 | for gamma in gamma_range:
212 | yield [kappa, gamma, seed]
213 |
214 |
215 | def multiple_sample_sizes(seed, n_range):
216 | """For each seed, map to a variety of sample sizes."""
217 | for n in n_range:
218 | yield [n, seed]
219 |
220 |
221 | def create_sim(sim_params):
222 | """Create a simulation according to passed params."""
223 | if sim_params.covariates == "gaussian":
224 | return Simulation(sim_params)
225 | elif sim_params.covariates == "gwas":
226 | return GWASSimulation(sim_params)
227 | else:
228 | raise NotImplementedError("No simulation with covariates {}".format(
229 | FLAGS.covariates))
230 |
231 |
232 | def create_inference_model(method=None, fit_intercept=False):
233 | """Create a model to use for inference, getting default from FLAGS."""
234 | if method is None:
235 | method = FLAGS.method
236 |
237 | if method == "probe_frontier":
238 | if fit_intercept:
239 | raise NotImplementedError(
240 | "ProbeFrontier can't fit an intercept right now")
241 | logit_model = probe_frontier.ProbeFrontierLogisticRegression(
242 | num_subsamples=8)
243 | elif method == "mle":
244 | logit_model = unbiased_logistic_regression.LogisticRegressionMLE(
245 | fit_intercept=fit_intercept)
246 | elif method == "bootstrap":
247 | logit_model = unbiased_logistic_regression.LogisticRegressionPercBoot(
248 | fit_intercept=fit_intercept)
249 | elif method == "newmethod":
250 | logit_model = unbiased_logistic_regression.UnbiasedLogisticRegression(
251 | fit_intercept=fit_intercept)
252 | else:
253 | raise NotImplementedError("No method {}".format(FLAGS.method))
254 | return logit_model
255 |
256 |
257 | def numpy_array_to_csv(arr):
258 | return ",".join(["%.5f" % num for num in arr])
259 |
260 |
261 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/unbiased_logistic_regression.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2021 The SLOE Logistic Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | """Implements methods for inference for logistic regression based on the MLE.
17 |
18 | Implements SLOE and other methods for inference for logistic regression
19 | based on the MLE.
20 | """
21 |
22 | import numpy as np
23 | import scipy
24 | import scipy.stats
25 | import sklearn.linear_model
26 |
27 | from sloe_logistic import asymp_system_solve
28 |
29 |
30 | class ScaledLogisticRegression(object):
31 | """Generic class for methods rescaling the logistic regression MLE."""
32 |
33 | def __init__(self):
34 | pass
35 |
36 | def predict_proba(self, features, *args, **kwargs):
37 | del args
38 | del kwargs
39 | results = np.zeros((features.shape[0], 2))
40 | log_odds_ratio = features.dot(self.coef_.T).reshape(-1) + self.intercept_
41 | results[:, 1] = self._expit(log_odds_ratio)
42 | results[:, 0] = 1 - results[:, 1]
43 | return results
44 |
45 | def predict_inv_proba(self, features, *args, **kwargs):
46 | """Provides reciprocal of probability given features."""
47 | return 1 / self.predict_proba(features, *args, **kwargs)
48 |
49 | def _expit(self, logit, trimmed=False):
50 | if trimmed:
51 | logit = np.minimum(logit, 5)
52 | logit = np.maximum(logit, -5)
53 | return 1.0 / (1.0 + np.exp(-logit))
54 |
55 |
56 | class PlattScaledLogisticRegression(ScaledLogisticRegression):
57 | """Rescales the logit reg MLE to make it calibrated using approximation."""
58 |
59 | def __init__(self, fit_intercept=True, **kwargs):
60 | del kwargs
61 | super().__init__()
62 | self.fit_intercept = fit_intercept
63 | self.sm = sklearn.linear_model.LogisticRegression(
64 | fit_intercept=fit_intercept,
65 | penalty="none",
66 | solver="newton-cg",
67 | warm_start=False)
68 |
69 | def fit(self, features, outcome, weights=None, verbose=False):
70 | """Compute MLE and then use Taylor approximation rescale for calibration."""
71 | del verbose
72 | self.sm.fit(features, outcome, weights)
73 |
74 | refit_weights = None
75 | if refit_weights is None:
76 | refit_weights = 1
77 |
78 | # Get leave-one-out logits to pass in to Platt scaling
79 | pred = self.sm.predict_proba(features)[:, 1]
80 | hessian = -features.T.dot(
81 | (refit_weights * pred * (1 - pred)).reshape(-1, 1) * features)
82 | xihinvxi = np.diag(features.dot(np.linalg.solve(hessian, features.T)))
83 | mod = xihinvxi / (1.0 + xihinvxi * refit_weights * pred * (1 - pred))
84 | features = mod * refit_weights * (
85 | outcome - pred) + self.sm.decision_function(features)
86 |
87 | # Fit model for outcome using LOO logit estimates as feature. Coefficient on
88 | # feature is scaling to recalibrate model.
89 | cm = sklearn.linear_model.LogisticRegression(
90 | penalty="none", fit_intercept=self.fit_intercept)
91 | cm.fit(features.reshape(-1, 1), outcome.reshape(-1), weights)
92 | self.coef_ = self.sm.coef_ * cm.coef_
93 | if self.fit_intercept:
94 | self.intercept_ = cm.coef_ * self.sm.intercept_ + cm.intercept_
95 | else:
96 | self.intercept_ = 0
97 | return self
98 |
99 |
100 | class CVRegLogisticRegression(ScaledLogisticRegression):
101 | """Cross-validated regularized logistic regression MLE."""
102 |
103 | def __init__(self, fit_intercept=True, Cs=10, **kwargs):
104 | super().__init__(**kwargs)
105 | self.fit_intercept = fit_intercept
106 | self.sm = sklearn.linear_model.LogisticRegressionCV(
107 | fit_intercept=fit_intercept,
108 | Cs=Cs,
109 | penalty="l2",
110 | solver="newton-cg")
111 |
112 | def fit(self, features, outcome, weights=None, verbose=False):
113 | """Fit cross-validated model."""
114 | del verbose
115 |
116 | self.sm.fit(features, outcome, weights)
117 |
118 | if self.fit_intercept:
119 | self.intercept_ = self.sm.intercept_
120 | else:
121 | self.intercept_ = 0
122 | self.coef_ = self.sm.coef_
123 |
124 | return self
125 |
126 |
127 | class LogisticRegressionInference(ScaledLogisticRegression):
128 | """Base class inference with logit reg that computes P/CIs from covariance."""
129 |
130 | def __init__(self, fit_intercept=True, ci=50, **kwargs):
131 | super().__init__(**kwargs)
132 | self.fit_intercept = fit_intercept
133 | self.coef_cov = None
134 | self.hessian = None
135 | self.chi_sq_rescale = 1
136 | self.set_coverage(ci)
137 |
138 | def set_coverage(self, ci):
139 | """Sets expected coverage level."""
140 | self.ci_coverage = ci / 100.0
141 | self.z = scipy.stats.norm.ppf(0.5 + self.ci_coverage / 2.0)
142 |
143 | def _set_coef_cov(self, *args):
144 | pass
145 |
146 | def _get_prediction_variances(self, features):
147 | if self.fit_intercept:
148 | features_aug = np.ones((features.shape[0], features.shape[1] + 1))
149 | features_aug[:, :-1] = features
150 | else:
151 | features_aug = features
152 | return (features_aug.dot(self.coef_cov) *
153 | features_aug).sum(axis=-1).reshape(-1)
154 |
155 | def p_values(self):
156 | """Get p-values for a fitted model using Wald test."""
157 | scale = np.sqrt(np.diag(self.coef_cov))
158 | if self.fit_intercept:
159 | scale = scale[:-1]
160 | t = np.abs(self.coef_) / scale
161 | t = t.reshape(-1)
162 | p = 2 * scipy.stats.norm.sf(t)
163 | return p
164 |
165 | def decision_function(self, features):
166 | """Compute logits (ie decision function in sklearn parlance."""
167 | return features.dot(self.coef_.T).reshape(-1) + self.intercept_
168 |
169 | def prediction_intervals(self, features, logit=False):
170 | """Computes prediction CI for each row of features using coef covariance."""
171 | if self.coef_cov is None:
172 | raise Exception(
173 | "No covariance matrix defined yet, so can't do inference.")
174 |
175 | logits = self.decision_function(features)
176 | variances = self._get_prediction_variances(features)
177 |
178 | lower_ci = logits - self.z * np.sqrt(variances)
179 | upper_ci = logits + self.z * np.sqrt(variances)
180 |
181 | results = np.zeros((features.shape[0], 3))
182 | results[:, 0] = lower_ci
183 | results[:, 1] = logits
184 | results[:, 2] = upper_ci
185 |
186 | if not logit:
187 | results = self._expit(results)
188 |
189 | return results
190 |
191 | def predict_proba(self, X):
192 | logits = self.decision_function(X)
193 |
194 | preds = self._expit(logits)
195 |
196 | results = np.zeros((X.shape[0], 2))
197 | results[:, 1] = preds
198 | results[:, 0] = 1 - preds
199 | return results
200 |
201 | def predict_inv_proba(self, X):
202 | logits = self.decision_function(X)
203 |
204 | pos_exps = np.exp(logits)
205 | neg_exps = np.exp(-logits)
206 |
207 | results = np.zeros((X.shape[0], 2))
208 | results[:, 1] = 1 + neg_exps
209 | results[:, 0] = 1 + pos_exps
210 |
211 | return results
212 |
213 |
214 | class LogisticRegressionMLE(LogisticRegressionInference):
215 | """Computes the un-rescaled MLE and standard large-sample stats inference."""
216 |
217 | def __init__(self, fit_intercept=True, **kwargs):
218 | super().__init__(fit_intercept=fit_intercept, **kwargs)
219 | self.fit_intercept = fit_intercept
220 | self.sm = sklearn.linear_model.LogisticRegression(
221 | fit_intercept=fit_intercept,
222 | penalty="none",
223 | solver="newton-cg",
224 | warm_start=False)
225 |
226 | def fit(self, features, outcomes, weights=None, verbose=False):
227 | """Fit standard MLE model and compute coefficient covariance matrix."""
228 | del verbose
229 |
230 | self.sm.fit(features, outcomes, weights)
231 |
232 | self.coef_ = self.sm.coef_
233 | if self.fit_intercept:
234 | self.intercept_ = self.sm.intercept_
235 | else:
236 | self.intercept_ = 0
237 |
238 | self._set_coef_cov(features, weights)
239 |
240 | return self
241 |
242 | def _set_coef_cov(self, features, weights):
243 | """Use large-sample asymp. to compute coefficient covariance matrix."""
244 | if weights is None:
245 | weights = 1
246 | pred = self.sm.predict_proba(features)[:, 1]
247 | _, p = features.shape
248 | if self.fit_intercept:
249 | features_aug = np.ones((features.shape[0], features.shape[1] + 1))
250 | features_aug[:, :-1] = features
251 | dim = p + 1
252 | else:
253 | features_aug = features
254 | dim = p
255 | hessian = features_aug.T.dot(
256 | (weights * pred *
257 | (1 - pred)).reshape(-1, 1) * features_aug) / np.mean(weights)
258 | self.hessian = -hessian
259 | self.coef_cov = scipy.linalg.solve(hessian, np.eye(dim), assume_a="pos")
260 |
261 |
262 | class LogisticRegressionPercBoot(LogisticRegressionInference):
263 | """Fit standard MLE using multiplier bootstrap and compute percentile CIs.
264 |
265 | It is not recommended to use this method in practice if d / n ~> 0.05. The
266 | results from our paper suggest that it is very biased and has poor precision.
267 | """
268 |
269 | def __init__(self, fit_intercept=True, num_boot=20, **kwargs):
270 | super().__init__(fit_intercept=fit_intercept, **kwargs)
271 | self.fit_intercept = fit_intercept
272 | self.sm = sklearn.linear_model.LogisticRegression(
273 | fit_intercept=fit_intercept,
274 | penalty="none",
275 | solver="newton-cg",
276 | warm_start=False)
277 | self.num_boot = num_boot
278 |
279 | def fit(self, features, outcome, weights=None, verbose=False):
280 | """Fit main model and bootstrapped models with multiplier bootstrap."""
281 | del verbose
282 | self.sm.fit(features, outcome, weights)
283 |
284 | self.coef_ = self.sm.coef_
285 | if self.fit_intercept:
286 | self.intercept_ = self.sm.intercept_
287 | else:
288 | self.intercept_ = 0
289 |
290 | if weights is None:
291 | weights = 1.0
292 |
293 | n = features.shape[0]
294 | self.bootstraps = []
295 | for _ in range(self.num_boot):
296 | self.sm.fit(features, outcome,
297 | weights * np.random.poisson(lam=1.0, size=n))
298 | if np.linalg.norm(self.sm.coef_) >= 1e6:
299 | continue
300 | d = {"coef": self.sm.coef_.reshape(-1)}
301 | if self.fit_intercept:
302 | d["intercept"] = self.sm.intercept_
303 | else:
304 | d["intercept"] = 0
305 | self.bootstraps.append(d)
306 |
307 | return self
308 |
309 | def p_values(self):
310 | raise NotImplementedError(
311 | "This form of bootstrap does not lend itself well to p-values")
312 |
313 | def approx_lrt_p_values(self):
314 | raise NotImplementedError(
315 | "This form of bootstrap does not lend itself well to p-values")
316 |
317 | def _predict_with_param_dict(self, params, features):
318 | return features.dot(params["coef"]).reshape(-1) + params["intercept"]
319 |
320 | def prediction_intervals(self, X, logit=False):
321 | """Computes percentile CIs for feature rows using bootstrap samples."""
322 | all_preds = np.array(
323 | [self._predict_with_param_dict(d, X) for d in self.bootstraps])
324 |
325 | ci_range = (1 - self.ci_coverage) / 2
326 | results = np.quantile(all_preds, q=(ci_range, 0.5, 1 - ci_range), axis=0).T
327 |
328 | if not logit:
329 | results = self._expit(results)
330 |
331 | return results
332 |
333 |
334 | class UnbiasedLogisticRegression(LogisticRegressionInference):
335 | """Corrected bias and inference with the logitistic regression MLE."""
336 |
337 | def __init__(self, fit_intercept=False, **kwargs):
338 | super().__init__(fit_intercept, **kwargs)
339 |
340 | self.fit_intercept = fit_intercept
341 | if fit_intercept:
342 | raise ValueError("This model doesn't allow fitting an intercept.")
343 |
344 | self.sm = sklearn.linear_model.LogisticRegression(
345 | fit_intercept=fit_intercept,
346 | penalty="none",
347 | solver="newton-cg",
348 | warm_start=False)
349 |
350 | def fit(self, features, outcome, weights=None, verbose=False):
351 | """Fit MLE, estimate eta with SLOE, de-bias, and estimate covariance."""
352 | del verbose
353 | kappa = float(features.shape[1]) / features.shape[0]
354 |
355 | self.sm.fit(features, outcome, weights)
356 |
357 | if weights is None:
358 | weights = 1
359 |
360 | pred = self.sm.predict_proba(features)[:, 1]
361 | weights /= np.mean(weights)
362 | diag = weights * pred * (1 - pred)
363 | hessian = -features.T.dot(diag.reshape(-1, 1) * features)
364 | self.hessian = hessian
365 | xihinvxi = np.einsum("ij,ji->i", features,
366 | np.linalg.solve(hessian, features.T))
367 | mod = xihinvxi / (1.0 + xihinvxi * diag)
368 | infl = mod * weights * (outcome -
369 | pred) + self.sm.decision_function(features)
370 |
371 | eta_hat = np.var(infl)
372 |
373 | b0 = 0
374 |
375 | self.alpha, lambda_, sigma, intercept_est = asymp_system_solve.correction_factors(
376 | kappa, eta_hat, np.sqrt(eta_hat), b0, use_eta=True)
377 | if (kappa >= 0.05 and self.alpha < 0.999) or self.alpha > 5 \
378 | or lambda_ < 0.1 or sigma < 0.3 or lambda_ > 1e3 or sigma > 1e3:
379 | raise ValueError("Problem with optimization")
380 |
381 | self.eta_hat = eta_hat
382 | self.lambda_ = lambda_
383 | self.sigma = sigma
384 | self.intercept_est = intercept_est
385 |
386 | self.chi_sq_rescale = lambda_ * self.alpha**2 / sigma**2
387 | self.coef_ = self.sm.coef_ / self.alpha
388 | self.intercept_ = 0
389 |
390 | self._set_coef_cov(features, sigma / np.sqrt(kappa), self.alpha)
391 |
392 | return self
393 |
394 | def _set_coef_cov(self, features, sigma, alpha):
395 | n, p = features.shape
396 | features_aug = features
397 | dim = p
398 | feature_cov = features_aug.T.dot(features_aug)
399 | one_on_tau_sq = scipy.linalg.solve(feature_cov, np.eye(dim), assume_a="pos")
400 | self.coef_cov = one_on_tau_sq
401 | self.coef_cov *= (1 - float(p) / n) * ((sigma / alpha)**2)
402 |
--------------------------------------------------------------------------------