├── requirements.txt
├── run.sh
├── CONTRIBUTING.md
├── asymp_system_solve_test.py
├── setup.py
├── sloe_experiments
    ├── experiment_helpers_test.py
    ├── p_values.py
    ├── runtime.py
    ├── est_gamma.py
    ├── sweep_coverage.py
    └── experiment_helpers.py
├── probe_frontier_test.py
├── third_party
    └── py
    │   └── scipy
    │       ├── optimize
    │           └── Zeros
    │           │   ├── zeros.h
    │           │   └── brentq.c
    │       └── LICENSE.txt
├── README.md
├── mle_param_integrands.h
├── mle_param_integrands.cc
├── asymp_system_solve.py
├── unbiased_logistic_regression_test.py
├── probe_frontier.py
├── LICENSE
└── unbiased_logistic_regression.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.16.5
2 | scipy==1.5.4
3 | apache-beam
4 | absl-py
5 | scikit-learn
6 | statsmodels
7 | pybind11
8 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The SLOE Logistic Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | #!/bin/bash
16 | set -e
17 | set -x
18 | 
19 | virtualenv -p python3 .
20 | source ./bin/activate
21 | 
22 | pip install -r requirements.txt
23 | python setup.py build
24 | python setup.py install
25 | python -m sloe_logistic.asymp_system_solve_test
26 | python -m sloe_logistic.unbiased_logistic_regression_test
27 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | # Issues
 4 | 
 5 | * Please tag your issue with `bug`, `feature request`, or `question` to help us
 6 |   effectively respond.
 7 | * Please include the version of Uncertainty Metrics you are running.
 8 | * Please provide the command line you ran as well as the log output.
 9 | 
10 | # Pull Requests
11 | 
12 | Please send in fixes and feature additions through Pull Requests.
13 | 
14 | ## Contributor License Agreement
15 | 
16 | Contributions to this project must be accompanied by a Contributor License
17 | Agreement. You (or your employer) retain the copyright to your contribution,
18 | this simply gives us permission to use and redistribute your contributions as
19 | part of the project. Head over to <https://cla.developers.google.com/> to see
20 | your current agreements on file or to sign a new one.
21 | 
22 | You generally only need to submit a CLA once, so if you've already submitted one
23 | (even if it was for a different project), you probably don't need to do it
24 | again.
25 | 
26 | ## Code reviews
27 | 
28 | All submissions, including submissions by project members, require review. We
29 | use GitHub pull requests for this purpose. Consult
30 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
31 | information on using pull requests.
32 | 


--------------------------------------------------------------------------------
/asymp_system_solve_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 The SLOE Logistic Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Tests for sloe_logistic.asymp_system_solve."""
17 | 
18 | from absl.testing import absltest
19 | import numpy as np
20 | from sloe_logistic import asymp_system_solve
21 | 
22 | 
23 | class AsympSystemSolveTest(absltest.TestCase):
24 | 
25 |   def test_correction_factors_solve(self):
26 |     sol = asymp_system_solve.correction_factors(
27 |         0.2, 1, np.sqrt(5), 0, use_eta=False)
28 |     target = [1.499, 3.027, 2.1214, 0.0]
29 |     for i in range(4):
30 |       self.assertAlmostEqual(sol[i], target[i], places=3)
31 | 
32 |     sol = asymp_system_solve.correction_factors(
33 |         0.1, 8.881028475794636, np.sqrt(5), 0, use_eta=True)
34 |     target = [1.174, 1.007, 1.086, 0.0]
35 |     for i in range(4):
36 |       self.assertAlmostEqual(sol[i], target[i], places=3)
37 | 
38 |   def test_frontier(self):
39 |     sol = asymp_system_solve.frontier(0.1)
40 |     self.assertAlmostEqual(sol, 9.890, places=3)
41 | 
42 |     sol = asymp_system_solve.frontier(0.2)
43 |     self.assertAlmostEqual(sol, 4.550, places=3)
44 | 
45 | if __name__ == '__main__':
46 |   absltest.main()
47 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 The SLOE Logistic Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Builds sloe_logistic package."""
16 | 
17 | from distutils import core
18 | from distutils.command import build_clib
19 | 
20 | from pybind11.setup_helpers import build_ext
21 | from pybind11.setup_helpers import Pybind11Extension
22 | 
23 | libraries = [
24 |     ("scipy_brentq", {
25 |         "sources": ["third_party/py/scipy/optimize/Zeros/brentq.c",],
26 |     }),
27 | ]
28 | 
29 | ext_modules = [
30 |     Pybind11Extension("sloe_logistic.mle_param_integrands", [
31 |         "mle_param_integrands.cc",
32 |     ]),
33 | ]
34 | 
35 | core.setup(
36 |     name="sloe_logistic",
37 |     version="0.0.1",
38 |     description="Implements SLOE method and Logistic Regression Inference",
39 |     long_description="Code to supplement the ICML submission SLOE: A Faster "
40 |     "Method for Statistical Inference in High-Dimensional Logistic Regression.",
41 |     packages=["sloe_logistic", "sloe_logistic.sloe_experiments"],
42 |     package_dir={
43 |         "sloe_logistic": ".",
44 |         "sloe_logistic.sloe_experiments": "sloe_experiments/"
45 |     },
46 |     libraries=libraries,
47 |     ext_modules=ext_modules,
48 |     cmdclass={
49 |         "build_ext": build_ext,
50 |         "build_clib": build_clib.build_clib,
51 |     },
52 |     zip_safe=False,
53 | )
54 | 


--------------------------------------------------------------------------------
/sloe_experiments/experiment_helpers_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 The SLOE Logistic Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Tests for experiment_helpers."""
17 | 
18 | from absl.testing import absltest
19 | from sloe_logistic.sloe_experiments import experiment_helpers
20 | 
21 | 
22 | class ExperimentHelpersTest(absltest.TestCase):
23 | 
24 |   def test_simulation(self):
25 |     params = experiment_helpers.SimulationParams(4000, 400, 1, seed=202103)
26 |     sim = experiment_helpers.Simulation(params)
27 |     features, outputs = sim.sample()
28 | 
29 |     self.assertAlmostEqual(features.mean(), 0, places=3)
30 |     self.assertAlmostEqual(outputs.mean(), 0.5, places=2)
31 | 
32 |   def test_gwas_simulation(self):
33 |     params = experiment_helpers.SimulationParams(4000, 400, 1, seed=202103)
34 |     params.covariates = 'gwas'
35 |     sim = experiment_helpers.GWASSimulation(params)
36 |     features, outputs = sim.sample()
37 | 
38 |     self.assertAlmostEqual(features.mean(), 0, places=3)
39 |     self.assertAlmostEqual(outputs.mean(), 0.5, places=2)
40 | 
41 |   def test_gwas_simulation_checks_covariates(self):
42 |     params = experiment_helpers.SimulationParams(4000, 400, 1, seed=202103)
43 |     params.covariates = 'not_gwas'
44 |     with self.assertRaises(ValueError):
45 |       _ = experiment_helpers.GWASSimulation(params)
46 | 
47 | if __name__ == '__main__':
48 |   absltest.main()
49 | 


--------------------------------------------------------------------------------
/probe_frontier_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 The SLOE Logistic Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Tests for sloe_logistic.asymp_system_solve."""
17 | 
18 | from absl.testing import absltest
19 | import numpy as np
20 | from sloe_logistic import probe_frontier
21 | 
22 | 
23 | class ProbeFrontierTest(absltest.TestCase):
24 | 
25 |   def get_simulated_data(self, n, d):
26 |     np.random.seed(1)
27 |     features = np.random.randn(n, d)
28 |     beta = np.sqrt(5 * 2.0 / d) * np.ones(d)
29 |     beta[(d // 2):] = 0
30 | 
31 |     outcome = (np.random.rand(n) <= 1.0 /
32 |                (1.0 + np.exp(-features.dot(beta)))).astype(float)
33 | 
34 |     return features, outcome
35 | 
36 |   def test_probe_frontier_model(self):
37 |     n, d = 1000, 100
38 |     features, outcome = self.get_simulated_data(n, d)
39 |     model = probe_frontier.ProbeFrontierLogisticRegression(num_subsamples=4)
40 |     model.fit(features, outcome)
41 | 
42 |     self.assertLen(model.coef_.reshape(-1), features.shape[1])
43 | 
44 |   def test_corrected_p_values(self):
45 |     """Check null P value CDF is within 95% CI of uniform CDF."""
46 |     n, d = 4000, 400
47 |     features, outcome = self.get_simulated_data(n, d)
48 |     model = probe_frontier.ProbeFrontierLogisticRegression(num_subsamples=4)
49 |     model.fit(features, outcome)
50 | 
51 |     thresh = 0.1
52 |     emp_p_cdf = model.p_values().reshape(-1)[(d // 2):] <= thresh
53 |     self.assertAlmostEqual(
54 |         emp_p_cdf.mean(),
55 |         thresh,
56 |         delta=1.96 * emp_p_cdf.std() / np.sqrt(d // 2))
57 | 
58 | 
59 | if __name__ == '__main__':
60 |   absltest.main()
61 | 


--------------------------------------------------------------------------------
/third_party/py/scipy/optimize/Zeros/zeros.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 The SLOE Logistic Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | /* Written by Charles Harris charles.harris@sdl.usu.edu */
16 | 
17 | /* Modified to not depend on Python everywhere by Travis Oliphant.
18 |  */
19 | 
20 | #ifndef ZEROS_H
21 | #define ZEROS_H
22 | 
23 | typedef struct {
24 |     int funcalls;
25 |     int iterations;
26 |     int error_num;
27 | } scipy_zeros_info;
28 | 
29 | 
30 | /* Must agree with _ECONVERGED, _ESIGNERR, _ECONVERR  in zeros.py */
31 | #define CONVERGED 0
32 | #define SIGNERR -1
33 | #define CONVERR -2
34 | #define EVALUEERR -3
35 | #define INPROGRESS 1
36 | 
37 | typedef double (*callback_type)(double, void*);
38 | typedef double (*solver_type)(callback_type, double, double, double, double,
39 |                               int, void *, scipy_zeros_info*);
40 | 
41 | extern double bisect(callback_type f, double xa, double xb, double xtol,
42 |                      double rtol, int iter, void *func_data,
43 |                      scipy_zeros_info *solver_stats);
44 | extern double ridder(callback_type f, double xa, double xb, double xtol,
45 |                      double rtol, int iter, void *func_data,
46 |                      scipy_zeros_info *solver_stats);
47 | extern double brenth(callback_type f, double xa, double xb, double xtol,
48 |                      double rtol, int iter, void *func_data,
49 |                      scipy_zeros_info *solver_stats);
50 | extern double brentq(callback_type f, double xa, double xb, double xtol,
51 |                      double rtol, int iter, void *func_data,
52 |                      scipy_zeros_info *solver_stats);
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Code to run experiments in *SLOE: A Faster Method for Statistical Inference in High-Dimensional Logistic Regression*.
 2 | 
 3 | Not an official Google product.
 4 | 
 5 | ## Method Introduction
 6 | This library provides statistical inference for high dimensional logistic
 7 | regression maximum likelihood, based largely on the breakthrough results from
 8 | Sur and Candès (PNAS, 2019). The challenge with applying their results is that
 9 | they depend on an unobserved signal strength quantity. Our method estimates this
10 | quantity via a leave-one-out approach, which we outline in our paper [1].
11 | 
12 | By high-dimensions, we mean that the ratio of the number of covariates `p` to
13 | the sample size `n` is strictly between 0 and 0.5. When the number of covariates
14 | is too large, the data is separable, and our method will not help to recover
15 | from such a case. When the number of covariates is small (say, `p <= 0.05 * n`),
16 | the high dimensional adjustment is a bit numerically unstable, and adds little
17 | value over the standard large-sample theory.
18 | 
19 | The setting studied is complementary to sparse high dimensional regimes. We
20 | assume that there are a relatively large number of covariates that are weakly
21 | correlated with the binary outcome. If one expects only a very small number of
22 | the many candidate covariates to have a nonzero coefficient in the model,
23 | sparse model selection and post-selective inference is probably a better
24 | approach than the one taken here.
25 | 
26 | ## Installation and tests
27 | Run `run.sh` to install requirements and package, and run tests.
28 | 
29 | ## Usage
30 | The main approach proposed in our work is implemented in the
31 | `UnbiasedLogisticRegression` class in `unbiased_logistic_regression.py`. This
32 | has an `sklearn`-like interface, with a `fit`, `decision_function` and
33 | `predict_proba` API. Additionally, for inference, we've added a
34 | `prediction_intervals` method. See the inline documentation for more details
35 | of usage.
36 | 
37 | # Citation
38 | [1] S. Yadlowsky, T. Yun, C. McLean, A. D'Amour (2021). "SLOE: A Faster
39 | Method for Statistical Inference in High-Dimensional Logistic Regression".
40 | [arXiv:2103.12725](http://arxiv.org/abs/2103.12725) [stat.ML].
41 | 


--------------------------------------------------------------------------------
/mle_param_integrands.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2021 The SLOE Logistic Authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef MLE_PARAM_INTEGRANDS_H_
16 | #define MLE_PARAM_INTEGRANDS_H_
17 | 
18 | #include <math.h>
19 | 
20 | extern "C" {
21 | #include "third_party/py/scipy/optimize/Zeros/zeros.h"
22 | }
23 | 
24 | namespace logistic_hd {
25 | 
26 | // Integrands for the equations defined in Eq. 5 from Sur and Candès
27 | // (PNAS, 2019). These are called by the bivariate integration over Z1 and Z2
28 | // in `asymp_system_solve.py`.
29 | double integrand(double Z1, double Z2, double kappa, double gamma, double b0,
30 |                  double alpha, double lambda, double sigma, double beta0,
31 |                  int eq_num);
32 | 
33 | // Computes the derivative of the objective that defines the proximal operator.
34 | // The prox operator is the value of z that makes this zero.
35 | double prox_deriv(double z, void *args);
36 | 
37 | double sigmoid(double z);
38 | 
39 | // Computes the derivative of the prox operator for the logistic regression
40 | // log likelihood.
41 | double prox_impl(double lambda, double x, double xtol = 1e-8,
42 |                  double rtol = 1e-8, int maxiters = 1000);
43 | 
44 | // Computes the pdf of the bivariate normal without any input validation
45 | // because this is called many times during optimization.
46 | double pdf(double x1, double x2);
47 | 
48 | // Helper function to pass values between our code and the scipy.optimize API.
49 | double scipy_zeros_functions_func(double x, void *params);
50 | 
51 | typedef struct prox_params {
52 |   double lambda;
53 |   double x;
54 | } prox_params;
55 | 
56 | }  // namespace logistic_hd
57 | 
58 | #endif  // MLE_PARAM_INTEGRANDS_H_
59 | 


--------------------------------------------------------------------------------
/sloe_experiments/p_values.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 The SLOE Logistic Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Run experiment to understand uniformity of p-values generated by SLOE.
17 | 
18 | Tests the SLOE estimator empirically by computing it
19 | over a bunch of different seeds, and storing in csv files to be analyzed in a
20 | colab.
21 | """
22 | 
23 | 
24 | from absl import app
25 | from absl import flags
26 | import apache_beam as beam
27 | from apache_beam.options import pipeline_options
28 | import numpy as np
29 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper
30 | 
31 | FLAGS = flags.FLAGS
32 | 
33 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run')
34 | flags.DEFINE_string('output_path', '/tmp/counts.txt', 'The output file path')
35 | flags.DEFINE_string(
36 |     'coverage_target', 'true_preds',
37 |     'Which value to check coverage in prediction intervals? Options '
38 |     '`true_preds` or `calib_ests`'
39 | )
40 | 
41 | 
42 | def run_sim(seed):
43 |   """Runs simulation and computes estimated p-values to compare to uniform."""
44 |   # Model parameters
45 | 
46 |   sim_params = exp_helper.SimulationParams.create_from_flags()
47 |   sim_params.seed = 201216 + seed
48 |   sim = exp_helper.Simulation(sim_params)
49 | 
50 |   x1, y1 = sim.sample()
51 | 
52 |   logit_model = exp_helper.create_inference_model()
53 |   logit_model_fit = logit_model.fit(x1, y1)
54 | 
55 |   p_values = logit_model_fit.p_values()
56 |   return np.sort(p_values[sim.null_indices()])
57 | 
58 | 
59 | def main(unused_argv):
60 |   # If you have custom beam options add them here.
61 |   beam_options = pipeline_options.PipelineOptions()
62 | 
63 |   with beam.Pipeline(options=beam_options) as pipe:
64 |     _ = (
65 |         pipe
66 |         | beam.Create(range(FLAGS.num_sims))
67 |         | beam.Map(run_sim)
68 |         | beam.Map(exp_helper.numpy_array_to_csv)
69 |         | beam.Reshuffle()
70 |         |
71 |         'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5))
72 | 
73 | 
74 | if __name__ == '__main__':
75 |   app.run(main)
76 | 


--------------------------------------------------------------------------------
/sloe_experiments/runtime.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2021 The SLOE Logistic Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | """Run experiment to understand runtime of SLOE relative to ProbeFrontier.
17 | 
18 | Tests the runtime of the SLOE estimator in compared to
19 | ProbeFrontier over many seeds, storing in csv files to be analyzed in a colab.
20 | """
21 | import time
22 | 
23 | 
24 | 
25 | from absl import app
26 | from absl import flags
27 | import apache_beam as beam
28 | from apache_beam.options import pipeline_options
29 | import numpy as np
30 | 
31 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper
32 | 
33 | 
34 | FLAGS = flags.FLAGS
35 | 
36 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run')
37 | flags.DEFINE_string('output_path', '/tmp/counts.txt', 'The output file path')
38 | 
39 | N_RANGE = [500, 1000, 2000, 3000, 4000, 6000, 8000, 16000]
40 | 
41 | 
42 | def run_sim(val):
43 |   """Runs simulation and compare runtime of SLOE and ProbeFrontier."""
44 |   n = val[0]
45 |   seed = 201216 + val[1]
46 |   # Model parameters
47 | 
48 |   sim_params = exp_helper.SimulationParams.create_from_flags()
49 |   sim_params.seed = seed
50 |   sim_params.training_n = n
51 |   sim_params.p = int(n * FLAGS.features_per_sample)
52 |   sim = exp_helper.Simulation(sim_params)
53 | 
54 |   new_method_model = exp_helper.create_inference_model('newmethod')
55 |   pf_model = exp_helper.create_inference_model('probe_frontier')
56 | 
57 |   x1, y1 = sim.sample()
58 |   if pf_model.is_separable(x1, y1):
59 |     return
60 | 
61 |   tic = time.perf_counter()
62 |   m = new_method_model.fit(x1, y1)
63 |   toc = time.perf_counter()
64 |   new_method_time = toc - tic
65 |   # Deleting model here to keep memory clean for probe frontier model.
66 |   del new_method_model, m
67 | 
68 |   tic = time.perf_counter()
69 |   m, v = pf_model.fit(x1, y1)
70 |   toc = time.perf_counter()
71 |   probe_frontier_time = toc - tic
72 | 
73 |   return [np.array([n, seed, new_method_time, probe_frontier_time, v])]
74 | 
75 | 
76 | def main(unused_argv):
77 |   # If you have custom beam options add them here.
78 |   beam_options = pipeline_options.PipelineOptions()
79 | 
80 |   with beam.Pipeline(options=beam_options) as pipe:
81 |     _ = (
82 |         pipe
83 |         | beam.Create(range(FLAGS.num_sims))
84 |         | beam.FlatMap(exp_helper.multiple_sample_sizes, N_RANGE)
85 |         | 'PrepShuffle' >> beam.Reshuffle()
86 |         | beam.FlatMap(run_sim)
87 |         | beam.Map(exp_helper.numpy_array_to_csv)
88 |         | beam.Reshuffle()
89 |         |
90 |         'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5))
91 | 
92 | 
93 | if __name__ == '__main__':
94 |   app.run(main)
95 | 


--------------------------------------------------------------------------------
/sloe_experiments/est_gamma.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The SLOE Logistic Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Run experiment to understand convergence of SLOE estimator of eta.
 17 | 
 18 | Tests the SLOE estimator empirically by computing it
 19 | over a range of sample sizes for a bunch of different seeds, and storing in
 20 | csv files to be analyzed in a colab.
 21 | """
 22 | 
 23 | 
 24 | from absl import app
 25 | from absl import flags
 26 | import apache_beam as beam
 27 | from apache_beam.options import pipeline_options
 28 | import numpy as np
 29 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper
 30 | import statsmodels.api as sm
 31 | 
 32 | FLAGS = flags.FLAGS
 33 | 
 34 | flags.DEFINE_string('output_path', '/tmp/counts.txt', 'The output file path')
 35 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run')
 36 | flags.DEFINE_string('img_path', '/tmp/counts.png', 'Path to save plots')
 37 | 
 38 | N_RANGE = [250, 500, 1000, 2000, 4000]
 39 | 
 40 | 
 41 | def multiple_sample_sizes(seed):
 42 |   """Run same seed over multiple sample sizes."""
 43 |   for n in N_RANGE:
 44 |     yield [n, seed]
 45 | 
 46 | 
 47 | def run_sim(params):
 48 |   """Runs simulation and computes estimated eta_hat to compare to truth."""
 49 |   n = params[0]
 50 |   seed = params[1]
 51 |   kappa = FLAGS.features_per_sample
 52 |   p = int(n * kappa)
 53 | 
 54 |   gamma = np.sqrt(FLAGS.signal_strength)
 55 |   rand_state = np.random.RandomState(201216 + seed)
 56 | 
 57 |   p_positive = int(p / 8)
 58 |   p_negative = p_positive
 59 |   p_zero = p - p_positive - p_negative
 60 |   beta = 2 * np.concatenate(
 61 |       (np.ones(p_positive), -np.ones(p_negative), np.zeros(p_zero)))
 62 |   beta *= gamma
 63 | 
 64 |   features = rand_state.randn(n, p) / np.sqrt(p)
 65 |   labels = (rand_state.rand(n) <= 1.0 /
 66 |             (1.0 + np.exp(-features.dot(beta)))).astype(float)
 67 | 
 68 |   logit_model = sm.Logit(labels, features)
 69 |   logit_model_fit = logit_model.fit(disp=False)
 70 |   beta_hat = logit_model_fit.params
 71 | 
 72 |   hessian = logit_model.hessian(beta_hat)
 73 |   # Computes X_i^T H^{-1} X_i for all examples. Used in Sherman-Morrison formula
 74 |   # below.
 75 |   xi_hessian_inv_xi = np.diag(
 76 |       features.dot(np.linalg.solve(hessian, features.T)))
 77 |   pred = logit_model_fit.predict(features)
 78 |   # Sherman-Morrison formula for X_i^T H_{-i}^{-1} X_i, where H_{-i} is Hessian
 79 |   # without i-th example.
 80 |   mod = xi_hessian_inv_xi / (1.0 + xi_hessian_inv_xi * pred * (1 - pred))
 81 |   infl = mod * (labels - pred) + features.dot(beta_hat)
 82 | 
 83 |   eta_hat = np.var(infl)
 84 | 
 85 |   eta_hat_simp = np.linalg.norm(beta_hat)**2
 86 | 
 87 |   return np.array([n, seed, eta_hat, eta_hat_simp])
 88 | 
 89 | 
 90 | def main(unused_argv):
 91 |   # If you have custom beam options add them here.
 92 |   beam_options = pipeline_options.PipelineOptions()
 93 | 
 94 |   with beam.Pipeline(options=beam_options) as pipe:
 95 |     _ = (
 96 |         pipe
 97 |         | beam.Create(range(FLAGS.num_sims))
 98 |         | beam.FlatMap(multiple_sample_sizes)
 99 |         | 'PrepShuffle' >> beam.Reshuffle()
100 |         | beam.Map(run_sim)
101 |         | beam.Map(exp_helper.numpy_array_to_csv)
102 |         | beam.Reshuffle()
103 |         |
104 |         'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5))
105 | 
106 | 
107 | if __name__ == '__main__':
108 |   app.run(main)
109 | 


--------------------------------------------------------------------------------
/mle_param_integrands.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2021 The SLOE Logistic Authors.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "mle_param_integrands.h"
 16 | 
 17 | #include <iostream>
 18 | 
 19 | #include "pybind11/pybind11.h"
 20 | 
 21 | namespace logistic_hd {
 22 | 
 23 | double sigmoid(double z) {
 24 |   const double v = 1.0 / (1 + exp(-z));
 25 |   return (v);
 26 | }
 27 | 
 28 | double prox_deriv(double z, void *args) {
 29 |   prox_params *myargs = reinterpret_cast<prox_params *>(args);
 30 |   return (myargs->lambda * sigmoid(z) + z - myargs->x);
 31 | }
 32 | 
 33 | double prox_impl(double lambda, double x, double xtol, double rtol,
 34 |                  int maxiters) {
 35 |   prox_params params;
 36 |   scipy_zeros_info solver_stats;
 37 |   double lower;
 38 |   double upper;
 39 | 
 40 |   params.lambda = lambda;
 41 |   params.x = x;
 42 | 
 43 |   if (lambda * x > 0) {
 44 |     lower = x - lambda - 1e-4;
 45 |     upper = x + 1e-4;
 46 |   } else {
 47 |     lower = x - lambda / 2.0 - 1e-4;
 48 |     upper = x + 1e-4;
 49 |   }
 50 |   lower = -abs(x) - 8;
 51 |   upper = abs(x) + 8;
 52 | 
 53 |   if (abs(prox_deriv(lower, &params)) < 1e-8) {
 54 |     return (lower);
 55 |   }
 56 |   if (abs(prox_deriv(upper, &params)) < 1e-8) {
 57 |     return (upper);
 58 |   }
 59 | 
 60 |   const double x0 = brentq(&prox_deriv, lower, upper, xtol, rtol, maxiters,
 61 |                            reinterpret_cast<void *>(&params), &solver_stats);
 62 | 
 63 |   return (x0);
 64 | }
 65 | 
 66 | double integrand(double Z1, double Z2, double kappa, double gamma, double b0,
 67 |                  double alpha, double lambda, double sigma, double beta0,
 68 |                  int eq_num) {
 69 |   double eq;
 70 | 
 71 |   const double S1 = gamma * Z1 / alpha + beta0;
 72 |   const double S2 = gamma * Z1 + sigma * Z2 + b0;
 73 | 
 74 |   const double prox_S2 = prox_impl(lambda, S2);
 75 |   const double prox_lambda_S2 = prox_impl(lambda, lambda + S2);
 76 | 
 77 |   const double sig_S1 = sigmoid(S1);
 78 |   const double sig_neg_S1 = 1 - sig_S1;
 79 | 
 80 |   if (eq_num == 1) {
 81 |     eq = sig_S1 * pow(S2 - prox_lambda_S2, 2);
 82 |     eq += sig_neg_S1 * pow(S2 - prox_S2, 2);
 83 |   } else if (eq_num == 2) {
 84 |     eq = sig_S1 * Z2 * prox_lambda_S2;
 85 |     eq += sig_neg_S1 * Z2 * prox_S2;
 86 |   } else if (eq_num == 3) {
 87 |     eq = sig_S1 * Z1 * prox_lambda_S2;
 88 |     eq += sig_neg_S1 * Z1 * prox_S2;
 89 |   } else {
 90 |     const double prox_neg_S2 = prox_impl(lambda, -S2);
 91 |     eq = -sig_S1 * sigmoid(prox_neg_S2);
 92 |     eq += sig_neg_S1 * sigmoid(prox_S2);
 93 |   }
 94 | 
 95 |   return (eq * pdf(Z1, Z2));
 96 | }
 97 | 
 98 | double pdf(double x1, double x2) {
 99 |   return (exp(-(pow(x1, 2) + pow(x2, 2)) / 2.0) / (2 * M_PI));
100 | }
101 | 
102 | }  // namespace logistic_hd
103 | 
104 | PYBIND11_MODULE(mle_param_integrands, m) {
105 |   m.doc() = "Logistic Regression MLE High Dimensional Integrands";
106 | 
107 |   m.def("sigmoid", &logistic_hd::sigmoid,
108 |         "Sigmoid for a float (unvectorized, no error checking)");
109 |   m.def("integrand", &logistic_hd::integrand,
110 |         "Integrand for equation to get high dimensional adjustment");
111 |   m.def("prox_deriv", &logistic_hd::prox_deriv,
112 |         "Derivative prox objective for logistic link");
113 |   m.def("prox_impl", &logistic_hd::prox_impl,
114 |         "Computes prox for logistic link times lambda");
115 |   m.def("pdf", &logistic_hd::pdf,
116 |         "Computes pdf of bivariate normal distribution");
117 | }
118 | 


--------------------------------------------------------------------------------
/third_party/py/scipy/optimize/Zeros/brentq.c:
--------------------------------------------------------------------------------
  1 | /* Written by Charles Harris charles.harris@sdl.usu.edu */
  2 | 
  3 | #include <math.h>
  4 | #include "zeros.h"
  5 | 
  6 | #define MIN(a, b) ((a) < (b) ? (a) : (b))
  7 | 
  8 | /*
  9 |   At the top of the loop the situation is the following:
 10 | 
 11 |     1. the root is bracketed between xa and xb
 12 |     2. xa is the most recent estimate
 13 |     3. xp is the previous estimate
 14 |     4. |fp| < |fb|
 15 | 
 16 |   The order of xa and xp doesn't matter, but assume xp < xb. Then xa lies to
 17 |   the right of xp and the assumption is that xa is increasing towards the root.
 18 |   In this situation we will attempt quadratic extrapolation as long as the
 19 |   condition
 20 | 
 21 |   *  |fa| < |fp| < |fb|
 22 | 
 23 |   is satisfied. That is, the function value is decreasing as we go along.
 24 |   Note the 4 above implies that the right inequlity already holds.
 25 | 
 26 |   The first check is that xa is still to the left of the root. If not, xb is
 27 |   replaced by xp and the interval reverses, with xb < xa. In this situation
 28 |   we will try linear interpolation. That this has happened is signaled by the
 29 |   equality xb == xp;
 30 | 
 31 |   The second check is that |fa| < |fb|. If this is not the case, we swap
 32 |   xa and xb and resort to bisection.
 33 | 
 34 | */
 35 | 
 36 | double
 37 | brentq(callback_type f, double xa, double xb, double xtol, double rtol,
 38 |        int iter, void *func_data, scipy_zeros_info *solver_stats)
 39 | {
 40 |     double xpre = xa, xcur = xb;
 41 |     double xblk = 0., fpre, fcur, fblk = 0., spre = 0., scur = 0., sbis;
 42 |     /* the tolerance is 2*delta */
 43 |     double delta;
 44 |     double stry, dpre, dblk;
 45 |     int i;
 46 |     solver_stats->error_num = INPROGRESS;
 47 | 
 48 |     fpre = (*f)(xpre, func_data);
 49 |     fcur = (*f)(xcur, func_data);
 50 |     solver_stats->funcalls = 2;
 51 |     if (fpre*fcur > 0) {
 52 |         solver_stats->error_num = SIGNERR;
 53 |         return 0.;
 54 |     }
 55 |     if (fpre == 0) {
 56 |         solver_stats->error_num = CONVERGED;
 57 |         return xpre;
 58 |     }
 59 |     if (fcur == 0) {
 60 |         solver_stats->error_num = CONVERGED;
 61 |         return xcur;
 62 |     }
 63 | 
 64 |     solver_stats->iterations = 0;
 65 |     for (i = 0; i < iter; i++) {
 66 |         solver_stats->iterations++;
 67 |         if (fpre*fcur < 0) {
 68 |             xblk = xpre;
 69 |             fblk = fpre;
 70 |             spre = scur = xcur - xpre;
 71 |         }
 72 |         if (fabs(fblk) < fabs(fcur)) {
 73 |             xpre = xcur;
 74 |             xcur = xblk;
 75 |             xblk = xpre;
 76 | 
 77 |             fpre = fcur;
 78 |             fcur = fblk;
 79 |             fblk = fpre;
 80 |         }
 81 | 
 82 |         delta = (xtol + rtol*fabs(xcur))/2;
 83 |         sbis = (xblk - xcur)/2;
 84 |         if (fcur == 0 || fabs(sbis) < delta) {
 85 |             solver_stats->error_num = CONVERGED;
 86 |             return xcur;
 87 |         }
 88 | 
 89 |         if (fabs(spre) > delta && fabs(fcur) < fabs(fpre)) {
 90 |             if (xpre == xblk) {
 91 |                 /* interpolate */
 92 |                 stry = -fcur*(xcur - xpre)/(fcur - fpre);
 93 |             }
 94 |             else {
 95 |                 /* extrapolate */
 96 |                 dpre = (fpre - fcur)/(xpre - xcur);
 97 |                 dblk = (fblk - fcur)/(xblk - xcur);
 98 |                 stry = -fcur*(fblk*dblk - fpre*dpre)
 99 |                     /(dblk*dpre*(fblk - fpre));
100 |             }
101 |             if (2*fabs(stry) < MIN(fabs(spre), 3*fabs(sbis) - delta)) {
102 |                 /* good short step */
103 |                 spre = scur;
104 |                 scur = stry;
105 |             } else {
106 |                 /* bisect */
107 |                 spre = sbis;
108 |                 scur = sbis;
109 |             }
110 |         }
111 |         else {
112 |             /* bisect */
113 |             spre = sbis;
114 |             scur = sbis;
115 |         }
116 | 
117 |         xpre = xcur; fpre = fcur;
118 |         if (fabs(scur) > delta) {
119 |             xcur += scur;
120 |         }
121 |         else {
122 |             xcur += (sbis > 0 ? delta : -delta);
123 |         }
124 | 
125 |         fcur = (*f)(xcur, func_data);
126 |         solver_stats->funcalls++;
127 |     }
128 |     solver_stats->error_num = CONVERR;
129 |     return xcur;
130 | }
131 | 


--------------------------------------------------------------------------------
/asymp_system_solve.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The SLOE Logistic Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Solves nonlinear equations for high dim correction factors for MLE.
 17 | 
 18 | Solves the nonlinear equations in Sur and Candès (PNAS., 2019) to find
 19 | the adjustment factors for bias and variance of logistic regression MLE.
 20 | """
 21 | 
 22 | 
 23 | import functools
 24 | 
 25 | from absl import app
 26 | import numpy as np
 27 | import scipy
 28 | import scipy.integrate
 29 | import scipy.optimize
 30 | 
 31 | import sloe_logistic.mle_param_integrands as mle_helper
 32 | 
 33 | 
 34 | def _t_integrand(z, v, t, gamma):
 35 |   """Integrand used to calculate when the logistic MLE exists."""
 36 |   return 2 * mle_helper.sigmoid(gamma * v) * mle_helper.pdf(z, v) * (
 37 |       max(z - t * v, 0)**2)
 38 | 
 39 | 
 40 | def _t_problem(t, gamma):
 41 |   """Minimizer of this integrand in t is the frontier where the MLE exists."""
 42 |   loss, _ = scipy.integrate.dblquad(_t_integrand, -8, 8, -8, 8, (
 43 |       t,
 44 |       gamma,
 45 |   ), 1e-6, 1e-6)
 46 |   return loss
 47 | 
 48 | 
 49 | def _g_mle_inv(gamma):
 50 |   """Frontier where data separable in limit. Gives kappa in terms of gamma."""
 51 |   res = scipy.optimize.minimize_scalar(
 52 |       _t_problem, bounds=(-10, 10), args=(gamma,), method='Bounded')
 53 |   return _t_problem(res.x, gamma)
 54 | 
 55 | 
 56 | def frontier(kappa):
 57 |   """Frontier where data separable in limit. Gives gamma in terms of kappa."""
 58 |   gamma_star = scipy.optimize.brentq(lambda gamma: _g_mle_inv(gamma) - kappa, 0,
 59 |                                      25)
 60 |   return gamma_star
 61 | 
 62 | 
 63 | def equations(kappa, eta, gamma, beta0, use_eta, alpha, lambda_, sigma, b0):
 64 |   """The solution to these equations gives the high dimensional adjustment."""
 65 |   if use_eta:
 66 |     gamma = np.sqrt(max(eta - sigma**2, 0.0001))
 67 |   else:
 68 |     gamma *= alpha
 69 | 
 70 |   eq1, _ = scipy.integrate.dblquad(
 71 |       mle_helper.integrand, -8, 8, -8, 8,
 72 |       (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 1), 1e-4, 1e-4)
 73 |   eq2, _ = scipy.integrate.dblquad(
 74 |       mle_helper.integrand, -8, 8, -8, 8,
 75 |       (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 2), 1e-4, 1e-4)
 76 |   eq3, _ = scipy.integrate.dblquad(
 77 |       mle_helper.integrand, -8, 8, -8, 8,
 78 |       (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 3), 1e-4, 1e-4)
 79 |   eq4, _ = scipy.integrate.dblquad(
 80 |       mle_helper.integrand, -8, 8, -8, 8,
 81 |       (kappa, gamma, b0, alpha, lambda_, sigma, beta0, 4), 1, 1)
 82 |   eq1 -= sigma**2 * kappa
 83 |   eq2 -= abs(sigma) * (1 - kappa)
 84 |   eq3 -= gamma
 85 | 
 86 |   return -np.array([eq1, eq2, eq3, eq4])
 87 | 
 88 | 
 89 | def get_system(kappa, eta, gamma, b0, use_eta=True):
 90 |   system_ = functools.partial(equations, kappa, eta, gamma, b0, use_eta)
 91 |   return system_
 92 | 
 93 | 
 94 | def correction_factors(kappa, eta, gamma, b0, use_eta=True):
 95 |   """Computes correction factors for MLE of high dimensional logistic reg."""
 96 |   system_ = get_system(kappa, eta, gamma, b0, use_eta)
 97 |   if use_eta:
 98 |     init = np.array([2, 2, np.sqrt(eta / 2), b0 / 2])
 99 |   else:
100 |     init = np.array([2, 2, np.sqrt(gamma**2 + 1), b0])
101 |   soln = scipy.optimize.root(
102 |       lambda x: system_(*x),
103 |       init,
104 |       method='lm',
105 |       options={
106 |           'xtol': 1e-4,
107 |           'eps': 1e-8
108 |       })
109 |   x0 = soln.x
110 |   if kappa >= 0.03 and (x0[0] < 1 or x0[2] < 0.1):
111 |     print('Rerunning due to convergence issue')
112 |     init += 0.1 * np.random.randn(4)
113 |     init = np.maximum(init, np.array([1, 0.5, 0.1, b0 / 2.0]))
114 |     soln = scipy.optimize.root(
115 |         lambda x: system_(*x),
116 |         init,
117 |         method='lm',
118 |         options={
119 |             'xtol': 1e-4,
120 |             'eps': 1e-8
121 |         })
122 |     x0 = soln.x
123 |   return x0
124 | 
125 | 
126 | def main(argv):
127 |   if len(argv) > 1:
128 |     raise app.UsageError('Too many command-line arguments.')
129 | 
130 |   sol = correction_factors(0.2, 1, np.sqrt(5), 0, use_eta=False)
131 |   print(sol)
132 |   sol = correction_factors(0.1, 8.881028475794636, np.sqrt(5), 0, use_eta=True)
133 |   print(sol)
134 | 
135 | 
136 | if __name__ == '__main__':
137 |   app.run(main)
138 | 


--------------------------------------------------------------------------------
/unbiased_logistic_regression_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The SLOE Logistic Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Tests for sloe_logistic.asymp_system_solve."""
 17 | 
 18 | from absl.testing import absltest
 19 | import numpy as np
 20 | from sloe_logistic import unbiased_logistic_regression
 21 | 
 22 | 
 23 | class UnbiasedLogisticRegressionTest(absltest.TestCase):
 24 | 
 25 |   def get_simulated_data(self, n, d):
 26 |     np.random.seed(1)
 27 |     features = np.random.randn(n, d)
 28 |     beta = np.sqrt(5 * 2.0 / d) * np.ones(d)
 29 |     beta[(d // 2):] = 0
 30 | 
 31 |     outcome = (np.random.rand(n) <= 1.0 /
 32 |                (1.0 + np.exp(-features.dot(beta)))).astype(float)
 33 | 
 34 |     return features, outcome
 35 | 
 36 |   def test_unbiased_model(self):
 37 |     """Tests that UnbiasedLogisticRegression.fit runs without errors."""
 38 |     n, d = 1000, 100
 39 |     features, outcome = self.get_simulated_data(n, d)
 40 |     model = unbiased_logistic_regression.UnbiasedLogisticRegression(
 41 |         fit_intercept=False)
 42 |     model.fit(features, outcome)
 43 | 
 44 |     self.assertLen(model.coef_.reshape(-1), features.shape[1])
 45 | 
 46 |   def test_cant_fit_intercept(self):
 47 |     """Tests that UnbiasedLogisticRegression doesn't allow fit_intercept.
 48 | 
 49 |     Currently, there's no support for fitting the intercept. This checks that
 50 |     trying to fit an intercept raises an error instead of silently ignoring
 51 |     the intercept.
 52 |     """
 53 |     with self.assertRaises(ValueError):
 54 |       _ = unbiased_logistic_regression.UnbiasedLogisticRegression(
 55 |           fit_intercept=True)
 56 | 
 57 |   def test_platt_model(self):
 58 |     """Tests that PlattScaledLogisticRegression.fit runs without errors."""
 59 |     n, d = 1000, 100
 60 |     features, outcome = self.get_simulated_data(n, d)
 61 |     model = unbiased_logistic_regression.PlattScaledLogisticRegression(
 62 |         fit_intercept=False)
 63 |     model.fit(features, outcome)
 64 | 
 65 |   def test_standard_mle_model(self):
 66 |     """Tests that LogisticRegressionMLE.fit runs without errors."""
 67 |     n, d = 1000, 100
 68 |     features, outcome = self.get_simulated_data(n, d)
 69 |     model = unbiased_logistic_regression.LogisticRegressionMLE(
 70 |         fit_intercept=False)
 71 |     model.fit(features, outcome)
 72 | 
 73 |   def test_bootstrap_model(self):
 74 |     """Tests that LogisticRegressionPercBoot.fit runs without errors."""
 75 |     n, d = 1000, 100
 76 |     features, outcome = self.get_simulated_data(n, d)
 77 |     model = unbiased_logistic_regression.LogisticRegressionPercBoot(
 78 |         fit_intercept=False)
 79 |     model.fit(features, outcome)
 80 | 
 81 |   def test_bootstrap_prediction_intervals(self):
 82 |     """Tests that LogisticRegressionPercBoot.prediction_intervals runs."""
 83 |     n, d = 1000, 100
 84 |     features, outcome = self.get_simulated_data(n, d)
 85 |     model = unbiased_logistic_regression.LogisticRegressionPercBoot(
 86 |         fit_intercept=False)
 87 |     model.fit(features, outcome)
 88 |     model.prediction_intervals(features)
 89 | 
 90 |   def test_regularized_model(self):
 91 |     """Tests that CVRegLogisticRegression.fit runs without errors."""
 92 |     n, d = 1000, 100
 93 |     features, outcome = self.get_simulated_data(n, d)
 94 |     model = unbiased_logistic_regression.CVRegLogisticRegression(
 95 |         fit_intercept=False)
 96 |     model.fit(features, outcome)
 97 | 
 98 |     self.assertLen(model.coef_.reshape(-1), features.shape[1])
 99 | 
100 |   def test_prediction_intervals(self):
101 |     n, d = 1000, 100
102 |     features, outcome = self.get_simulated_data(n, d)
103 |     model = unbiased_logistic_regression.UnbiasedLogisticRegression(
104 |         fit_intercept=False)
105 |     model.fit(features, outcome)
106 | 
107 |     test_features, _ = self.get_simulated_data(100, d)
108 |     intervals = model.prediction_intervals(test_features)
109 |     estimated_probs = model.predict_proba(test_features)[:, 1]
110 | 
111 |     np.testing.assert_array_less(intervals[:, 0], estimated_probs)
112 |     np.testing.assert_array_less(estimated_probs, intervals[:, 2])
113 | 
114 |   def test_corrected_p_values(self):
115 |     """Check null P value CDF is within 95% CI of uniform CDF."""
116 |     n, d = 4000, 400
117 |     features, outcome = self.get_simulated_data(n, d)
118 |     model = unbiased_logistic_regression.UnbiasedLogisticRegression(
119 |         fit_intercept=False)
120 |     model.fit(features, outcome)
121 | 
122 |     thresh = 0.1
123 |     emp_p_cdf = model.p_values().reshape(-1)[(d // 2):] <= thresh
124 |     self.assertAlmostEqual(
125 |         emp_p_cdf.mean(),
126 |         thresh,
127 |         delta=1.96 * emp_p_cdf.std() / np.sqrt(d // 2))
128 | 
129 | 
130 | if __name__ == '__main__':
131 |   absltest.main()
132 | 


--------------------------------------------------------------------------------
/probe_frontier.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The SLOE Logistic Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Implements logistic regression w/ ProbeFrontier estimator of bias correction.
 17 | 
 18 | Implements the bias correction and inference for the MLE using the ProbeFrontier
 19 | estimator of the signal strength as in [1]. Theory for arbitrary covariance with
 20 | Gaussian features from [2], and empirical evidence suggesting good performance
 21 | for non-Gaussian designs.
 22 | 
 23 | [1] Sur, Pragya, and Emmanuel J. Candès. "A modern maximum-likelihood theory
 24 | for high-dimensional logistic regression." Proceedings of the National Academy
 25 | of Sciences 116.29 (2019): 14516-14525.
 26 | [2] Zhao, Qian, Pragya Sur, and Emmanuel J. Candes. "The asymptotic distribution
 27 | of the mle in high-dimensional logistic models: Arbitrary covariance." arXiv
 28 | preprint arXiv:2001.09351 (2020).
 29 | """
 30 | from absl import app
 31 | import numpy as np
 32 | import scipy
 33 | from sloe_logistic import asymp_system_solve
 34 | from sloe_logistic import unbiased_logistic_regression
 35 | import statsmodels.api as sm
 36 | import statsmodels.tools
 37 | 
 38 | 
 39 | class ProbeFrontierLogisticRegression(
 40 |     unbiased_logistic_regression.UnbiasedLogisticRegression):
 41 |   """Implements ProbeFrontier and statistical inference with it."""
 42 | 
 43 |   def __init__(self, num_subsamples=10):
 44 |     super().__init__(fit_intercept=False)
 45 |     self.num_subsamples = num_subsamples
 46 |     self.sep_calls = 0
 47 | 
 48 |   def fit(self, features, outcome, weights=None, verbose=False):
 49 |     """Fit ProbeFrontier model."""
 50 |     if self.fit_intercept:
 51 |       raise NotImplementedError("ProbeFrontier doesn't work with intercept")
 52 |     self.sep_calls = 0
 53 | 
 54 |     self.sm.fit(features, outcome, weights)
 55 | 
 56 |     if weights is None:
 57 |       weights = 1
 58 | 
 59 |     kappa = float(features.shape[1]) / features.shape[0]
 60 |     gamma_hat = self.estimate_gamma(features, outcome)
 61 | 
 62 |     self.alpha, _, sigma, _ = asymp_system_solve.correction_factors(
 63 |         kappa, None, gamma_hat, 0, use_eta=False)
 64 | 
 65 |     self.coef_ = self.sm.coef_ / self.alpha
 66 |     self.intercept_ = 0
 67 | 
 68 |     self._set_coef_cov(features, sigma / np.sqrt(kappa), self.alpha)
 69 | 
 70 |     return self, self.sep_calls
 71 | 
 72 |   def estimate_gamma(self, features, outcome):
 73 |     """Estimate gamma."""
 74 |     estimated_kappa_threshold = self.probe_frontier(features, outcome)
 75 |     if estimated_kappa_threshold < 0:
 76 |       print(features, outcome)
 77 |     if estimated_kappa_threshold >= 0.499:
 78 |       return 0.0
 79 |     return asymp_system_solve.frontier(estimated_kappa_threshold)
 80 | 
 81 |   def probe_frontier(self, features, outcome):
 82 |     """Probe for frontier."""
 83 |     n, p = features.shape
 84 |     upper_frac = n
 85 |     lower_frac = min(n, 1.99 * p)
 86 |     obs = []
 87 |     while abs(upper_frac - lower_frac) > (0.05 * p):
 88 |       frac = int((upper_frac + lower_frac) / 2)
 89 |       p_sep = 0
 90 |       for _ in range(self.num_subsamples):
 91 |         indices = np.random.choice(n, frac, replace=False)
 92 |         feature_sub = features[indices, :]
 93 |         outcome_sub = outcome[indices]
 94 |         p_sep += self.is_separable(feature_sub, outcome_sub)
 95 |       p_sep /= float(self.num_subsamples)
 96 |       obs.append([frac, p_sep])
 97 |       if p_sep >= 0.8:
 98 |         lower_frac = frac
 99 |       elif p_sep <= 0.2:
100 |         upper_frac = frac
101 |       elif p_sep > 0.5:
102 |         lower_frac = 0.5 * lower_frac + 0.5 * frac
103 |       else:
104 |         upper_frac = 0.5 * upper_frac + 0.5 * frac
105 | 
106 |     if len(obs) <= 2:
107 |       frac = int(0.5 * (upper_frac + lower_frac))
108 |     else:
109 |       obs = np.array(obs)
110 | 
111 |       if (obs[0, 1] > (1 - 1.5 / self.num_subsamples)):
112 |         frac = obs[0, 0]
113 |       elif (obs[-1, 1] < (1.5 / self.num_subsamples)):
114 |         frac = obs[-1, 0]
115 |       else:
116 |         try:
117 |           interp = sm.GLM(
118 |               obs[:, 1],
119 |               sm.add_constant(obs[:, 0].reshape(-1, 1)),
120 |               family=sm.families.Binomial())
121 |           res = interp.fit()
122 |           frac = -res.params[0] / res.params[1]
123 | 
124 |         except statsmodels.tools.sm_exceptions.PerfectSeparationError:
125 |           threshold = np.argmax(np.diff(obs[:, 1], prepend=0))
126 |           frac = obs[threshold, 0]
127 | 
128 |     return min(float(p) / frac, 0.5)
129 | 
130 |   def is_separable(self, features, outcome):
131 |     """Check whether data are linearly separable."""
132 |     self.sep_calls += 1
133 |     n, p = features.shape
134 |     features_aug = np.ones((n, p + 1))
135 |     features_aug[:, :-1] = features
136 |     features_aug *= (2 * outcome - 1).reshape(-1, 1)
137 |     b = -np.ones(n)
138 |     res = scipy.optimize.linprog(
139 |         b, A_eq=features_aug.T, b_eq=np.zeros(p + 1), method='interior-point')
140 |     if res.status == 0:
141 |       return res.fun > -1e-6
142 |     elif res.status == 2:
143 |       return False
144 |     elif res.status == 3:
145 |       return False
146 |     else:
147 |       print(res)
148 |       raise Exception('Error finding separability')
149 | 
150 | 
151 | def main(argv):
152 |   if len(argv) > 1:
153 |     raise app.UsageError('Too many command-line arguments.')
154 | 
155 |   p = ProbeFrontierLogisticRegression()
156 | 
157 |   features = np.random.randn(600, 300) / np.sqrt(300)
158 |   outcome = (np.random.rand(600) <= 1 /
159 |              (1.0 + np.exp(-1 * features.sum(axis=1)))).astype(float)
160 |   primal = p.is_separable(features, outcome)
161 |   print(primal)
162 | 
163 |   features = np.array([[1, 1], [0, 0]])
164 |   outcome = np.array([1, 0])
165 |   print(p.is_separable(features, outcome))
166 |   features = np.array([[1, 1], [0, 0], [-1, -1]])
167 |   outcome = np.array([1, 0, 1])
168 |   print(p.is_separable(features, outcome))
169 | 
170 |   features = np.random.randn(100, 100)
171 |   outcome = (np.random.rand(100) <= 0.5).astype(float)
172 |   print(p.is_separable(features, outcome))
173 | 
174 |   features = np.random.randn(100, 10)
175 |   outcome = (np.random.rand(100) <= 0.5).astype(float)
176 |   print(p.is_separable(features, outcome))
177 | 
178 | if __name__ == '__main__':
179 |   app.run(main)
180 | 


--------------------------------------------------------------------------------
/sloe_experiments/sweep_coverage.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The SLOE Logistic Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Run experiment to understand coverage of CIs generated by SLOE.
 17 | 
 18 | Tests the SLOE estimator empirically by computing
 19 | confidence intervals (CIs) using it over a bunch of different seeds and aspect
 20 | ratios, calculating properties such as coverage and size, and storing in csv
 21 | files to be analyzed in a colab.
 22 | """
 23 | 
 24 | 
 25 | from absl import app
 26 | from absl import flags
 27 | import apache_beam as beam
 28 | from apache_beam.options import pipeline_options
 29 | import numpy as np
 30 | import sklearn.linear_model
 31 | from sklearn.model_selection import LeaveOneOut
 32 | 
 33 | from sloe_logistic import probe_frontier
 34 | from sloe_logistic import unbiased_logistic_regression
 35 | import sloe_logistic.sloe_experiments.experiment_helpers as exp_helper
 36 | 
 37 | 
 38 | GAMMA_RANGE = [0.1, 1, 5]
 39 | FLAGS = flags.FLAGS
 40 | 
 41 | flags.DEFINE_integer('num_sims', 100, 'number of simulations to run')
 42 | flags.DEFINE_string('output_path', '/tmp/counts', 'The output file path')
 43 | flags.DEFINE_enum(
 44 |     'coverage_target', 'true_preds', ['true_preds', 'calib_ests', 'reg_ests'],
 45 |     'Which value to check coverage in prediction intervals?')
 46 | flags.DEFINE_boolean('include_bootstrap', False,
 47 |                      'Include bootstrap CIs as well? These are slow.')
 48 | flags.DEFINE_float(
 49 |     'kappa_spacing', 0.05,
 50 |     'Resolution of graph in terms of spacing between kappa evaluated.')
 51 | flags.DEFINE_float(
 52 |     'coverage_rate', 95, 'What level confidence intervals'
 53 |     'should be tested (0-100)?')
 54 | 
 55 | 
 56 | def run_sim(params):
 57 |   """Runs simulation and computes properties of the estimated CIs."""
 58 |   kappa = params[0]
 59 |   gamma = params[1]
 60 |   seed = 201216 + params[2]
 61 | 
 62 |   sim_params = exp_helper.SimulationParams.create_from_flags()
 63 |   sim_params.seed = seed
 64 |   sim_params.gamma = np.sqrt(gamma)
 65 |   sim_params.p = int(sim_params.training_n * kappa)
 66 |   sim = exp_helper.create_sim(sim_params)
 67 | 
 68 |   x1, y1 = sim.sample()
 69 | 
 70 |   pfr = probe_frontier.ProbeFrontierLogisticRegression()
 71 |   if pfr.is_separable(x1, y1):
 72 |     return
 73 | 
 74 |   # Draw test data
 75 |   x2, _ = sim.sample(int(sim_params.training_n / 4))
 76 |   true_logits = x2.dot(sim.beta)
 77 |   bias_selector = np.abs(true_logits) > 1e-2
 78 | 
 79 |   # Calculate coverage
 80 |   if FLAGS.coverage_target == 'true_preds':
 81 |     target = 1.0 / (1.0 + np.exp(-true_logits)).reshape(-1)
 82 |   elif FLAGS.coverage_target == 'calib_ests':
 83 |     ps_logit_model = unbiased_logistic_regression.PlattScaledLogisticRegression(
 84 |         fit_intercept=sim_params.intercept or sim_params.uncentered)
 85 |     ps_logit_model.fit(x1, y1)
 86 |     target = ps_logit_model.predict_proba(x2)[:, 1]
 87 |   elif FLAGS.coverage_target == 'reg_ests':
 88 |     ps_logit_model = sklearn.linear_model.LogisticRegressionCV(
 89 |         cv=LeaveOneOut(),
 90 |         fit_intercept=False,
 91 |         Cs=20,
 92 |         penalty='l2',
 93 |         solver='newton-cg')
 94 |     ps_logit_model.fit(x1, y1)
 95 |     target = ps_logit_model.predict_proba(x2)[:, 1]
 96 |   else:
 97 |     raise ValueError("Invalid choice of coverage target '{}'.".format(
 98 |         FLAGS.coverage_target))
 99 | 
100 |   try:
101 |     new_method_model = exp_helper.create_inference_model('newmethod')
102 |     new_method_model.set_coverage(FLAGS.coverage_rate)
103 |     _ = new_method_model.fit(x1, y1)
104 |     new_pred_int = new_method_model.prediction_intervals(x2)
105 |     new_logit_int = new_method_model.prediction_intervals(x2, logit=True)
106 |   except ValueError as e:
107 |     print(e)
108 |     return
109 | 
110 |   std_method_model = exp_helper.create_inference_model('mle')
111 |   std_method_model.set_coverage(FLAGS.coverage_rate)
112 |   _ = std_method_model.fit(x1, y1)
113 |   std_pred_int = std_method_model.prediction_intervals(x2)
114 |   std_logit_int = std_method_model.prediction_intervals(x2, logit=True)
115 | 
116 |   new_coverage = np.logical_and(
117 |       new_pred_int[:, 0].reshape(-1) <= target,
118 |       target <= new_pred_int[:, 2].reshape(-1)).astype(float)
119 |   std_coverage = np.logical_and(
120 |       std_pred_int[:, 0].reshape(-1) <= target,
121 |       target <= std_pred_int[:, 2].reshape(-1)).astype(float)
122 | 
123 |   new_width = np.abs(new_logit_int[:, 2] - new_logit_int[:, 0])
124 |   std_width = np.abs(std_logit_int[:, 2] - std_logit_int[:, 0])
125 | 
126 |   new_bias = new_logit_int[bias_selector, 1] / true_logits[bias_selector]
127 |   std_bias = std_logit_int[bias_selector, 1] / true_logits[bias_selector]
128 | 
129 |   results = [
130 |       gamma, kappa, seed,
131 |       np.mean(new_coverage),
132 |       np.mean(new_width),
133 |       np.mean(new_bias),
134 |       np.mean(std_coverage),
135 |       np.mean(std_width),
136 |       np.mean(std_bias)
137 |   ]
138 | 
139 |   if FLAGS.include_bootstrap:
140 |     boot_method_model = exp_helper.create_inference_model('bootstrap')
141 |     boot_method_model.set_coverage(FLAGS.coverage_rate)
142 |     _ = boot_method_model.fit(x1, y1)
143 |     boot_pred_int = boot_method_model.prediction_intervals(x2)
144 |     boot_logit_int = boot_method_model.prediction_intervals(x2, logit=True)
145 | 
146 |     boot_coverage = np.logical_and(
147 |         boot_pred_int[:, 0].reshape(-1) <= target,
148 |         target <= boot_pred_int[:, 2].reshape(-1)).astype(float)
149 |     boot_width = np.abs(boot_logit_int[:, 2] - boot_logit_int[:, 0])
150 |     boot_bias = boot_logit_int[bias_selector, 1] / true_logits[bias_selector]
151 | 
152 |     results.append(np.mean(boot_coverage))
153 |     results.append(np.mean(boot_width))
154 |     results.append(np.mean(boot_bias))
155 | 
156 |   return [np.array(results)]
157 | 
158 | 
159 | def main(unused_argv):
160 |   kappa_range = np.arange(0.05, 0.5 + 0.5 * FLAGS.kappa_spacing,
161 |                           FLAGS.kappa_spacing)
162 | 
163 |   # If you have custom beam options add them here.
164 |   beam_options = pipeline_options.PipelineOptions()
165 | 
166 |   with beam.Pipeline(options=beam_options) as pipe:
167 |     _ = (
168 |         pipe
169 |         | beam.Create(range(FLAGS.num_sims))
170 |         | beam.FlatMap(exp_helper.multiple_sim_params, kappa_range,
171 |                        GAMMA_RANGE)
172 |         | 'PrepShuffle' >> beam.Reshuffle()
173 |         | beam.FlatMap(run_sim)
174 |         | beam.Map(exp_helper.numpy_array_to_csv)
175 |         | beam.Reshuffle()
176 |         |
177 |         'WriteToText' >> beam.io.WriteToText(FLAGS.output_path, num_shards=5))
178 | 
179 | 
180 | if __name__ == '__main__':
181 |   app.run(main)
182 | 


--------------------------------------------------------------------------------
/third_party/py/scipy/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | Copyright (c) 2001, 2002 Enthought, Inc.
  2 | All rights reserved.
  3 | 
  4 | Copyright (c) 2003-2017 SciPy Developers.
  5 | All rights reserved.
  6 | 
  7 | Redistribution and use in source and binary forms, with or without
  8 | modification, are permitted provided that the following conditions are met:
  9 | 
 10 |   a. Redistributions of source code must retain the above copyright notice,
 11 |      this list of conditions and the following disclaimer.
 12 |   b. Redistributions in binary form must reproduce the above copyright
 13 |      notice, this list of conditions and the following disclaimer in the
 14 |      documentation and/or other materials provided with the distribution.
 15 |   c. Neither the name of Enthought nor the names of the SciPy Developers
 16 |      may be used to endorse or promote products derived from this software
 17 |      without specific prior written permission.
 18 | 
 19 | 
 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 23 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS
 24 | BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
 25 | OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 26 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 27 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 28 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 29 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 30 | THE POSSIBILITY OF SUCH DAMAGE.
 31 | 
 32 | 
 33 | 
 34 | SciPy bundles a number of libraries that are compatibly licensed.  We list
 35 | these here.
 36 | 
 37 | Name: Numpydoc
 38 | Files: doc/sphinxext/numpydoc/*
 39 | License: 2-clause BSD
 40 |   For details, see doc/sphinxext/LICENSE.txt
 41 | 
 42 | Name: scipy-sphinx-theme
 43 | Files: doc/scipy-sphinx-theme/*
 44 | License: 3-clause BSD, PSF and Apache 2.0
 45 |   For details, see doc/sphinxext/LICENSE.txt
 46 | 
 47 | Name: Six
 48 | Files: scipy/_lib/six.py
 49 | License: MIT
 50 |   For details, see the header inside scipy/_lib/six.py
 51 | 
 52 | Name: Decorator
 53 | Files: scipy/_lib/decorator.py
 54 | License: 2-clause BSD
 55 |   For details, see the header inside scipy/_lib/decorator.py
 56 | 
 57 | Name: ID
 58 | Files: scipy/linalg/src/id_dist/*
 59 | License: 3-clause BSD
 60 |   For details, see scipy/linalg/src/id_dist/doc/doc.tex
 61 | 
 62 | Name: L-BFGS-B
 63 | Files: scipy/optimize/lbfgsb/*
 64 | License: BSD license
 65 |   For details, see scipy/optimize/lbfgsb/README
 66 | 
 67 | Name: SuperLU
 68 | Files: scipy/sparse/linalg/dsolve/SuperLU/*
 69 | License: 3-clause BSD
 70 |   For details, see scipy/sparse/linalg/dsolve/SuperLU/License.txt
 71 | 
 72 | Name: ARPACK
 73 | Files: scipy/sparse/linalg/eigen/arpack/ARPACK/*
 74 | License: 3-clause BSD
 75 |   For details, see scipy/sparse/linalg/eigen/arpack/ARPACK/COPYING
 76 | 
 77 | Name: Qhull
 78 | Files: scipy/spatial/qhull/*
 79 | License: Qhull license (BSD-like)
 80 |   For details, see scipy/spatial/qhull/COPYING.txt
 81 | 
 82 | Name: Cephes
 83 | Files: scipy/special/cephes/*
 84 | License: 3-clause BSD
 85 |   Distributed under 3-clause BSD license with permission from the author,
 86 |   see https://lists.debian.org/debian-legal/2004/12/msg00295.html
 87 |   
 88 |   Cephes Math Library Release 2.8:  June, 2000
 89 |   Copyright 1984, 1995, 2000 by Stephen L. Moshier
 90 |   
 91 |   This software is derived from the Cephes Math Library and is
 92 |   incorporated herein by permission of the author.
 93 |   
 94 |   All rights reserved.
 95 |   
 96 |   Redistribution and use in source and binary forms, with or without
 97 |   modification, are permitted provided that the following conditions are met:
 98 |       * Redistributions of source code must retain the above copyright
 99 |         notice, this list of conditions and the following disclaimer.
100 |       * Redistributions in binary form must reproduce the above copyright
101 |         notice, this list of conditions and the following disclaimer in the
102 |         documentation and/or other materials provided with the distribution.
103 |       * Neither the name of the <organization> nor the
104 |         names of its contributors may be used to endorse or promote products
105 |         derived from this software without specific prior written permission.
106 |   
107 |   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
108 |   ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
109 |   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
110 |   DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
111 |   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
112 |   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
113 |   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
114 |   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
115 |   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
116 |   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
117 | 
118 | Name: Faddeeva
119 | Files: scipy/special/Faddeeva.*
120 | License: MIT
121 |   Copyright (c) 2012 Massachusetts Institute of Technology
122 |   
123 |   Permission is hereby granted, free of charge, to any person obtaining
124 |   a copy of this software and associated documentation files (the
125 |   "Software"), to deal in the Software without restriction, including
126 |   without limitation the rights to use, copy, modify, merge, publish,
127 |   distribute, sublicense, and/or sell copies of the Software, and to
128 |   permit persons to whom the Software is furnished to do so, subject to
129 |   the following conditions:
130 |   
131 |   The above copyright notice and this permission notice shall be
132 |   included in all copies or substantial portions of the Software.
133 |   
134 |   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
135 |   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
136 |   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
137 |   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
138 |   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
139 |   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
140 |   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 
141 | 
142 | Name: qd
143 | Files: scipy/special/cephes/dd_*.[ch]
144 | License: modified BSD license ("BSD-LBNL-License.doc")
145 |   This work was supported by the Director, Office of Science, Division
146 |   of Mathematical, Information, and Computational Sciences of the
147 |   U.S. Department of Energy under contract numbers DE-AC03-76SF00098 and
148 |   DE-AC02-05CH11231.
149 |   
150 |   Copyright (c) 2003-2009, The Regents of the University of California,
151 |   through Lawrence Berkeley National Laboratory (subject to receipt of
152 |   any required approvals from U.S. Dept. of Energy) All rights reserved.
153 |   
154 |   1. Redistribution and use in source and binary forms, with or
155 |   without modification, are permitted provided that the following
156 |   conditions are met:
157 |   
158 |   (1) Redistributions of source code must retain the copyright
159 |   notice, this list of conditions and the following disclaimer.
160 |   
161 |   (2) Redistributions in binary form must reproduce the copyright
162 |   notice, this list of conditions and the following disclaimer in
163 |   the documentation and/or other materials provided with the
164 |   distribution.
165 |   
166 |   (3) Neither the name of the University of California, Lawrence
167 |   Berkeley National Laboratory, U.S. Dept. of Energy nor the names
168 |   of its contributors may be used to endorse or promote products
169 |   derived from this software without specific prior written
170 |   permission.
171 |   
172 |   2. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
173 |   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
174 |   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
175 |   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
176 |   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
177 |   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
178 |   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
179 |   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
180 |   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
181 |   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
182 |   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
183 |   
184 |   3. You are under no obligation whatsoever to provide any bug fixes,
185 |   patches, or upgrades to the features, functionality or performance of
186 |   the source code ("Enhancements") to anyone; however, if you choose to
187 |   make your Enhancements available either publicly, or directly to
188 |   Lawrence Berkeley National Laboratory, without imposing a separate
189 |   written license agreement for such Enhancements, then you hereby grant
190 |   the following license: a non-exclusive, royalty-free perpetual license
191 |   to install, use, modify, prepare derivative works, incorporate into
192 |   other computer software, distribute, and sublicense such enhancements
193 |   or derivative works thereof, in binary and source code form.
194 | 


--------------------------------------------------------------------------------
/sloe_experiments/experiment_helpers.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The SLOE Logistic Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Helpers used across many experiments to understand SLOE estimator.
 17 | 
 18 | Implements the simulation settings studied in the paper
 19 | and provides a bunch of helper functions used throughout to create and analyze
 20 | simulations.
 21 | """
 22 | 
 23 | 
 24 | from absl import flags
 25 | import numpy as np
 26 | 
 27 | from sloe_logistic import probe_frontier
 28 | from sloe_logistic import unbiased_logistic_regression
 29 | 
 30 | FLAGS = flags.FLAGS
 31 | 
 32 | flags.DEFINE_enum(
 33 |     "covariates", "gaussian", ["gaussian", "gwas"],
 34 |     "Covariate generating distribution for sim. If gaussian, see --covariance"
 35 |     "for more details about distribution.")
 36 | flags.DEFINE_enum(
 37 |     "covariance", "isotropic", ["isotropic", "elliptical"],
 38 |     "Covariance of covariates.")
 39 | flags.DEFINE_float("features_per_sample", 0.2,
 40 |                    "number of features per sample (kappa)")
 41 | flags.DEFINE_float("intercept", 0, "intercept of logits")
 42 | flags.DEFINE_enum(
 43 |     "method", "newmethod", ["newmethod", "mle", "probefrontier"],
 44 |     "Which method for estimation and inference?")
 45 | flags.DEFINE_boolean("one_and_none", False,
 46 |                      "Put all of the signal in one (the first) covariate. "
 47 |                      "This does not meet assumptions of method, but provides "
 48 |                      "a nice robustness check to see how inaccurate results "
 49 |                      "will be.")
 50 | flags.DEFINE_integer("sample_size", 1000, "number of samples per simulation")
 51 | flags.DEFINE_float("signal_strength", 5, "variance of logits (gamma^2)")
 52 | flags.DEFINE_boolean(
 53 |     "uncentered", False,
 54 |     "By default, covariates are centered. This makes them uncentered (w/o effecting intercept)?."
 55 | )
 56 | 
 57 | 
 58 | class SimulationParams(object):
 59 |   """Simulation parameters shared across SLOE estimator experiments."""
 60 | 
 61 |   def __init__(self,
 62 |                training_n,
 63 |                p,
 64 |                gamma,
 65 |                covariates="gaussian",
 66 |                covariance="isotropic",
 67 |                one_and_none=False,
 68 |                uncentered=False,
 69 |                intercept=0,
 70 |                seed=None):
 71 |     self.training_n = training_n
 72 |     self.p = p
 73 |     self.gamma = gamma
 74 |     self.covariates = covariates
 75 |     self.covariance = covariance
 76 |     self.one_and_none = one_and_none
 77 |     self.uncentered = uncentered
 78 |     self.intercept = intercept
 79 |     self.seed = seed
 80 | 
 81 |   @classmethod
 82 |   def create_from_flags(cls):
 83 |     """Create a SimulationParams object from FLAGS."""
 84 |     n = FLAGS.sample_size
 85 |     kappa = FLAGS.features_per_sample
 86 |     gamma = np.sqrt(FLAGS.signal_strength)
 87 |     covariates = FLAGS.covariates
 88 |     covariance = FLAGS.covariance
 89 |     one_and_none = FLAGS.one_and_none
 90 |     uncentered = FLAGS.uncentered
 91 |     intercept = FLAGS.intercept
 92 | 
 93 |     p = int(n * kappa)
 94 |     return SimulationParams(n, p, gamma, covariates, covariance, one_and_none,
 95 |                             uncentered, intercept)
 96 | 
 97 | 
 98 | class Simulation(object):
 99 |   """Standard simulation model used in most experiments in SLOE paper."""
100 | 
101 |   def __init__(self, simulation_params):
102 |     self.simulation_params = simulation_params
103 | 
104 |     self._check_sim_params()
105 |     self._reset_random_state()
106 |     self._initialize_params()
107 | 
108 |   def _initialize_params(self):
109 |     """Initializes statistical params of model from simulation parameters."""
110 |     p = self.simulation_params.p
111 | 
112 |     self.intercept_ = self.simulation_params.intercept
113 | 
114 |     if self.simulation_params.one_and_none:
115 |       self.beta = np.zeros(p)
116 |       self.beta[0] = self.simulation_params.gamma * np.sqrt(p)
117 |     else:
118 |       self.p_positive = int(p / 8)
119 |       self.p_negative = self.p_positive
120 |       self.p_zero = p - self.p_positive - self.p_negative
121 |       self.beta = 2 * np.concatenate((np.ones(
122 |           self.p_positive), -np.ones(self.p_negative), np.zeros(self.p_zero)))
123 |       self.beta *= self.simulation_params.gamma
124 | 
125 |     if self.simulation_params.covariance == "isotropic":
126 |       self.diag = np.ones(p)
127 |     elif self.simulation_params.covariance == "elliptical":
128 |       self.diag = self.random_state.rand(p) + 0.5
129 |       self.diag /= self.diag[:(self.p_positive + self.p_negative)].mean()
130 |       self.diag[0] = 1
131 |     else:
132 |       raise NotImplementedError("No covariance {}".format(
133 |           self.simulation_params.covariance))
134 | 
135 |     if self.simulation_params.uncentered:
136 |       self.centering = np.ones(p)
137 |       self.intercept_ -= self.beta.dot(self.centering)
138 |     else:
139 |       self.centering = 0
140 | 
141 |   def null_indices(self):
142 |     """Get null indices."""
143 |     return slice(-self.p_zero, None, None)
144 | 
145 |   def _check_sim_params(self):
146 |     if self.simulation_params.covariates != "gaussian":
147 |       raise ValueError(
148 |           "Simulation parameters calls for {} covariate distribution, "
149 |           "but this class generates Gaussian covariates.".format(
150 |               self.simulation_params.covariates))
151 | 
152 |   def _reset_random_state(self):
153 |     self.random_state = np.random.RandomState(seed=self.simulation_params.seed)
154 | 
155 |   def _sample_x(self, n):
156 |     return self.diag * self.random_state.randn(
157 |         n, self.simulation_params.p) / np.sqrt(
158 |             self.simulation_params.p) + self.centering
159 | 
160 |   def sample(self, n=None):
161 |     """Sample data from simulation."""
162 |     if n is None:
163 |       n = self.simulation_params.training_n
164 | 
165 |     x1 = self._sample_x(n)
166 |     y1 = (self.random_state.rand(n) <= 1.0 /
167 |           (1.0 + np.exp(-x1.dot(self.beta) - self.intercept_))).astype(float)
168 |     return (x1, y1)
169 | 
170 | 
171 | class GWASSimulation(Simulation):
172 |   """From Sur and Candes, 2019. PNAS. Section 4(g)."""
173 | 
174 |   def __init__(self, simulation_params):
175 |     super().__init__(simulation_params)
176 | 
177 |     self._initialize_cov_params()
178 | 
179 |   def _initialize_cov_params(self):
180 |     self.equil = 0.5 * self.random_state.rand(self.simulation_params.p) + 0.25
181 | 
182 |   def _check_sim_params(self):
183 |     if self.simulation_params.covariates != "gwas":
184 |       raise ValueError(
185 |           "Simulation parameters calls for {} covariate distribution, "
186 |           "but this class generates GWAS-like covariates.".format(
187 |               self.simulation_params.covariates))
188 | 
189 |   def covariate_mean(self):
190 |     return 2 * (1 - self.equil)
191 | 
192 |   def covariate_std(self):
193 |     return 2 * (1 - self.equil) * self.equil
194 | 
195 |   def _sample_x(self, n):
196 |     p = self.simulation_params.p
197 |     x1 = np.zeros((n, p))
198 |     equil = self.equil
199 |     for j in range(p):
200 |       pj = equil[j]
201 |       probs = np.array([pj**2, 2 * pj * (1 - pj), (1 - pj)**2])
202 |       x1[:, j] = self.random_state.choice(3, size=(n,), p=probs)
203 |     x1 -= self.covariate_mean().reshape(1, -1)
204 |     x1 /= self.covariate_std().reshape(1, -1) * np.sqrt(p)
205 |     return x1
206 | 
207 | 
208 | def multiple_sim_params(seed, kappa_range, gamma_range):
209 |   """For each seed, map to a variety of simulation parameters."""
210 |   for kappa in kappa_range:
211 |     for gamma in gamma_range:
212 |       yield [kappa, gamma, seed]
213 | 
214 | 
215 | def multiple_sample_sizes(seed, n_range):
216 |   """For each seed, map to a variety of sample sizes."""
217 |   for n in n_range:
218 |     yield [n, seed]
219 | 
220 | 
221 | def create_sim(sim_params):
222 |   """Create a simulation according to passed params."""
223 |   if sim_params.covariates == "gaussian":
224 |     return Simulation(sim_params)
225 |   elif sim_params.covariates == "gwas":
226 |     return GWASSimulation(sim_params)
227 |   else:
228 |     raise NotImplementedError("No simulation with covariates {}".format(
229 |         FLAGS.covariates))
230 | 
231 | 
232 | def create_inference_model(method=None, fit_intercept=False):
233 |   """Create a model to use for inference, getting default from FLAGS."""
234 |   if method is None:
235 |     method = FLAGS.method
236 | 
237 |   if method == "probe_frontier":
238 |     if fit_intercept:
239 |       raise NotImplementedError(
240 |           "ProbeFrontier can't fit an intercept right now")
241 |     logit_model = probe_frontier.ProbeFrontierLogisticRegression(
242 |         num_subsamples=8)
243 |   elif method == "mle":
244 |     logit_model = unbiased_logistic_regression.LogisticRegressionMLE(
245 |         fit_intercept=fit_intercept)
246 |   elif method == "bootstrap":
247 |     logit_model = unbiased_logistic_regression.LogisticRegressionPercBoot(
248 |         fit_intercept=fit_intercept)
249 |   elif method == "newmethod":
250 |     logit_model = unbiased_logistic_regression.UnbiasedLogisticRegression(
251 |         fit_intercept=fit_intercept)
252 |   else:
253 |     raise NotImplementedError("No method {}".format(FLAGS.method))
254 |   return logit_model
255 | 
256 | 
257 | def numpy_array_to_csv(arr):
258 |   return ",".join(["%.5f" % num for num in arr])
259 | 
260 | 
261 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/unbiased_logistic_regression.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The SLOE Logistic Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Implements methods for inference for logistic regression based on the MLE.
 17 | 
 18 | Implements SLOE and other methods for inference for logistic regression
 19 | based on the MLE.
 20 | """
 21 | 
 22 | import numpy as np
 23 | import scipy
 24 | import scipy.stats
 25 | import sklearn.linear_model
 26 | 
 27 | from sloe_logistic import asymp_system_solve
 28 | 
 29 | 
 30 | class ScaledLogisticRegression(object):
 31 |   """Generic class for methods rescaling the logistic regression MLE."""
 32 | 
 33 |   def __init__(self):
 34 |     pass
 35 | 
 36 |   def predict_proba(self, features, *args, **kwargs):
 37 |     del args
 38 |     del kwargs
 39 |     results = np.zeros((features.shape[0], 2))
 40 |     log_odds_ratio = features.dot(self.coef_.T).reshape(-1) + self.intercept_
 41 |     results[:, 1] = self._expit(log_odds_ratio)
 42 |     results[:, 0] = 1 - results[:, 1]
 43 |     return results
 44 | 
 45 |   def predict_inv_proba(self, features, *args, **kwargs):
 46 |     """Provides reciprocal of probability given features."""
 47 |     return 1 / self.predict_proba(features, *args, **kwargs)
 48 | 
 49 |   def _expit(self, logit, trimmed=False):
 50 |     if trimmed:
 51 |       logit = np.minimum(logit, 5)
 52 |       logit = np.maximum(logit, -5)
 53 |     return 1.0 / (1.0 + np.exp(-logit))
 54 | 
 55 | 
 56 | class PlattScaledLogisticRegression(ScaledLogisticRegression):
 57 |   """Rescales the logit reg MLE to make it calibrated using approximation."""
 58 | 
 59 |   def __init__(self, fit_intercept=True, **kwargs):
 60 |     del kwargs
 61 |     super().__init__()
 62 |     self.fit_intercept = fit_intercept
 63 |     self.sm = sklearn.linear_model.LogisticRegression(
 64 |         fit_intercept=fit_intercept,
 65 |         penalty="none",
 66 |         solver="newton-cg",
 67 |         warm_start=False)
 68 | 
 69 |   def fit(self, features, outcome, weights=None, verbose=False):
 70 |     """Compute MLE and then use Taylor approximation rescale for calibration."""
 71 |     del verbose
 72 |     self.sm.fit(features, outcome, weights)
 73 | 
 74 |     refit_weights = None
 75 |     if refit_weights is None:
 76 |       refit_weights = 1
 77 | 
 78 |     # Get leave-one-out logits to pass in to Platt scaling
 79 |     pred = self.sm.predict_proba(features)[:, 1]
 80 |     hessian = -features.T.dot(
 81 |         (refit_weights * pred * (1 - pred)).reshape(-1, 1) * features)
 82 |     xihinvxi = np.diag(features.dot(np.linalg.solve(hessian, features.T)))
 83 |     mod = xihinvxi / (1.0 + xihinvxi * refit_weights * pred * (1 - pred))
 84 |     features = mod * refit_weights * (
 85 |         outcome - pred) + self.sm.decision_function(features)
 86 | 
 87 |     # Fit model for outcome using LOO logit estimates as feature. Coefficient on
 88 |     # feature is scaling to recalibrate model.
 89 |     cm = sklearn.linear_model.LogisticRegression(
 90 |         penalty="none", fit_intercept=self.fit_intercept)
 91 |     cm.fit(features.reshape(-1, 1), outcome.reshape(-1), weights)
 92 |     self.coef_ = self.sm.coef_ * cm.coef_
 93 |     if self.fit_intercept:
 94 |       self.intercept_ = cm.coef_ * self.sm.intercept_ + cm.intercept_
 95 |     else:
 96 |       self.intercept_ = 0
 97 |     return self
 98 | 
 99 | 
100 | class CVRegLogisticRegression(ScaledLogisticRegression):
101 |   """Cross-validated regularized logistic regression MLE."""
102 | 
103 |   def __init__(self, fit_intercept=True, Cs=10, **kwargs):
104 |     super().__init__(**kwargs)
105 |     self.fit_intercept = fit_intercept
106 |     self.sm = sklearn.linear_model.LogisticRegressionCV(
107 |         fit_intercept=fit_intercept,
108 |         Cs=Cs,
109 |         penalty="l2",
110 |         solver="newton-cg")
111 | 
112 |   def fit(self, features, outcome, weights=None, verbose=False):
113 |     """Fit cross-validated model."""
114 |     del verbose
115 | 
116 |     self.sm.fit(features, outcome, weights)
117 | 
118 |     if self.fit_intercept:
119 |       self.intercept_ = self.sm.intercept_
120 |     else:
121 |       self.intercept_ = 0
122 |     self.coef_ = self.sm.coef_
123 | 
124 |     return self
125 | 
126 | 
127 | class LogisticRegressionInference(ScaledLogisticRegression):
128 |   """Base class inference with logit reg that computes P/CIs from covariance."""
129 | 
130 |   def __init__(self, fit_intercept=True, ci=50, **kwargs):
131 |     super().__init__(**kwargs)
132 |     self.fit_intercept = fit_intercept
133 |     self.coef_cov = None
134 |     self.hessian = None
135 |     self.chi_sq_rescale = 1
136 |     self.set_coverage(ci)
137 | 
138 |   def set_coverage(self, ci):
139 |     """Sets expected coverage level."""
140 |     self.ci_coverage = ci / 100.0
141 |     self.z = scipy.stats.norm.ppf(0.5 + self.ci_coverage / 2.0)
142 | 
143 |   def _set_coef_cov(self, *args):
144 |     pass
145 | 
146 |   def _get_prediction_variances(self, features):
147 |     if self.fit_intercept:
148 |       features_aug = np.ones((features.shape[0], features.shape[1] + 1))
149 |       features_aug[:, :-1] = features
150 |     else:
151 |       features_aug = features
152 |     return (features_aug.dot(self.coef_cov) *
153 |             features_aug).sum(axis=-1).reshape(-1)
154 | 
155 |   def p_values(self):
156 |     """Get p-values for a fitted model using Wald test."""
157 |     scale = np.sqrt(np.diag(self.coef_cov))
158 |     if self.fit_intercept:
159 |       scale = scale[:-1]
160 |     t = np.abs(self.coef_) / scale
161 |     t = t.reshape(-1)
162 |     p = 2 * scipy.stats.norm.sf(t)
163 |     return p
164 | 
165 |   def decision_function(self, features):
166 |     """Compute logits (ie decision function in sklearn parlance."""
167 |     return features.dot(self.coef_.T).reshape(-1) + self.intercept_
168 | 
169 |   def prediction_intervals(self, features, logit=False):
170 |     """Computes prediction CI for each row of features using coef covariance."""
171 |     if self.coef_cov is None:
172 |       raise Exception(
173 |           "No covariance matrix defined yet, so can't do inference.")
174 | 
175 |     logits = self.decision_function(features)
176 |     variances = self._get_prediction_variances(features)
177 | 
178 |     lower_ci = logits - self.z * np.sqrt(variances)
179 |     upper_ci = logits + self.z * np.sqrt(variances)
180 | 
181 |     results = np.zeros((features.shape[0], 3))
182 |     results[:, 0] = lower_ci
183 |     results[:, 1] = logits
184 |     results[:, 2] = upper_ci
185 | 
186 |     if not logit:
187 |       results = self._expit(results)
188 | 
189 |     return results
190 | 
191 |   def predict_proba(self, X):
192 |     logits = self.decision_function(X)
193 | 
194 |     preds = self._expit(logits)
195 | 
196 |     results = np.zeros((X.shape[0], 2))
197 |     results[:, 1] = preds
198 |     results[:, 0] = 1 - preds
199 |     return results
200 | 
201 |   def predict_inv_proba(self, X):
202 |     logits = self.decision_function(X)
203 | 
204 |     pos_exps = np.exp(logits)
205 |     neg_exps = np.exp(-logits)
206 | 
207 |     results = np.zeros((X.shape[0], 2))
208 |     results[:, 1] = 1 + neg_exps
209 |     results[:, 0] = 1 + pos_exps
210 | 
211 |     return results
212 | 
213 | 
214 | class LogisticRegressionMLE(LogisticRegressionInference):
215 |   """Computes the un-rescaled MLE and standard large-sample stats inference."""
216 | 
217 |   def __init__(self, fit_intercept=True, **kwargs):
218 |     super().__init__(fit_intercept=fit_intercept, **kwargs)
219 |     self.fit_intercept = fit_intercept
220 |     self.sm = sklearn.linear_model.LogisticRegression(
221 |         fit_intercept=fit_intercept,
222 |         penalty="none",
223 |         solver="newton-cg",
224 |         warm_start=False)
225 | 
226 |   def fit(self, features, outcomes, weights=None, verbose=False):
227 |     """Fit standard MLE model and compute coefficient covariance matrix."""
228 |     del verbose
229 | 
230 |     self.sm.fit(features, outcomes, weights)
231 | 
232 |     self.coef_ = self.sm.coef_
233 |     if self.fit_intercept:
234 |       self.intercept_ = self.sm.intercept_
235 |     else:
236 |       self.intercept_ = 0
237 | 
238 |     self._set_coef_cov(features, weights)
239 | 
240 |     return self
241 | 
242 |   def _set_coef_cov(self, features, weights):
243 |     """Use large-sample asymp. to compute coefficient covariance matrix."""
244 |     if weights is None:
245 |       weights = 1
246 |     pred = self.sm.predict_proba(features)[:, 1]
247 |     _, p = features.shape
248 |     if self.fit_intercept:
249 |       features_aug = np.ones((features.shape[0], features.shape[1] + 1))
250 |       features_aug[:, :-1] = features
251 |       dim = p + 1
252 |     else:
253 |       features_aug = features
254 |       dim = p
255 |     hessian = features_aug.T.dot(
256 |         (weights * pred *
257 |          (1 - pred)).reshape(-1, 1) * features_aug) / np.mean(weights)
258 |     self.hessian = -hessian
259 |     self.coef_cov = scipy.linalg.solve(hessian, np.eye(dim), assume_a="pos")
260 | 
261 | 
262 | class LogisticRegressionPercBoot(LogisticRegressionInference):
263 |   """Fit standard MLE using multiplier bootstrap and compute percentile CIs.
264 | 
265 |   It is not recommended to use this method in practice if d / n ~> 0.05. The
266 |   results from our paper suggest that it is very biased and has poor precision.
267 |   """
268 | 
269 |   def __init__(self, fit_intercept=True, num_boot=20, **kwargs):
270 |     super().__init__(fit_intercept=fit_intercept, **kwargs)
271 |     self.fit_intercept = fit_intercept
272 |     self.sm = sklearn.linear_model.LogisticRegression(
273 |         fit_intercept=fit_intercept,
274 |         penalty="none",
275 |         solver="newton-cg",
276 |         warm_start=False)
277 |     self.num_boot = num_boot
278 | 
279 |   def fit(self, features, outcome, weights=None, verbose=False):
280 |     """Fit main model and bootstrapped models with multiplier bootstrap."""
281 |     del verbose
282 |     self.sm.fit(features, outcome, weights)
283 | 
284 |     self.coef_ = self.sm.coef_
285 |     if self.fit_intercept:
286 |       self.intercept_ = self.sm.intercept_
287 |     else:
288 |       self.intercept_ = 0
289 | 
290 |     if weights is None:
291 |       weights = 1.0
292 | 
293 |     n = features.shape[0]
294 |     self.bootstraps = []
295 |     for _ in range(self.num_boot):
296 |       self.sm.fit(features, outcome,
297 |                   weights * np.random.poisson(lam=1.0, size=n))
298 |       if np.linalg.norm(self.sm.coef_) >= 1e6:
299 |         continue
300 |       d = {"coef": self.sm.coef_.reshape(-1)}
301 |       if self.fit_intercept:
302 |         d["intercept"] = self.sm.intercept_
303 |       else:
304 |         d["intercept"] = 0
305 |       self.bootstraps.append(d)
306 | 
307 |     return self
308 | 
309 |   def p_values(self):
310 |     raise NotImplementedError(
311 |         "This form of bootstrap does not lend itself well to p-values")
312 | 
313 |   def approx_lrt_p_values(self):
314 |     raise NotImplementedError(
315 |         "This form of bootstrap does not lend itself well to p-values")
316 | 
317 |   def _predict_with_param_dict(self, params, features):
318 |     return features.dot(params["coef"]).reshape(-1) + params["intercept"]
319 | 
320 |   def prediction_intervals(self, X, logit=False):
321 |     """Computes percentile CIs for feature rows using bootstrap samples."""
322 |     all_preds = np.array(
323 |         [self._predict_with_param_dict(d, X) for d in self.bootstraps])
324 | 
325 |     ci_range = (1 - self.ci_coverage) / 2
326 |     results = np.quantile(all_preds, q=(ci_range, 0.5, 1 - ci_range), axis=0).T
327 | 
328 |     if not logit:
329 |       results = self._expit(results)
330 | 
331 |     return results
332 | 
333 | 
334 | class UnbiasedLogisticRegression(LogisticRegressionInference):
335 |   """Corrected bias and inference with the logitistic regression MLE."""
336 | 
337 |   def __init__(self, fit_intercept=False, **kwargs):
338 |     super().__init__(fit_intercept, **kwargs)
339 | 
340 |     self.fit_intercept = fit_intercept
341 |     if fit_intercept:
342 |       raise ValueError("This model doesn't allow fitting an intercept.")
343 | 
344 |     self.sm = sklearn.linear_model.LogisticRegression(
345 |         fit_intercept=fit_intercept,
346 |         penalty="none",
347 |         solver="newton-cg",
348 |         warm_start=False)
349 | 
350 |   def fit(self, features, outcome, weights=None, verbose=False):
351 |     """Fit MLE, estimate eta with SLOE, de-bias, and estimate covariance."""
352 |     del verbose
353 |     kappa = float(features.shape[1]) / features.shape[0]
354 | 
355 |     self.sm.fit(features, outcome, weights)
356 | 
357 |     if weights is None:
358 |       weights = 1
359 | 
360 |     pred = self.sm.predict_proba(features)[:, 1]
361 |     weights /= np.mean(weights)
362 |     diag = weights * pred * (1 - pred)
363 |     hessian = -features.T.dot(diag.reshape(-1, 1) * features)
364 |     self.hessian = hessian
365 |     xihinvxi = np.einsum("ij,ji->i", features,
366 |                          np.linalg.solve(hessian, features.T))
367 |     mod = xihinvxi / (1.0 + xihinvxi * diag)
368 |     infl = mod * weights * (outcome -
369 |                             pred) + self.sm.decision_function(features)
370 | 
371 |     eta_hat = np.var(infl)
372 | 
373 |     b0 = 0
374 | 
375 |     self.alpha, lambda_, sigma, intercept_est = asymp_system_solve.correction_factors(
376 |         kappa, eta_hat, np.sqrt(eta_hat), b0, use_eta=True)
377 |     if (kappa >= 0.05 and self.alpha < 0.999) or self.alpha > 5 \
378 |         or lambda_ < 0.1 or sigma < 0.3 or lambda_ > 1e3 or sigma > 1e3:
379 |       raise ValueError("Problem with optimization")
380 | 
381 |     self.eta_hat = eta_hat
382 |     self.lambda_ = lambda_
383 |     self.sigma = sigma
384 |     self.intercept_est = intercept_est
385 | 
386 |     self.chi_sq_rescale = lambda_ * self.alpha**2 / sigma**2
387 |     self.coef_ = self.sm.coef_ / self.alpha
388 |     self.intercept_ = 0
389 | 
390 |     self._set_coef_cov(features, sigma / np.sqrt(kappa), self.alpha)
391 | 
392 |     return self
393 | 
394 |   def _set_coef_cov(self, features, sigma, alpha):
395 |     n, p = features.shape
396 |     features_aug = features
397 |     dim = p
398 |     feature_cov = features_aug.T.dot(features_aug)
399 |     one_on_tau_sq = scipy.linalg.solve(feature_cov, np.eye(dim), assume_a="pos")
400 |     self.coef_cov = one_on_tau_sq
401 |     self.coef_cov *= (1 - float(p) / n) * ((sigma / alpha)**2)
402 | 


--------------------------------------------------------------------------------