├── riskslim
    ├── tests
    │   ├── __init__.py
    │   ├── test_risk_slim.py
    │   └── test_loss_functions.py
    ├── __init__.py
    ├── loss_functions
    │   ├── __init__.py
    │   ├── build_cython_loss_functions.py
    │   ├── fast_log_loss.pyx
    │   ├── log_loss.py
    │   ├── log_loss_weighted.py
    │   └── lookup_log_loss.pyx
    ├── defaults.py
    ├── bound_tightening.py
    ├── solution_pool.py
    ├── setup_functions.py
    ├── heuristics.py
    ├── utils.py
    ├── coefficient_set.py
    ├── mip.py
    └── initialization.py
├── MANIFEST.in
├── examples
    ├── README.txt
    ├── data
    │   ├── README.txt
    │   ├── breastcancer_cvindices.csv
    │   ├── breastcancer_weights.csv
    │   └── breastcancer_data.csv
    ├── ex_01_quickstart.py
    ├── ex_03_constraints.py
    └── ex_02_advanced_options.py
├── requirements.txt
├── docs
    ├── images
    │   └── risk_score_seizure.png
    ├── references
    │   └── ustun2019riskslim.bib
    └── cplex_instructions.md
├── LICENSE
├── .gitignore
├── batch
    ├── settings_template.json
    ├── job_template.sh
    └── train_risk_slim.py
├── README.md
└── setup.py


/riskslim/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
1 | .. _general_examples:
2 | 
3 | General examples
4 | ================
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cplex
2 | numpy
3 | scipy
4 | Cython
5 | IPython
6 | traitlets
7 | nose
8 | pandas
9 | prettytable


--------------------------------------------------------------------------------
/docs/images/risk_score_seizure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ustunb/risk-slim/HEAD/docs/images/risk_score_seizure.png


--------------------------------------------------------------------------------
/riskslim/__init__.py:
--------------------------------------------------------------------------------
1 | from .coefficient_set import CoefficientSet
2 | from .lattice_cpa import run_lattice_cpa, setup_lattice_cpa, finish_lattice_cpa
3 | from .utils import load_data_from_csv, print_model


--------------------------------------------------------------------------------
/examples/data/README.txt:
--------------------------------------------------------------------------------
1 | .. _datasets:
2 | 
3 | Datasets
4 | ================
5 | 
6 | These datasets are used in the examples as well as the experiments in our paper http://arxiv.org/abs/1610.00168
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/references/ustun2019riskslim.bib:
--------------------------------------------------------------------------------
 1 | @article{ustun2019jmlr,
 2 |   author  = {Ustun, Berk and Rudin, Cynthia},
 3 |   title   = {{Learning Optimized Risk Scores}},
 4 |   journal = {{Journal of Machine Learning Research}},
 5 |   year    = {2019},
 6 |   volume  = {20},
 7 |   number  = {150},
 8 |   pages   = {1-75},
 9 |   url     = {http://jmlr.org/papers/v20/18-615.html}
10 | }


--------------------------------------------------------------------------------
/riskslim/loss_functions/__init__.py:
--------------------------------------------------------------------------------
 1 | from .log_loss import *
 2 | from .log_loss_weighted import *
 3 | 
 4 | try:
 5 |     from .fast_log_loss import *
 6 | except ImportError:
 7 |     print("warning: could not import fast log loss")
 8 |     print("warning: returning handle to standard loss functions")
 9 |     # todo replace with warning object
10 |     import log_loss as fast_log_loss
11 | 
12 | try:
13 |     from .lookup_log_loss import *
14 | except ImportError:
15 |     print("warning: could not import lookup log loss")
16 |     print("warning: returning handle to standard loss functions")
17 |     # todo replace with warning object
18 |     import log_loss as lookup_log_loss
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/cplex_instructions.md:
--------------------------------------------------------------------------------
 1 | # Downloading & Installing CPLEX
 2 | 
 3 | CPLEX is cross-platform optimization tool solver a Python API. It is free for students and faculty members at accredited institutions. 
 4 | 
 5 | To download CPLEX:
 6 | 
 7 | 1. Register for [IBM OnTheHub](https://ur.us-south.cf.appdomain.cloud/a2mt/email-auth)
 8 | 2. Download the *IBM ILOG CPLEX Optimization Studio* from the [software catalog](https://www-03.ibm.com/isc/esd/dswdown/searchPartNumber.wss?partNumber=CJ6BPML)
 9 | 3. Install CPLEX Optimization Studio.
10 | 4. Setup the CPLEX Python API [as described here](https://www.ibm.com/support/knowledgecenter/SSSA5P_12.8.0/ilog.odms.cplex.help/CPLEX/GettingStarted/topics/set_up/Python_setup.html).
11 | 
12 | If you have problems with CPLEX, please check the [CPLEX user manual](http://www-01.ibm.com/support/knowledgecenter/SSSA5P/welcome) or the [CPLEX forums](https://www.ibm.com/developerworks/community/forums/html/forum?id=11111111-0000-0000-0000-000000002059). 
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Berk Ustun
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/riskslim/loss_functions/build_cython_loss_functions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | This script builds loss functions using Cython on a local machine.
 5 | To run this script
 6 | 
 7 | 1. Change to the directory
 8 | 
 9 | $REPO_DIR/riskslim/loss_functions
10 | 
11 | 2. Run the following commands in Bash:
12 | 
13 | python2 build_cython_loss_functions.py build_ext --inplace
14 | python3 build_cython_loss_functions.py build_ext --inplace
15 | 
16 | """
17 | import numpy
18 | import scipy
19 | from distutils.core import setup
20 | from distutils.extension import Extension
21 | from Cython.Distutils import build_ext
22 | 
23 | 
24 | #fast log loss
25 | ext_modules = [Extension(name = "fast_log_loss",
26 |                          sources=["fast_log_loss.pyx"],
27 |                          include_dirs=[numpy.get_include(), scipy.get_include()],
28 |                          libraries=["m"],
29 |                          extra_compile_args = ["-ffast-math"])]
30 | 
31 | setup(
32 |     cmdclass = {'build_ext': build_ext},
33 |     include_dirs = [numpy.get_include(), scipy.get_include()],
34 |     ext_modules = ext_modules,
35 | )
36 | 
37 | #lookup log loss
38 | ext_modules = [Extension(name = "lookup_log_loss",
39 |                          sources=["lookup_log_loss.pyx"],
40 |                          include_dirs=[numpy.get_include(), scipy.get_include()],
41 |                          libraries=["m"],
42 |                          extra_compile_args = ["-ffast-math"])]
43 | 
44 | setup(
45 |     cmdclass = {'build_ext': build_ext},
46 |     include_dirs = [numpy.get_include(), scipy.get_include()],
47 |     ext_modules = ext_modules,
48 | )
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # scikit-learn specific
 10 | doc/_build/
 11 | doc/auto_examples/
 12 | doc/modules/generated/
 13 | doc/datasets/generated/
 14 | 
 15 | # sklearn template
 16 | doc/
 17 | *.yml
 18 | .nojekyll
 19 | ci_scripts/
 20 | skltemplate/
 21 | 
 22 | # riskslim directories
 23 | batch/data/
 24 | batch/results/
 25 | batch/log/
 26 | dev/
 27 | cluster/
 28 | 
 29 | # riskslim files
 30 | riskslim_todo.ft
 31 | examples/data/*cvindices.csv
 32 | examples/data/*weights.csv
 33 | examples/ex_00_tests.py
 34 | examples/data/recidivism_v01_*.csv
 35 | !examples/data/breastcancer_cvindices.csv
 36 | !examples/data/breastcancer_weights.csv
 37 | riskslim/tests/test_common.py
 38 | riskslim/tests/test_template.py
 39 | riskslim/loss_functions/build/
 40 | riskslim/loss_functions/*.so
 41 | riskslim/loss_functions/*.c
 42 | 
 43 | # Distribution / packaging
 44 | .Python
 45 | env/
 46 | venv/
 47 | build/
 48 | develop-eggs/
 49 | dist/
 50 | downloads/
 51 | eggs/
 52 | .eggs/
 53 | lib/
 54 | lib64/
 55 | parts/
 56 | sdist/
 57 | var/
 58 | *.egg-info/
 59 | .installed.cfg
 60 | *.egg
 61 | .idea
 62 | 
 63 | # PyInstaller
 64 | #  Usually these files are written by a python script from a template
 65 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 66 | *.manifest
 67 | *.spec
 68 | 
 69 | # Installer logs
 70 | pip-log.txt
 71 | pip-delete-this-directory.txt
 72 | 
 73 | # Unit test / coverage reports
 74 | htmlcov/
 75 | .tox/
 76 | .coverage
 77 | .coverage.*
 78 | .cache
 79 | nosetests.xml
 80 | coverage.xml
 81 | *,cover
 82 | .hypothesis/
 83 | 
 84 | # Translations
 85 | *.mo
 86 | *.pot
 87 | 
 88 | # Django stuff:
 89 | *.log
 90 | 
 91 | # Sphinx documentation
 92 | docs/_build/
 93 | 
 94 | # PyBuilder
 95 | target/
 96 | 
 97 | # emacs
 98 | *~
 99 | *.org
100 | \#*#
101 | 
102 | # Jupyter NB Checkpoints
103 | .ipynb_checkpoints/
104 | 


--------------------------------------------------------------------------------
/batch/settings_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "max_runtime": 300.0,
 3 |     "max_tolerance": 1e-06, 
 4 |     "loss_computation": "normal",
 5 |     "chained_updates_flag": true, 
 6 |     "round_flag": true, 
 7 |     "polish_flag": true, 
 8 |     "initialization_flag": true,
 9 |     "add_cuts_at_heuristic_solutions": true, 
10 |     "tight_formulation": true, 
11 |     
12 |     "polish_rounded_solutions": true, 
13 |     "polishing_max_runtime": 10.0, 
14 |     "polishing_max_solutions": 5.0, 
15 |     "polishing_start_cuts": 0, 
16 |     "polishing_start_gap": Infinity, 
17 |     "polishing_stop_cuts": Infinity, 
18 |     "polishing_stop_gap": 5.0, 
19 |     "polishing_tolerance": 0.1, 
20 |     
21 |     "rounding_start_cuts": 0, 
22 |     "rounding_start_gap": Infinity, 
23 |     "rounding_stop_cuts": 20000, 
24 |     "rounding_stop_gap": 0.2, 
25 |     "rounding_tolerance": Infinity, 
26 | 
27 |     "init_display_cplex_progress": false, 
28 |     "init_display_progress": true, 
29 |     "init_max_cplex_time_per_iteration": 10.0, 
30 |     "init_max_iterations": 10000, 
31 |     "init_max_runtime": 300.0, 
32 |     "init_max_runtime_per_iteration": 300.0, 
33 |     "init_max_tolerance": 0.0001, 
34 |     "init_polishing_after": true, 
35 |     "init_polishing_max_runtime": 30.0, 
36 |     "init_polishing_max_solutions": 5, 
37 |     "init_sequential_rounding_max_runtime": 30.0, 
38 |     "init_sequential_rounding_max_solutions": 5, 
39 |     "init_use_sequential_rounding": true, 
40 |     
41 |     "display_cplex_progress": true, 
42 |     "purge_bound_cuts": false, 
43 |     "purge_loss_cuts": false, 
44 |     "cplex_absmipgap": 2.2204460492503131e-16, 
45 |     "cplex_integrality_tolerance": 2.2204460492503131e-16, 
46 |     "cplex_mipemphasis": 0, 
47 |     "cplex_mipgap": 2.2204460492503131e-16, 
48 |     "cplex_n_cores": 1, 
49 |     "cplex_nodefilesize": 122880, 
50 |     "cplex_poolrelgap": NaN, 
51 |     "cplex_poolreplace": 2, 
52 |     "cplex_poolsize": 100, 
53 |     "cplex_randomseed": 0, 
54 |     "cplex_repairtries": 20 
55 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | risk-slim
 2 | ========
 3 | 
 4 | risk-slim is a machine learning method to fit simple customized risk scores in python. 
 5 | 
 6 | #### Background 
 7 | 
 8 | Risk scores let users make quick risk predictions by adding and subtracting a few small numbers (see e.g., 500 + medical risk scores at [mdcalc.com](https://www.mdcalc.com/)). 
 9 | 
10 | Here is a risk score for ICU risk prediction from our [paper](http://www.berkustun.com/docs/ustun_2017_optimized_risk_scores.pdf). 
11 | 
12 | <div>
13 | <p align="center">
14 | <img src="https://github.com/ustunb/risk-slim/blob/master/docs/images/risk_score_seizure.png" width="480" height="360" border="0"/>
15 | </p>
16 | </div>
17 | 
18 | #### Video
19 | 
20 | <p align="center">
21 | 	<a href="http://www.youtube.com/watch?feature=player_embedded&v=WQDVejk17Aw" target="_blank">
22 | 		<img src="http://img.youtube.com/vi/WQDVejk17Aw/0.jpg" alt="RiskSLIM KDD" width="480" height="360" border="10" />
23 | 	</a>
24 | </p>
25 |  
26 | 
27 | #### Reference
28 | 
29 | If you use risk-slim in your research, we would appreciate a citation to the following paper ([bibtex](/docs/references/ustun2019riskslim.bib)!
30 | 
31 | <a href="http://jmlr.org/papers/v20/18-615.html" target="_blank">Learning Optimized Risk Scores</a> <br>
32 | Berk Ustun and Cynthia Rudin<br>
33 | Journal of Machine Learning Research, 2019.
34 | 
35 | ## Installation
36 | 
37 | Run the following snippet in a Unix terminal to install risk-slim and complete a test run.  
38 | 
39 | ```
40 | git clone https://github.com/ustunb/risk-slim
41 | cd risk-slim
42 | pip install -e . 		# install in editable mode  
43 | bash batch/job_template.sh 	# batch run
44 | ```
45 | 
46 | ### Requirements
47 | 
48 | risk-slim requires Python 3.5+ and CPLEX 12.6+. For instructions on how to download and install, click [here](/docs/cplex_instructions.md). 
49 | 
50 | 
51 | 
52 | ## Contributing
53 | 
54 | I'm planning to pick up development again in Fall 2020. I can definitely use a hand! If you are interested in contributing, please reach out!  
55 | 
56 | Here's the current development roadmap:
57 | 
58 | - [sci-kit learn interface](http://scikit-learn.org/stable/developers/contributing.html#rolling-your-own-estimator)
59 | - support for open source solver in [python-mip](https://github.com/coin-or/python-mip)
60 | - basic reporting tools (roc curves, calibration plots, model reports)
61 | - documentation
62 | - pip


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | #
 3 | # Copyright (C) 2017 Berk Ustun
 4 | 
 5 | import os
 6 | import sys
 7 | from setuptools import setup, find_packages, dist
 8 | from setuptools.extension import Extension
 9 | 
10 | #resources
11 | #setuptools http://setuptools.readthedocs.io/en/latest/setuptools.html
12 | #setuptools + Cython: http://stackoverflow.com/questions/32528560/
13 | 
14 | DISTNAME = 'riskslim'
15 | DESCRIPTION = "optimized risk scores on large-scale datasets"
16 | AUTHOR = 'Berk Ustun'
17 | AUTHOR_EMAIL = 'berk@seas.harvard.edu'
18 | URL = 'https://github.com/ustunb/risk-slim'
19 | LICENSE = 'new BSD'
20 | DOWNLOAD_URL = 'https://github.com/ustunb/risk-slim'
21 | VERSION = '0.0.0'
22 | 
23 | # Install setup requirements
24 | dist.Distribution().fetch_build_eggs(['Cython', 'numpy', 'scipy'])
25 | 
26 | #read requirements as listed in txt file
27 | try:
28 |     import numpy
29 | except ImportError:
30 |     print('numpy is required for installation')
31 |     sys.exit(1)
32 | 
33 | try:
34 |     import scipy
35 | except ImportError:
36 |     print('scipy is required for installation')
37 |     sys.exit(1)
38 | 
39 | try:
40 |     from Cython.Build import cythonize
41 | except ImportError:
42 |     print('Cython is required for installation')
43 |     sys.exit(1)
44 | 
45 | #fast log loss
46 | extensions =[
47 |     Extension(
48 |         DISTNAME + ".loss_functions." + "fast_log_loss",
49 |         [DISTNAME + "/loss_functions/fast_log_loss.pyx"],
50 |         include_dirs=[numpy.get_include(), scipy.get_include()],
51 |         libraries=["m"],
52 |         extra_compile_args=["-ffast-math"]
53 |     ),
54 |     Extension(
55 |         DISTNAME + ".loss_functions." + "lookup_log_loss",
56 |         [DISTNAME + "/loss_functions/lookup_log_loss.pyx"],
57 |         include_dirs=[numpy.get_include(), scipy.get_include()],
58 |         libraries=["m"],
59 |         extra_compile_args=["-ffast-math"])
60 | ]
61 | 
62 | 
63 | if __name__ == "__main__":
64 | 
65 |     old_path = os.getcwd()
66 |     local_path = os.path.dirname(os.path.abspath(sys.argv[0]))
67 | 
68 |     os.chdir(local_path)
69 |     sys.path.insert(0, local_path)
70 | 
71 |     with open('requirements.txt') as f:
72 |         INSTALL_REQUIRES = [l.strip() for l in f.readlines() if l]
73 | 
74 |     setup(
75 |         name=DISTNAME,
76 |         packages=find_packages(),
77 |         ext_modules=cythonize(extensions),
78 |         author=AUTHOR,
79 |         author_email=AUTHOR_EMAIL,
80 |         description=DESCRIPTION,
81 |         install_requires=INSTALL_REQUIRES,
82 |         license=LICENSE,
83 |         url=URL,
84 |         version=VERSION,
85 |         download_url=DOWNLOAD_URL,
86 |         zip_safe=False,
87 |     )
88 | 


--------------------------------------------------------------------------------
/batch/job_template.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This is a template to show how to train RiskSLIM from a Bash command shell
 3 | # You should adapt this to run RiskSLIM on a batch computing environment (e.g. AWS Batch)
 4 | #
 5 | # To test the script, run the following command from risk-slim directory:
 6 | #
 7 | # `bash batch/job_template.sh`
 8 | #
 9 | # To see a detailed list of all arguments that can be passed into risk_slim, use:
10 | #
11 | # `python "batch/train_risk_slim.py --help`
12 | #
13 | #  or
14 | #
15 | # `python2 "batch/train_risk_slim.py --help`
16 | #
17 | # Recommended Directory Structure for Batch Computing:
18 | #
19 | #   risk-slim/
20 | #       └──batch/
21 | #           └──data/           location of CSV files for data (ignored in git)
22 | #           └──logs/           directory where log files are printed out (ignored in git)
23 | #           └──results/        directory where results files are stored (ignored in git)
24 | #       └──doc/
25 | #       └──examples/           
26 | #       └──riskslim/           directory where code is stored (do not change this to be able to pull from GitHub)
27 | #       └──setup.py
28 | #
29 | # Advantaged settings are be configured through a JSON file. See: `batch/settings_template.json` for a template
30 | # The values can be changed directly using a text editor, or programmatically using a tool such as
31 | # `jq` https://stedolan.github.io/jq/
32 | 
33 | #directories
34 | repo_dir=$(pwd)
35 | data_dir="${repo_dir}/examples/data"    #change to /batch/data/ for your own data
36 | batch_dir="${repo_dir}/batch"
37 | results_dir="${batch_dir}/results"
38 | log_dir="${batch_dir}/logs"
39 | 
40 | #set job parameters
41 | data_name="breastcancer"
42 | data_file="${data_dir}/${data_name}_data.csv"
43 | 
44 | cvindices_file="${data_dir}/${data_name}_cvindices.csv"
45 | #weights_file="${data_dir}/${data_name}_weights.csv"
46 | fold=0
47 | 
48 | max_coef=5
49 | max_size=5
50 | max_offset=-1
51 | w_pos=1.00
52 | c0_value=1e-6
53 | 
54 | timelimit=60
55 | 
56 | #results_file and log_file must have a UNIQUE name for each job to avoid overwriting existing files
57 | run_name="${data_name}_fold_${fold}"
58 | run_time=$(date +"%m_%d_%Y_%H_%M_%S")
59 | results_file="${results_dir}/${run_name}_results.p"
60 | log_file="${log_dir}/${run_name}_${run_time}.log"
61 | 
62 | #comment out the following in case testing / OK with overwriting
63 | #for safety, train_risk_slim.py will not run if results_file exists on disk
64 | rm -f "${results_file}" #c
65 | 
66 | #create directories that do not exist
67 | mkdir -p "${results_dir}"
68 | mkdir -p "${log_dir}"
69 | 
70 | #addition settings can be modified by changing a JSON file
71 | #complete list of settings is in: risk-slim/batch/settings_template.json
72 | settings_file="${results_dir}/${run_name}_settings.json"
73 | cp "${batch_dir}/settings_template.json" "${settings_file}"
74 | 
75 | #run command
76 | python3 "${batch_dir}/train_risk_slim.py"  \
77 |     --data "${data_file}" \
78 |     --results "${results_file}" \
79 |     --cvindices "${cvindices_file}" \
80 |     --fold "${fold}" \
81 |     --timelimit "${timelimit}" \
82 |     --settings "${settings_file}" \
83 |     --w_pos "${w_pos}" \
84 |     --c0_value "${c0_value}" \
85 |     --max_size "${max_size}" \
86 |     --max_coef "${max_coef}" \
87 |     --max_offset "${max_offset}" \
88 |     --log "${log_file}"
89 | 
90 | exit
91 | W


--------------------------------------------------------------------------------
/examples/ex_01_quickstart.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pprint
 3 | import numpy as np
 4 | import riskslim
 5 | 
 6 | # data
 7 | data_name = "breastcancer"                                  # name of the data
 8 | data_dir = os.getcwd() + '/examples/data/'                  # directory where datasets are stored
 9 | data_csv_file = data_dir + data_name + '_data.csv'          # csv file for the dataset
10 | sample_weights_csv_file = None                              # csv file of sample weights for the dataset (optional)
11 | 
12 | # problem parameters
13 | max_coefficient = 5                                         # value of largest/smallest coefficient
14 | max_L0_value = 5                                            # maximum model size (set as float(inf))
15 | max_offset = 50                                             # maximum value of offset parameter (optional)
16 | c0_value = 1e-6                                             # L0-penalty parameter such that c0_value > 0; larger values -> sparser models; we set to a small value (1e-6) so that we get a model with max_L0_value terms
17 | 
18 | 
19 | # load data from disk
20 | data = riskslim.load_data_from_csv(dataset_csv_file = data_csv_file, sample_weights_csv_file = sample_weights_csv_file)
21 | 
22 | # create coefficient set and set the value of the offset parameter
23 | coef_set = riskslim.CoefficientSet(variable_names = data['variable_names'], lb = -max_coefficient, ub = max_coefficient, sign = 0)
24 | coef_set.update_intercept_bounds(X = data['X'], y = data['Y'], max_offset = max_offset)
25 | 
26 | constraints = {
27 |     'L0_min': 0,
28 |     'L0_max': max_L0_value,
29 |     'coef_set':coef_set,
30 | }
31 | 
32 | # major settings (see riskslim_ex_02_complete for full set of options)
33 | settings = {
34 |     # Problem Parameters
35 |     'c0_value': c0_value,
36 |     #
37 |     # LCPA Settings
38 |     'max_runtime': 30.0,                               # max runtime for LCPA
39 |     'max_tolerance': np.finfo('float').eps,             # tolerance to stop LCPA (set to 0 to return provably optimal solution)
40 |     'display_cplex_progress': True,                     # print CPLEX progress on screen
41 |     'loss_computation': 'fast',                         # how to compute the loss function ('normal','fast','lookup')
42 |     #
43 |     # LCPA Improvements
44 |     'round_flag': True,                                # round continuous solutions with SeqRd
45 |     'polish_flag': True,                               # polish integer feasible solutions with DCD
46 |     'chained_updates_flag': True,                      # use chained updates
47 |     'add_cuts_at_heuristic_solutions': True,            # add cuts at integer feasible solutions found using polishing/rounding
48 |     #
49 |     # Initialization
50 |     'initialization_flag': True,                       # use initialization procedure
51 |     'init_max_runtime': 120.0,                         # max time to run CPA in initialization procedure
52 |     'init_max_coefficient_gap': 0.49,
53 |     #
54 |     # CPLEX Solver Parameters
55 |     'cplex_randomseed': 0,                              # random seed
56 |     'cplex_mipemphasis': 0,                             # cplex MIP strategy
57 | }
58 | 
59 | # train model using lattice_cpa
60 | model_info, mip_info, lcpa_info = riskslim.run_lattice_cpa(data, constraints, settings)
61 | 
62 | #print model contains model
63 | riskslim.print_model(model_info['solution'], data)
64 | 
65 | #model info contains key results
66 | pprint.pprint(model_info)
67 | 
68 | 


--------------------------------------------------------------------------------
/riskslim/loss_functions/fast_log_loss.pyx:
--------------------------------------------------------------------------------
  1 | import cython
  2 | import numpy as np
  3 | cimport numpy as np
  4 | cimport scipy.linalg.cython_blas as blas
  5 | cimport libc.math as math
  6 | 
  7 | DTYPE = np.float64
  8 | ctypedef np.float64_t DTYPE_T
  9 | 
 10 | @cython.boundscheck(False)
 11 | @cython.wraparound(False)
 12 | @cython.nonecheck(False)
 13 | @cython.cdivision(False)
 14 | def log_loss_value(np.ndarray[DTYPE_T, ndim=2, mode="fortran"] Z, np.ndarray[DTYPE_T, ndim=1, mode="fortran"] rho):
 15 | 
 16 |     cdef:
 17 |         int N = Z.shape[0]
 18 |         int D = Z.shape[1]
 19 |         int lda = N
 20 |         int incx = 1 #increments of rho
 21 |         int incy = 1 #increments of y
 22 |         double alpha = 1.0
 23 |         double beta = 0.0
 24 |         np.ndarray[DTYPE_T, ndim=1, mode = "fortran"] y = np.empty(N, dtype = DTYPE)
 25 |         Py_ssize_t i
 26 |         DTYPE_T total_loss = 0.0
 27 |         int zero_score_cnt = 0
 28 | 
 29 |     #compute scores
 30 |     #calls dgemv from BLZS which computes y = alpha * trans(Z) + beta * y
 31 |     #see: http://www.nag.com/numeric/fl/nagdoc_fl22/xhtml/F06/f06paf.xml
 32 |     blas.dgemv("N", &N, &D, &alpha, &Z[0,0], &lda, &rho[0], &incx, &beta, &y[0], &incy)
 33 | 
 34 |     #compute loss
 35 |     for i in range(N):
 36 |         if (y[i] < 0):
 37 |             total_loss += math.log(1.0 + math.exp(y[i])) - y[i]
 38 |         elif (y[i] > 0):
 39 |             total_loss += math.log1p(math.exp(-y[i]))
 40 |         else:
 41 |             zero_score_cnt += 1
 42 | 
 43 |     total_loss += zero_score_cnt * math.M_LN2
 44 |     return total_loss/N
 45 | 
 46 | @cython.boundscheck(False)
 47 | @cython.wraparound(False)
 48 | @cython.nonecheck(False)
 49 | @cython.cdivision(False)
 50 | def log_loss_value_and_slope(np.ndarray[DTYPE_T, ndim=2, mode="fortran"] Z, np.ndarray[DTYPE_T, ndim=1, mode="fortran"] rho):
 51 | 
 52 |     cdef:
 53 |         int N = Z.shape[0]
 54 |         int D = Z.shape[1]
 55 |         int lda = N
 56 |         int incx = 1 #increments of rho
 57 |         int incy = 1 #increments of y
 58 |         double alpha = 1.0
 59 |         double beta = 0.0
 60 |         Py_ssize_t i
 61 |         DTYPE_T total_loss = 0.0
 62 |         DTYPE_T exp_value
 63 |         np.ndarray[DTYPE_T, ndim=1, mode = "fortran"] y = np.empty(N, dtype = DTYPE)
 64 |         np.ndarray[DTYPE_T, ndim=1, mode = "fortran"] loss_slope = np.empty(D, dtype = DTYPE)
 65 | 
 66 |     #compute scores
 67 |     #calls dgemv from BLAS which computes y = alpha * trans(Z) + beta * y
 68 |     #see: http://www.nag.com/numeric/fl/nagdoc_fl22/xhtml/F06/f06paf.xml
 69 |     blas.dgemv("N", &N, &D, &alpha, &Z[0,0], &lda, &rho[0], &incx, &beta, &y[0], &incy)
 70 | 
 71 |     #exponentiate scores, compute mean scores and probabilities
 72 |     for i in range(N):
 73 |         if y[i] < 0:
 74 |             exp_value = math.exp(y[i])
 75 |             total_loss += math.log(1.0 + exp_value) - y[i]
 76 |             y[i] = (exp_value / (1.0 + exp_value)) - 1.0
 77 |         else:
 78 |             exp_value = math.exp(-y[i])
 79 |             total_loss += math.log1p(exp_value)
 80 |             y[i] = (1.0 / (1.0 + exp_value)) - 1.0
 81 | 
 82 |     #compute loss slope
 83 |     alpha = 1.0/N
 84 |     blas.dgemv("T", &N, &D, &alpha, &Z[0,0], &lda, &y[0], &incx, &beta, &loss_slope[0], &incy)
 85 |     return (total_loss/N), loss_slope
 86 | 
 87 | @cython.boundscheck(False)
 88 | @cython.wraparound(False)
 89 | @cython.nonecheck(False)
 90 | @cython.cdivision(False)
 91 | def log_loss_value_from_scores(np.ndarray[DTYPE_T, ndim=1, mode="fortran"] scores):
 92 | 
 93 |     cdef:
 94 |         Py_ssize_t N = scores.shape[0]
 95 |         DTYPE_T total_loss = 0.0
 96 |         int zero_score_cnt = 0
 97 |         Py_ssize_t i
 98 |         DTYPE_T s
 99 | 
100 |     #compute loss
101 |     for i in range(N):
102 |         s = scores[i]
103 |         if s < 0:
104 |             total_loss += math.log(1.0 + math.exp(s)) - s
105 |         elif s > 0:
106 |             total_loss += math.log1p(math.exp(-s))
107 |         else:
108 |             zero_score_cnt += 1
109 | 
110 |     total_loss += zero_score_cnt * math.M_LN2
111 |     return total_loss/N
112 | 


--------------------------------------------------------------------------------
/riskslim/loss_functions/log_loss.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | def log_loss_value(Z, rho):
  4 |     """
  5 |     computes the value and slope of the logistic loss in a numerically stable way
  6 |     see also: http://stackoverflow.com/questions/20085768/
  7 | 
  8 |     Parameters
  9 |     ----------
 10 |     Z           numpy.array containing training data with shape = (n_rows, n_cols)
 11 |     rho         numpy.array of coefficients with shape = (n_cols,)
 12 | 
 13 |     Returns
 14 |     -------
 15 |     loss_value  scalar = 1/n_rows * sum(log( 1 .+ exp(-Z*rho))
 16 | 
 17 |     """
 18 |     scores = Z.dot(rho)
 19 |     pos_idx = scores > 0
 20 |     loss_value = np.empty_like(scores)
 21 |     loss_value[pos_idx] = np.log1p(np.exp(-scores[pos_idx]))
 22 |     loss_value[~pos_idx] = -scores[~pos_idx] + np.log1p(np.exp(scores[~pos_idx]))
 23 |     loss_value = loss_value.mean()
 24 |     return loss_value
 25 | 
 26 | def log_loss_value_and_slope(Z, rho):
 27 |     """
 28 |     computes the value and slope of the logistic loss in a numerically stable way
 29 |     this function should only be used when generating cuts in cutting-plane algorithms
 30 |     (computing both the value and the slope at the same time is slightly cheaper)
 31 | 
 32 |     see also: http://stackoverflow.com/questions/20085768/
 33 | 
 34 |     Parameters
 35 |     ----------
 36 |     Z           numpy.array containing training data with shape = (n_rows, n_cols)
 37 |     rho         numpy.array of coefficients with shape = (n_cols,)
 38 | 
 39 |     Returns
 40 |     -------
 41 |     loss_value  scalar = 1/n_rows * sum(log( 1 .+ exp(-Z*rho))
 42 |     loss_slope: (n_cols x 1) vector = 1/n_rows * sum(-Z*rho ./ (1+exp(-Z*rho))
 43 | 
 44 |     """
 45 |     scores = Z.dot(rho)
 46 |     pos_idx = scores > 0
 47 |     exp_scores_pos = np.exp(-scores[pos_idx])
 48 |     exp_scores_neg = np.exp(scores[~pos_idx])
 49 | 
 50 |     #compute loss value
 51 |     loss_value = np.empty_like(scores)
 52 |     loss_value[pos_idx] = np.log1p(exp_scores_pos)
 53 |     loss_value[~pos_idx] = -scores[~pos_idx] + np.log1p(exp_scores_neg)
 54 |     loss_value = loss_value.mean()
 55 | 
 56 |     #compute loss slope
 57 |     log_probs = np.empty_like(scores)
 58 |     log_probs[pos_idx]  = 1.0 / (1.0 + exp_scores_pos)
 59 |     log_probs[~pos_idx] = exp_scores_neg / (1.0 + exp_scores_neg)
 60 |     loss_slope = Z.T.dot(log_probs - 1.0) / Z.shape[0]
 61 | 
 62 |     return loss_value, loss_slope
 63 | 
 64 | def log_loss_value_from_scores(scores):
 65 |     """
 66 |     computes the logistic loss value from a vector of scores in a numerically stable way
 67 |     where scores = Z.dot(rho)
 68 | 
 69 |     see also: http://stackoverflow.com/questions/20085768/
 70 | 
 71 |     this function is used for heuristics (discrete_descent, sequential_rounding).
 72 |     to save computation when running the heuristics, we store the scores and
 73 |     call this function to compute the loss directly from the scores
 74 |     this reduces the need to recompute the dot product.
 75 | 
 76 |     Parameters
 77 |     ----------
 78 |     scores  numpy.array of scores = Z.dot(rho)
 79 | 
 80 |     Returns
 81 |     -------
 82 |     loss_value  scalar = 1/n_rows * sum(log( 1 .+ exp(-Z*rho))
 83 | 
 84 |     """
 85 | 
 86 |     pos_idx = scores > 0
 87 |     loss_value = np.empty_like(scores)
 88 |     loss_value[pos_idx] = np.log1p(np.exp(-scores[pos_idx]))
 89 |     loss_value[~pos_idx] = -scores[~pos_idx] + np.log1p(np.exp(scores[~pos_idx]))
 90 |     loss_value = loss_value.mean()
 91 |     return loss_value
 92 | 
 93 | def log_probs(Z, rho):
 94 |     """
 95 |     compute the probabilities of the logistic loss function in a way that is numerically stable
 96 | 
 97 |     see also: http://stackoverflow.com/questions/20085768/
 98 |     Parameters
 99 |     ----------
100 |     Z           numpy.array containing training data with shape = (n_rows, n_cols)
101 |     rho         numpy.array of coefficients with shape = (n_cols,)
102 | 
103 |     Returns
104 |     -------
105 |     log_probs   numpy.array of probabilities under the logit model
106 |     """
107 | 
108 |     scores = Z.dot(rho)
109 |     pos_idx = scores > 0
110 |     log_probs = np.empty_like(scores)
111 |     log_probs[pos_idx]  = 1.0 / (1.0 + np.exp(-scores[pos_idx]))
112 |     log_probs[~pos_idx] = np.exp(scores[~pos_idx]) / (1.0 + np.exp(scores[~pos_idx]))
113 |     return log_probs
114 | 


--------------------------------------------------------------------------------
/riskslim/loss_functions/log_loss_weighted.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | def log_loss_value(Z, weights, total_weights, rho):
  4 |     """
  5 |     computes the value and slope of the logistic loss in a numerically stable way
  6 |     supports sample non-negative weights for each example in the training data
  7 |     see http://stackoverflow.com/questions/20085768/
  8 | 
  9 |     Parameters
 10 |     ----------
 11 |     Z               numpy.array containing training data with shape = (n_rows, n_cols)
 12 |     rho             numpy.array of coefficients with shape = (n_cols,)
 13 |     total_weights   numpy.sum(total_weights) (only included to reduce computation)
 14 |     weights         numpy.array of sample weights with shape (n_rows,)
 15 | 
 16 |     Returns
 17 |     -------
 18 |     loss_value  scalar = 1/n_rows * sum(log( 1 .+ exp(-Z*rho))
 19 | 
 20 |     """
 21 |     scores = Z.dot(rho)
 22 |     pos_idx = scores > 0
 23 |     loss_value = np.empty_like(scores)
 24 |     loss_value[pos_idx] = np.log1p(np.exp(-scores[pos_idx]))
 25 |     loss_value[~pos_idx] = -scores[~pos_idx] + np.log1p(np.exp(scores[~pos_idx]))
 26 |     loss_value = loss_value.dot(weights) / total_weights
 27 |     return loss_value
 28 | 
 29 | def log_loss_value_and_slope(Z, weights, total_weights, rho):
 30 |     """
 31 |     computes the value and slope of the logistic loss in a numerically stable way
 32 |     supports sample non-negative weights for each example in the training data
 33 |     this function should only be used when generating cuts in cutting-plane algorithms
 34 |     (computing both the value and the slope at the same time is slightly cheaper)
 35 | 
 36 |     see http://stackoverflow.com/questions/20085768/
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     Z               numpy.array containing training data with shape = (n_rows, n_cols)
 41 |     rho             numpy.array of coefficients with shape = (n_cols,)
 42 |     total_weights   numpy.sum(total_weights) (only included to reduce computation)
 43 |     weights         numpy.array of sample weights with shape (n_rows,)
 44 | 
 45 |     Returns
 46 |     -------
 47 |     loss_value  scalar = 1/n_rows * sum(log( 1 .+ exp(-Z*rho))
 48 |     loss_slope: (n_cols x 1) vector = 1/n_rows * sum(-Z*rho ./ (1+exp(-Z*rho))
 49 | 
 50 |     """
 51 |     scores = Z.dot(rho)
 52 |     pos_idx = scores > 0
 53 |     exp_scores_pos = np.exp(-scores[pos_idx])
 54 |     exp_scores_neg = np.exp(scores[~pos_idx])
 55 | 
 56 |     #compute loss value
 57 |     loss_value = np.empty_like(scores)
 58 |     loss_value[pos_idx] = np.log1p(exp_scores_pos)
 59 |     loss_value[~pos_idx] = -scores[~pos_idx] + np.log1p(exp_scores_neg)
 60 |     loss_value = loss_value.dot(weights) / total_weights
 61 | 
 62 |     #compute loss slope
 63 |     log_probs = np.empty_like(scores)
 64 |     log_probs[pos_idx] = 1.0 / (1.0 + exp_scores_pos)
 65 |     log_probs[~pos_idx] = (exp_scores_neg / (1.0 + exp_scores_neg))
 66 |     log_probs -= 1.0
 67 |     log_probs *= weights
 68 |     loss_slope = Z.T.dot(log_probs) / total_weights
 69 | 
 70 |     return loss_value, loss_slope
 71 | 
 72 | def log_loss_value_from_scores(weights, total_weights, scores):
 73 |     """
 74 |     computes the logistic loss value from a vector of scores in a numerically stable way
 75 |     where scores = Z.dot(rho)
 76 | 
 77 |     see also: http://stackoverflow.com/questions/20085768/
 78 | 
 79 |     this function is used for heuristics (discrete_descent, sequential_rounding).
 80 |     to save computation when running the heuristics, we store the scores and
 81 |     call this function to compute the loss directly from the scores
 82 |     this reduces the need to recompute the dot product.
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     scores          numpy.array of scores = Z.dot(rho)
 87 |     total_weights   numpy.sum(total_weights) (only included to reduce computation)
 88 |     weights         numpy.array of sample weights with shape (n_rows,)
 89 | 
 90 |     Returns
 91 |     -------
 92 |     loss_value  scalar = 1/n_rows * sum(log( 1 .+ exp(-Z*rho))
 93 | 
 94 |     """
 95 |     pos_idx = scores > 0
 96 |     loss_value = np.empty_like(scores)
 97 |     loss_value[pos_idx] = np.log1p(np.exp(-scores[pos_idx]))
 98 |     loss_value[~pos_idx] = -scores[~pos_idx] + np.log1p(np.exp(scores[~pos_idx]))
 99 |     loss_value = loss_value.dot(weights) / total_weights
100 | 
101 |     return loss_value


--------------------------------------------------------------------------------
/examples/data/breastcancer_cvindices.csv:
--------------------------------------------------------------------------------
  1 | 2
  2 | 5
  3 | 2
  4 | 4
  5 | 4
  6 | 5
  7 | 5
  8 | 3
  9 | 2
 10 | 5
 11 | 5
 12 | 2
 13 | 1
 14 | 2
 15 | 3
 16 | 5
 17 | 5
 18 | 1
 19 | 3
 20 | 4
 21 | 4
 22 | 1
 23 | 1
 24 | 1
 25 | 1
 26 | 3
 27 | 4
 28 | 5
 29 | 2
 30 | 4
 31 | 3
 32 | 2
 33 | 5
 34 | 2
 35 | 5
 36 | 1
 37 | 3
 38 | 1
 39 | 4
 40 | 1
 41 | 5
 42 | 4
 43 | 3
 44 | 1
 45 | 5
 46 | 5
 47 | 5
 48 | 2
 49 | 1
 50 | 5
 51 | 4
 52 | 3
 53 | 2
 54 | 3
 55 | 1
 56 | 2
 57 | 1
 58 | 2
 59 | 4
 60 | 3
 61 | 3
 62 | 2
 63 | 3
 64 | 1
 65 | 1
 66 | 1
 67 | 5
 68 | 3
 69 | 2
 70 | 4
 71 | 1
 72 | 3
 73 | 5
 74 | 3
 75 | 1
 76 | 1
 77 | 4
 78 | 3
 79 | 2
 80 | 1
 81 | 4
 82 | 5
 83 | 2
 84 | 5
 85 | 2
 86 | 2
 87 | 3
 88 | 2
 89 | 3
 90 | 2
 91 | 1
 92 | 3
 93 | 5
 94 | 3
 95 | 1
 96 | 2
 97 | 4
 98 | 3
 99 | 2
100 | 5
101 | 5
102 | 3
103 | 4
104 | 2
105 | 4
106 | 5
107 | 3
108 | 2
109 | 4
110 | 4
111 | 3
112 | 4
113 | 2
114 | 5
115 | 1
116 | 4
117 | 4
118 | 4
119 | 3
120 | 1
121 | 2
122 | 1
123 | 2
124 | 3
125 | 3
126 | 4
127 | 5
128 | 1
129 | 4
130 | 4
131 | 3
132 | 4
133 | 1
134 | 2
135 | 1
136 | 3
137 | 3
138 | 2
139 | 2
140 | 3
141 | 1
142 | 1
143 | 3
144 | 3
145 | 5
146 | 4
147 | 1
148 | 4
149 | 1
150 | 3
151 | 4
152 | 3
153 | 4
154 | 1
155 | 1
156 | 3
157 | 4
158 | 2
159 | 2
160 | 3
161 | 5
162 | 1
163 | 1
164 | 2
165 | 5
166 | 1
167 | 3
168 | 2
169 | 2
170 | 5
171 | 4
172 | 1
173 | 5
174 | 2
175 | 4
176 | 3
177 | 1
178 | 1
179 | 5
180 | 4
181 | 4
182 | 1
183 | 1
184 | 3
185 | 5
186 | 4
187 | 1
188 | 3
189 | 5
190 | 1
191 | 4
192 | 5
193 | 5
194 | 2
195 | 2
196 | 5
197 | 4
198 | 5
199 | 2
200 | 4
201 | 4
202 | 1
203 | 1
204 | 5
205 | 3
206 | 1
207 | 1
208 | 5
209 | 5
210 | 4
211 | 2
212 | 4
213 | 5
214 | 1
215 | 4
216 | 4
217 | 2
218 | 2
219 | 2
220 | 1
221 | 1
222 | 3
223 | 4
224 | 5
225 | 3
226 | 4
227 | 4
228 | 3
229 | 3
230 | 3
231 | 3
232 | 5
233 | 3
234 | 3
235 | 3
236 | 1
237 | 1
238 | 3
239 | 2
240 | 5
241 | 2
242 | 5
243 | 4
244 | 5
245 | 4
246 | 2
247 | 1
248 | 4
249 | 4
250 | 4
251 | 2
252 | 4
253 | 4
254 | 4
255 | 3
256 | 3
257 | 3
258 | 5
259 | 1
260 | 2
261 | 1
262 | 2
263 | 1
264 | 2
265 | 3
266 | 5
267 | 4
268 | 5
269 | 1
270 | 3
271 | 4
272 | 2
273 | 3
274 | 4
275 | 5
276 | 1
277 | 2
278 | 3
279 | 4
280 | 5
281 | 5
282 | 5
283 | 2
284 | 3
285 | 2
286 | 2
287 | 5
288 | 3
289 | 4
290 | 3
291 | 3
292 | 2
293 | 5
294 | 4
295 | 2
296 | 1
297 | 1
298 | 3
299 | 2
300 | 3
301 | 2
302 | 5
303 | 4
304 | 1
305 | 2
306 | 2
307 | 1
308 | 2
309 | 2
310 | 1
311 | 5
312 | 4
313 | 5
314 | 5
315 | 5
316 | 2
317 | 1
318 | 1
319 | 2
320 | 3
321 | 3
322 | 2
323 | 2
324 | 5
325 | 2
326 | 2
327 | 3
328 | 1
329 | 5
330 | 5
331 | 4
332 | 3
333 | 5
334 | 2
335 | 4
336 | 1
337 | 5
338 | 2
339 | 2
340 | 5
341 | 2
342 | 4
343 | 1
344 | 5
345 | 1
346 | 5
347 | 3
348 | 5
349 | 2
350 | 3
351 | 1
352 | 2
353 | 4
354 | 3
355 | 3
356 | 2
357 | 3
358 | 4
359 | 2
360 | 5
361 | 2
362 | 2
363 | 5
364 | 2
365 | 4
366 | 3
367 | 1
368 | 1
369 | 4
370 | 4
371 | 3
372 | 3
373 | 1
374 | 1
375 | 5
376 | 5
377 | 4
378 | 3
379 | 3
380 | 4
381 | 5
382 | 2
383 | 5
384 | 2
385 | 5
386 | 1
387 | 1
388 | 1
389 | 4
390 | 2
391 | 2
392 | 1
393 | 4
394 | 3
395 | 5
396 | 3
397 | 3
398 | 2
399 | 5
400 | 5
401 | 3
402 | 2
403 | 1
404 | 2
405 | 4
406 | 1
407 | 2
408 | 5
409 | 1
410 | 5
411 | 4
412 | 4
413 | 5
414 | 1
415 | 3
416 | 3
417 | 2
418 | 1
419 | 4
420 | 3
421 | 3
422 | 4
423 | 3
424 | 4
425 | 5
426 | 2
427 | 5
428 | 2
429 | 1
430 | 4
431 | 5
432 | 1
433 | 5
434 | 3
435 | 5
436 | 5
437 | 5
438 | 5
439 | 3
440 | 4
441 | 2
442 | 1
443 | 5
444 | 3
445 | 2
446 | 4
447 | 4
448 | 4
449 | 1
450 | 1
451 | 4
452 | 4
453 | 2
454 | 5
455 | 1
456 | 1
457 | 4
458 | 1
459 | 3
460 | 5
461 | 1
462 | 1
463 | 4
464 | 2
465 | 2
466 | 4
467 | 2
468 | 1
469 | 3
470 | 2
471 | 1
472 | 1
473 | 3
474 | 1
475 | 3
476 | 2
477 | 5
478 | 5
479 | 2
480 | 1
481 | 4
482 | 3
483 | 3
484 | 5
485 | 3
486 | 3
487 | 3
488 | 1
489 | 3
490 | 1
491 | 5
492 | 5
493 | 4
494 | 2
495 | 2
496 | 3
497 | 1
498 | 4
499 | 3
500 | 1
501 | 1
502 | 5
503 | 4
504 | 1
505 | 3
506 | 4
507 | 4
508 | 1
509 | 3
510 | 5
511 | 5
512 | 1
513 | 1
514 | 4
515 | 4
516 | 5
517 | 4
518 | 1
519 | 2
520 | 4
521 | 5
522 | 1
523 | 4
524 | 2
525 | 5
526 | 4
527 | 5
528 | 1
529 | 2
530 | 1
531 | 2
532 | 3
533 | 5
534 | 5
535 | 4
536 | 2
537 | 4
538 | 3
539 | 4
540 | 4
541 | 2
542 | 2
543 | 4
544 | 5
545 | 4
546 | 4
547 | 2
548 | 1
549 | 5
550 | 1
551 | 2
552 | 5
553 | 1
554 | 2
555 | 3
556 | 3
557 | 1
558 | 2
559 | 3
560 | 4
561 | 2
562 | 3
563 | 4
564 | 5
565 | 4
566 | 1
567 | 2
568 | 3
569 | 2
570 | 1
571 | 2
572 | 5
573 | 1
574 | 5
575 | 1
576 | 2
577 | 1
578 | 3
579 | 5
580 | 2
581 | 5
582 | 5
583 | 1
584 | 2
585 | 3
586 | 3
587 | 4
588 | 5
589 | 5
590 | 4
591 | 3
592 | 5
593 | 2
594 | 4
595 | 5
596 | 2
597 | 5
598 | 4
599 | 4
600 | 1
601 | 3
602 | 5
603 | 2
604 | 2
605 | 4
606 | 1
607 | 2
608 | 4
609 | 3
610 | 3
611 | 4
612 | 3
613 | 2
614 | 3
615 | 1
616 | 4
617 | 3
618 | 4
619 | 3
620 | 5
621 | 2
622 | 5
623 | 4
624 | 4
625 | 4
626 | 3
627 | 2
628 | 1
629 | 5
630 | 3
631 | 1
632 | 3
633 | 2
634 | 4
635 | 3
636 | 5
637 | 5
638 | 1
639 | 2
640 | 3
641 | 4
642 | 2
643 | 5
644 | 4
645 | 1
646 | 5
647 | 3
648 | 2
649 | 5
650 | 5
651 | 3
652 | 2
653 | 4
654 | 3
655 | 4
656 | 5
657 | 4
658 | 4
659 | 4
660 | 4
661 | 1
662 | 4
663 | 5
664 | 3
665 | 2
666 | 1
667 | 2
668 | 3
669 | 1
670 | 1
671 | 2
672 | 3
673 | 2
674 | 1
675 | 5
676 | 1
677 | 5
678 | 1
679 | 4
680 | 3
681 | 3
682 | 5
683 | 3
684 | 


--------------------------------------------------------------------------------
/examples/ex_03_constraints.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import cplex as cplex
  4 | import pprint
  5 | import riskslim
  6 | 
  7 | # data
  8 | import riskslim.coefficient_set
  9 | 
 10 | data_name = "breastcancer"                                  # name of the data
 11 | data_dir = os.getcwd() + '/examples/data/'                  # directory where datasets are stored
 12 | data_csv_file = data_dir + data_name + '_data.csv'          # csv file for the dataset
 13 | sample_weights_csv_file = None                              # csv file of sample weights for the dataset (optional)
 14 | 
 15 | # problem parameters
 16 | max_coefficient = 5                                         # value of largest/smallest coefficient
 17 | max_L0_value = 5                                            # maximum model size
 18 | max_offset = 50                                             # maximum value of offset parameter (optional)
 19 | c0_value = 1e-6                                             # L0-penalty parameter such that c0_value > 0; larger values -> sparser models; we set to a small value (1e-6) so that we get a model with max_L0_value terms
 20 | w_pos = 1.00                                                # relative weight on examples with y = +1; w_neg = 1.00 (optional)
 21 | 
 22 | # load data from disk
 23 | data = riskslim.load_data_from_csv(dataset_csv_file = data_csv_file, sample_weights_csv_file = sample_weights_csv_file)
 24 | N, P = data['X'].shape
 25 | 
 26 | # create coefficient set and set the value of the offset parameter
 27 | coef_set = riskslim.CoefficientSet(variable_names=data['variable_names'], lb=-max_coefficient, ub=max_coefficient, sign=0)
 28 | coef_set.update_intercept_bounds(X = data['X'], y = data['Y'], max_offset = max_offset)
 29 | 
 30 | # create constraint
 31 | trivial_L0_max = P - np.sum(coef_set.C_0j == 0)
 32 | max_L0_value = min(max_L0_value, trivial_L0_max)
 33 | 
 34 | constraints = {
 35 |     'L0_min': 0,
 36 |     'L0_max': max_L0_value,
 37 |     'coef_set':coef_set,
 38 | }
 39 | 
 40 | 
 41 | # major settings (see riskslim_ex_02_complete for full set of options)
 42 | settings = {
 43 |     # Problem Parameters
 44 |     'c0_value': c0_value,
 45 |     'w_pos': w_pos,
 46 |     #
 47 |     # LCPA Settings
 48 |     'max_runtime': 300.0,                               # max runtime for LCPA
 49 |     'max_tolerance': np.finfo('float').eps,             # tolerance to stop LCPA (set to 0 to return provably optimal solution)
 50 |     'display_cplex_progress': True,                     # print CPLEX progress on screen
 51 |     'loss_computation': 'normal',                       # how to compute the loss function ('normal','fast','lookup')
 52 |     #
 53 |     # RiskSLIM MIP settings
 54 |     'drop_variables': False,
 55 |     #
 56 |     # LCPA Improvements
 57 |     'round_flag': False,                                # round continuous solutions with SeqRd
 58 |     'polish_flag': False,                               # polish integer feasible solutions with DCD
 59 |     'chained_updates_flag': False,                      # use chained updates
 60 |     'initialization_flag': False,                       # use initialization procedure
 61 |     'init_max_runtime': 300.0,                          # max time to run CPA in initialization procedure
 62 |     'add_cuts_at_heuristic_solutions': True,            # add cuts at integer feasible solutions found using polishing/rounding
 63 |     #
 64 |     # CPLEX Solver Parameters
 65 |     'cplex_randomseed': 0,                              # random seed
 66 |     'cplex_mipemphasis': 0,                             # cplex MIP strategy
 67 | }
 68 | 
 69 | # turn on at your own risk
 70 | settings['round_flag'] = False
 71 | settings['polish_flag'] = False
 72 | settings['chained_updates_flag'] = False
 73 | settings['initialization_flag'] = False
 74 | 
 75 | 
 76 | # initialize MIP for lattice CPA
 77 | mip_objects = riskslim.setup_lattice_cpa(data, constraints, settings)
 78 | 
 79 | # add operational constraints
 80 | mip, indices = mip_objects['mip'], mip_objects['indices']
 81 | get_alpha_name = lambda var_name: 'alpha_' + str(data['variable_names'].index(var_name))
 82 | get_alpha_ind = lambda var_names: [get_alpha_name(v) for v in var_names]
 83 | 
 84 | # to add a constraint like "either "CellSize" or "CellShape"
 85 | # you must formulate the constraint in terms of the alpha variables
 86 | # alpha[cell_size] + alpha[cell_shape] <= 1 to MIP
 87 | mip.linear_constraints.add(
 88 |         names = ["EitherOr_CellSize_or_CellShape"],
 89 |         lin_expr = [cplex.SparsePair(ind = get_alpha_ind(['UniformityOfCellSize', 'UniformityOfCellShape']),
 90 |                                      val = [1.0, 1.0])],
 91 |         senses = "L",
 92 |         rhs = [1.0])
 93 | 
 94 | mip_objects['mip'] = mip
 95 | 
 96 | # pass MIP back to lattice CPA so that it will solve
 97 | model_info, mip_info, lcpa_info = riskslim.finish_lattice_cpa(data, constraints, mip_objects, settings)
 98 | 
 99 | #model info contains key results
100 | pprint.pprint(model_info)
101 | riskslim.print_model(model_info['solution'], data)
102 | 
103 | # mip_output contains information to access the MIP
104 | mip_info['risk_slim_mip'] #CPLEX mip
105 | mip_info['risk_slim_idx'] #indices of the relevant constraints
106 | 
107 | # lcpa_output contains detailed information about LCPA
108 | pprint.pprint(lcpa_info)
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/riskslim/defaults.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | INTERCEPT_NAME = '(Intercept)'
  4 | 
  5 | # Settings
  6 | DEFAULT_LCPA_SETTINGS = {
  7 |     #
  8 |     'c0_value': 1e-6,
  9 |     'w_pos': 1.00,
 10 |     #
 11 |     #  MIP Formulation
 12 |     'drop_variables': True,        #drop variables
 13 |     'tight_formulation': True,     #use a slightly tighter MIP formulation
 14 |     'include_auxillary_variable_for_objval': True,
 15 |     'include_auxillary_variable_for_L0_norm': True,
 16 |     #
 17 |     # LCPA Settings
 18 |     'max_runtime': 300.0,  # max runtime for LCPA
 19 |     'max_tolerance': 0.000001,  # tolerance to stop LCPA
 20 |     'display_cplex_progress': True,  # setting to True shows CPLEX progress
 21 |     'loss_computation': 'normal',  # type of loss computation to use ('normal','fast','lookup')
 22 |     'chained_updates_flag': True,  # use chained updates
 23 |     'initialization_flag': False,  # use initialization procedure
 24 |     'initial_bound_updates': True, # update bounds before solving
 25 |     'add_cuts_at_heuristic_solutions': True, #add cuts at integer feasible solutions found using polishing/rounding
 26 |     #
 27 |     #  LCPA Rounding Heuristic
 28 |     'round_flag': True,  # round continuous solutions with SeqRd
 29 |     'polish_rounded_solutions': True,  # polish solutions rounded with SeqRd using DCD
 30 |     'rounding_tolerance': float('inf'),  # only solutions with objective value < (1 + tol) are rounded
 31 |     'rounding_start_cuts': 0,  # cuts needed to start using rounding heuristic
 32 |     'rounding_start_gap': float('inf'),  # optimality gap needed to start using rounding heuristic
 33 |     'rounding_stop_cuts': 20000,  # cuts needed to stop using rounding heuristic
 34 |     'rounding_stop_gap': 0.2,  # optimality gap needed to stop using rounding heuristic
 35 |     #
 36 |     # LCPA Polishing Heuristic
 37 |     'polish_flag': True,  # polish integer feasible solutions with DCD
 38 |     'polishing_tolerance': 0.1, # only solutions with objective value (1 + polishing_ub_to_objval_relgap) are polished. setting to
 39 |     'polishing_max_runtime': 10.0,  # max time to run polishing each time
 40 |     'polishing_max_solutions': 5.0,  # max # of solutions to polish each time
 41 |     'polishing_start_cuts': 0,  # cuts needed to start using polishing heuristic
 42 |     'polishing_start_gap': float('inf'),  # min optimality gap needed to start using polishing heuristic
 43 |     'polishing_stop_cuts': float('inf'),  # cuts needed to stop using polishing heuristic
 44 |     'polishing_stop_gap': 5.0,  # max optimality gap required to stop using polishing heuristic
 45 |     #
 46 |     #  Internal Parameters
 47 |     'purge_loss_cuts': False,
 48 |     'purge_bound_cuts': False,
 49 |     }
 50 | 
 51 | DEFAULT_CPLEX_SETTINGS = {
 52 |     'randomseed': 0,  # random seed
 53 |     'mipemphasis': 0,  # cplex MIP strategy
 54 |     'mipgap': np.finfo('float').eps,  #
 55 |     'absmipgap': np.finfo('float').eps,  #
 56 |     'integrality_tolerance': np.finfo('float').eps,  #
 57 |     'repairtries': 20,  # number of tries to repair user provided solutions
 58 |     'poolsize': 100,  # number of feasible solutions to keep in solution pool
 59 |     'poolrelgap': float('nan'),  # discard if solutions
 60 |     'poolreplace': 2,  # solution pool
 61 |     'n_cores': 1,  # number of cores to use in B & B (must be 1)
 62 |     'nodefilesize': (120 * 1024) / 1,  # node file size
 63 |     }
 64 | 
 65 | DEFAULT_CPA_SETTINGS = {
 66 |     #
 67 |     'type': 'cvx',
 68 |     'display_progress': True,  # print progress of initialization procedure
 69 |     'display_cplex_progress': False,  # print of CPLEX during intialization procedure
 70 |     'save_progress': False,  # print progress of initialization procedure
 71 |     'update_bounds': True,
 72 |     #
 73 |     'max_runtime': 300.0,  # max time to run CPA in initialization procedure
 74 |     'max_runtime_per_iteration': 15.0,  # max time per iteration of CPA
 75 |     #
 76 |     'max_coefficient_gap': 0.49, # stopping tolerance for CPA (based on gap between consecutive solutions)
 77 |     'min_iterations_before_coefficient_gap_check': 250,
 78 |     #
 79 |     'max_iterations': 10000,  # max # of cuts needed to stop CPA
 80 |     'max_tolerance': 0.0001,  # stopping tolerance for CPA (based on optimality gap)
 81 |     }
 82 | 
 83 | DEFAULT_INITIALIZATION_SETTINGS = {
 84 |     'type': 'cvx',
 85 |     'use_rounding': True,  # use SeqRd in initialization procedure
 86 |     'rounding_max_runtime': 30.0,  # max runtime for Rs in initialization procedure
 87 |     'rounding_max_solutions': 5,  # max solutions to round using Rd
 88 |     #
 89 |     'use_sequential_rounding': True,  # use SeqRd in initialization procedure
 90 |     'sequential_rounding_max_runtime': 30.0,  # max runtime for SeqRd in initialization procedure
 91 |     'sequential_rounding_max_solutions': 5,  # max solutions to round using SeqRd
 92 |     #
 93 |     'polishing_after': True,  # polish after rounding
 94 |     'polishing_max_runtime': 30.0,  # max runtime for polishing
 95 |     'polishing_max_solutions': 5  # max solutions to polish
 96 |     }
 97 | 
 98 | # Initialization Settings includes CPA Settings
 99 | DEFAULT_INITIALIZATION_SETTINGS.update(DEFAULT_CPA_SETTINGS)
100 | 
101 | # LCPA Settings includes Initialization and CPLEX settings
102 | DEFAULT_LCPA_SETTINGS.update({'init_%s' % k: v for k,v in DEFAULT_INITIALIZATION_SETTINGS.items()})
103 | DEFAULT_LCPA_SETTINGS.update({'cplex_%s' % k: v for k,v in DEFAULT_CPLEX_SETTINGS.items()})


--------------------------------------------------------------------------------
/examples/data/breastcancer_weights.csv:
--------------------------------------------------------------------------------
  1 | 1
  2 | 1
  3 | 1
  4 | 1
  5 | 1
  6 | 1
  7 | 1
  8 | 1
  9 | 1
 10 | 1
 11 | 1
 12 | 1
 13 | 1
 14 | 1
 15 | 1
 16 | 1
 17 | 1
 18 | 1
 19 | 1
 20 | 1
 21 | 1
 22 | 1
 23 | 1
 24 | 1
 25 | 1
 26 | 1
 27 | 1
 28 | 1
 29 | 1
 30 | 1
 31 | 1
 32 | 1
 33 | 1
 34 | 1
 35 | 1
 36 | 1
 37 | 1
 38 | 1
 39 | 1
 40 | 1
 41 | 1
 42 | 1
 43 | 1
 44 | 1
 45 | 1
 46 | 1
 47 | 1
 48 | 1
 49 | 1
 50 | 1
 51 | 1
 52 | 1
 53 | 1
 54 | 1
 55 | 1
 56 | 1
 57 | 1
 58 | 1
 59 | 1
 60 | 1
 61 | 1
 62 | 1
 63 | 1
 64 | 1
 65 | 1
 66 | 1
 67 | 1
 68 | 1
 69 | 1
 70 | 1
 71 | 1
 72 | 1
 73 | 1
 74 | 1
 75 | 1
 76 | 1
 77 | 1
 78 | 1
 79 | 1
 80 | 1
 81 | 1
 82 | 1
 83 | 1
 84 | 1
 85 | 1
 86 | 1
 87 | 1
 88 | 1
 89 | 1
 90 | 1
 91 | 1
 92 | 1
 93 | 1
 94 | 1
 95 | 1
 96 | 1
 97 | 1
 98 | 1
 99 | 1
100 | 1
101 | 1
102 | 1
103 | 1
104 | 1
105 | 1
106 | 1
107 | 1
108 | 1
109 | 1
110 | 1
111 | 1
112 | 1
113 | 1
114 | 1
115 | 1
116 | 1
117 | 1
118 | 1
119 | 1
120 | 1
121 | 1
122 | 1
123 | 1
124 | 1
125 | 1
126 | 1
127 | 1
128 | 1
129 | 1
130 | 1
131 | 1
132 | 1
133 | 1
134 | 1
135 | 1
136 | 1
137 | 1
138 | 1
139 | 1
140 | 1
141 | 1
142 | 1
143 | 1
144 | 1
145 | 1
146 | 1
147 | 1
148 | 1
149 | 1
150 | 1
151 | 1
152 | 1
153 | 1
154 | 1
155 | 1
156 | 1
157 | 1
158 | 1
159 | 1
160 | 1
161 | 1
162 | 1
163 | 1
164 | 1
165 | 1
166 | 1
167 | 1
168 | 1
169 | 1
170 | 1
171 | 1
172 | 1
173 | 1
174 | 1
175 | 1
176 | 1
177 | 1
178 | 1
179 | 1
180 | 1
181 | 1
182 | 1
183 | 1
184 | 1
185 | 1
186 | 1
187 | 1
188 | 1
189 | 1
190 | 1
191 | 1
192 | 1
193 | 1
194 | 1
195 | 1
196 | 1
197 | 1
198 | 1
199 | 1
200 | 1
201 | 1
202 | 1
203 | 1
204 | 1
205 | 1
206 | 1
207 | 1
208 | 1
209 | 1
210 | 1
211 | 1
212 | 1
213 | 1
214 | 1
215 | 1
216 | 1
217 | 1
218 | 1
219 | 1
220 | 1
221 | 1
222 | 1
223 | 1
224 | 1
225 | 1
226 | 1
227 | 1
228 | 1
229 | 1
230 | 1
231 | 1
232 | 1
233 | 1
234 | 1
235 | 1
236 | 1
237 | 1
238 | 1
239 | 1
240 | 1
241 | 1
242 | 1
243 | 1
244 | 1
245 | 1
246 | 1
247 | 1
248 | 1
249 | 1
250 | 1
251 | 1
252 | 1
253 | 1
254 | 1
255 | 1
256 | 1
257 | 1
258 | 1
259 | 1
260 | 1
261 | 1
262 | 1
263 | 1
264 | 1
265 | 1
266 | 1
267 | 1
268 | 1
269 | 1
270 | 1
271 | 1
272 | 1
273 | 1
274 | 1
275 | 1
276 | 1
277 | 1
278 | 1
279 | 1
280 | 1
281 | 1
282 | 1
283 | 1
284 | 1
285 | 1
286 | 1
287 | 1
288 | 1
289 | 1
290 | 1
291 | 1
292 | 1
293 | 1
294 | 1
295 | 1
296 | 1
297 | 1
298 | 1
299 | 1
300 | 1
301 | 1
302 | 1
303 | 1
304 | 1
305 | 1
306 | 1
307 | 1
308 | 1
309 | 1
310 | 1
311 | 1
312 | 1
313 | 1
314 | 1
315 | 1
316 | 1
317 | 1
318 | 1
319 | 1
320 | 1
321 | 1
322 | 1
323 | 1
324 | 1
325 | 1
326 | 1
327 | 1
328 | 1
329 | 1
330 | 1
331 | 1
332 | 1
333 | 1
334 | 1
335 | 1
336 | 1
337 | 1
338 | 1
339 | 1
340 | 1
341 | 1
342 | 1
343 | 1
344 | 1
345 | 1
346 | 1
347 | 1
348 | 1
349 | 1
350 | 1
351 | 1
352 | 1
353 | 1
354 | 1
355 | 1
356 | 1
357 | 1
358 | 1
359 | 1
360 | 1
361 | 1
362 | 1
363 | 1
364 | 1
365 | 1
366 | 1
367 | 1
368 | 1
369 | 1
370 | 1
371 | 1
372 | 1
373 | 1
374 | 1
375 | 1
376 | 1
377 | 1
378 | 1
379 | 1
380 | 1
381 | 1
382 | 1
383 | 1
384 | 1
385 | 1
386 | 1
387 | 1
388 | 1
389 | 1
390 | 1
391 | 1
392 | 1
393 | 1
394 | 1
395 | 1
396 | 1
397 | 1
398 | 1
399 | 1
400 | 1
401 | 1
402 | 1
403 | 1
404 | 1
405 | 1
406 | 1
407 | 1
408 | 1
409 | 1
410 | 1
411 | 1
412 | 1
413 | 1
414 | 1
415 | 1
416 | 1
417 | 1
418 | 1
419 | 1
420 | 1
421 | 1
422 | 1
423 | 1
424 | 1
425 | 1
426 | 1
427 | 1
428 | 1
429 | 1
430 | 1
431 | 1
432 | 1
433 | 1
434 | 1
435 | 1
436 | 1
437 | 1
438 | 1
439 | 1
440 | 1
441 | 1
442 | 1
443 | 1
444 | 1
445 | 1
446 | 1
447 | 1
448 | 1
449 | 1
450 | 1
451 | 1
452 | 1
453 | 1
454 | 1
455 | 1
456 | 1
457 | 1
458 | 1
459 | 1
460 | 1
461 | 1
462 | 1
463 | 1
464 | 1
465 | 1
466 | 1
467 | 1
468 | 1
469 | 1
470 | 1
471 | 1
472 | 1
473 | 1
474 | 1
475 | 1
476 | 1
477 | 1
478 | 1
479 | 1
480 | 1
481 | 1
482 | 1
483 | 1
484 | 1
485 | 1
486 | 1
487 | 1
488 | 1
489 | 1
490 | 1
491 | 1
492 | 1
493 | 1
494 | 1
495 | 1
496 | 1
497 | 1
498 | 1
499 | 1
500 | 1
501 | 1
502 | 1
503 | 1
504 | 1
505 | 1
506 | 1
507 | 1
508 | 1
509 | 1
510 | 1
511 | 1
512 | 1
513 | 1
514 | 1
515 | 1
516 | 1
517 | 1
518 | 1
519 | 1
520 | 1
521 | 1
522 | 1
523 | 1
524 | 1
525 | 1
526 | 1
527 | 1
528 | 1
529 | 1
530 | 1
531 | 1
532 | 1
533 | 1
534 | 1
535 | 1
536 | 1
537 | 1
538 | 1
539 | 1
540 | 1
541 | 1
542 | 1
543 | 1
544 | 1
545 | 1
546 | 1
547 | 1
548 | 1
549 | 1
550 | 1
551 | 1
552 | 1
553 | 1
554 | 1
555 | 1
556 | 1
557 | 1
558 | 1
559 | 1
560 | 1
561 | 1
562 | 1
563 | 1
564 | 1
565 | 1
566 | 1
567 | 1
568 | 1
569 | 1
570 | 1
571 | 1
572 | 1
573 | 1
574 | 1
575 | 1
576 | 1
577 | 1
578 | 1
579 | 1
580 | 1
581 | 1
582 | 1
583 | 1
584 | 1
585 | 1
586 | 1
587 | 1
588 | 1
589 | 1
590 | 1
591 | 1
592 | 1
593 | 1
594 | 1
595 | 1
596 | 1
597 | 1
598 | 1
599 | 1
600 | 1
601 | 1
602 | 1
603 | 1
604 | 1
605 | 1
606 | 1
607 | 1
608 | 1
609 | 1
610 | 1
611 | 1
612 | 1
613 | 1
614 | 1
615 | 1
616 | 1
617 | 1
618 | 1
619 | 1
620 | 1
621 | 1
622 | 1
623 | 1
624 | 1
625 | 1
626 | 1
627 | 1
628 | 1
629 | 1
630 | 1
631 | 1
632 | 1
633 | 1
634 | 1
635 | 1
636 | 1
637 | 1
638 | 1
639 | 1
640 | 1
641 | 1
642 | 1
643 | 1
644 | 1
645 | 1
646 | 1
647 | 1
648 | 1
649 | 1
650 | 1
651 | 1
652 | 1
653 | 1
654 | 1
655 | 1
656 | 1
657 | 1
658 | 1
659 | 1
660 | 1
661 | 1
662 | 1
663 | 1
664 | 1
665 | 1
666 | 1
667 | 1
668 | 1
669 | 1
670 | 1
671 | 1
672 | 1
673 | 1
674 | 1
675 | 1
676 | 1
677 | 1
678 | 1
679 | 1
680 | 1
681 | 1
682 | 1
683 | 1


--------------------------------------------------------------------------------
/examples/ex_02_advanced_options.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import pprint
  4 | import riskslim
  5 | 
  6 | # data
  7 | data_name = "breastcancer"                                  # name of the data
  8 | data_dir = os.getcwd() + '/examples/data/'                  # directory where datasets are stored
  9 | data_csv_file = data_dir + data_name + '_data.csv'          # csv file for the dataset
 10 | sample_weights_csv_file = None                              # csv file of sample weights for the dataset (optional)
 11 | 
 12 | # problem parameters
 13 | max_coefficient = 5                                         # value of largest/smallest coefficient
 14 | max_L0_value = 5                                            # maximum model size
 15 | max_offset = 50                                             # maximum value of offset parameter (optional)
 16 | c0_value = 1e-6                                             # L0-penalty parameter such that c0_value > 0; larger values -> sparser models; we set to a small value (1e-6) so that we get a model with max_L0_value terms
 17 | w_pos = 1.00                                                # relative weight on examples with y = +1; w_neg = 1.00 (optional)
 18 | 
 19 | # load dataset
 20 | data = riskslim.load_data_from_csv(dataset_csv_file = data_csv_file, sample_weights_csv_file = sample_weights_csv_file)
 21 | N, P = data['X'].shape
 22 | 
 23 | # coefficient set
 24 | coef_set = riskslim.CoefficientSet(variable_names = data['variable_names'], lb=-max_coefficient, ub=max_coefficient, sign=0)
 25 | coef_set.update_intercept_bounds(X = data['X'], y = data['Y'], max_offset = max_offset)
 26 | 
 27 | # create constraint dictionary
 28 | N, P = data['X'].shape
 29 | trivial_L0_max = P - np.sum(coef_set.C_0j == 0)
 30 | max_L0_value = min(max_L0_value, trivial_L0_max)
 31 | 
 32 | constraints = {
 33 |     'L0_min': 0,
 34 |     'L0_max': max_L0_value,
 35 |     'coef_set': coef_set,
 36 | }
 37 | 
 38 | # Run RiskSLIM
 39 | settings = {
 40 |     #
 41 |     'c0_value': c0_value,
 42 |     'w_pos': w_pos,
 43 |     #
 44 |     # LCPA Settings
 45 |     'max_runtime': 300.0,                               # max runtime for LCPA
 46 |     'max_tolerance': np.finfo('float').eps,             # tolerance to stop LCPA (set to 0 to return provably optimal solution)
 47 |     'display_cplex_progress': True,                     # set to True to print CPLEX progress
 48 |     'loss_computation': 'lookup',                       # how to compute the loss function ('normal','fast','lookup')
 49 |     #
 50 |     # Other LCPA Heuristics
 51 |     'chained_updates_flag': True,                         # use chained updates
 52 |     'add_cuts_at_heuristic_solutions': True,            # add cuts at integer feasible solutions found using polishing/rounding
 53 |     #
 54 |     # LCPA Rounding Heuristic
 55 |     'round_flag': False,                                 # round continuous solutions with SeqRd
 56 |     'polish_rounded_solutions': True,                   # polish solutions rounded with SeqRd using DCD
 57 |     'rounding_tolerance': float('inf'),                 # only solutions with objective value < (1 + tol) are rounded
 58 |     'rounding_start_cuts': 0,                           # cuts needed to start using rounding heuristic
 59 |     'rounding_start_gap': float('inf'),                 # optimality gap needed to start using rounding heuristic
 60 |     'rounding_stop_cuts': 20000,                        # cuts needed to stop using rounding heuristic
 61 |     'rounding_stop_gap': 0.2,                           # optimality gap needed to stop using rounding heuristic
 62 |     #
 63 |     # LCPA Polishing Heuristic
 64 |     'polish_flag': False,                                # polish integer feasible solutions with DCD
 65 |     'polishing_tolerance': 0.1,                         # only solutions with objective value (1 + tol) are polished.
 66 |     'polishing_max_runtime': 10.0,                      # max time to run polishing each time
 67 |     'polishing_max_solutions': 5.0,                     # max # of solutions to polish each time
 68 |     'polishing_start_cuts': 0,                          # cuts needed to start using polishing heuristic
 69 |     'polishing_start_gap': float('inf'),                # min optimality gap needed to start using polishing heuristic
 70 |     'polishing_stop_cuts': float('inf'),                # cuts needed to stop using polishing heuristic
 71 |     'polishing_stop_gap': 0.0,                          # max optimality gap required to stop using polishing heuristic
 72 |     #
 73 |     # Initialization Procedure
 74 |     'initialization_flag': True,                       # use initialization procedure
 75 |     'init_display_progress': True,                      # show progress of initialization procedure
 76 |     'init_display_cplex_progress': False,               # show progress of CPLEX during intialization procedure
 77 |     #
 78 |     'init_max_runtime': 300.0,                          # max time to run CPA in initialization procedure
 79 |     'init_max_iterations': 10000,                       # max # of cuts needed to stop CPA
 80 |     'init_max_tolerance': 0.0001,                       # tolerance of solution to stop CPA
 81 |     'init_max_runtime_per_iteration': 300.0,            # max time per iteration of CPA
 82 |     'init_max_cplex_time_per_iteration': 10.0,          # max time per iteration to solve surrogate problem in CPA
 83 |     #
 84 |     'init_use_rounding': True,                          # use Rd in initialization procedure
 85 |     'init_rounding_max_runtime': 30.0,                  # max runtime for Rd in initialization procedure
 86 |     'init_rounding_max_solutions': 5,                   # max solutions to round using Rd
 87 |     #
 88 |     'init_use_sequential_rounding': True,               # use SeqRd in initialization procedure
 89 |     'init_sequential_rounding_max_runtime': 10.0,       # max runtime for SeqRd in initialization procedure
 90 |     'init_sequential_rounding_max_solutions': 5,        # max solutions to round using SeqRd
 91 |     #
 92 |     'init_polishing_after': True,                       # polish after rounding
 93 |     'init_polishing_max_runtime': 30.0,                 # max runtime for polishing
 94 |     'init_polishing_max_solutions': 5,                  # max solutions to polish
 95 |     #
 96 |     # CPLEX Solver Parameters
 97 |     'cplex_randomseed': 0,                              # random seed
 98 |     'cplex_mipemphasis': 0,                             # cplex MIP strategy
 99 | }
100 | 
101 | # train model using lattice_cpa
102 | model_info, mip_info, lcpa_info = riskslim.run_lattice_cpa(data, constraints, settings)
103 | 
104 | #model info contains key results
105 | pprint.pprint(model_info)
106 | riskslim.print_model(model_info['solution'], data)
107 | 
108 | # mip_output contains information to access the MIP
109 | mip_info['risk_slim_mip'] #CPLEX mip
110 | mip_info['risk_slim_idx'] #indices of the relevant constraints
111 | 
112 | # lcpa_output contains detailed information about LCPA
113 | pprint.pprint(lcpa_info)
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/riskslim/tests/test_risk_slim.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pprint
  3 | 
  4 | import numpy as np
  5 | import riskslim
  6 | 
  7 | # Dataset Strategy
  8 | #
  9 | # variables:    binary, real,
 10 | # N+:           0, 1, >1
 11 | # N-:           0, 1, >1
 12 | 
 13 | 
 14 | # Testing Strategy
 15 | #
 16 | # loss_computation  normal, fast, lookup
 17 | # max_coefficient   0, 1, >1
 18 | # max_L0_value      0, 1, >1
 19 | # max_offset        0, 1, Inf
 20 | # c0_value          eps, 1e-8, 0.01, C0_max
 21 | # sample_weights    no, yes
 22 | # w_pos             1.00, < 1.00, > 1.00
 23 | # initialization    on, off
 24 | # chained_updates   on, off
 25 | # polishing         on, off
 26 | # seq_rd            on, off
 27 | 
 28 | # data
 29 | data_name = "breastcancer"  # name of the data
 30 | data_dir = os.getcwd() + '/examples/data/'  # directory where datasets are stored
 31 | data_csv_file = data_dir + data_name + '_data.csv'  # csv file for the dataset
 32 | sample_weights_csv_file = None  # csv file of sample weights for the dataset (optional)
 33 | 
 34 | default_settings = {
 35 |     #
 36 |     'c0_value': 1e-6,
 37 |     'w_pos': 1.00,
 38 |     #
 39 |     # LCPA Settings
 40 |     'max_runtime': 300.0,                               # max runtime for LCPA
 41 |     'max_tolerance': np.finfo('float').eps,             # tolerance to stop LCPA (set to 0 to return provably optimal solution)
 42 |     'display_cplex_progress': True,                     # set to True to print CPLEX progress
 43 |     'loss_computation': 'normal',                       # how to compute the loss function ('normal','fast','lookup')
 44 |     'tight_formulation': True,                          # use a slightly formulation of surrogate MIP that provides a slightly improved formulation
 45 |     #
 46 |     # Other LCPA Heuristics
 47 |     'chained_updates_flag': True,                         # use chained updates
 48 |     'add_cuts_at_heuristic_solutions': True,            # add cuts at integer feasible solutions found using polishing/rounding
 49 |     #
 50 |     # LCPA Rounding Heuristic
 51 |     'round_flag': True,                                 # round continuous solutions with SeqRd
 52 |     'polish_rounded_solutions': True,                   # polish solutions rounded with SeqRd using DCD
 53 |     'rounding_tolerance': float('inf'),                 # only solutions with objective value < (1 + tol) are rounded
 54 |     'rounding_start_cuts': 0,                           # cuts needed to start using rounding heuristic
 55 |     'rounding_start_gap': float('inf'),                 # optimality gap needed to start using rounding heuristic
 56 |     'rounding_stop_cuts': 20000,                        # cuts needed to stop using rounding heuristic
 57 |     'rounding_stop_gap': 0.2,                           # optimality gap needed to stop using rounding heuristic
 58 |     #
 59 |     # LCPA Polishing Heuristic
 60 |     'polish_flag': True,                                # polish integer feasible solutions with DCD
 61 |     'polishing_tolerance': 0.1,                         # only solutions with objective value (1 + tol) are polished.
 62 |     'polishing_max_runtime': 10.0,                      # max time to run polishing each time
 63 |     'polishing_max_solutions': 5.0,                     # max # of solutions to polish each time
 64 |     'polishing_start_cuts': 0,                          # cuts needed to start using polishing heuristic
 65 |     'polishing_start_gap': float('inf'),                # min optimality gap needed to start using polishing heuristic
 66 |     'polishing_stop_cuts': float('inf'),                # cuts needed to stop using polishing heuristic
 67 |     'polishing_stop_gap': 5.0,                          # max optimality gap required to stop using polishing heuristic
 68 |     #
 69 |     # Initialization Procedure
 70 |     'initialization_flag': False,                       # use initialization procedure
 71 |     'init_display_progress': True,                      # show progress of initialization procedure
 72 |     'init_display_cplex_progress': False,               # show progress of CPLEX during intialization procedure
 73 |     #
 74 |     'init_max_runtime': 300.0,                          # max time to run CPA in initialization procedure
 75 |     'init_max_iterations': 10000,                       # max # of cuts needed to stop CPA
 76 |     'init_max_tolerance': 0.0001,                       # tolerance of solution to stop CPA
 77 |     'init_max_runtime_per_iteration': 300.0,            # max time per iteration of CPA
 78 |     'init_max_cplex_time_per_iteration': 10.0,          # max time per iteration to solve surrogate problem in CPA
 79 |     #
 80 |     'init_use_sequential_rounding': True,               # use SeqRd in initialization procedure
 81 |     'init_sequential_rounding_max_runtime': 30.0,       # max runtime for SeqRd in initialization procedure
 82 |     'init_sequential_rounding_max_solutions': 5,        # max solutions to round using SeqRd
 83 |     'init_polishing_after': True,                       # polish after rounding
 84 |     'init_polishing_max_runtime': 30.0,                 # max runtime for polishing
 85 |     'init_polishing_max_solutions': 5,                  # max solutions to polish
 86 |     #
 87 |     # CPLEX Solver Parameters
 88 |     'cplex_randomseed': 0,                              # random seed
 89 |     'cplex_mipemphasis': 0,                             # cplex MIP strategy
 90 | }
 91 | 
 92 | 
 93 | def test_risk_slim(data_csv_file, sample_weights_csv_file = None, max_coefficient = 5, max_L0_value = 5, max_offset = 50, c0_value = 1e-6, w_pos = 1.00, settings = None):
 94 | 
 95 |     # load dataset
 96 |     data = riskslim.load_data_from_csv(dataset_csv_file = data_csv_file, sample_weights_csv_file = sample_weights_csv_file)
 97 |     N, P = data['X'].shape
 98 | 
 99 |     # offset value
100 |     coef_set = riskslim.CoefficientSet(variable_names=data['variable_names'], lb=-max_coefficient, ub=max_coefficient, sign=0)
101 |     coef_set.update_intercept_bounds(X = data['X'], y = data['Y'], max_offset = max_offset, max_L0_value = max_L0_value)
102 | 
103 |     # create constraint dictionary
104 |     trivial_L0_max = P - np.sum(coef_set.C_0j == 0)
105 |     max_L0_value = min(max_L0_value, trivial_L0_max)
106 | 
107 |     constraints = {
108 |         'L0_min': 0,
109 |         'L0_max': max_L0_value,
110 |         'coef_set':coef_set,
111 |     }
112 | 
113 |     # Train model using lattice_cpa
114 |     model_info, mip_info, lcpa_info = riskslim.run_lattice_cpa(data, constraints, settings)
115 | 
116 |     #model info contains key results
117 |     pprint.pprint(model_info)
118 | 
119 |     # lcpa_output contains detailed information about LCPA
120 |     pprint.pprint(lcpa_info)
121 | 
122 |     return True
123 | 
124 | 
125 | test_risk_slim(data_csv_file = data_csv_file, max_coefficient = 5, max_L0_value = 5, max_offset = 50, settings = default_settings)
126 | test_risk_slim(data_csv_file = data_csv_file, max_coefficient = 5, max_L0_value = 1, max_offset = 50, settings = default_settings)
127 | test_risk_slim(data_csv_file = data_csv_file, max_coefficient = 5, max_L0_value = 0, max_offset = 50, settings = default_settings)
128 | test_risk_slim(data_csv_file = data_csv_file, max_coefficient = 5, max_L0_value = 0, max_offset = 0, settings = default_settings)
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/riskslim/loss_functions/lookup_log_loss.pyx:
--------------------------------------------------------------------------------
  1 | import cython
  2 | import numpy as np
  3 | cimport numpy as np
  4 | cimport scipy.linalg.cython_blas as blas
  5 | cimport libc.math as math
  6 | 
  7 | DTYPE = np.float64
  8 | ctypedef np.float64_t DTYPE_t
  9 | 
 10 | #create loss_value_table for logistic loss
 11 | @cython.boundscheck(False)
 12 | @cython.wraparound(False)
 13 | @cython.nonecheck(False)
 14 | @cython.cdivision(False)
 15 | def get_loss_value_table(int min_score, int max_score):
 16 | 
 17 |     cdef:
 18 |         int lookup_offset = -min_score
 19 |         np.ndarray[DTYPE_t, ndim=1, mode = "fortran"] loss_value_table = np.empty(max_score - min_score + 1, dtype = DTYPE)
 20 |         Py_ssize_t i = 0
 21 |         int s = min_score
 22 | 
 23 |     while (s < 0):
 24 |         loss_value_table[i] = math.log(1.0 + math.exp(s)) - s
 25 |         i += 1
 26 |         s += 1
 27 | 
 28 |     if s == 0:
 29 |         loss_value_table[i] = math.M_LN2
 30 |         i += 1
 31 |         s += 1
 32 | 
 33 |     while s <= max_score:
 34 |         loss_value_table[i] = math.log1p(math.exp(-s))
 35 |         i += 1
 36 |         s += 1
 37 |     return loss_value_table, lookup_offset
 38 | 
 39 | #create prob_value_table for logistic loss
 40 | @cython.boundscheck(False)
 41 | @cython.wraparound(False)
 42 | @cython.nonecheck(False)
 43 | @cython.cdivision(False)
 44 | def get_prob_value_table(int min_score, int max_score):
 45 | 
 46 |     cdef:
 47 |         int lookup_offset = -min_score
 48 |         np.ndarray[DTYPE_t, ndim=1, mode = "fortran"] prob_value_table = np.empty(max_score - min_score + 1, dtype = DTYPE)
 49 |         Py_ssize_t i = 0
 50 |         DTYPE_t exp_value
 51 |         int s = min_score
 52 | 
 53 |     while (s < 0):
 54 |         exp_value = math.exp(s)
 55 |         prob_value_table[i] = (exp_value / (1.0 + exp_value)) - 1.0
 56 |         i += 1
 57 |         s += 1
 58 | 
 59 |     if (s == 0):
 60 |         prob_value_table[i] = -0.5
 61 |         i += 1
 62 |         s += 1
 63 | 
 64 |     while (s <= max_score):
 65 |         exp_value = math.exp(-s)
 66 |         prob_value_table[i] = (1.0 / (1.0 + exp_value)) - 1.0
 67 |         i += 1
 68 |         s += 1
 69 | 
 70 |     return prob_value_table, lookup_offset
 71 | 
 72 | #create both loss and prob tables for logistic loss
 73 | @cython.boundscheck(False)
 74 | @cython.wraparound(False)
 75 | @cython.nonecheck(False)
 76 | @cython.cdivision(False)
 77 | def get_loss_value_and_prob_tables(int min_score, int max_score):
 78 | 
 79 |     cdef:
 80 |         int lookup_offset = -min_score
 81 |         int table_size = max_score - min_score + 1
 82 |         np.ndarray[DTYPE_t, ndim=1, mode = "fortran"] loss_value_table = np.empty(table_size, dtype = DTYPE)
 83 |         np.ndarray[DTYPE_t, ndim=1, mode = "fortran"] prob_value_table = np.empty(table_size, dtype = DTYPE)
 84 |         Py_ssize_t i = 0
 85 |         DTYPE_t exp_value
 86 |         int s = min_score
 87 | 
 88 |     while (s < 0):
 89 |         exp_value = math.exp(s)
 90 |         loss_value_table[i] = math.log(1.0 + exp_value) - s
 91 |         prob_value_table[i] = (exp_value / (1.0 + exp_value)) - 1.0
 92 |         i += 1
 93 |         s += 1
 94 | 
 95 |     if (s == 0):
 96 |         loss_value_table[i] = math.M_LN2
 97 |         prob_value_table[i] = -0.5
 98 |         i += 1
 99 |         s += 1
100 | 
101 |     while (s <= max_score):
102 |         exp_value = math.exp(-s)
103 |         loss_value_table[i] = math.log1p(exp_value)
104 |         prob_value_table[i] = (1.0 / (1.0 + exp_value)) - 1.0
105 |         i += 1
106 |         s += 1
107 | 
108 |     return loss_value_table, prob_value_table, lookup_offset
109 | 
110 | ##############################################################################################################
111 | ##############################################################################################################
112 | 
113 | @cython.boundscheck(False)
114 | @cython.wraparound(False)
115 | @cython.nonecheck(False)
116 | @cython.cdivision(False)
117 | def log_loss_value(np.ndarray[DTYPE_t, ndim=2, mode="fortran"] Z,
118 | np.ndarray[DTYPE_t, ndim=1, mode="fortran"] rho,
119 | np.ndarray[DTYPE_t, ndim=1, mode="fortran"] loss_value_table,
120 | int lookup_offset):
121 | 
122 |     cdef:
123 |         int N = Z.shape[0]
124 |         int D = Z.shape[1]
125 |         int incx = 1 #increments of rho
126 |         int incy = 1 #increments of y
127 |         double alpha = 1.0
128 |         double beta = 0.0
129 |         np.ndarray[DTYPE_t, ndim=1, mode = "fortran"] y = np.empty(N, dtype = DTYPE)
130 |         Py_ssize_t i
131 |         DTYPE_t total_loss = 0.0
132 | 
133 |     #get scores using dgemv, which computes: y <- alpha * trans(Z) + beta * y
134 |     #see also: (http://www.nag.com/numeric/fl/nagdoc_fl22/xhtml/F06/f06paf.xml)
135 |     blas.dgemv("N", &N, &D, &alpha, &Z[0,0], &N, &rho[0], &incx, &beta, &y[0], &incy)
136 | 
137 |     #compute loss
138 |     for i in range(N):
139 |         total_loss += loss_value_table[(<int>y[i]) + lookup_offset]
140 | 
141 |     return total_loss/N
142 | 
143 | @cython.boundscheck(False)
144 | @cython.wraparound(False)
145 | @cython.nonecheck(False)
146 | @cython.cdivision(False)
147 | def log_loss_value_from_scores(
148 |     np.ndarray[DTYPE_t, ndim=1, mode="fortran"] scores,
149 |     np.ndarray[DTYPE_t, ndim=1, mode="fortran"] loss_value_table,
150 |     int lookup_offset):
151 | 
152 |     cdef:
153 |         Py_ssize_t i
154 |         Py_ssize_t N = scores.shape[0]
155 |         DTYPE_t total_loss = 0.0
156 | 
157 |     #compute loss
158 |     for i in range(N):
159 |         total_loss += loss_value_table[((<int>scores[i]) + lookup_offset)]
160 | 
161 |     return total_loss/N
162 | 
163 | @cython.boundscheck(False)
164 | @cython.wraparound(False)
165 | @cython.nonecheck(False)
166 | @cython.cdivision(False)
167 | def log_loss_value_and_slope(
168 |             np.ndarray[DTYPE_t, ndim=2, mode="fortran"] Z,
169 |             np.ndarray[DTYPE_t, ndim=1, mode="fortran"] rho,
170 |             np.ndarray[DTYPE_t, ndim=1, mode="fortran"] loss_value_table,
171 |             np.ndarray[DTYPE_t, ndim=1, mode="fortran"] prob_value_table,
172 |             int lookup_offset):
173 | 
174 |     cdef:
175 |         int N = Z.shape[0]
176 |         int D = Z.shape[1]
177 |         int lda = N
178 |         int incx = 1 #increments of rho
179 |         int incy = 1 #increments of y
180 |         double alpha = 1.0
181 |         double beta = 0.0
182 |         Py_ssize_t i
183 |         int lookup_index
184 |         DTYPE_t total_loss = 0.0
185 |         np.ndarray[DTYPE_t, ndim=1, mode = "fortran"] y = np.empty(N, dtype = DTYPE)
186 |         np.ndarray[DTYPE_t, ndim=1, mode = "fortran"] loss_slope = np.empty(D, dtype = DTYPE)
187 | 
188 |     #get scores using dgemv, which computes: y <- alpha * trans(Z) + beta * y
189 |     #see also: (http://www.nag.com/numeric/fl/nagdoc_fl22/xhtml/F06/f06paf.xml)
190 |     blas.dgemv("N", &N, &D, &alpha, &Z[0,0], &lda, &rho[0], &incx, &beta, &y[0], &incy)
191 | 
192 |     #exponentiate scores, compute mean scores and probabilities
193 |     for i in range(N):
194 |         lookup_index = (<int> y[i]) + lookup_offset
195 |         total_loss += loss_value_table[lookup_index]
196 |         y[i] = prob_value_table[lookup_index]
197 | 
198 |     #compute loss slope
199 |     alpha = 1.0/N
200 |     blas.dgemv("T", &N, &D, &alpha, &Z[0,0], &lda, &y[0], &incx, &beta, &loss_slope[0], &incy)
201 | 
202 |     return (total_loss/N), loss_slope
203 | 


--------------------------------------------------------------------------------
/riskslim/bound_tightening.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def chained_updates(bounds, C_0_nnz, new_objval_at_feasible = None, new_objval_at_relaxation = None, MAX_CHAIN_COUNT = 20):
  5 | 
  6 |     new_bounds = dict(bounds)
  7 | 
  8 |     # update objval_min using new_value (only done once)
  9 |     if new_objval_at_relaxation is not None:
 10 |         if new_bounds['objval_min'] < new_objval_at_relaxation:
 11 |             new_bounds['objval_min'] = new_objval_at_relaxation
 12 | 
 13 |     # update objval_max using new_value (only done once)
 14 |     if new_objval_at_feasible is not None:
 15 |         if new_bounds['objval_max'] > new_objval_at_feasible:
 16 |             new_bounds['objval_max'] = new_objval_at_feasible
 17 | 
 18 |     # we have already converged
 19 |     if new_bounds['objval_max'] <= new_bounds['objval_min']:
 20 |         new_bounds['objval_max'] = max(new_bounds['objval_max'], new_bounds['objval_min'])
 21 |         new_bounds['objval_min'] = min(new_bounds['objval_max'], new_bounds['objval_min'])
 22 |         new_bounds['loss_max'] = min(new_bounds['objval_max'], new_bounds['loss_max'])
 23 |         return new_bounds
 24 | 
 25 |     # start update chain
 26 |     chain_count = 0
 27 |     improved_bounds = True
 28 | 
 29 |     while improved_bounds and chain_count < MAX_CHAIN_COUNT:
 30 | 
 31 |         improved_bounds = False
 32 |         L0_penalty_min = np.sum(np.sort(C_0_nnz)[np.arange(int(new_bounds['L0_min']))])
 33 |         L0_penalty_max = np.sum(-np.sort(-C_0_nnz)[np.arange(int(new_bounds['L0_max']))])
 34 | 
 35 |         # loss_min
 36 |         if new_bounds['objval_min'] > L0_penalty_max:
 37 |             proposed_loss_min = new_bounds['objval_min'] - L0_penalty_max
 38 |             if proposed_loss_min > new_bounds['loss_min']:
 39 |                 new_bounds['loss_min'] = proposed_loss_min
 40 |                 improved_bounds = True
 41 | 
 42 |         # L0_min
 43 |         if new_bounds['objval_min'] > new_bounds['loss_max']:
 44 |             proposed_L0_min = np.ceil((new_bounds['objval_min'] - new_bounds['loss_max']) / np.min(C_0_nnz))
 45 |             if proposed_L0_min > new_bounds['L0_min']:
 46 |                 new_bounds['L0_min'] = proposed_L0_min
 47 |                 improved_bounds = True
 48 | 
 49 |         # objval_min = max(objval_min, loss_min + L0_penalty_min)
 50 |         proposed_objval_min = min(new_bounds['loss_min'], L0_penalty_min)
 51 |         if proposed_objval_min > new_bounds['objval_min']:
 52 |             new_bounds['objval_min'] = proposed_objval_min
 53 |             improved_bounds = True
 54 | 
 55 |         # loss max
 56 |         if new_bounds['objval_max'] > L0_penalty_min:
 57 |             proposed_loss_max = new_bounds['objval_max'] - L0_penalty_min
 58 |             if proposed_loss_max < new_bounds['loss_max']:
 59 |                 new_bounds['loss_max'] = proposed_loss_max
 60 |                 improved_bounds = True
 61 | 
 62 |         # L0_max
 63 |         if new_bounds['objval_max'] > new_bounds['loss_min']:
 64 |             proposed_L0_max = np.floor((new_bounds['objval_max'] - new_bounds['loss_min']) / np.min(C_0_nnz))
 65 |             if proposed_L0_max < new_bounds['L0_max']:
 66 |                 new_bounds['L0_max'] = proposed_L0_max
 67 |                 improved_bounds = True
 68 | 
 69 |         # objval_max = min(objval_max, loss_max + penalty_max)
 70 |         proposed_objval_max = new_bounds['loss_max'] + L0_penalty_max
 71 |         if proposed_objval_max < new_bounds['objval_max']:
 72 |             new_bounds['objval_max'] = proposed_objval_max
 73 |             improved_bounds = True
 74 | 
 75 |         chain_count += 1
 76 | 
 77 |     return new_bounds
 78 | 
 79 | 
 80 | def chained_updates_for_lp(bounds, C_0_nnz, new_objval_at_feasible = None, new_objval_at_relaxation = None, MAX_CHAIN_COUNT = 20):
 81 | 
 82 |     new_bounds = dict(bounds)
 83 | 
 84 |     # update objval_min using new_value (only done once)
 85 |     if new_objval_at_relaxation is not None:
 86 |         if new_bounds['objval_min'] < new_objval_at_relaxation:
 87 |             new_bounds['objval_min'] = new_objval_at_relaxation
 88 | 
 89 |     # update objval_max using new_value (only done once)
 90 |     if new_objval_at_feasible is not None:
 91 |         if new_bounds['objval_max'] > new_objval_at_feasible:
 92 |             new_bounds['objval_max'] = new_objval_at_feasible
 93 | 
 94 |     if new_bounds['objval_max'] <= new_bounds['objval_min']:
 95 |         new_bounds['objval_max'] = max(new_bounds['objval_max'], new_bounds['objval_min'])
 96 |         new_bounds['objval_min'] = min(new_bounds['objval_max'], new_bounds['objval_min'])
 97 |         new_bounds['loss_max'] = min(new_bounds['objval_max'], new_bounds['loss_max'])
 98 |         return new_bounds
 99 | 
100 |     # start update chain
101 |     chain_count = 0
102 |     improved_bounds = True
103 |     C_0_min = np.min(C_0_nnz)
104 |     C_0_max = np.max(C_0_nnz)
105 |     L0_penalty_min = C_0_min * new_bounds['L0_min']
106 |     L0_penalty_max = min(C_0_max * new_bounds['L0_max'], new_bounds['objval_max'])
107 | 
108 |     while improved_bounds and chain_count < MAX_CHAIN_COUNT:
109 | 
110 |         improved_bounds = False
111 |         # loss_min
112 |         if new_bounds['objval_min'] > L0_penalty_max:
113 |             proposed_loss_min = new_bounds['objval_min'] - L0_penalty_max
114 |             if proposed_loss_min > new_bounds['loss_min']:
115 |                 new_bounds['loss_min'] = proposed_loss_min
116 |                 improved_bounds = True
117 | 
118 |         # L0_min and L0_penalty_min
119 |         if new_bounds['objval_min'] > new_bounds['loss_max']:
120 |             proposed_L0_min = (new_bounds['objval_min'] - new_bounds['loss_max']) / C_0_min
121 |             if proposed_L0_min > new_bounds['L0_min']:
122 |                 new_bounds['L0_min'] = proposed_L0_min
123 |                 L0_penalty_min = max(L0_penalty_min, C_0_min * proposed_L0_min)
124 |                 improved_bounds = True
125 | 
126 |         # objval_min = max(objval_min, loss_min + L0_penalty_min)
127 |         proposed_objval_min = min(new_bounds['loss_min'], L0_penalty_min)
128 |         if proposed_objval_min > new_bounds['objval_min']:
129 |             new_bounds['objval_min'] = proposed_objval_min
130 |             improved_bounds = True
131 | 
132 |         # loss max
133 |         if new_bounds['objval_max'] > L0_penalty_min:
134 |             proposed_loss_max = new_bounds['objval_max'] - L0_penalty_min
135 |             if proposed_loss_max < new_bounds['loss_max']:
136 |                 new_bounds['loss_max'] = proposed_loss_max
137 |                 improved_bounds = True
138 | 
139 |         # L0_max and L0_penalty_max
140 |         if new_bounds['objval_max'] > new_bounds['loss_min']:
141 |             proposed_L0_max = (new_bounds['objval_max'] - new_bounds['loss_min']) / C_0_min
142 |             if proposed_L0_max < new_bounds['L0_max']:
143 |                 new_bounds['L0_max'] = proposed_L0_max
144 |                 L0_penalty_max = min(L0_penalty_max, C_0_max * proposed_L0_max)
145 |                 improved_bounds = True
146 | 
147 |         # objval_max = min(objval_max, loss_max + penalty_max)
148 |         proposed_objval_max = new_bounds['loss_max'] + L0_penalty_max
149 |         if proposed_objval_max < new_bounds['objval_max']:
150 |             new_bounds['objval_max'] = proposed_objval_max
151 |             L0_penalty_max = min(L0_penalty_max, proposed_objval_max)
152 |             improved_bounds = True
153 | 
154 |         chain_count += 1
155 | 
156 |     return new_bounds
157 | 


--------------------------------------------------------------------------------
/riskslim/tests/test_loss_functions.py:
--------------------------------------------------------------------------------
  1 | #noinspection
  2 | import numpy as np
  3 | 
  4 | import riskslim.loss_functions.fast_log_loss as fast
  5 | import riskslim.loss_functions.log_loss as normal
  6 | import riskslim.loss_functions.log_loss_weighted as weighted
  7 | import riskslim.loss_functions.lookup_log_loss as lookup
  8 | from riskslim.setup_functions import _setup_training_weights
  9 | 
 10 | np.random.seed(seed = 0)
 11 | 
 12 | #initialize data matrix X and label vector Y
 13 | n_rows = 1000000
 14 | n_cols = 20
 15 | rho_ub = 100
 16 | rho_lb = -100
 17 | 
 18 | #helper function s
 19 | def generate_binary_data(n_rows = 1000000, n_cols = 20):
 20 |     X = np.random.randint(low=0, high=2, size=(n_rows, n_cols))
 21 |     Y = np.random.randint(low=0, high=2, size=(n_rows, 1))
 22 |     pos_ind = Y == 1
 23 |     Y[~pos_ind] = -1
 24 |     return X, Y
 25 | 
 26 | def generate_integer_model(n_cols = 20, rho_ub = 100, rho_lb = -100, sparse_pct = 0.5):
 27 |     rho = np.random.randint(low=rho_lb, high=rho_ub, size=n_cols)
 28 |     rho = np.require(rho, dtype=Z.dtype, requirements=['F'])
 29 |     nnz_count = int(sparse_pct * np.floor(n_cols / 2))
 30 |     set_to_zero = np.random.choice(range(0, n_cols), size=nnz_count, replace=False)
 31 |     rho[set_to_zero] = 0.0
 32 |     return rho
 33 | 
 34 | def get_score_bounds(Z_min, Z_max, rho):
 35 |     pos_ind = np.where(rho>0.0)[0]
 36 |     neg_ind = np.where(rho<0.0)[0]
 37 |     s_min, s_max = 0, 0
 38 | 
 39 |     for j in pos_ind:
 40 |         s_max += rho[j] * Z_max[j]
 41 |         s_min += rho[j] * Z_min[j]
 42 | 
 43 |     for j in neg_ind:
 44 |         s_max += rho[j] * Z_min[j]
 45 |         s_min += rho[j] * Z_max[j]
 46 | 
 47 |     return s_min, s_max
 48 | 
 49 | def get_score_bounds_from_range(Z_min, Z_max, rho_lb, rho_ub, L0_max = None):
 50 |     "global variables: L0_reg_ind"
 51 |     edge_values = np.vstack([Z_min * rho_lb,
 52 |                              Z_max * rho_lb,
 53 |                              Z_min * rho_ub,
 54 |                              Z_max * rho_ub])
 55 | 
 56 |     if L0_max is None or L0_max == Z_min.shape[0]:
 57 |         s_min = np.sum(np.min(edge_values, axis = 0))
 58 |         s_max = np.sum(np.max(edge_values, axis = 0))
 59 |     else:
 60 |         min_values = np.min(edge_values, axis = 0)
 61 |         s_min_reg = np.sum(np.sort(min_values[L0_reg_ind])[0:L0_max])
 62 |         s_min_no_reg = np.sum(min_values[~L0_reg_ind])
 63 |         s_min = s_min_reg + s_min_no_reg
 64 | 
 65 |         max_values = np.max(edge_values, axis = 0)
 66 |         s_max_reg = np.sum(-np.sort(-max_values[L0_reg_ind])[0:L0_max])
 67 |         s_max_no_reg = np.sum(max_values[~L0_reg_ind])
 68 |         s_max = s_max_reg + s_max_no_reg
 69 | 
 70 |     return s_min, s_max
 71 | 
 72 | 
 73 | #generate data
 74 | X, Y = generate_binary_data(n_rows, n_cols)
 75 | Z = X * Y
 76 | Z = np.require(Z, requirements=['F'], dtype=np.float64)
 77 | rho = generate_integer_model(n_cols, rho_ub, rho_lb)
 78 | L0_reg_ind = np.ones(n_cols, dtype='bool')
 79 | L0_reg_ind[0] = False
 80 | Z_min = np.min(Z, axis = 0)
 81 | Z_max = np.max(Z, axis = 0)
 82 | 
 83 | #setup weights
 84 | weights = _setup_training_weights(Y, w_pos = 1.0, w_neg = 1.0, w_total_target = 2.0)
 85 | 
 86 | #create lookup table
 87 | min_score, max_score = get_score_bounds_from_range(Z_min, Z_max, rho_lb, rho_ub, L0_max = n_cols)
 88 | loss_value_tbl, prob_value_tbl, loss_tbl_offset = lookup.get_loss_value_and_prob_tables(min_score, max_score)
 89 | loss_tbl_offset = int(loss_tbl_offset)
 90 | 
 91 | #assert correctnes of log_loss from scores function
 92 | for s in range(int(min_score), int(max_score)+1):
 93 |     normal_value = normal.log_loss_value_from_scores(np.array(s, dtype = Z.dtype, ndmin = 1)) #loss_value_tbl[s+loss_tbl_offset]
 94 |     cython_value = fast.log_loss_value_from_scores(np.array(s, dtype = Z.dtype, ndmin = 1))
 95 |     table_value = loss_value_tbl[s+loss_tbl_offset]
 96 |     lookup_value = lookup.log_loss_value_from_scores(np.array(s,dtype = Z.dtype, ndmin = 1), loss_value_tbl, loss_tbl_offset)
 97 |     assert(np.isclose(normal_value, cython_value, rtol = 1e-06))
 98 |     assert(np.isclose(table_value, cython_value, rtol = 1e-06))
 99 |     assert(np.isclose(table_value, normal_value, rtol = 1e-06))
100 |     assert(np.equal(table_value, lookup_value))
101 | 
102 | 
103 | #python implementations need to be 'C' aligned instead of D aligned
104 | Z_py = np.require(Z, requirements = ['C'])
105 | rho_py = np.require(rho, requirements = ['C'])
106 | scores_py = Z_py.dot(rho_py)
107 | 
108 | #define tests
109 | def normal_value_test(): return normal.log_loss_value(Z_py, rho_py)
110 | def fast_value_test(): return fast.log_loss_value(Z, rho)
111 | def lookup_value_test(): return lookup.log_loss_value(Z, rho, loss_value_tbl, loss_tbl_offset)
112 | 
113 | def normal_cut_test(): return normal.log_loss_value_and_slope(Z_py, rho_py)
114 | def fast_cut_test(): return fast.log_loss_value_and_slope(Z, rho)
115 | def lookup_cut_test(): return lookup.log_loss_value_and_slope(Z, rho, loss_value_tbl, prob_value_tbl, loss_tbl_offset)
116 | 
117 | # def dynamic_lookup_value_test():
118 | #     s_min_dynamic, s_max_dynamic = get_score_bounds(Z_min, Z_max, rho)
119 | #     tbl, offset = lookup.get_loss_value_table(s_min_dynamic, s_max_dynamic)
120 | #     return lookup.log_loss_value(Z, rho, tbl, offset)
121 | 
122 | #check values and cuts
123 | normal_cut = normal_cut_test()
124 | cython_cut = fast_cut_test()
125 | lookup_cut = lookup_cut_test()
126 | assert(np.isclose(fast_value_test(), lookup_value_test()))
127 | assert(np.isclose(normal_cut[0], cython_cut[0]))
128 | assert(np.isclose(lookup_cut[0], cython_cut[0]))
129 | assert(all(np.isclose(normal_cut[1], cython_cut[1])))
130 | assert(all(np.isclose(lookup_cut[1], cython_cut[1])))
131 | print("passed cut tests")
132 | 
133 | 
134 | #weighted tests
135 | def weighted_value_test(weights): return weighted.log_loss_value(Z_py, weights, np.sum(weights), rho_py)
136 | def weighted_cut_test(weights): return weighted.log_loss_value_and_slope(Z_py, weights, np.sum(weights),  rho_py)
137 | def weighted_scores_test(weights): return weighted.log_loss_value_from_scores(weights, np.sum(weights), scores_py)
138 | 
139 | 
140 | #w_pos = w_neg = 1.0
141 | weights = _setup_training_weights(Y, w_pos = 1.0, w_neg = 1.0, w_total_target = 2.0)
142 | 
143 | weights_match_unit_weights = all(weights == 1.0)
144 | 
145 | if weights_match_unit_weights:
146 |     print("tests for match between normal and weighted loss function")
147 |     #value
148 |     assert(np.isclose(normal_value_test(), weighted_value_test(weights)))
149 |     assert(np.isclose(normal_value_test(), weighted_scores_test(weights)))
150 | 
151 |     #cut
152 |     normal_cut = normal_cut_test()
153 |     weighted_cut = weighted_cut_test(weights)
154 |     assert(np.isclose(normal_cut[0], weighted_cut[0]))
155 |     assert(all(np.isclose(normal_cut[1], weighted_cut[1])))
156 | 
157 | print("passed all tests for weighted implementations when w_pos = w_neg = 1.0")
158 | 
159 | 
160 | #w_pos = w_neg = 1.0
161 | w_pos = 0.5 + np.random.rand()
162 | w_neg = 1.0
163 | weights = _setup_training_weights(Y, w_pos = 0.5 + np.random.rand(), w_neg = 1.0, w_total_target = 2.0)
164 | weighted_value = weighted_value_test(weights)
165 | weighted_cut = weighted_cut_test(weights)
166 | weighted_value_from_scores = weighted_scores_test(weights)
167 | 
168 | assert(np.isclose(weighted_value, weighted_value_from_scores))
169 | assert(np.isclose(weighted_value, weighted_cut[0]))
170 | print("passed all tests for weighted loss functions when w_pos = %1.2f and w_neg = %1.2f" % (w_pos, w_neg))
171 | 
172 | 
173 | # print 'timing for loss value computation \n'
174 | # %timeit -n 20 normal_value = normal_value_test()
175 | # %timeit -n 20 cython_value = fast_value_test()
176 | # %timeit -n 20 lookup_value = lookup_value_test()
177 | #
178 | # print 'timing for loss cut computation \n'
179 | # %timeit -n 20 normal_cut = normal_cut_test()
180 | # %timeit -n 20 cython_cut = fast_cut_test()
181 | # %timeit -n 20 lookup_cut = lookup_cut_test()
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/batch/train_risk_slim.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """
  4 | This file is to train a RiskSLIM model in a batch computing environment
  5 | It parses command line arguments, and can be called as:
  6 | 
  7 | python train_risk_slim.py --data="${data_file}" --results="${results_file}"
  8 | 
  9 | where:
 10 | 
 11 | data_file       csv file containing the training data
 12 | results_file    file name for the save file; needs to be unique and not already exist on disk
 13 | 
 14 | Use "python train_risk_slim.py --help" for a description of additional arguments.
 15 | 
 16 | Copyright (C) 2017 Berk Ustun
 17 | """
 18 | import os
 19 | import sys
 20 | import time
 21 | import argparse
 22 | import logging
 23 | import pickle
 24 | import json
 25 | import numpy as np
 26 | 
 27 | # add the source directory to search path to avoid module import errors if riskslim has not been installed
 28 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 29 | from riskslim.utils import load_data_from_csv, setup_logging
 30 | from riskslim.coefficient_set import CoefficientSet
 31 | from riskslim.lattice_cpa import run_lattice_cpa, DEFAULT_LCPA_SETTINGS
 32 | 
 33 | # uncomment for debugging
 34 | 
 35 | # TODO: run the following when building
 36 | # with open(settings_json, 'w') as outfile:
 37 | #     json.dump(DEFAULT_LCPA_SETTINGS, outfile, sort_keys = False, indent=4)
 38 | 
 39 | def setup_parser():
 40 |     """
 41 |     Create an argparse Parser object for RiskSLIM command line arguments.
 42 |     This object determines all command line arguments, handles input
 43 |     validation and default values.
 44 | 
 45 |     See https://docs.python.org/3/library/argparse.html for configuration
 46 |     """
 47 | 
 48 |     #parser helper functions
 49 |     def is_positive_integer(value):
 50 |         parsed_value = int(value)
 51 |         if parsed_value <= 0:
 52 |             raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value)
 53 |         return parsed_value
 54 | 
 55 |     def is_positive_float(value):
 56 |         parsed_value = float(value)
 57 |         if parsed_value <= 0.0:
 58 |             raise argparse.ArgumentTypeError("%s must be a positive value" % value)
 59 |         return parsed_value
 60 | 
 61 |     def is_negative_one_or_positive_integer(value):
 62 |         parsed_value = int(value)
 63 |         if not (parsed_value == -1 or parsed_value >= 1):
 64 |             raise argparse.ArgumentTypeError("%s is an invalid value (must be -1 or >=1)" % value)
 65 |         else:
 66 |             return parsed_value
 67 | 
 68 |     def is_file_on_disk(file_name):
 69 |         if not os.path.isfile(file_name):
 70 |             raise argparse.ArgumentTypeError("the file %s does not exist!" % file_name)
 71 |         else:
 72 |             return file_name
 73 | 
 74 |     def is_file_not_on_disk(file_name):
 75 |         if os.path.isfile(file_name):
 76 |             raise argparse.ArgumentTypeError("the file %s already exists on disk" % file_name)
 77 |         else:
 78 |             return file_name
 79 | 
 80 |     def is_valid_fold(value):
 81 |         parsed_value = int(value)
 82 |         if parsed_value < 0:
 83 |             raise argparse.ArgumentTypeError("%s must be a positive integer" % value)
 84 |         return parsed_value
 85 | 
 86 |     parser = argparse.ArgumentParser(
 87 |         prog='train_risk_slim',
 88 |         description='Train a RiskSLIM classifier from the command shell',
 89 |         epilog='Copyright (C) 2017 Berk Ustun',
 90 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 91 |     )
 92 | 
 93 |     parser.add_argument('--data',
 94 |                         type=str,
 95 |                         required=True,
 96 |                         help='csv file with training data')
 97 | 
 98 |     parser.add_argument('--results',
 99 |                         type=str,
100 |                         required=True,
101 |                         help='name of results file (must not already exist)')
102 | 
103 |     parser.add_argument('--cvindices',
104 |                         type=is_file_on_disk,
105 |                         help='csv file with indices for K-fold CV')
106 | 
107 |     parser.add_argument('--fold',
108 |                         type=is_valid_fold,
109 |                         default=0,
110 |                         help='index of test fold; set as 0 to use all data for training')
111 | 
112 |     parser.add_argument('--weights',
113 |                         type=is_file_on_disk,
114 |                         help='csv file with non-negative weights for each point')
115 | 
116 |     parser.add_argument('--settings',
117 |                         type=is_file_on_disk,
118 |                         help='JSON file with additional settings for LCPA')
119 | 
120 |     parser.add_argument('--timelimit',
121 |                         type=is_negative_one_or_positive_integer,
122 |                         default=300,
123 |                         help='time limit on training (in seconds); set as -1 for no time limit')
124 | 
125 |     parser.add_argument('--max_size',
126 |                         type = is_negative_one_or_positive_integer,
127 |                         default=-1,
128 |                         help='maximum number of non-zero coefficients; set as -1 for no limit')
129 | 
130 |     parser.add_argument('--max_coef',
131 |                         type=is_positive_integer,
132 |                         default=5,
133 |                         help='value of upper and lower bounds for any coefficient')
134 | 
135 |     parser.add_argument('--max_offset',
136 |                         type=is_negative_one_or_positive_integer,
137 |                         default=-1,
138 |                         help='value of upper and lower bound on offset parameter; set as -1 to use a conservative value')
139 | 
140 |     parser.add_argument('--c0_value',
141 |                         type=is_positive_float,
142 |                         default=1e-6,
143 |                         help='l0 regularization parameter; set as a positive number between 0.00 and log(2)')
144 | 
145 |     parser.add_argument('--w_pos',
146 |                         type=is_positive_float,
147 |                         default=1.00,
148 |                         help='w_pos')
149 | 
150 |     parser.add_argument('--log',
151 |                         type=str,
152 |                         help='name of the log file')
153 | 
154 |     parser.add_argument('--silent',
155 |                         action='store_true',
156 |                         help='flag to suppress logging to stderr')
157 | 
158 |     return parser
159 | 
160 | if __name__ == '__main__':
161 | 
162 |     parser = setup_parser()
163 |     parsed = parser.parse_args()
164 |     parsed_dict = vars(parsed)
165 |     parsed_string = [key + ' : ' + str(parsed_dict[key]) + '\n' for key in parsed_dict]
166 |     parsed_string.sort()
167 | 
168 |     # setup logging
169 |     logger = logging.getLogger()
170 |     logger = setup_logging(logger, log_to_console =(not parsed.silent), log_file = parsed.log)
171 |     logger.setLevel(logging.INFO)
172 |     logger.info("running 'train_risk_slim.py'")
173 |     logger.info("working directory: %r" % os.getcwd())
174 |     logger.info("parsed the following variables:\n-%s" % '-'.join(parsed_string))
175 | 
176 |     # check results_file does not exist
177 |     if os.path.isfile(parsed.results):
178 |         logger.error("results file %s already exists)" % parsed.results)
179 |         logger.error("either delete %s or choose a different name" % parsed.results)
180 |         sys.exit(1)
181 | 
182 |     # check settings_json exists / or use default settings
183 |     settings = dict(DEFAULT_LCPA_SETTINGS)
184 |     if parsed.settings is not None:
185 |         with open(parsed.settings) as json_file:
186 |             loaded_settings = json.load(json_file)
187 |             loaded_settings = {str(key): loaded_settings[key] for key in loaded_settings if key in settings}
188 |             settings.update(loaded_settings)
189 | 
190 |     #overwrite parameters specified by the user
191 |     settings['max_runtime'] = float('inf') if parsed.timelimit == -1 else parsed.timelimit
192 |     settings['c0_value'] = parsed.c0_value
193 |     settings['w_pos'] = parsed.w_pos
194 | 
195 |     # check if sample weights file was specified, if not set as None
196 |     logger.info("loading data and sample weights")
197 | 
198 |     data = load_data_from_csv(dataset_csv_file = parsed.data,
199 |                               sample_weights_csv_file = parsed.weights,
200 |                               fold_csv_file = parsed.cvindices,
201 |                               fold_num = parsed.fold)
202 |     N, P = data['X'].shape
203 | 
204 |     # initialize coefficient set and offset parameter
205 |     logger.info("creating coefficient set and constraints")
206 |     max_coefficient = parsed.max_coef
207 |     max_model_size = parsed.max_size if parsed.max_size >= 0 else float('inf')
208 |     max_offset = parsed.max_offset if parsed.max_offset >= 0 else float('inf')
209 | 
210 |     coef_set = CoefficientSet(variable_names = data['variable_names'],
211 |                               lb = -max_coefficient,
212 |                               ub = max_coefficient,
213 |                               sign = 0)
214 |     coef_set.update_intercept_bounds(X = data['X'], y = data['Y'], max_offset = max_offset, max_L0_value = max_model_size)
215 | 
216 |     #print coefficient set
217 |     if not parsed.silent:
218 |         print(coef_set)
219 | 
220 |     constraints = {
221 |         'L0_min': 0,
222 |         'L0_max': max_model_size,
223 |         'coef_set': coef_set,
224 |     }
225 | 
226 |     # fit RiskSLIM model using Lattice Cutting Plane Algorithm
227 |     model_info, mip_info, lcpa_info = run_lattice_cpa(data, constraints, settings)
228 | 
229 |     # save output to disk
230 |     results = {
231 |         "date": time.strftime("%d/%m/%y", time.localtime()),
232 |         "data_file": parsed.data,
233 |         "fold_file": parsed.cvindices,
234 |         "fold_num": parsed.settings,
235 |         "results_file": parsed.results,
236 |     }
237 |     results.update(model_info)
238 | 
239 |     coef_set = results.pop('coef_set')
240 |     results['coef_set_ub'] = coef_set.ub
241 |     results['coef_set_lb'] = coef_set.lb
242 |     results['coef_set_signs'] = coef_set.sign
243 |     results['coef_set_c0'] = coef_set.c0
244 | 
245 |     logger.info("saving results...")
246 |     with open(parsed.results, 'wb') as outfile:
247 |         pickle.dump(results, outfile, protocol=pickle.HIGHEST_PROTOCOL)
248 | 
249 |     logger.info("saved results as pickle file: %r" % parsed.results)
250 |     logger.info('''to access results, use this snippet:
251 | 
252 |                 \t\t\t    import pickle
253 |                 \t\t\t    f = open(results_file, 'rb')
254 |                 \t\t\t    results = pickle.load(f)
255 |                 '''
256 |                 )
257 |     logger.info("finished training")
258 |     logger.info("quitting\n\n")
259 |     sys.exit(0)
260 | 


--------------------------------------------------------------------------------
/riskslim/solution_pool.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import prettytable as pt
  3 | 
  4 | class SolutionPool(object):
  5 |     """
  6 |     Helper class used to store solutions to the risk slim optimization problem
  7 |     """
  8 | 
  9 |     def __init__(self,  obj):
 10 | 
 11 |         if isinstance(obj, SolutionPool):
 12 | 
 13 |             self._P = obj.P
 14 |             self._objvals = obj.objvals
 15 |             self._solutions = obj.solutions
 16 | 
 17 |         elif isinstance(obj, int):
 18 | 
 19 |             assert obj >= 1
 20 |             self._P = int(obj)
 21 |             self._objvals = np.empty(0)
 22 |             self._solutions = np.empty(shape = (0, self._P))
 23 | 
 24 |         elif isinstance(obj, dict):
 25 | 
 26 |             assert len(obj) == 2
 27 |             objvals = np.copy(obj['objvals']).flatten().astype(dtype = np.float_)
 28 |             solutions = np.copy(obj['solutions'])
 29 |             n = objvals.size
 30 |             if solutions.ndim == 2:
 31 |                 assert n in solutions.shape
 32 |                 if solutions.shape[1] == n and solutions.shape[0] != n:
 33 |                     solutions = np.transpose(solutions)
 34 |             elif solutions.ndim == 1:
 35 |                 assert n == 1
 36 |                 solutions = np.reshape(solutions, (1, solutions.size))
 37 |             else:
 38 |                 raise ValueError('solutions has more than 2 dimensions')
 39 | 
 40 |             self._P = solutions.shape[1]
 41 |             self._objvals = objvals
 42 |             self._solutions = solutions
 43 | 
 44 |         else:
 45 |             raise ValueError('cannot initialize SolutionPool using %s object' % type(obj))
 46 | 
 47 | 
 48 |     def __len__(self):
 49 |         return len(self._objvals)
 50 | 
 51 | 
 52 |     @staticmethod
 53 |     def solution_string(solution, float_fmt = '%1.3f'):
 54 |         solution_string = ''
 55 |         for j in range(len(solution)):
 56 |             if SolutionPool.is_integral(solution[j]):
 57 |                 solution_string += ' ' + str(int(solution[j]))
 58 |             else:
 59 |                 solution_string += ((' ' + float_fmt) % solution[j])
 60 |         return solution_string
 61 | 
 62 | 
 63 |     def table(self):
 64 |         x = pt.PrettyTable(align = 'r', float_format = '1.3', hrules = pt.ALL)
 65 |         x.add_column("objval", self._objvals.tolist())
 66 |         x.add_column("solution", list(map(self.solution_string, self._solutions)))
 67 |         return str(x)
 68 | 
 69 | 
 70 |     def __repr__(self):
 71 |         return self.table()
 72 | 
 73 | 
 74 |     def __str__(self):
 75 |         return self.table()
 76 | 
 77 | 
 78 |     def copy(self):
 79 |         return SolutionPool(self)
 80 | 
 81 | 
 82 |     @property
 83 |     def P(self):
 84 |         return int(self._P)
 85 | 
 86 | 
 87 |     @property
 88 |     def objvals(self):
 89 |         return self._objvals
 90 | 
 91 | 
 92 |     @property
 93 |     def solutions(self):
 94 |         return self._solutions
 95 | 
 96 | 
 97 |     @objvals.setter
 98 |     def objvals(self, objvals):
 99 |         if hasattr(objvals, "__len__"):
100 |             if len(objvals) > 0:
101 |                 self._objvals = np.copy(list(objvals)).flatten().astype(dtype = np.float_)
102 |             elif len(objvals) == 0:
103 |                 self._objvals = np.empty(0)
104 |         else:
105 |             self._objvals = float(objvals)
106 | 
107 | 
108 |     @solutions.setter
109 |     def solutions(self, solutions):
110 |         if solutions.ndim == 2:
111 |             assert self._P in solutions.shape
112 |             if solutions.shape[0] == self._P and solutions.shape[1] != self._P:
113 |                 solutions = np.transpose(solutions)
114 |         elif solutions.ndim == 1:
115 |             solutions = np.reshape(solutions, (1, solutions.size))
116 |         else:
117 |             raise ValueError('incorrect solution dimensions')
118 | 
119 |         self._solutions = np.copy(solutions)
120 | 
121 | 
122 |     def append(self, pool):
123 |         if len(pool) == 0:
124 |             return self
125 |         else:
126 |             return self.add(pool.objvals, pool.solutions)
127 | 
128 | 
129 |     def add(self, objvals, solutions):
130 | 
131 |         if isinstance(objvals, np.ndarray) or isinstance(objvals, list):
132 |             n = len(objvals)
133 |             if n == 0:
134 |                 return self
135 |             if isinstance(solutions, np.ndarray):
136 |                 if solutions.ndim == 2:
137 |                     assert n in solutions.shape
138 |                     assert self._P in solutions.shape
139 |                     if solutions.shape[0] == self._P and solutions.shape[1] != self._P:
140 |                         solutions = np.transpose(solutions)
141 |                 elif solutions.ndim == 1:
142 |                     assert n == 1
143 |                     solutions = np.reshape(solutions, (1, solutions.size))
144 |                 else:
145 |                     raise ValueError('incorrect solution dimensions')
146 |             elif isinstance(solutions, list):
147 |                 solutions = np.array(solutions)
148 |                 assert solutions.shape[0] == n
149 |                 assert solutions.shape[1] == self._P
150 |             else:
151 |                 raise TypeError('incorrect solution type')
152 |         else:
153 |             objvals = float(objvals) #also assertion
154 |             solutions = np.reshape(solutions, (1, self._P))
155 | 
156 |         self._objvals = np.append(self._objvals, objvals)
157 |         self._solutions = np.append(self._solutions, solutions, axis = 0)
158 |         return self
159 | 
160 | 
161 |     def filter(self, filter_ind):
162 |         idx = np.require(filter_ind, dtype = 'bool').flatten()
163 |         if len(self) > 0 and any(idx == 0):
164 |             self._objvals = self._objvals[idx]
165 |             self._solutions = self._solutions[idx, :]
166 |         return self
167 | 
168 | 
169 |     def distinct(self):
170 |         if len(self) > 0:
171 |             _, idx = np.unique(self._solutions, return_index = True, axis = 0)
172 |             self._objvals = self._objvals[idx]
173 |             self._solutions = self._solutions[idx, :]
174 |         return self
175 | 
176 | 
177 |     def sort(self):
178 |         if len(self) > 0:
179 |             idx = np.argsort(self._objvals)
180 |             self._objvals = self._objvals[idx]
181 |             self._solutions = self._solutions[idx, :]
182 |         return self
183 | 
184 | 
185 |     def map(self, mapfun, target = 'all'):
186 |         assert callable(mapfun), 'map function must be callable'
187 |         if target is 'solutions':
188 |             return list(map(mapfun, self.solutions))
189 |         elif target is 'objvals':
190 |             return list(map(mapfun, self.objvals))
191 |         elif target is 'all':
192 |             return list(map(mapfun, self.objvals, self.solutions))
193 |         else:
194 |             raise ValueError('target must be either solutions, objvals, or all')
195 | 
196 | 
197 |     @staticmethod
198 |     def is_integral(solution):
199 |         return np.all(solution == np.require(solution, dtype = 'int_'))
200 | 
201 | 
202 |     def remove_nonintegral(self):
203 |         return self.filter(list(map(self.is_integral, self.solutions)))
204 | 
205 | 
206 |     def compute_objvals(self, get_objval):
207 |         compute_idx = np.flatnonzero(np.isnan(self._objvals))
208 |         self._objvals[compute_idx] = np.array(list(map(get_objval, self._solutions[compute_idx, :])))
209 |         return self
210 | 
211 | 
212 |     def remove_suboptimal(self, objval_cutoff):
213 |         return self.filter(self.objvals <= objval_cutoff)
214 | 
215 | 
216 |     def remove_infeasible(self, is_feasible):
217 |         return self.filter(list(map(is_feasible, self.solutions)))
218 | 
219 | 
220 | class FastSolutionPool(object):
221 |     """
222 |     Helper class used to store solutions to the risk slim optimization problem
223 |     SolutionQueue designed to work faster than SolutionPool.
224 |     It is primarily used by the callback functions in risk_slim
225 |     """
226 | 
227 |     def __init__(self, P):
228 |         self._P = int(P)
229 |         self._objvals = np.empty(shape = 0)
230 |         self._solutions = np.empty(shape = (0, P))
231 | 
232 | 
233 |     def __len__(self):
234 |         return len(self._objvals)
235 | 
236 |     @property
237 |     def P(self):
238 |         return self._P
239 | 
240 |     @property
241 |     def objvals(self):
242 |         return self._objvals
243 | 
244 |     @property
245 |     def solutions(self):
246 |         return self._solutions
247 | 
248 | 
249 |     def add(self, new_objvals, new_solutions):
250 |         if isinstance(new_objvals, (np.ndarray, list)):
251 |             n = len(new_objvals)
252 |             self._objvals = np.append(self._objvals, np.array(new_objvals).astype(dtype = np.float_).flatten())
253 |         else:
254 |             n = 1
255 |             self._objvals = np.append(self._objvals, float(new_objvals))
256 | 
257 |         new_solutions = np.reshape(new_solutions, (n, self._P))
258 |         self._solutions = np.append(self._solutions, new_solutions, axis = 0)
259 | 
260 | 
261 |     def get_best_objval_and_solution(self):
262 |         if len(self) > 0:
263 |             idx = np.argmin(self._objvals)
264 |             return float(self._objvals[idx]), np.copy(self._solutions[idx,])
265 |         else:
266 |             return np.empty(shape = 0), np.empty(shape = (0, self.P))
267 | 
268 | 
269 |     def filter_sort_unique(self, max_objval = float('inf')):
270 | 
271 |         # filter
272 |         if max_objval < float('inf'):
273 |             good_idx = np.less_equal(self._objvals, max_objval)
274 |             self._objvals = self._objvals[good_idx]
275 |             self._solutions = self._solutions[good_idx,]
276 | 
277 |         if len(self._objvals) >= 2:
278 |             _, unique_idx = np.unique(self._solutions, axis = 0, return_index = True)
279 |             self._objvals = self._objvals[unique_idx]
280 |             self._solutions = self._solutions[unique_idx,]
281 | 
282 |         if len(self._objvals) >= 2:
283 |             sort_idx = np.argsort(self._objvals)
284 |             self._objvals = self._objvals[sort_idx]
285 |             self._solutions = self._solutions[sort_idx,]
286 | 
287 |         return self
288 | 
289 | 
290 |     def clear(self):
291 |         self._objvals = np.empty(shape = 0)
292 |         self._solutions = np.empty(shape = (0, self._P))
293 |         return self
294 | 
295 | 
296 |     def table(self):
297 |         x = pt.PrettyTable(align = 'r', float_format = '1.4', hrules=pt.ALL)
298 |         x.add_column("objval", self._objvals.tolist())
299 |         x.add_column("solution", list(map(self.solution_string, self._solutions)))
300 |         return str(x)
301 | 
302 |     @staticmethod
303 |     def solution_string(solution):
304 |         solution_string = ''
305 |         for j in range(len(solution)):
306 |             if SolutionPool.is_integral(solution[j]):
307 |                 solution_string += ' ' + str(int(solution[j]))
308 |             else:
309 |                 solution_string += (' %1.4f' % solution[j])
310 |         return solution_string
311 | 
312 |     def __repr__(self):
313 |         return self.table()
314 | 
315 | 
316 |     def __str__(self):
317 |         return self.table()


--------------------------------------------------------------------------------
/riskslim/setup_functions.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .coefficient_set import CoefficientSet, get_score_bounds
  3 | from .utils import print_log
  4 | 
  5 | 
  6 | def setup_loss_functions(data, coef_set, L0_max = None, loss_computation = None, w_pos = 1.0):
  7 |     """
  8 | 
  9 |     Parameters
 10 |     ----------
 11 |     data
 12 |     coef_set
 13 |     L0_max
 14 |     loss_computation
 15 |     w_pos
 16 | 
 17 |     Returns
 18 |     -------
 19 | 
 20 |     """
 21 |     #todo check if fast/lookup loss is installed
 22 |     assert loss_computation in [None, 'weighted', 'normal', 'fast', 'lookup']
 23 | 
 24 |     Z = data['X'] * data['Y']
 25 | 
 26 |     if 'sample_weights' in data:
 27 |         sample_weights = _setup_training_weights(Y = data['Y'], sample_weights = data['sample_weights'], w_pos = w_pos)
 28 |         use_weighted = not np.all(np.equal(sample_weights, 1.0))
 29 |     else:
 30 |         use_weighted = False
 31 | 
 32 |     integer_data_flag = np.all(Z == np.require(Z, dtype = np.int_))
 33 |     use_lookup_table = isinstance(coef_set, CoefficientSet) and integer_data_flag
 34 |     if use_weighted:
 35 |         final_loss_computation = 'weighted'
 36 |     elif use_lookup_table:
 37 |         final_loss_computation = 'lookup'
 38 |     else:
 39 |         final_loss_computation = 'fast'
 40 | 
 41 |     if final_loss_computation != loss_computation:
 42 |         print_log("switching loss computation from %s to %s" % (loss_computation, final_loss_computation))
 43 | 
 44 |     if final_loss_computation == 'weighted':
 45 | 
 46 |         from riskslim.loss_functions.log_loss_weighted import \
 47 |             log_loss_value, \
 48 |             log_loss_value_and_slope, \
 49 |             log_loss_value_from_scores
 50 | 
 51 |         Z = np.require(Z, requirements = ['C'])
 52 |         total_sample_weights = np.sum(sample_weights)
 53 | 
 54 |         compute_loss = lambda rho: log_loss_value(Z, sample_weights, total_sample_weights, rho)
 55 |         compute_loss_cut = lambda rho: log_loss_value_and_slope(Z, sample_weights, total_sample_weights, rho)
 56 |         compute_loss_from_scores = lambda scores: log_loss_value_from_scores(sample_weights, total_sample_weights, scores)
 57 | 
 58 |     elif final_loss_computation == 'normal':
 59 | 
 60 |         from riskslim.loss_functions.log_loss import \
 61 |             log_loss_value, \
 62 |             log_loss_value_and_slope, \
 63 |             log_loss_value_from_scores
 64 | 
 65 |         Z = np.require(Z, requirements=['C'])
 66 |         compute_loss = lambda rho: log_loss_value(Z, rho)
 67 |         compute_loss_cut = lambda rho: log_loss_value_and_slope(Z, rho)
 68 |         compute_loss_from_scores = lambda scores: log_loss_value_from_scores(scores)
 69 | 
 70 |     elif final_loss_computation == 'fast':
 71 | 
 72 |         from riskslim.loss_functions.fast_log_loss import \
 73 |             log_loss_value, \
 74 |             log_loss_value_and_slope, \
 75 |             log_loss_value_from_scores
 76 | 
 77 |         Z = np.require(Z, requirements=['F'])
 78 |         compute_loss = lambda rho: log_loss_value(Z, rho)
 79 |         compute_loss_cut = lambda rho: log_loss_value_and_slope(Z, rho)
 80 |         compute_loss_from_scores = lambda scores: log_loss_value_from_scores(scores)
 81 | 
 82 |     elif final_loss_computation == 'lookup':
 83 | 
 84 |         from riskslim.loss_functions.lookup_log_loss import \
 85 |             get_loss_value_and_prob_tables, \
 86 |             log_loss_value, \
 87 |             log_loss_value_and_slope, \
 88 |             log_loss_value_from_scores
 89 | 
 90 |         s_min, s_max = get_score_bounds(Z_min = np.min(Z, axis=0),
 91 |                                         Z_max = np.max(Z, axis=0),
 92 |                                         rho_lb = coef_set.lb,
 93 |                                         rho_ub = coef_set.ub,
 94 |                                         L0_reg_ind = np.array(coef_set.c0) == 0.0,
 95 |                                         L0_max = L0_max)
 96 | 
 97 | 
 98 |         Z = np.require(Z, requirements=['F'], dtype = float)
 99 |         print_log("%d rows in lookup table" % (s_max - s_min + 1))
100 | 
101 |         loss_value_tbl, prob_value_tbl, tbl_offset = get_loss_value_and_prob_tables(s_min, s_max)
102 |         compute_loss = lambda rho: log_loss_value(Z, rho, loss_value_tbl, tbl_offset)
103 |         compute_loss_cut = lambda rho: log_loss_value_and_slope(Z, rho, loss_value_tbl, prob_value_tbl, tbl_offset)
104 |         compute_loss_from_scores = lambda scores: log_loss_value_from_scores(scores, loss_value_tbl, tbl_offset)
105 | 
106 |     # real loss functions
107 |     if final_loss_computation == 'lookup':
108 | 
109 |         from riskslim.loss_functions.fast_log_loss import \
110 |             log_loss_value as loss_value_real, \
111 |             log_loss_value_and_slope as loss_value_and_slope_real,\
112 |             log_loss_value_from_scores as loss_value_from_scores_real
113 | 
114 |         compute_loss_real = lambda rho: loss_value_real(Z, rho)
115 |         compute_loss_cut_real = lambda rho: loss_value_and_slope_real(Z, rho)
116 |         compute_loss_from_scores_real = lambda scores: loss_value_from_scores_real(scores)
117 | 
118 |     else:
119 | 
120 |         compute_loss_real = compute_loss
121 |         compute_loss_cut_real = compute_loss_cut
122 |         compute_loss_from_scores_real = compute_loss_from_scores
123 | 
124 |     return (Z,
125 |             compute_loss,
126 |             compute_loss_cut,
127 |             compute_loss_from_scores,
128 |             compute_loss_real,
129 |             compute_loss_cut_real,
130 |             compute_loss_from_scores_real)
131 | 
132 | 
133 | def _setup_training_weights(Y, sample_weights = None, w_pos = 1.0, w_neg = 1.0, w_total_target = 2.0):
134 | 
135 |     """
136 |     Parameters
137 |     ----------
138 |     Y - N x 1 vector with Y = -1,+1
139 |     sample_weights - N x 1 vector
140 |     w_pos - positive scalar showing relative weight on examples where Y = +1
141 |     w_neg - positive scalar showing relative weight on examples where Y = -1
142 | 
143 |     Returns
144 |     -------
145 |     a vector of N training weights for all points in the training data
146 | 
147 |     """
148 | 
149 |     # todo: throw warning if there is no positive/negative point in Y
150 | 
151 |     # process class weights
152 |     assert w_pos > 0.0, 'w_pos must be strictly positive'
153 |     assert w_neg > 0.0, 'w_neg must be strictly positive'
154 |     assert np.isfinite(w_pos), 'w_pos must be finite'
155 |     assert np.isfinite(w_neg), 'w_neg must be finite'
156 |     w_total = w_pos + w_neg
157 |     w_pos = w_total_target * (w_pos / w_total)
158 |     w_neg = w_total_target * (w_neg / w_total)
159 | 
160 |     # process case weights
161 |     Y = Y.flatten()
162 |     N = len(Y)
163 |     pos_ind = Y == 1
164 | 
165 |     if sample_weights is None:
166 |         training_weights = np.ones(N)
167 |     else:
168 |         training_weights = sample_weights.flatten()
169 |         assert len(training_weights) == N
170 |         assert np.all(training_weights >= 0.0)
171 |         #todo: throw warning if any training weights = 0
172 |         #todo: throw warning if there are no effective positive/negative points in Y
173 | 
174 |     # normalization
175 |     training_weights = N * (training_weights / sum(training_weights))
176 |     training_weights[pos_ind] *= w_pos
177 |     training_weights[~pos_ind] *= w_neg
178 | 
179 |     return training_weights
180 | 
181 | 
182 | def setup_penalty_parameters(coef_set, c0_value = 1e-6):
183 |     """
184 | 
185 |     Parameters
186 |     ----------
187 |     coef_set
188 |     c0_value
189 | 
190 |     Returns
191 |     -------
192 |     c0_value
193 |     C_0
194 |     L0_reg_ind
195 |     C_0_nnz
196 |     """
197 |     assert isinstance(coef_set, CoefficientSet)
198 |     assert c0_value > 0.0, 'default L0_parameter should be positive'
199 |     c0_value = float(c0_value)
200 |     C_0 = np.array(coef_set.c0)
201 |     L0_reg_ind = np.isnan(C_0)
202 |     C_0[L0_reg_ind] = c0_value
203 |     C_0_nnz = C_0[L0_reg_ind]
204 |     return c0_value, C_0, L0_reg_ind, C_0_nnz
205 | 
206 | 
207 | def setup_objective_functions(compute_loss, L0_reg_ind, C_0_nnz):
208 | 
209 |     get_objval = lambda rho: compute_loss(rho) + np.sum(C_0_nnz * (rho[L0_reg_ind] != 0.0))
210 |     get_L0_norm = lambda rho: np.count_nonzero(rho[L0_reg_ind])
211 |     get_L0_penalty = lambda rho: np.sum(C_0_nnz * (rho[L0_reg_ind] != 0.0))
212 |     get_alpha = lambda rho: np.array(abs(rho[L0_reg_ind]) > 0.0, dtype = np.float_)
213 |     get_L0_penalty_from_alpha = lambda alpha: np.sum(C_0_nnz * alpha)
214 | 
215 |     return (get_objval, get_L0_norm, get_L0_penalty, get_alpha, get_L0_penalty_from_alpha)
216 | 
217 | 
218 | def get_loss_bounds(Z, rho_ub, rho_lb, L0_reg_ind, L0_max = float('nan')):
219 |     # min value of loss = log(1+exp(-score)) occurs at max score for each point
220 |     # max value of loss = loss(1+exp(-score)) occurs at min score for each point
221 | 
222 |     rho_lb = np.array(rho_lb)
223 |     rho_ub = np.array(rho_ub)
224 | 
225 |     # get maximum number of regularized coefficients
226 |     L0_max = Z.shape[0] if np.isnan(L0_max) else L0_max
227 |     num_max_reg_coefs = min(L0_max, sum(L0_reg_ind))
228 | 
229 |     # calculate the smallest and largest score that can be attained by each point
230 |     scores_at_lb = Z * rho_lb
231 |     scores_at_ub = Z * rho_ub
232 |     max_scores_matrix = np.maximum(scores_at_ub, scores_at_lb)
233 |     min_scores_matrix = np.minimum(scores_at_ub, scores_at_lb)
234 |     assert (np.all(max_scores_matrix >= min_scores_matrix))
235 | 
236 |     # for each example, compute max sum of scores from top reg coefficients
237 |     max_scores_reg = max_scores_matrix[:, L0_reg_ind]
238 |     max_scores_reg = -np.sort(-max_scores_reg, axis=1)
239 |     max_scores_reg = max_scores_reg[:, 0:num_max_reg_coefs]
240 |     max_score_reg = np.sum(max_scores_reg, axis=1)
241 | 
242 |     # for each example, compute max sum of scores from no reg coefficients
243 |     max_scores_no_reg = max_scores_matrix[:, ~L0_reg_ind]
244 |     max_score_no_reg = np.sum(max_scores_no_reg, axis=1)
245 | 
246 |     # max score for each example
247 |     max_score = max_score_reg + max_score_no_reg
248 | 
249 |     # for each example, compute min sum of scores from top reg coefficients
250 |     min_scores_reg = min_scores_matrix[:, L0_reg_ind]
251 |     min_scores_reg = np.sort(min_scores_reg, axis=1)
252 |     min_scores_reg = min_scores_reg[:, 0:num_max_reg_coefs]
253 |     min_score_reg = np.sum(min_scores_reg, axis=1)
254 | 
255 |     # for each example, compute min sum of scores from no reg coefficients
256 |     min_scores_no_reg = min_scores_matrix[:, ~L0_reg_ind]
257 |     min_score_no_reg = np.sum(min_scores_no_reg, axis=1)
258 | 
259 |     min_score = min_score_reg + min_score_no_reg
260 |     assert (np.all(max_score >= min_score))
261 | 
262 |     # compute min loss
263 |     idx = max_score > 0
264 |     min_loss = np.empty_like(max_score)
265 |     min_loss[idx] = np.log1p(np.exp(-max_score[idx]))
266 |     min_loss[~idx] = np.log1p(np.exp(max_score[~idx])) - max_score[~idx]
267 |     min_loss = min_loss.mean()
268 | 
269 |     # compute max loss
270 |     idx = min_score > 0
271 |     max_loss = np.empty_like(min_score)
272 |     max_loss[idx] = np.log1p(np.exp(-min_score[idx]))
273 |     max_loss[~idx] = np.log1p(np.exp(min_score[~idx])) - min_score[~idx]
274 |     max_loss = max_loss.mean()
275 | 
276 |     return min_loss, max_loss
277 | 


--------------------------------------------------------------------------------
/riskslim/heuristics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | #todo: finish specifications
  4 | #todo: add input checking (with ability to turn off)
  5 | #todo: Cython implementation
  6 | 
  7 | def sequential_rounding(rho, Z, C_0, compute_loss_from_scores_real, get_L0_penalty, objval_cutoff = float('Inf')):
  8 |     """
  9 | 
 10 |     Parameters
 11 |     ----------
 12 |     rho:                                P x 1 vector of continuous coefficients
 13 |     Z:                                  N x P data matrix computed as X * Y
 14 |     C_0:                                N x 1 vector of L0 penalties. C_0[j] = L0 penalty for rho[j] for j = 0,..., P.
 15 |     compute_loss_from_scores_real:      function handle to compute loss using N x 1 vector of scores, where scores = Z.dot(rho)
 16 |     get_L0_penalty:                     function handle to compute L0_penalty from rho
 17 |     objval_cutoff:                      objective value used for early stopping.
 18 |                                         the procedure will stop if the objective value achieved by an intermediate solution will exceeds objval_cutoff
 19 | 
 20 |     Returns
 21 |     -------
 22 | 
 23 |     rho:                                P x 1 vector of integer coefficients (if early_stop_flag = False, otherwise continuous solution)
 24 |     best_objval:                        objective value achieved by rho (if early_stop_flag = False, otherwise NaN)
 25 |     early_stop_flag:                    True if procedure was stopped early (in which case rho is not integer feasible)
 26 | 
 27 |     """
 28 | 
 29 |     assert callable(compute_loss_from_scores_real)
 30 |     assert callable(get_L0_penalty)
 31 | 
 32 |     P = rho.shape[0]
 33 | 
 34 |     rho_floor = np.floor(rho)
 35 |     floor_is_zero = np.equal(rho_floor, 0)
 36 |     dist_from_start_to_floor = rho_floor - rho
 37 | 
 38 |     rho_ceil = np.ceil(rho)
 39 |     ceil_is_zero = np.equal(rho_ceil, 0)
 40 |     dist_from_start_to_ceil = rho_ceil - rho
 41 | 
 42 |     dimensions_to_round = np.flatnonzero(np.not_equal(rho_floor, rho_ceil)).tolist()
 43 | 
 44 |     scores = Z.dot(rho)
 45 |     best_objval = compute_loss_from_scores_real(scores) + get_L0_penalty(rho)
 46 |     while len(dimensions_to_round) > 0 and best_objval < objval_cutoff:
 47 | 
 48 |         objvals_at_floor = np.repeat(np.nan, P)
 49 |         objvals_at_ceil = np.repeat(np.nan, P)
 50 |         current_penalty = get_L0_penalty(rho)
 51 | 
 52 |         for idx in dimensions_to_round:
 53 | 
 54 |             # scores go from center to ceil -> center + dist_from_start_to_ceil
 55 |             Z_dim = Z[:, idx]
 56 |             base_scores = scores + dist_from_start_to_ceil[idx] * Z_dim
 57 |             objvals_at_ceil[idx] = compute_loss_from_scores_real(base_scores)
 58 | 
 59 |             # move from ceil to floor => -1*Z_j
 60 |             base_scores -= Z_dim
 61 |             objvals_at_floor[idx] = compute_loss_from_scores_real(base_scores)
 62 | 
 63 |             if ceil_is_zero[idx]:
 64 |                 objvals_at_ceil[idx] -= C_0[idx]
 65 |             elif floor_is_zero[idx]:
 66 |                 objvals_at_floor[idx] -= C_0[idx]
 67 | 
 68 | 
 69 |         # adjust for penalty value
 70 |         objvals_at_ceil += current_penalty
 71 |         objvals_at_floor += current_penalty
 72 |         best_objval_at_ceil = np.nanmin(objvals_at_ceil)
 73 |         best_objval_at_floor = np.nanmin(objvals_at_floor)
 74 | 
 75 |         if best_objval_at_ceil <= best_objval_at_floor:
 76 |             best_objval = best_objval_at_ceil
 77 |             best_dim = np.nanargmin(objvals_at_ceil)
 78 |             rho[best_dim] += dist_from_start_to_ceil[best_dim]
 79 |             scores += dist_from_start_to_ceil[best_dim] * Z[:, best_dim]
 80 |         else:
 81 |             best_objval = best_objval_at_floor
 82 |             best_dim = np.nanargmin(objvals_at_floor)
 83 |             rho[best_dim] += dist_from_start_to_floor[best_dim]
 84 |             scores += dist_from_start_to_floor[best_dim] * Z[:, best_dim]
 85 | 
 86 |         dimensions_to_round.remove(best_dim)
 87 |         #assert(np.all(np.isclose(scores, Z.dot(rho))))
 88 | 
 89 |     early_stop_flag = best_objval > objval_cutoff
 90 |     return rho, best_objval, early_stop_flag
 91 | 
 92 | 
 93 | def discrete_descent(rho, Z, C_0, rho_ub, rho_lb, get_L0_penalty, compute_loss_from_scores, descent_dimensions = None, active_set_flag = True):
 94 | 
 95 |     """
 96 |     Given a initial feasible solution, rho, produces an improved solution that is 1-OPT
 97 |     (i.e. the objective value does not decrease by moving in any single dimension)
 98 |     at each iteration, the algorithm moves in the dimension that yields the greatest decrease in objective value
 99 |     the best step size is each dimension is computed using a directional search strategy that saves computation
100 | 
101 |     Parameters
102 |     ----------
103 |     rho:                                P x 1 vector of continuous coefficients
104 |     Z:                                  N x P data matrix computed as X * Y
105 |     C_0:                                N x 1 vector of L0 penalties. C_0[j] = L0 penalty for rho[j] for j = 0,..., P.
106 |     rho_ub
107 |     rho_lb
108 |     compute_loss_from_scores_real:      function handle to compute loss using N x 1 vector of scores, where scores = Z.dot(rho)
109 |     get_L0_penalty:                     function handle to compute L0_penalty from rho
110 |     descent_dimensions
111 | 
112 |     Returns
113 |     -------
114 | 
115 |     """
116 |     """
117 |     
118 |     """
119 |     assert callable(compute_loss_from_scores)
120 |     assert callable(get_L0_penalty)
121 | 
122 |     # initialize key variables
123 |     MAX_ITERATIONS = 500
124 |     MIN_IMPROVEMENT_PER_STEP = float(1e-8)
125 |     P = len(rho)
126 | 
127 |     # convert solution to integer
128 |     rho = np.require(np.require(rho, dtype = np.int_), dtype = np.float_)
129 | 
130 |     # convert descent dimensions to integer values
131 |     if descent_dimensions is None:
132 |         descent_dimensions = np.arange(P)
133 |     else:
134 |         descent_dimensions = np.require(descent_dimensions, dtype = np.int_)
135 | 
136 |     if active_set_flag:
137 |         descent_dimensions = np.intersect1d(np.flatnonzero(rho), descent_dimensions)
138 | 
139 |     descent_dimensions = descent_dimensions.tolist()
140 | 
141 |     base_scores = Z.dot(rho)
142 |     base_loss = compute_loss_from_scores(base_scores)
143 |     base_objval = base_loss + get_L0_penalty(rho)
144 |     n_iterations = 0
145 | 
146 |     coefficient_values = {k: np.arange(int(rho_lb[k]), int(rho_ub[k]) + 1) for k in descent_dimensions}
147 |     search_dimensions = descent_dimensions
148 |     while n_iterations < MAX_ITERATIONS and len(search_dimensions) > 0:
149 | 
150 |         # compute the best objective value / step size in each dimension
151 |         best_objval_by_dim = np.repeat(np.nan, P)
152 |         best_coef_by_dim = np.repeat(np.nan, P)
153 | 
154 |         for k in search_dimensions:
155 | 
156 |             dim_objvals = _compute_objvals_at_dim(base_rho = rho,
157 |                                                   base_scores = base_scores,
158 |                                                   base_loss = base_loss,
159 |                                                   dim_idx = k,
160 |                                                   dim_coefs = coefficient_values[k],
161 |                                                   Z = Z,
162 |                                                   C_0 = C_0,
163 |                                                   compute_loss_from_scores = compute_loss_from_scores)
164 | 
165 |             # mark points that will improve the current objective value by at least MIN_IMPROVEMENT_PER_STEP
166 |             best_dim_idx = np.nanargmin(dim_objvals)
167 |             best_objval_by_dim[k] = dim_objvals[best_dim_idx]
168 |             best_coef_by_dim[k] = coefficient_values[k][best_dim_idx]
169 | 
170 |         # recompute base objective value/loss/scores
171 |         best_idx = np.nanargmin(best_objval_by_dim)
172 |         next_objval = best_objval_by_dim[best_idx]
173 |         threshold_objval = base_objval - MIN_IMPROVEMENT_PER_STEP
174 | 
175 |         if next_objval >= threshold_objval:
176 |             break
177 | 
178 |         best_step = best_coef_by_dim[best_idx] - rho[best_idx]
179 |         rho[best_idx] += best_step
180 |         base_objval = next_objval
181 |         base_loss = base_objval - get_L0_penalty(rho)
182 |         base_scores = base_scores + (best_step * Z[:, best_idx])
183 | 
184 |         # remove the current best direction from the set of directions to explore
185 |         search_dimensions = list(descent_dimensions)
186 |         search_dimensions.remove(best_idx)
187 |         n_iterations += 1
188 | 
189 |     return rho, base_loss, base_objval
190 | 
191 | 
192 | def _compute_objvals_at_dim(Z, C_0, base_rho, base_scores, base_loss, dim_coefs, dim_idx, compute_loss_from_scores):
193 | 
194 |     """
195 |     finds the value of rho[j] in dim_coefs that minimizes log_loss(rho) + C_0j
196 | 
197 |     Parameters
198 |     ----------
199 |     Z
200 |     C_0
201 |     base_rho
202 |     base_scores
203 |     base_loss
204 |     dim_coefs
205 |     dim_idx
206 |     compute_loss_from_scores
207 | 
208 |     Returns
209 |     -------
210 | 
211 |     """
212 | 
213 |     # copy stuff because ctypes
214 |     scores = np.copy(base_scores)
215 | 
216 |     # initialize parameters
217 |     P = base_rho.shape[0]
218 |     base_coef_value = base_rho[dim_idx]
219 |     base_index = np.flatnonzero(dim_coefs == base_coef_value)
220 |     loss_at_coef_value = np.repeat(np.nan, len(dim_coefs))
221 |     loss_at_coef_value[base_index] = float(base_loss)
222 |     Z_dim = Z[:, dim_idx]
223 | 
224 |     # start by moving forward
225 |     forward_indices = np.flatnonzero(base_coef_value <= dim_coefs)
226 |     forward_step_sizes = np.diff(dim_coefs[forward_indices] - base_coef_value)
227 |     n_forward_steps = len(forward_step_sizes)
228 |     stop_after_first_forward_step = False
229 | 
230 |     best_loss = base_loss
231 |     total_distance_from_base = 0
232 | 
233 |     for i in range(n_forward_steps):
234 |         scores += forward_step_sizes[i] * Z_dim
235 |         total_distance_from_base += forward_step_sizes[i]
236 |         current_loss = compute_loss_from_scores(scores)
237 |         if current_loss >= best_loss:
238 |             stop_after_first_forward_step = i == 0
239 |             break
240 |         loss_at_coef_value[forward_indices[i + 1]] = current_loss
241 |         best_loss = current_loss
242 | 
243 |     # if the first step forward didn't lead to a decrease in loss, then move backwards
244 |     move_backward = stop_after_first_forward_step or n_forward_steps == 0
245 | 
246 |     if move_backward:
247 | 
248 |         # compute backward steps
249 |         backward_indices = np.flipud(np.where(dim_coefs <= base_coef_value)[0])
250 |         backward_step_sizes = np.diff(dim_coefs[backward_indices] - base_coef_value)
251 |         n_backward_steps = len(backward_step_sizes)
252 | 
253 |         # correct size of first backward step if you took 1 step forward
254 |         if n_backward_steps > 0 and n_forward_steps > 0:
255 |             backward_step_sizes[0] = backward_step_sizes[0] - forward_step_sizes[0]
256 | 
257 |         best_loss = base_loss
258 | 
259 |         for i in range(n_backward_steps):
260 |             scores += backward_step_sizes[i] * Z_dim
261 |             total_distance_from_base += backward_step_sizes[i]
262 |             current_loss = compute_loss_from_scores(scores)
263 |             if current_loss >= best_loss:
264 |                 break
265 |             loss_at_coef_value[backward_indices[i + 1]] = current_loss
266 |             best_loss = current_loss
267 | 
268 |     # at this point scores == base_scores + step_distance*Z_dim
269 |     # assert(all(np.isclose(scores, base_scores + total_distance_from_base * Z_dim)))
270 | 
271 |     # compute objective values by adding penalty values to all other indices
272 |     other_dim_idx = np.flatnonzero(dim_idx != np.arange(P))
273 |     other_dim_penalty = np.sum(C_0[other_dim_idx] * (base_rho[other_dim_idx] != 0))
274 |     objval_at_coef_values = loss_at_coef_value + other_dim_penalty
275 | 
276 |     if C_0[dim_idx] > 0.0:
277 | 
278 |         # increase objective value at every non-zero coefficient value by C_0j
279 |         nonzero_coef_idx = np.flatnonzero(dim_coefs)
280 |         objval_at_coef_values[nonzero_coef_idx] = objval_at_coef_values[nonzero_coef_idx] + C_0[dim_idx]
281 | 
282 |         # compute value at coef[j] == 0 if needed
283 |         zero_coef_idx = np.flatnonzero(dim_coefs == 0)
284 |         if np.isnan(objval_at_coef_values[zero_coef_idx]):
285 |             # steps_from_here_to_zero: step_from_here_to_base + step_from_base_to_zero
286 |             # steps_from_here_to_zero: -step_from_base_to_here + -step_from_zero_to_base
287 |             steps_to_zero = -(base_coef_value + total_distance_from_base)
288 |             scores += steps_to_zero * Z_dim
289 |             objval_at_coef_values[zero_coef_idx] = compute_loss_from_scores(scores) + other_dim_penalty
290 |             # assert(all(np.isclose(scores, base_scores - base_coef_value * Z_dim)))
291 | 
292 |     # return objective value at feasible coefficients
293 |     return objval_at_coef_values
294 | 
295 | 
296 | 


--------------------------------------------------------------------------------
/riskslim/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sys
  3 | from pathlib import Path
  4 | import time
  5 | import warnings
  6 | import numpy as np
  7 | import pandas as pd
  8 | import prettytable as pt
  9 | from .defaults import INTERCEPT_NAME
 10 | 
 11 | # DATA
 12 | def load_data_from_csv(dataset_csv_file, sample_weights_csv_file = None, fold_csv_file = None, fold_num = 0):
 13 |     """
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     dataset_csv_file                csv file containing the training data
 18 |                                     see /datasets/adult_data.csv for an example
 19 |                                     training data stored as a table with N+1 rows and d+1 columns
 20 |                                     column 1 is the outcome variable entries must be (-1,1) or (0,1)
 21 |                                     column 2 to d+1 are the d input variables
 22 |                                     row 1 contains unique names for the outcome variable, and the input vairable
 23 | 
 24 |     sample_weights_csv_file         csv file containing sample weights for the training data
 25 |                                     weights stored as a table with N rows and 1 column
 26 |                                     all sample weights must be non-negative
 27 | 
 28 |     fold_csv_file                   csv file containing indices of folds for K-fold cross validation
 29 |                                     fold indices stored as a table with N rows and 1 column
 30 |                                     folds must be integers between 1 to K
 31 |                                     if fold_csv_file is None, then we do not use folds
 32 | 
 33 |     fold_num                        int between 0 to K, where K is set by the fold_csv_file
 34 |                                     let fold_idx be the N x 1 index vector listed in fold_csv_file
 35 |                                     samples where fold_idx == fold_num will be used to test
 36 |                                     samples where fold_idx != fold_num will be used to train the model
 37 |                                     fold_num = 0 means use "all" of the training data (since all values of fold_idx \in [1,K])
 38 |                                     if fold_csv_file is None, then fold_num is set to 0
 39 | 
 40 | 
 41 |     Returns
 42 |     -------
 43 |     dictionary containing training data for a binary classification problem with the fields:
 44 | 
 45 |      - 'X' N x P matrix of features (numpy.ndarray) with a column of 1s for the INTERCEPT_NAME
 46 |      - 'Y' N x 1 vector of labels (+1/-1) (numpy.ndarray)
 47 |      - 'variable_names' list of strings containing the names of each feature (list)
 48 |      - 'Y_name' string containing the name of the output (optional)
 49 |      - 'sample_weights' N x 1 vector of sample weights, must all be positive
 50 | 
 51 |     """
 52 |     dataset_csv_file = Path(dataset_csv_file)
 53 |     if not dataset_csv_file.exists():
 54 |         raise IOError('could not find dataset_csv_file: %s' % dataset_csv_file)
 55 | 
 56 |     df = pd.read_csv(dataset_csv_file, sep = ',')
 57 | 
 58 |     raw_data = df.to_numpy()
 59 |     data_headers = list(df.columns.values)
 60 |     N = raw_data.shape[0]
 61 | 
 62 |     # setup Y vector and Y_name
 63 |     Y_col_idx = [0]
 64 |     Y = raw_data[:, Y_col_idx]
 65 |     Y_name = data_headers[Y_col_idx[0]]
 66 |     Y[Y == 0] = -1
 67 | 
 68 |     # setup X and X_names
 69 |     X_col_idx = [j for j in range(raw_data.shape[1]) if j not in Y_col_idx]
 70 |     X = raw_data[:, X_col_idx]
 71 |     variable_names = [data_headers[j] for j in X_col_idx]
 72 | 
 73 |     # insert a column of ones to X for the intercept
 74 |     X = np.insert(arr=X, obj=0, values=np.ones(N), axis=1)
 75 |     variable_names.insert(0, INTERCEPT_NAME)
 76 | 
 77 | 
 78 |     if sample_weights_csv_file is None:
 79 |         sample_weights = np.ones(N)
 80 |     else:
 81 |         sample_weights_csv_file = Path(sample_weights_csv_file)
 82 |         if not sample_weights_csv_file.exists():
 83 |             raise IOError('could not find sample_weights_csv_file: %s' % sample_weights_csv_file)
 84 |         sample_weights = pd.read_csv(sample_weights_csv_file, sep=',', header=None)
 85 |         sample_weights = sample_weights.to_numpy()
 86 | 
 87 |     data = {
 88 |         'X': X,
 89 |         'Y': Y,
 90 |         'variable_names': variable_names,
 91 |         'outcome_name': Y_name,
 92 |         'sample_weights': sample_weights,
 93 |         }
 94 | 
 95 |     #load folds
 96 |     if fold_csv_file is not None:
 97 |         fold_csv_file = Path(fold_csv_file)
 98 |         if not fold_csv_file.exists():
 99 |             raise IOError('could not find fold_csv_file: %s' % fold_csv_file)
100 | 
101 |         fold_idx = pd.read_csv(fold_csv_file, sep=',', header=None)
102 |         fold_idx = fold_idx.values.flatten()
103 |         K = max(fold_idx)
104 |         all_fold_nums = np.sort(np.unique(fold_idx))
105 |         assert len(fold_idx) == N, "dimension mismatch: read %r fold indices (expected N = %r)" % (len(fold_idx), N)
106 |         assert np.all(all_fold_nums == np.arange(1, K+1)), "folds should contain indices between 1 to %r" % K
107 |         assert fold_num in np.arange(0, K+1), "fold_num should either be 0 or an integer between 1 to %r" % K
108 |         if fold_num >= 1:
109 |             #test_idx = fold_num == fold_idx
110 |             train_idx = fold_num != fold_idx
111 |             data['X'] = data['X'][train_idx,]
112 |             data['Y'] = data['Y'][train_idx]
113 |             data['sample_weights'] = data['sample_weights'][train_idx]
114 | 
115 |     assert check_data(data)
116 |     return data
117 | 
118 | 
119 | def check_data(data):
120 |     """
121 |     makes sure that 'data' contains training data that is suitable for binary classification problems
122 |     throws AssertionError if
123 | 
124 |     'data' is a dictionary that must contain:
125 | 
126 |      - 'X' N x P matrix of features (numpy.ndarray) with a column of 1s for the INTERCEPT_NAME
127 |      - 'Y' N x 1 vector of labels (+1/-1) (numpy.ndarray)
128 |      - 'variable_names' list of strings containing the names of each feature (list)
129 | 
130 |      data can also contain:
131 | 
132 |      - 'outcome_name' string containing the name of the output (optional)
133 |      - 'sample_weights' N x 1 vector of sample weights, must all be positive
134 | 
135 |     Returns
136 |     -------
137 |     True if data passes checks
138 | 
139 |     """
140 |     # type checks
141 |     assert type(data) is dict, "data should be a dict"
142 | 
143 |     assert 'X' in data, "data should contain X matrix"
144 |     assert type(data['X']) is np.ndarray, "type(X) should be numpy.ndarray"
145 | 
146 |     assert 'Y' in data, "data should contain Y matrix"
147 |     assert type(data['Y']) is np.ndarray, "type(Y) should be numpy.ndarray"
148 | 
149 |     assert 'variable_names' in data, "data should contain variable_names"
150 |     assert type(data['variable_names']) is list, "variable_names should be a list"
151 | 
152 |     X = data['X']
153 |     Y = data['Y']
154 |     variable_names = data['variable_names']
155 | 
156 |     if 'outcome_name' in data:
157 |         assert type(data['outcome_name']) is str, "outcome_name should be a str"
158 | 
159 |     # sizes and uniqueness
160 |     N, P = X.shape
161 |     assert N > 0, 'X matrix must have at least 1 row'
162 |     assert P > 0, 'X matrix must have at least 1 column'
163 |     assert len(Y) == N, 'dimension mismatch. Y must contain as many entries as X. Need len(Y) = N.'
164 |     assert len(list(set(data['variable_names']))) == len(data['variable_names']), 'variable_names is not unique'
165 |     assert len(data['variable_names']) == P, 'len(variable_names) should be same as # of cols in X'
166 | 
167 |     # feature matrix
168 |     assert np.all(~np.isnan(X)), 'X has nan entries'
169 |     assert np.all(~np.isinf(X)), 'X has inf entries'
170 | 
171 |     # offset in feature matrix
172 |     if INTERCEPT_NAME in variable_names:
173 |         assert all(X[:, variable_names.index(INTERCEPT_NAME)] == 1.0), "(Intercept)' column should only be composed of 1s"
174 |     else:
175 |         warnings.warn("there is no column named INTERCEPT_NAME in variable_names")
176 | 
177 |     # labels values
178 |     assert all((Y == 1) | (Y == -1)), 'Need Y[i] = [-1,1] for all i.'
179 |     if all(Y == 1):
180 |         warnings.warn('Y does not contain any positive examples. Need Y[i] = +1 for at least 1 i.')
181 |     if all(Y == -1):
182 |         warnings.warn('Y does not contain any negative examples. Need Y[i] = -1 for at least 1 i.')
183 | 
184 |     if 'sample_weights' in data:
185 |         sample_weights = data['sample_weights']
186 |         type(sample_weights) is np.ndarray
187 |         assert len(sample_weights) == N, 'sample_weights should contain N elements'
188 |         assert all(sample_weights > 0.0), 'sample_weights[i] > 0 for all i '
189 | 
190 |         # by default, we set sample_weights as an N x 1 array of ones. if not, then sample weights is non-trivial
191 |         if any(sample_weights != 1.0) and len(np.unique(sample_weights)) < 2:
192 |             warnings.warn('note: sample_weights only has <2 unique values')
193 | 
194 |     return True
195 | 
196 | 
197 | # MODEL PRINTING
198 | def print_model(rho, data,  show_omitted_variables = False):
199 | 
200 |     variable_names = data['variable_names']
201 | 
202 |     rho_values = np.copy(rho)
203 |     rho_names = list(variable_names)
204 | 
205 |     if INTERCEPT_NAME in rho_names:
206 |         intercept_ind = variable_names.index(INTERCEPT_NAME)
207 |         intercept_val = int(rho[intercept_ind])
208 |         rho_values = np.delete(rho_values, intercept_ind)
209 |         rho_names.remove(INTERCEPT_NAME)
210 |     else:
211 |         intercept_val = 0
212 | 
213 |     if 'outcome_name' in data:
214 |         predict_string = "Pr(Y = +1) = 1.0/(1.0 + exp(-(%d + score))" % intercept_val
215 |     else:
216 |         predict_string = "Pr(%s = +1) = 1.0/(1.0 + exp(-(%d + score))" % (data['outcome_name'].upper(), intercept_val)
217 | 
218 |     if not show_omitted_variables:
219 |         selected_ind = np.flatnonzero(rho_values)
220 |         rho_values = rho_values[selected_ind]
221 |         rho_names = [rho_names[i] for i in selected_ind]
222 |         rho_binary = [np.all((data['X'][:,j] == 0) | (data['X'][:,j] == 1)) for j in selected_ind]
223 | 
224 |         #sort by most positive to most negative
225 |         sort_ind = np.argsort(-np.array(rho_values))
226 |         rho_values = [rho_values[j] for j in sort_ind]
227 |         rho_names = [rho_names[j] for j in sort_ind]
228 |         rho_binary = [rho_binary[j] for j in sort_ind]
229 |         rho_values = np.array(rho_values)
230 | 
231 |     rho_values_string = [str(int(i)) + " points" for i in rho_values]
232 |     n_variable_rows = len(rho_values)
233 |     total_string = "ADD POINTS FROM ROWS %d to %d" % (1, n_variable_rows)
234 | 
235 |     max_name_col_length = max(len(predict_string), len(total_string), max([len(s) for s in rho_names])) + 2
236 |     max_value_col_length = max(7, max([len(s) for s in rho_values_string]) + len("points")) + 2
237 | 
238 |     m = pt.PrettyTable()
239 |     m.field_names = ["Variable", "Points", "Tally"]
240 | 
241 |     m.add_row([predict_string, "", ""])
242 |     m.add_row(['=' * max_name_col_length, "=" * max_value_col_length, "========="])
243 | 
244 |     for name, value_string in zip(rho_names, rho_values_string):
245 |         m.add_row([name, value_string, "+ ....."])
246 | 
247 |     m.add_row(['=' * max_name_col_length, "=" * max_value_col_length, "========="])
248 |     m.add_row([total_string, "SCORE", "= ....."])
249 |     m.header = False
250 |     m.align["Variable"] = "l"
251 |     m.align["Points"] = "r"
252 |     m.align["Tally"] = "r"
253 | 
254 |     print(m)
255 |     return m
256 | 
257 | 
258 | # LOGGING
259 | def setup_logging(logger, log_to_console = True, log_file = None):
260 |     """
261 |     Sets up logging to console and file on disk
262 |     See https://docs.python.org/2/howto/logging-cookbook.html for details on how to customize
263 | 
264 |     Parameters
265 |     ----------
266 |     log_to_console  set to True to disable logging in console
267 |     log_file        path to file for loggin
268 | 
269 |     Returns
270 |     -------
271 |     Logger object that prints formatted messages to log_file and console
272 |     """
273 | 
274 |     # quick return if no logging to console or file
275 |     if log_to_console is False and log_file is None:
276 |         logger.disabled = True
277 |         return logger
278 | 
279 |     log_format = logging.Formatter(fmt='%(asctime)s | %(levelname)-8s | %(message)s', datefmt='%m-%d-%Y %I:%M:%S %p')
280 | 
281 |     # log to file
282 |     if log_file is not None:
283 |         fh = logging.FileHandler(filename=log_file)
284 |         #fh.setLevel(logging.DEBUG)
285 |         fh.setFormatter(log_format)
286 |         logger.addHandler(fh)
287 | 
288 |     if log_to_console:
289 |         ch = logging.StreamHandler()
290 |         #ch.setLevel(logging.DEBUG)
291 |         ch.setFormatter(log_format)
292 |         logger.addHandler(ch)
293 | 
294 |     return logger
295 | 
296 | 
297 | def print_log(msg, print_flag = True):
298 |     """
299 | 
300 |     Parameters
301 |     ----------
302 |     msg
303 |     print_flag
304 | 
305 |     Returns
306 |     -------
307 | 
308 |     """
309 |     if print_flag:
310 |         if isinstance(msg, str):
311 |             print('%s | %s' % (time.strftime("%m/%d/%y @ %I:%M %p", time.localtime()), msg))
312 |         else:
313 |             print('%s | %r' % (time.strftime("%m/%d/%y @ %I:%M %p", time.localtime()), msg))
314 |         sys.stdout.flush()
315 | 
316 | 
317 | def validate_settings(settings = None, default_settings = None):
318 | 
319 |     if settings is None:
320 |         settings = dict()
321 |     else:
322 |         assert isinstance(settings, dict)
323 |         settings = dict(settings)
324 | 
325 |     if default_settings is not None:
326 |         assert isinstance(default_settings, dict)
327 |         settings = {k: settings[k] if k in settings else default_settings[k] for k in default_settings}
328 | 
329 |     return settings


--------------------------------------------------------------------------------
/riskslim/coefficient_set.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from prettytable import PrettyTable
  3 | from .defaults import INTERCEPT_NAME
  4 | 
  5 | 
  6 | class CoefficientSet(object):
  7 |     """
  8 |     Class used to represent and manipulate constraints on individual coefficients
  9 |     including upper bound, lower bound, variable type, and regularization.
 10 |     Coefficient Set is composed of Coefficient Elements
 11 |     """
 12 | 
 13 |     _initialized = False
 14 |     _print_flag = True
 15 |     _check_flag = True
 16 |     _correct_flag = True
 17 |     _variable_names = None
 18 | 
 19 |     def __init__(self, variable_names, **kwargs):
 20 | 
 21 |         # set variables using setter methods
 22 |         self.variable_names = list(variable_names)
 23 |         self.print_flag = kwargs.get('print_flag', self._print_flag)
 24 |         self.check_flag = kwargs.get('check_flag', self._check_flag)
 25 |         self.correct_flag = kwargs.get('correct_flag', self._correct_flag)
 26 | 
 27 |         ub = kwargs.get('ub', _CoefficientElement._DEFAULT_UB)
 28 |         lb = kwargs.get('lb', _CoefficientElement._DEFAULT_LB)
 29 |         c0 = kwargs.get('c0', _CoefficientElement._DEFAULT_c0)
 30 |         vtype = kwargs.get('type', _CoefficientElement._DEFAULT_TYPE)
 31 | 
 32 |         ub = self._expand_values(value = ub)
 33 |         lb = self._expand_values(value = lb)
 34 |         c0 = self._expand_values(value = c0)
 35 |         vtype = self._expand_values(value = vtype)
 36 | 
 37 |         self._coef_elements = dict()
 38 |         for name in variable_names:
 39 |             idx = variable_names.index(name)
 40 |             self._coef_elements[name] = _CoefficientElement(name = name, ub = ub[idx], lb = lb[idx], c0 = c0[idx], vtype = vtype[idx])
 41 | 
 42 |         self._check_rep()
 43 |         self._initialized = True
 44 | 
 45 | 
 46 |     @property
 47 |     def P(self):
 48 |         return len(self._variable_names)
 49 | 
 50 | 
 51 |     @property
 52 |     def print_flag(self):
 53 |         return bool(self._print_flag)
 54 | 
 55 | 
 56 |     @print_flag.setter
 57 |     def print_flag(self, flag):
 58 |         self._print_flag = bool(flag)
 59 | 
 60 | 
 61 |     @property
 62 |     def correct_flag(self):
 63 |         return bool(self._correct_flag)
 64 | 
 65 | 
 66 |     @correct_flag.setter
 67 |     def correct_flag(self, flag):
 68 |         self._correct_flag = bool(flag)
 69 | 
 70 | 
 71 |     @property
 72 |     def check_flag(self):
 73 |         return self._check_flag
 74 | 
 75 | 
 76 |     @check_flag.setter
 77 |     def check_flag(self, flag):
 78 |         self._check_flag = bool(flag)
 79 | 
 80 | 
 81 |     @property
 82 |     def variable_names(self):
 83 |         return self._variable_names
 84 | 
 85 | 
 86 |     @variable_names.setter
 87 |     def variable_names(self, names):
 88 |         assert isinstance(names, list), 'variable_names must be a list'
 89 |         for name in names:
 90 |             assert isinstance(name, str), 'variable_names must be a list of strings'
 91 |         assert len(names) == len(set(names)), 'variable_names contain elements with unique names'
 92 |         if self._variable_names is not None:
 93 |             assert len(names) == len(self), 'variable_names must contain at least %d elements' % len(self)
 94 |         self._variable_names = list(names)
 95 | 
 96 | 
 97 |     def index(self, name):
 98 |         assert isinstance(name, str)
 99 |         if name in self._variable_names:
100 |             return self._variable_names.index(name)
101 |         else:
102 |             raise ValueError('no variable named %s in coefficient set' % name)
103 | 
104 | 
105 |     def penalized_indices(self):
106 |         return np.array(list(map(lambda v: self._coef_elements[v].penalized, self._variable_names)))
107 | 
108 | 
109 |     def update_intercept_bounds(self, X, y, max_offset, max_L0_value = None):
110 |         """
111 |         uses data to set the lower and upper bound on the offset to a conservative value
112 |         the value is guaranteed to avoid a loss in performance
113 | 
114 |         optimal_offset = max_abs_score + 1
115 |         where max_abs_score is the largest absolute score that can be achieved using the coefficients in coef_set
116 |         with the training data. note:
117 |         when offset >= optimal_offset, then we predict y = +1 for every example
118 |         when offset <= optimal_offset, then we predict y = -1 for every example
119 |         thus, any feasible model should do better.
120 | 
121 | 
122 |         Parameters
123 |         ----------
124 |         X
125 |         y
126 |         max_offset
127 |         max_L0_value
128 | 
129 |         Returns
130 |         -------
131 |         None
132 | 
133 |         """
134 |         if INTERCEPT_NAME not in self._coef_elements:
135 |             raise ValueError("coef_set must contain a variable for the offset called %s" % INTERCEPT_NAME)
136 | 
137 |         e = self._coef_elements[INTERCEPT_NAME]
138 | 
139 |         # get idx of intercept/variables
140 |         names = self.variable_names
141 |         variable_names = list(names)
142 |         variable_names.remove(INTERCEPT_NAME)
143 |         variable_idx = np.array([names.index(n) for n in variable_names])
144 | 
145 |         # get max # of non-zero coefficients given model size limit
146 |         penalized_idx = [self._coef_elements[n].penalized for n in variable_names]
147 |         trivial_L0_max = len(penalized_idx)
148 | 
149 |         if max_L0_value is None:
150 |             max_L0_value = trivial_L0_max
151 | 
152 |         if max_L0_value > 0:
153 |             max_L0_value = min(trivial_L0_max, max_L0_value)
154 | 
155 |         # update intercept bounds
156 |         Z = X * y
157 |         Z_min = np.min(Z, axis = 0)
158 |         Z_max = np.max(Z, axis = 0)
159 | 
160 |         # get regularized indices
161 |         L0_reg_ind = np.isnan(self.C_0j)[variable_idx]
162 | 
163 |         # get smallest / largest score
164 |         s_min, s_max = get_score_bounds(Z_min = Z_min[variable_idx],
165 |                                         Z_max = Z_max[variable_idx],
166 |                                         rho_lb = self.lb[variable_idx],
167 |                                         rho_ub = self.ub[variable_idx],
168 |                                         L0_reg_ind = L0_reg_ind,
169 |                                         L0_max = max_L0_value)
170 | 
171 |         # get max # of non-zero coefficients given model size limit
172 |         conservative_offset = max(abs(s_min), abs(s_max)) + 1
173 |         max_offset = min(max_offset, conservative_offset)
174 |         e.ub = max_offset
175 |         e.lb = -max_offset
176 | 
177 | 
178 |     def tabulate(self):
179 |         t = PrettyTable()
180 |         t.align = "r"
181 |         t.add_column("variable_name", self._variable_names)
182 |         t.add_column("vtype", self.vtype)
183 |         t.add_column("sign", self.sign)
184 |         t.add_column("lb", self.lb)
185 |         t.add_column("ub", self.ub)
186 |         t.add_column("c0", self.c0)
187 |         return str(t)
188 | 
189 | 
190 |     def __len__(self):
191 |         return len(self._variable_names)
192 | 
193 | 
194 |     def __str__(self):
195 |         return self.tabulate()
196 | 
197 | 
198 |     def __repr__(self):
199 |         if self.print_flag:
200 |             return self.tabulate()
201 | 
202 | 
203 |     def __getattr__(self, name):
204 | 
205 |         if name == 'C_0j':
206 |             name = 'c0'
207 | 
208 |         vals = [getattr(self._coef_elements[v], name) for v in self._variable_names]
209 |         if name in ['ub', 'lb', 'c0', 'sign', 'vtype']:
210 |             return np.array(vals)
211 |         else:
212 |             return list(vals)
213 | 
214 | 
215 |     def __setattr__(self, name, value):
216 |         if self._initialized:
217 |             assert all(map(lambda e: hasattr(e, name), self._coef_elements.values()))
218 |             attr_values = self._expand_values(value)
219 |             for e, v in zip(self._coef_elements.values(), attr_values):
220 |                 setattr(e, name, v)
221 |             self._check_rep()
222 |         else:
223 |             object.__setattr__(self, name, value)
224 | 
225 | 
226 |     def __getitem__(self, key):
227 | 
228 |         if isinstance(key, int):
229 |             assert 0 <= int(key) <= self.P
230 |             return self._coef_elements[self._variable_names[key]]
231 |         elif isinstance(key, str):
232 |             return self._coef_elements[key]
233 |         else:
234 |             raise KeyError('invalid key')
235 | 
236 | 
237 |     def __setitem__(self, key, value):
238 | 
239 |         if isinstance(key, int):
240 |             assert 0 <= int(key) <= self.P
241 |             key = self._variable_names[key]
242 |         elif isinstance(key, str):
243 |             assert isinstance(key, str)
244 |             assert key in self._variable_names
245 |             assert value.name == key
246 |         else:
247 |             raise KeyError('invalid key')
248 | 
249 |         assert isinstance(value, _CoefficientElement)
250 |         self._coef_elements[key] = value
251 | 
252 | 
253 |     def _check_rep(self):
254 | 
255 |         if self._check_flag:
256 | 
257 |             assert len(self._variable_names) == len(set(self._variable_names))
258 | 
259 |             for name in self._variable_names:
260 |                 assert isinstance(name, str)
261 |                 assert len(name) >= 1
262 |                 assert self._coef_elements[name]._check_rep()
263 | 
264 |         if self._correct_flag:
265 | 
266 |             for name in self._variable_names:
267 |                 e = self._coef_elements[name]
268 |                 if name in {'Intercept', '(Intercept)', 'intercept', '(intercept)'}:
269 |                     if e.c0 > 0 or np.isnan(e.c0):
270 |                         if self._print_flag:
271 |                             print("setting c0_value = 0.0 for %s to ensure that intercept is not penalized" % name)
272 |                         e._c0 = 0.0
273 | 
274 |         return True
275 | 
276 | 
277 |     def _expand_values(self, value):
278 | 
279 |         if isinstance(value, np.ndarray):
280 |             if value.size == self.P:
281 |                 value_array = value
282 |             elif value.size == 1:
283 |                 value_array = np.repeat(value, self.P)
284 |             else:
285 |                 raise ValueError("length mismatch; need either 1 or %d values" % self.P)
286 | 
287 |         elif isinstance(value, list):
288 |             if len(value) == self.P:
289 |                 value_array = value
290 |             elif len(value) == 1:
291 |                 value_array = [value] * self.P
292 |             else:
293 |                 raise ValueError("length mismatch; need either 1 or %d values" % self.P)
294 | 
295 |         elif isinstance(value, str):
296 |             value_array = [str(value)] * self.P
297 | 
298 |         elif isinstance(value, int):
299 |             value_array = [int(value)] * self.P
300 | 
301 |         elif isinstance(value, float):
302 |             value_array = [float(value)] * self.P
303 | 
304 |         else:
305 |             raise ValueError("unknown variable type %s")
306 | 
307 |         return(value_array)
308 | 
309 | 
310 | class _CoefficientElement(object):
311 | 
312 |     _DEFAULT_UB = 5
313 |     _DEFAULT_LB = -5
314 |     _DEFAULT_c0 = float('nan')
315 |     _DEFAULT_TYPE = 'I'
316 |     _VALID_TYPES = ['I', 'C']
317 | 
318 |     def _is_integer(self, x):
319 |         return np.array_equal(x, np.require(x, dtype = np.int_))
320 | 
321 | 
322 |     def __init__(self, name, ub = _DEFAULT_UB, lb = _DEFAULT_LB, c0 = _DEFAULT_c0, vtype = _DEFAULT_TYPE):
323 | 
324 |         self._name = str(name)
325 |         self._ub = float(ub)
326 |         self._lb = float(lb)
327 |         self._c0 = float(c0)
328 |         self._vtype = vtype
329 |         assert self._check_rep()
330 | 
331 | 
332 |     @property
333 |     def name(self):
334 |         return self._name
335 | 
336 | 
337 |     @property
338 |     def vtype(self):
339 |         return self._vtype
340 | 
341 | 
342 |     @vtype.setter
343 |     def vtype(self, value):
344 |         assert isinstance(value, str)
345 |         assert value in self._VALID_TYPES
346 |         self._vtype = str(value)
347 | 
348 | 
349 |     @property
350 |     def ub(self):
351 |         return self._ub
352 | 
353 | 
354 |     @ub.setter
355 |     def ub(self, value):
356 |         if hasattr(value, '__len__'):
357 |             assert len(value) == 1
358 |             value = value[0]
359 |         assert value >= self._lb
360 |         self._ub = float(value)
361 | 
362 | 
363 |     @property
364 |     def lb(self):
365 |         return self._lb
366 | 
367 | 
368 |     @lb.setter
369 |     def lb(self, value):
370 |         if hasattr(value, '__len__'):
371 |             assert len(value) == 1
372 |             value = value[0]
373 |         assert value <= self._ub
374 |         self._lb = float(value)
375 | 
376 | 
377 |     @property
378 |     def c0(self):
379 |         return self._c0
380 | 
381 | 
382 |     @c0.setter
383 |     def c0(self, value):
384 |         if np.isnan(value):
385 |             self._c0 = float('nan')
386 |         else:
387 |             assert np.isfinite(value), 'L0 penalty for %s must either be NaN or a finite positive number' % self._name
388 |             assert value >= 0.0, 'L0 penalty for %s must either be NaN or a finite positive number' % self._name
389 |             self._c0 = float(value)
390 | 
391 | 
392 |     @property
393 |     def penalized(self):
394 |         return np.isnan(self._c0) or (self._c0 > 0.0)
395 | 
396 | 
397 |     @property
398 |     def sign(self):
399 |         if self._ub > 0.0 and self._lb >= 0.0:
400 |             return 1
401 |         elif self._ub <= 0.0 and self._lb < 0.0:
402 |             return -1
403 |         else:
404 |             return 0
405 | 
406 |     @sign.setter
407 |     def sign(self, value):
408 |         if value > 0:
409 |             self._lb = 0.0
410 |         elif value < 0:
411 |             self._ub = 0.0
412 | 
413 | 
414 |     def _check_rep(self):
415 | 
416 |         #name
417 |         assert isinstance(self._name, str)
418 |         assert len(self._name) >= 1
419 | 
420 |         #bounds
421 |         assert np.isfinite(self.ub)
422 |         assert np.isfinite(self.lb)
423 |         assert self.ub >= self.lb
424 | 
425 |         # value
426 |         assert self._vtype in self._VALID_TYPES
427 |         assert np.isnan(self.c0) or (self.c0 >= 0.0 and np.isfinite(self.c0))
428 | 
429 |         return True
430 | 
431 | 
432 |     def __repr__(self):
433 |         return self.tabulate()
434 | 
435 | 
436 |     def __str__(self):
437 |         return self.tabulate()
438 | 
439 | 
440 |     def tabulate(self):
441 |         s = ['-' * 60,
442 |              'variable: %s' % self._name,
443 |              '-' * 60,
444 |              '%s: %1.1f' % ('ub', self._ub),
445 |              '%s: %1.1f' % ('lb', self._lb),
446 |              '%s: %1.2g' % ('c0', self._c0),
447 |              '%s: %1.0f' % ('sign', self.sign),
448 |              '%s: %s' % ('vtype', self._vtype)]
449 |         t = '\n' + '\n'.join(s) + '\n'
450 |         return t
451 | 
452 | 
453 | def get_score_bounds(Z_min, Z_max, rho_lb, rho_ub, L0_reg_ind = None, L0_max = None):
454 | 
455 |     edge_values = np.vstack([Z_min * rho_lb, Z_max * rho_lb, Z_min * rho_ub, Z_max * rho_ub])
456 | 
457 |     if (L0_max is None) or (L0_reg_ind is None) or (L0_max == Z_min.shape[0]):
458 |         s_min = np.sum(np.min(edge_values, axis=0))
459 |         s_max = np.sum(np.max(edge_values, axis=0))
460 |     else:
461 |         min_values = np.min(edge_values, axis=0)
462 |         s_min_reg = np.sum(np.sort(min_values[L0_reg_ind])[0:L0_max])
463 |         s_min_no_reg = np.sum(min_values[~L0_reg_ind])
464 |         s_min = s_min_reg + s_min_no_reg
465 | 
466 |         max_values = np.max(edge_values, axis=0)
467 |         s_max_reg = np.sum(-np.sort(-max_values[L0_reg_ind])[0:L0_max])
468 |         s_max_no_reg = np.sum(max_values[~L0_reg_ind])
469 |         s_max = s_max_reg + s_max_no_reg
470 | 
471 |     return s_min, s_max


--------------------------------------------------------------------------------
/examples/data/breastcancer_data.csv:
--------------------------------------------------------------------------------
  1 | Benign,ClumpThickness,UniformityOfCellSize,UniformityOfCellShape,MarginalAdhesion,SingleEpithelialCellSize,BareNuclei,BlandChromatin,NormalNucleoli,Mitoses
  2 | 0,5,1,1,1,2,1,3,1,1
  3 | 0,5,4,4,5,7,10,3,2,1
  4 | 0,3,1,1,1,2,2,3,1,1
  5 | 0,6,8,8,1,3,4,3,7,1
  6 | 0,4,1,1,3,2,1,3,1,1
  7 | 1,8,10,10,8,7,10,9,7,1
  8 | 0,1,1,1,1,2,10,3,1,1
  9 | 0,2,1,2,1,2,1,3,1,1
 10 | 0,2,1,1,1,2,1,1,1,5
 11 | 0,4,2,1,1,2,1,2,1,1
 12 | 0,1,1,1,1,1,1,3,1,1
 13 | 0,2,1,1,1,2,1,2,1,1
 14 | 1,5,3,3,3,2,3,4,4,1
 15 | 0,1,1,1,1,2,3,3,1,1
 16 | 1,8,7,5,10,7,9,5,5,4
 17 | 1,7,4,6,4,6,1,4,3,1
 18 | 0,4,1,1,1,2,1,2,1,1
 19 | 0,4,1,1,1,2,1,3,1,1
 20 | 1,10,7,7,6,4,10,4,1,2
 21 | 0,6,1,1,1,2,1,3,1,1
 22 | 1,7,3,2,10,5,10,5,4,4
 23 | 1,10,5,5,3,6,7,7,10,1
 24 | 0,3,1,1,1,2,1,2,1,1
 25 | 0,1,1,1,1,2,1,3,1,1
 26 | 1,5,2,3,4,2,7,3,6,1
 27 | 0,3,2,1,1,1,1,2,1,1
 28 | 0,5,1,1,1,2,1,2,1,1
 29 | 0,2,1,1,1,2,1,2,1,1
 30 | 0,1,1,3,1,2,1,1,1,1
 31 | 0,3,1,1,1,1,1,2,1,1
 32 | 0,2,1,1,1,2,1,3,1,1
 33 | 1,10,7,7,3,8,5,7,4,3
 34 | 0,2,1,1,2,2,1,3,1,1
 35 | 0,3,1,2,1,2,1,2,1,1
 36 | 0,2,1,1,1,2,1,2,1,1
 37 | 1,10,10,10,8,6,1,8,9,1
 38 | 0,6,2,1,1,1,1,7,1,1
 39 | 1,5,4,4,9,2,10,5,6,1
 40 | 1,2,5,3,3,6,7,7,5,1
 41 | 1,10,4,3,1,3,3,6,5,2
 42 | 1,6,10,10,2,8,10,7,3,3
 43 | 1,5,6,5,6,10,1,3,1,1
 44 | 1,10,10,10,4,8,1,8,10,1
 45 | 0,1,1,1,1,2,1,2,1,2
 46 | 1,3,7,7,4,4,9,4,8,1
 47 | 0,1,1,1,1,2,1,2,1,1
 48 | 0,4,1,1,3,2,1,3,1,1
 49 | 1,7,8,7,2,4,8,3,8,2
 50 | 1,9,5,8,1,2,3,2,1,5
 51 | 1,5,3,3,4,2,4,3,4,1
 52 | 1,10,3,6,2,3,5,4,10,2
 53 | 1,5,5,5,8,10,8,7,3,7
 54 | 1,10,5,5,6,8,8,7,1,1
 55 | 1,10,6,6,3,4,5,3,6,1
 56 | 1,8,10,10,1,3,6,3,9,1
 57 | 1,8,2,4,1,5,1,5,4,4
 58 | 1,5,2,3,1,6,10,5,1,1
 59 | 1,9,5,5,2,2,2,5,1,1
 60 | 1,5,3,5,5,3,3,4,10,1
 61 | 0,1,1,1,1,2,2,2,1,1
 62 | 1,9,10,10,1,10,8,3,3,1
 63 | 1,6,3,4,1,5,2,3,9,1
 64 | 0,1,1,1,1,2,1,2,1,1
 65 | 1,10,4,2,1,3,2,4,3,10
 66 | 0,4,1,1,1,2,1,3,1,1
 67 | 1,5,3,4,1,8,10,4,9,1
 68 | 1,8,3,8,3,4,9,8,9,8
 69 | 0,1,1,1,1,2,1,3,2,1
 70 | 0,5,1,3,1,2,1,2,1,1
 71 | 1,6,10,2,8,10,2,7,8,10
 72 | 0,1,3,3,2,2,1,7,2,1
 73 | 1,9,4,5,10,6,10,4,8,1
 74 | 1,10,6,4,1,3,4,3,2,3
 75 | 0,1,1,2,1,2,2,4,2,1
 76 | 0,1,1,4,1,2,1,2,1,1
 77 | 0,5,3,1,2,2,1,2,1,1
 78 | 0,3,1,1,1,2,3,3,1,1
 79 | 0,2,1,1,1,3,1,2,1,1
 80 | 0,2,2,2,1,1,1,7,1,1
 81 | 0,4,1,1,2,2,1,2,1,1
 82 | 0,5,2,1,1,2,1,3,1,1
 83 | 0,3,1,1,1,2,2,7,1,1
 84 | 1,3,5,7,8,8,9,7,10,7
 85 | 1,5,10,6,1,10,4,4,10,10
 86 | 1,3,3,6,4,5,8,4,4,1
 87 | 1,3,6,6,6,5,10,6,8,3
 88 | 0,4,1,1,1,2,1,3,1,1
 89 | 0,2,1,1,2,3,1,2,1,1
 90 | 0,1,1,1,1,2,1,3,1,1
 91 | 0,3,1,1,2,2,1,1,1,1
 92 | 0,4,1,1,1,2,1,3,1,1
 93 | 0,1,1,1,1,2,1,2,1,1
 94 | 0,2,1,1,1,2,1,3,1,1
 95 | 0,1,1,1,1,2,1,3,1,1
 96 | 0,2,1,1,2,2,1,1,1,1
 97 | 0,5,1,1,1,2,1,3,1,1
 98 | 1,9,6,9,2,10,6,2,9,10
 99 | 1,7,5,6,10,5,10,7,9,4
100 | 1,10,3,5,1,10,5,3,10,2
101 | 1,2,3,4,4,2,5,2,5,1
102 | 0,4,1,2,1,2,1,3,1,1
103 | 1,8,2,3,1,6,3,7,1,1
104 | 1,10,10,10,10,10,1,8,8,8
105 | 1,7,3,4,4,3,3,3,2,7
106 | 1,10,10,10,8,2,10,4,1,1
107 | 1,1,6,8,10,8,10,5,7,1
108 | 0,1,1,1,1,2,1,2,3,1
109 | 1,6,5,4,4,3,9,7,8,3
110 | 0,1,3,1,2,2,2,5,3,2
111 | 1,8,6,4,3,5,9,3,1,1
112 | 1,10,3,3,10,2,10,7,3,3
113 | 1,10,10,10,3,10,8,8,1,1
114 | 0,3,3,2,1,2,3,3,1,1
115 | 0,1,1,1,1,2,5,1,1,1
116 | 0,8,3,3,1,2,2,3,2,1
117 | 1,4,5,5,10,4,10,7,5,8
118 | 0,1,1,1,1,4,3,1,1,1
119 | 0,3,2,1,1,2,2,3,1,1
120 | 0,1,1,2,2,2,1,3,1,1
121 | 0,4,2,1,1,2,2,3,1,1
122 | 1,10,10,10,2,10,10,5,3,3
123 | 1,5,3,5,1,8,10,5,3,1
124 | 1,5,4,6,7,9,7,8,10,1
125 | 0,1,1,1,1,2,1,2,1,1
126 | 1,7,5,3,7,4,10,7,5,5
127 | 0,3,1,1,1,2,1,3,1,1
128 | 1,8,3,5,4,5,10,1,6,2
129 | 0,1,1,1,1,10,1,1,1,1
130 | 0,5,1,3,1,2,1,2,1,1
131 | 0,2,1,1,1,2,1,3,1,1
132 | 1,5,10,8,10,8,10,3,6,3
133 | 0,3,1,1,1,2,1,2,2,1
134 | 0,3,1,1,1,3,1,2,1,1
135 | 0,5,1,1,1,2,2,3,3,1
136 | 0,4,1,1,1,2,1,2,1,1
137 | 0,3,1,1,1,2,1,1,1,1
138 | 0,4,1,2,1,2,1,2,1,1
139 | 0,3,1,1,1,2,1,1,1,1
140 | 0,2,1,1,1,2,1,1,1,1
141 | 1,9,5,5,4,4,5,4,3,3
142 | 0,1,1,1,1,2,5,1,1,1
143 | 0,2,1,1,1,2,1,2,1,1
144 | 1,3,4,5,2,6,8,4,1,1
145 | 0,1,1,1,1,3,2,2,1,1
146 | 0,3,1,1,3,8,1,5,8,1
147 | 1,8,8,7,4,10,10,7,8,7
148 | 0,1,1,1,1,1,1,3,1,1
149 | 1,7,2,4,1,6,10,5,4,3
150 | 1,10,10,8,6,4,5,8,10,1
151 | 0,4,1,1,1,2,3,1,1,1
152 | 0,1,1,1,1,2,1,1,1,1
153 | 1,5,5,5,6,3,10,3,1,1
154 | 0,1,2,2,1,2,1,2,1,1
155 | 0,2,1,1,1,2,1,3,1,1
156 | 1,9,9,10,3,6,10,7,10,6
157 | 1,10,7,7,4,5,10,5,7,2
158 | 0,4,1,1,1,2,1,3,2,1
159 | 0,3,1,1,1,2,1,3,1,1
160 | 0,1,1,1,2,1,3,1,1,7
161 | 0,4,1,1,1,2,2,3,2,1
162 | 1,5,6,7,8,8,10,3,10,3
163 | 1,10,8,10,10,6,1,3,1,10
164 | 0,3,1,1,1,2,1,3,1,1
165 | 0,1,1,1,2,1,1,1,1,1
166 | 0,3,1,1,1,2,1,1,1,1
167 | 0,1,1,1,1,2,1,3,1,1
168 | 0,1,1,1,1,2,1,2,1,1
169 | 1,6,10,10,10,8,10,10,10,7
170 | 1,8,6,5,4,3,10,6,1,1
171 | 1,5,8,7,7,10,10,5,7,1
172 | 0,2,1,1,1,2,1,3,1,1
173 | 1,5,10,10,3,8,1,5,10,3
174 | 0,4,1,1,1,2,1,3,1,1
175 | 1,5,3,3,3,6,10,3,1,1
176 | 0,1,1,1,1,1,1,3,1,1
177 | 0,1,1,1,1,2,1,1,1,1
178 | 0,6,1,1,1,2,1,3,1,1
179 | 1,5,8,8,8,5,10,7,8,1
180 | 1,8,7,6,4,4,10,5,1,1
181 | 0,2,1,1,1,1,1,3,1,1
182 | 1,1,5,8,6,5,8,7,10,1
183 | 1,10,5,6,10,6,10,7,7,10
184 | 1,5,8,4,10,5,8,9,10,1
185 | 0,1,2,3,1,2,1,3,1,1
186 | 1,10,10,10,8,6,8,7,10,1
187 | 1,7,5,10,10,10,10,4,10,3
188 | 0,5,1,1,1,2,1,2,1,1
189 | 0,1,1,1,1,2,1,3,1,1
190 | 0,3,1,1,1,2,1,3,1,1
191 | 0,4,1,1,1,2,1,3,1,1
192 | 0,8,4,4,5,4,7,7,8,2
193 | 0,5,1,1,4,2,1,3,1,1
194 | 0,1,1,1,1,2,1,1,1,1
195 | 0,3,1,1,1,2,1,2,1,1
196 | 1,9,7,7,5,5,10,7,8,3
197 | 1,10,8,8,4,10,10,8,1,1
198 | 0,1,1,1,1,2,1,3,1,1
199 | 0,5,1,1,1,2,1,3,1,1
200 | 0,1,1,1,1,2,1,3,1,1
201 | 1,5,10,10,9,6,10,7,10,5
202 | 1,10,10,9,3,7,5,3,5,1
203 | 0,1,1,1,1,1,1,3,1,1
204 | 0,1,1,1,1,1,1,3,1,1
205 | 0,5,1,1,1,1,1,3,1,1
206 | 1,8,10,10,10,5,10,8,10,6
207 | 1,8,10,8,8,4,8,7,7,1
208 | 0,1,1,1,1,2,1,3,1,1
209 | 1,10,10,10,10,7,10,7,10,4
210 | 1,10,10,10,10,3,10,10,6,1
211 | 1,8,7,8,7,5,5,5,10,2
212 | 0,1,1,1,1,2,1,2,1,1
213 | 0,1,1,1,1,2,1,3,1,1
214 | 1,6,10,7,7,6,4,8,10,2
215 | 0,6,1,3,1,2,1,3,1,1
216 | 0,1,1,1,2,2,1,3,1,1
217 | 1,10,6,4,3,10,10,9,10,1
218 | 1,4,1,1,3,1,5,2,1,1
219 | 1,7,5,6,3,3,8,7,4,1
220 | 1,10,5,5,6,3,10,7,9,2
221 | 0,1,1,1,1,2,1,2,1,1
222 | 1,10,5,7,4,4,10,8,9,1
223 | 1,8,9,9,5,3,5,7,7,1
224 | 0,1,1,1,1,1,1,3,1,1
225 | 1,10,10,10,3,10,10,9,10,1
226 | 1,7,4,7,4,3,7,7,6,1
227 | 1,6,8,7,5,6,8,8,9,2
228 | 0,8,4,6,3,3,1,4,3,1
229 | 1,10,4,5,5,5,10,4,1,1
230 | 0,3,3,2,1,3,1,3,6,1
231 | 1,10,8,8,2,8,10,4,8,10
232 | 1,9,8,8,5,6,2,4,10,4
233 | 1,8,10,10,8,6,9,3,10,10
234 | 1,10,4,3,2,3,10,5,3,2
235 | 0,5,1,3,3,2,2,2,3,1
236 | 0,3,1,1,3,1,1,3,1,1
237 | 0,2,1,1,1,2,1,3,1,1
238 | 0,1,1,1,1,2,5,5,1,1
239 | 0,1,1,1,1,2,1,3,1,1
240 | 0,5,1,1,2,2,2,3,1,1
241 | 1,8,10,10,8,5,10,7,8,1
242 | 1,8,4,4,1,2,9,3,3,1
243 | 0,4,1,1,1,2,1,3,6,1
244 | 0,1,2,2,1,2,1,1,1,1
245 | 1,10,4,4,10,2,10,5,3,3
246 | 0,6,3,3,5,3,10,3,5,3
247 | 1,6,10,10,2,8,10,7,3,3
248 | 1,9,10,10,1,10,8,3,3,1
249 | 1,5,6,6,2,4,10,3,6,1
250 | 0,3,1,1,1,2,1,1,1,1
251 | 0,3,1,1,1,2,1,2,1,1
252 | 0,3,1,1,1,2,1,3,1,1
253 | 0,5,7,7,1,5,8,3,4,1
254 | 1,10,5,8,10,3,10,5,1,3
255 | 1,5,10,10,6,10,10,10,6,5
256 | 1,8,8,9,4,5,10,7,8,1
257 | 1,10,4,4,10,6,10,5,5,1
258 | 1,7,9,4,10,10,3,5,3,3
259 | 0,5,1,4,1,2,1,3,2,1
260 | 1,10,10,6,3,3,10,4,3,2
261 | 1,3,3,5,2,3,10,7,1,1
262 | 1,10,8,8,2,3,4,8,7,8
263 | 0,1,1,1,1,2,1,3,1,1
264 | 1,8,4,7,1,3,10,3,9,2
265 | 0,5,1,1,1,2,1,3,1,1
266 | 1,3,3,5,2,3,10,7,1,1
267 | 1,7,2,4,1,3,4,3,3,1
268 | 0,3,1,1,1,2,1,3,2,1
269 | 0,3,1,1,1,2,1,2,1,1
270 | 0,1,1,1,1,2,1,2,1,1
271 | 0,1,1,1,1,2,1,3,1,1
272 | 1,10,5,7,3,3,7,3,3,8
273 | 0,3,1,1,1,2,1,3,1,1
274 | 0,2,1,1,2,2,1,3,1,1
275 | 1,1,4,3,10,4,10,5,6,1
276 | 1,10,4,6,1,2,10,5,3,1
277 | 1,7,4,5,10,2,10,3,8,2
278 | 1,8,10,10,10,8,10,10,7,3
279 | 1,10,10,10,10,10,10,4,10,10
280 | 0,3,1,1,1,3,1,2,1,1
281 | 1,6,1,3,1,4,5,5,10,1
282 | 1,5,6,6,8,6,10,4,10,4
283 | 0,1,1,1,1,2,1,1,1,1
284 | 0,1,1,1,1,2,1,3,1,1
285 | 1,10,4,4,6,2,10,2,3,1
286 | 1,5,5,7,8,6,10,7,4,1
287 | 0,5,3,4,3,4,5,4,7,1
288 | 0,8,2,1,1,5,1,1,1,1
289 | 1,9,1,2,6,4,10,7,7,2
290 | 1,8,4,10,5,4,4,7,10,1
291 | 0,1,1,1,1,2,1,3,1,1
292 | 1,10,10,10,7,9,10,7,10,10
293 | 0,1,1,1,1,2,1,3,1,1
294 | 1,8,3,4,9,3,10,3,3,1
295 | 1,10,8,4,4,4,10,3,10,4
296 | 0,1,1,1,1,2,1,3,1,1
297 | 0,1,1,1,1,2,1,3,1,1
298 | 1,7,8,7,6,4,3,8,8,4
299 | 0,3,1,1,1,2,5,5,1,1
300 | 0,2,1,1,1,3,1,2,1,1
301 | 0,1,1,1,1,2,1,1,1,1
302 | 1,8,6,4,10,10,1,3,5,1
303 | 0,1,1,1,1,2,1,1,1,1
304 | 0,1,1,1,1,1,1,2,1,1
305 | 1,5,5,5,2,5,10,4,3,1
306 | 1,6,8,7,8,6,8,8,9,1
307 | 0,1,1,1,1,5,1,3,1,1
308 | 0,4,4,4,4,6,5,7,3,1
309 | 1,7,6,3,2,5,10,7,4,6
310 | 0,3,1,1,1,2,1,3,1,1
311 | 1,5,4,6,10,2,10,4,1,1
312 | 0,1,1,1,1,2,1,3,1,1
313 | 0,3,2,2,1,2,1,2,3,1
314 | 1,10,1,1,1,2,10,5,4,1
315 | 0,1,1,1,1,2,1,2,1,1
316 | 1,8,10,3,2,6,4,3,10,1
317 | 1,10,4,6,4,5,10,7,1,1
318 | 1,10,4,7,2,2,8,6,1,1
319 | 0,5,1,1,1,2,1,3,1,2
320 | 0,5,2,2,2,2,1,2,2,1
321 | 1,5,4,6,6,4,10,4,3,1
322 | 1,8,6,7,3,3,10,3,4,2
323 | 0,1,1,1,1,2,1,1,1,1
324 | 1,6,5,5,8,4,10,3,4,1
325 | 0,1,1,1,1,2,1,3,1,1
326 | 0,1,1,1,1,1,1,2,1,1
327 | 1,8,5,5,5,2,10,4,3,1
328 | 1,10,3,3,1,2,10,7,6,1
329 | 0,1,1,1,1,2,1,3,1,1
330 | 0,2,1,1,1,2,1,1,1,1
331 | 0,1,1,1,1,2,1,1,1,1
332 | 1,7,6,4,8,10,10,9,5,3
333 | 0,1,1,1,1,2,1,1,1,1
334 | 0,5,2,2,2,3,1,1,3,1
335 | 0,1,1,1,1,1,1,1,3,1
336 | 1,3,4,4,10,5,1,3,3,1
337 | 1,4,2,3,5,3,8,7,6,1
338 | 0,5,1,1,3,2,1,1,1,1
339 | 0,2,1,1,1,2,1,3,1,1
340 | 0,3,4,5,3,7,3,4,6,1
341 | 1,2,7,10,10,7,10,4,9,4
342 | 0,1,1,1,1,2,1,2,1,1
343 | 0,4,1,1,1,3,1,2,2,1
344 | 1,5,3,3,1,3,3,3,3,3
345 | 1,8,10,10,7,10,10,7,3,8
346 | 1,8,10,5,3,8,4,4,10,3
347 | 1,10,3,5,4,3,7,3,5,3
348 | 1,6,10,10,10,10,10,8,10,10
349 | 1,3,10,3,10,6,10,5,1,4
350 | 0,3,2,2,1,4,3,2,1,1
351 | 0,4,4,4,2,2,3,2,1,1
352 | 0,2,1,1,1,2,1,3,1,1
353 | 0,2,1,1,1,2,1,2,1,1
354 | 1,6,10,10,10,8,10,7,10,7
355 | 1,5,8,8,10,5,10,8,10,3
356 | 0,1,1,3,1,2,1,1,1,1
357 | 0,1,1,3,1,1,1,2,1,1
358 | 0,4,3,2,1,3,1,2,1,1
359 | 0,1,1,3,1,2,1,1,1,1
360 | 0,4,1,2,1,2,1,2,1,1
361 | 0,5,1,1,2,2,1,2,1,1
362 | 0,3,1,2,1,2,1,2,1,1
363 | 0,1,1,1,1,2,1,1,1,1
364 | 0,1,1,1,1,2,1,2,1,1
365 | 0,1,1,1,1,1,1,2,1,1
366 | 0,3,1,1,4,3,1,2,2,1
367 | 0,5,3,4,1,4,1,3,1,1
368 | 0,1,1,1,1,2,1,1,1,1
369 | 1,10,6,3,6,4,10,7,8,4
370 | 0,3,2,2,2,2,1,3,2,1
371 | 0,2,1,1,1,2,1,1,1,1
372 | 0,2,1,1,1,2,1,1,1,1
373 | 0,3,3,2,2,3,1,1,2,3
374 | 1,7,6,6,3,2,10,7,1,1
375 | 0,5,3,3,2,3,1,3,1,1
376 | 0,2,1,1,1,2,1,2,2,1
377 | 0,5,1,1,1,3,2,2,2,1
378 | 0,1,1,1,2,2,1,2,1,1
379 | 1,10,8,7,4,3,10,7,9,1
380 | 0,3,1,1,1,2,1,2,1,1
381 | 0,1,1,1,1,1,1,1,1,1
382 | 0,1,2,3,1,2,1,2,1,1
383 | 0,3,1,1,1,2,1,2,1,1
384 | 0,3,1,1,1,2,1,3,1,1
385 | 0,4,1,1,1,2,1,1,1,1
386 | 0,3,2,1,1,2,1,2,2,1
387 | 0,1,2,3,1,2,1,1,1,1
388 | 1,3,10,8,7,6,9,9,3,8
389 | 0,3,1,1,1,2,1,1,1,1
390 | 0,5,3,3,1,2,1,2,1,1
391 | 0,3,1,1,1,2,4,1,1,1
392 | 0,1,2,1,3,2,1,1,2,1
393 | 0,1,1,1,1,2,1,2,1,1
394 | 0,4,2,2,1,2,1,2,1,1
395 | 0,1,1,1,1,2,1,2,1,1
396 | 0,2,3,2,2,2,2,3,1,1
397 | 0,3,1,2,1,2,1,2,1,1
398 | 0,1,1,1,1,2,1,2,1,1
399 | 1,10,10,10,6,8,4,8,5,1
400 | 0,5,1,2,1,2,1,3,1,1
401 | 1,8,5,6,2,3,10,6,6,1
402 | 0,3,3,2,6,3,3,3,5,1
403 | 1,8,7,8,5,10,10,7,2,1
404 | 0,1,1,1,1,2,1,2,1,1
405 | 0,5,2,2,2,2,2,3,2,2
406 | 0,2,3,1,1,5,1,1,1,1
407 | 0,3,2,2,3,2,3,3,1,1
408 | 1,10,10,10,7,10,10,8,2,1
409 | 0,4,3,3,1,2,1,3,3,1
410 | 0,5,1,3,1,2,1,2,1,1
411 | 0,3,1,1,1,2,1,1,1,1
412 | 1,9,10,10,10,10,10,10,10,1
413 | 0,5,3,6,1,2,1,1,1,1
414 | 1,8,7,8,2,4,2,5,10,1
415 | 0,1,1,1,1,2,1,2,1,1
416 | 0,2,1,1,1,2,1,2,1,1
417 | 0,1,3,1,1,2,1,2,2,1
418 | 0,5,1,1,3,4,1,3,2,1
419 | 0,5,1,1,1,2,1,2,2,1
420 | 0,3,2,2,3,2,1,1,1,1
421 | 0,6,9,7,5,5,8,4,2,1
422 | 1,10,8,10,1,3,10,5,1,1
423 | 1,10,10,10,1,6,1,2,8,1
424 | 0,4,1,1,1,2,1,1,1,1
425 | 0,4,1,3,3,2,1,1,1,1
426 | 0,5,1,1,1,2,1,1,1,1
427 | 1,10,4,3,10,4,10,10,1,1
428 | 0,5,2,2,4,2,4,1,1,1
429 | 0,1,1,1,3,2,3,1,1,1
430 | 0,1,1,1,1,2,2,1,1,1
431 | 0,5,1,1,6,3,1,2,1,1
432 | 0,2,1,1,1,2,1,1,1,1
433 | 0,1,1,1,1,2,1,1,1,1
434 | 0,5,1,1,1,2,1,1,1,1
435 | 0,1,1,1,1,1,1,1,1,1
436 | 1,5,7,9,8,6,10,8,10,1
437 | 0,4,1,1,3,1,1,2,1,1
438 | 0,5,1,1,1,2,1,1,1,1
439 | 0,3,1,1,3,2,1,1,1,1
440 | 1,4,5,5,8,6,10,10,7,1
441 | 0,2,3,1,1,3,1,1,1,1
442 | 1,10,2,2,1,2,6,1,1,2
443 | 1,10,6,5,8,5,10,8,6,1
444 | 1,8,8,9,6,6,3,10,10,1
445 | 0,5,1,2,1,2,1,1,1,1
446 | 0,5,1,3,1,2,1,1,1,1
447 | 0,5,1,1,3,2,1,1,1,1
448 | 0,3,1,1,1,2,5,1,1,1
449 | 0,6,1,1,3,2,1,1,1,1
450 | 0,4,1,1,1,2,1,1,2,1
451 | 0,4,1,1,1,2,1,1,1,1
452 | 1,10,9,8,7,6,4,7,10,3
453 | 1,10,6,6,2,4,10,9,7,1
454 | 1,6,6,6,5,4,10,7,6,2
455 | 0,4,1,1,1,2,1,1,1,1
456 | 0,1,1,2,1,2,1,2,1,1
457 | 0,3,1,1,1,1,1,2,1,1
458 | 0,6,1,1,3,2,1,1,1,1
459 | 0,6,1,1,1,1,1,1,1,1
460 | 0,4,1,1,1,2,1,1,1,1
461 | 0,5,1,1,1,2,1,1,1,1
462 | 0,3,1,1,1,2,1,1,1,1
463 | 0,4,1,2,1,2,1,1,1,1
464 | 0,4,1,1,1,2,1,1,1,1
465 | 0,5,2,1,1,2,1,1,1,1
466 | 1,4,8,7,10,4,10,7,5,1
467 | 0,5,1,1,1,1,1,1,1,1
468 | 0,5,3,2,4,2,1,1,1,1
469 | 1,9,10,10,10,10,5,10,10,10
470 | 1,8,7,8,5,5,10,9,10,1
471 | 0,5,1,2,1,2,1,1,1,1
472 | 0,1,1,1,3,1,3,1,1,1
473 | 0,3,1,1,1,1,1,2,1,1
474 | 1,10,10,10,10,6,10,8,1,5
475 | 1,3,6,4,10,3,3,3,4,1
476 | 1,6,3,2,1,3,4,4,1,1
477 | 0,1,1,1,1,2,1,1,1,1
478 | 1,5,8,9,4,3,10,7,1,1
479 | 0,4,1,1,1,1,1,2,1,1
480 | 1,5,10,10,10,6,10,6,5,2
481 | 0,5,1,2,10,4,5,2,1,1
482 | 0,3,1,1,1,1,1,2,1,1
483 | 0,1,1,1,1,1,1,1,1,1
484 | 0,4,2,1,1,2,1,1,1,1
485 | 0,4,1,1,1,2,1,2,1,1
486 | 0,4,1,1,1,2,1,2,1,1
487 | 0,6,1,1,1,2,1,3,1,1
488 | 0,4,1,1,1,2,1,2,1,1
489 | 0,4,1,1,2,2,1,2,1,1
490 | 0,4,1,1,1,2,1,3,1,1
491 | 0,1,1,1,1,2,1,1,1,1
492 | 0,3,3,1,1,2,1,1,1,1
493 | 1,8,10,10,10,7,5,4,8,7
494 | 0,1,1,1,1,2,4,1,1,1
495 | 0,5,1,1,1,2,1,1,1,1
496 | 0,2,1,1,1,2,1,1,1,1
497 | 0,1,1,1,1,2,1,1,1,1
498 | 0,5,1,1,1,2,1,2,1,1
499 | 0,5,1,1,1,2,1,1,1,1
500 | 0,3,1,1,1,1,1,2,1,1
501 | 1,6,6,7,10,3,10,8,10,2
502 | 1,4,10,4,7,3,10,9,10,1
503 | 0,1,1,1,1,1,1,1,1,1
504 | 0,1,1,1,1,1,1,2,1,1
505 | 0,3,1,2,2,2,1,1,1,1
506 | 1,4,7,8,3,4,10,9,1,1
507 | 0,1,1,1,1,3,1,1,1,1
508 | 0,4,1,1,1,3,1,1,1,1
509 | 1,10,4,5,4,3,5,7,3,1
510 | 1,7,5,6,10,4,10,5,3,1
511 | 0,3,1,1,1,2,1,2,1,1
512 | 0,3,1,1,2,2,1,1,1,1
513 | 0,4,1,1,1,2,1,1,1,1
514 | 0,4,1,1,1,2,1,3,1,1
515 | 0,6,1,3,2,2,1,1,1,1
516 | 0,4,1,1,1,1,1,2,1,1
517 | 1,7,4,4,3,4,10,6,9,1
518 | 0,4,2,2,1,2,1,2,1,1
519 | 0,1,1,1,1,1,1,3,1,1
520 | 0,3,1,1,1,2,1,2,1,1
521 | 0,2,1,1,1,2,1,2,1,1
522 | 0,1,1,3,2,2,1,3,1,1
523 | 0,5,1,1,1,2,1,3,1,1
524 | 0,5,1,2,1,2,1,3,1,1
525 | 0,4,1,1,1,2,1,2,1,1
526 | 0,6,1,1,1,2,1,2,1,1
527 | 0,5,1,1,1,2,2,2,1,1
528 | 0,3,1,1,1,2,1,1,1,1
529 | 0,5,3,1,1,2,1,1,1,1
530 | 0,4,1,1,1,2,1,2,1,1
531 | 0,2,1,3,2,2,1,2,1,1
532 | 0,5,1,1,1,2,1,2,1,1
533 | 1,6,10,10,10,4,10,7,10,1
534 | 0,2,1,1,1,1,1,1,1,1
535 | 0,3,1,1,1,1,1,1,1,1
536 | 1,7,8,3,7,4,5,7,8,2
537 | 0,3,1,1,1,2,1,2,1,1
538 | 0,1,1,1,1,2,1,3,1,1
539 | 0,3,2,2,2,2,1,4,2,1
540 | 0,4,4,2,1,2,5,2,1,2
541 | 0,3,1,1,1,2,1,1,1,1
542 | 0,4,3,1,1,2,1,4,8,1
543 | 0,5,2,2,2,1,1,2,1,1
544 | 0,5,1,1,3,2,1,1,1,1
545 | 0,2,1,1,1,2,1,2,1,1
546 | 0,5,1,1,1,2,1,2,1,1
547 | 0,5,1,1,1,2,1,3,1,1
548 | 0,5,1,1,1,2,1,3,1,1
549 | 0,1,1,1,1,2,1,3,1,1
550 | 0,3,1,1,1,2,1,2,1,1
551 | 0,4,1,1,1,2,1,3,2,1
552 | 1,5,7,10,10,5,10,10,10,1
553 | 0,3,1,2,1,2,1,3,1,1
554 | 0,4,1,1,1,2,3,2,1,1
555 | 1,8,4,4,1,6,10,2,5,2
556 | 1,10,10,8,10,6,5,10,3,1
557 | 1,8,10,4,4,8,10,8,2,1
558 | 1,7,6,10,5,3,10,9,10,2
559 | 0,3,1,1,1,2,1,2,1,1
560 | 0,1,1,1,1,2,1,2,1,1
561 | 1,10,9,7,3,4,2,7,7,1
562 | 0,5,1,2,1,2,1,3,1,1
563 | 0,5,1,1,1,2,1,2,1,1
564 | 0,1,1,1,1,2,1,2,1,1
565 | 0,1,1,1,1,2,1,2,1,1
566 | 0,1,1,1,1,2,1,3,1,1
567 | 0,5,1,2,1,2,1,2,1,1
568 | 1,5,7,10,6,5,10,7,5,1
569 | 1,6,10,5,5,4,10,6,10,1
570 | 0,3,1,1,1,2,1,1,1,1
571 | 0,5,1,1,6,3,1,1,1,1
572 | 0,1,1,1,1,2,1,1,1,1
573 | 1,8,10,10,10,6,10,10,10,1
574 | 0,5,1,1,1,2,1,2,2,1
575 | 1,9,8,8,9,6,3,4,1,1
576 | 0,5,1,1,1,2,1,1,1,1
577 | 1,4,10,8,5,4,1,10,1,1
578 | 1,2,5,7,6,4,10,7,6,1
579 | 1,10,3,4,5,3,10,4,1,1
580 | 0,5,1,2,1,2,1,1,1,1
581 | 1,4,8,6,3,4,10,7,1,1
582 | 0,5,1,1,1,2,1,2,1,1
583 | 0,4,1,2,1,2,1,2,1,1
584 | 0,5,1,3,1,2,1,3,1,1
585 | 0,3,1,1,1,2,1,2,1,1
586 | 0,5,2,4,1,1,1,1,1,1
587 | 0,3,1,1,1,2,1,2,1,1
588 | 0,1,1,1,1,1,1,2,1,1
589 | 0,4,1,1,1,2,1,2,1,1
590 | 1,5,4,6,8,4,1,8,10,1
591 | 1,5,3,2,8,5,10,8,1,2
592 | 1,10,5,10,3,5,8,7,8,3
593 | 0,4,1,1,2,2,1,1,1,1
594 | 0,1,1,1,1,2,1,1,1,1
595 | 1,5,10,10,10,10,10,10,1,1
596 | 0,5,1,1,1,2,1,1,1,1
597 | 1,10,4,3,10,3,10,7,1,2
598 | 1,5,10,10,10,5,2,8,5,1
599 | 1,8,10,10,10,6,10,10,10,10
600 | 0,2,3,1,1,2,1,2,1,1
601 | 0,2,1,1,1,1,1,2,1,1
602 | 0,4,1,3,1,2,1,2,1,1
603 | 0,3,1,1,1,2,1,2,1,1
604 | 0,4,1,1,1,2,1,2,1,1
605 | 0,5,1,1,1,2,1,2,1,1
606 | 0,3,1,1,1,2,1,2,1,1
607 | 0,6,3,3,3,3,2,6,1,1
608 | 0,7,1,2,3,2,1,2,1,1
609 | 0,1,1,1,1,2,1,1,1,1
610 | 0,5,1,1,2,1,1,2,1,1
611 | 0,3,1,3,1,3,4,1,1,1
612 | 1,4,6,6,5,7,6,7,7,3
613 | 0,2,1,1,1,2,5,1,1,1
614 | 0,2,1,1,1,2,1,1,1,1
615 | 0,4,1,1,1,2,1,1,1,1
616 | 0,6,2,3,1,2,1,1,1,1
617 | 0,5,1,1,1,2,1,2,1,1
618 | 0,1,1,1,1,2,1,1,1,1
619 | 1,8,7,4,4,5,3,5,10,1
620 | 0,3,1,1,1,2,1,1,1,1
621 | 0,3,1,4,1,2,1,1,1,1
622 | 1,10,10,7,8,7,1,10,10,3
623 | 0,4,2,4,3,2,2,2,1,1
624 | 0,4,1,1,1,2,1,1,1,1
625 | 0,5,1,1,3,2,1,1,1,1
626 | 0,4,1,1,3,2,1,1,1,1
627 | 0,3,1,1,1,2,1,2,1,1
628 | 0,3,1,1,1,2,1,2,1,1
629 | 0,1,1,1,1,2,1,1,1,1
630 | 0,2,1,1,1,2,1,1,1,1
631 | 0,3,1,1,1,2,1,2,1,1
632 | 0,1,2,2,1,2,1,1,1,1
633 | 0,1,1,1,3,2,1,1,1,1
634 | 1,5,10,10,10,10,2,10,10,10
635 | 0,3,1,1,1,2,1,2,1,1
636 | 0,3,1,1,2,3,4,1,1,1
637 | 0,1,2,1,3,2,1,2,1,1
638 | 0,5,1,1,1,2,1,2,2,1
639 | 0,4,1,1,1,2,1,2,1,1
640 | 0,3,1,1,1,2,1,3,1,1
641 | 0,3,1,1,1,2,1,2,1,1
642 | 0,5,1,1,1,2,1,2,1,1
643 | 0,5,4,5,1,8,1,3,6,1
644 | 1,7,8,8,7,3,10,7,2,3
645 | 0,1,1,1,1,2,1,1,1,1
646 | 0,1,1,1,1,2,1,2,1,1
647 | 0,4,1,1,1,2,1,3,1,1
648 | 0,1,1,3,1,2,1,2,1,1
649 | 0,1,1,3,1,2,1,2,1,1
650 | 0,3,1,1,3,2,1,2,1,1
651 | 0,1,1,1,1,2,1,1,1,1
652 | 0,5,2,2,2,2,1,1,1,2
653 | 0,3,1,1,1,2,1,3,1,1
654 | 1,5,7,4,1,6,1,7,10,3
655 | 1,5,10,10,8,5,5,7,10,1
656 | 1,3,10,7,8,5,8,7,4,1
657 | 0,3,2,1,2,2,1,3,1,1
658 | 0,2,1,1,1,2,1,3,1,1
659 | 0,5,3,2,1,3,1,1,1,1
660 | 0,1,1,1,1,2,1,2,1,1
661 | 0,4,1,4,1,2,1,1,1,1
662 | 0,1,1,2,1,2,1,2,1,1
663 | 0,5,1,1,1,2,1,1,1,1
664 | 0,1,1,1,1,2,1,1,1,1
665 | 0,2,1,1,1,2,1,1,1,1
666 | 1,10,10,10,10,5,10,10,10,7
667 | 1,5,10,10,10,4,10,5,6,3
668 | 0,5,1,1,1,2,1,3,2,1
669 | 0,1,1,1,1,2,1,1,1,1
670 | 0,1,1,1,1,2,1,1,1,1
671 | 0,1,1,1,1,2,1,1,1,1
672 | 0,1,1,1,1,2,1,1,1,1
673 | 0,3,1,1,1,2,1,2,3,1
674 | 0,4,1,1,1,2,1,1,1,1
675 | 0,1,1,1,1,2,1,1,1,8
676 | 0,1,1,1,3,2,1,1,1,1
677 | 1,5,10,10,5,4,5,4,4,1
678 | 0,3,1,1,1,2,1,1,1,1
679 | 0,3,1,1,1,2,1,2,1,2
680 | 0,3,1,1,1,3,2,1,1,1
681 | 0,2,1,1,1,2,1,1,1,1
682 | 1,5,10,10,3,7,3,8,10,2
683 | 1,4,8,6,4,3,4,10,6,1
684 | 1,4,8,8,5,4,5,10,4,1
685 | 


--------------------------------------------------------------------------------
/riskslim/mip.py:
--------------------------------------------------------------------------------
  1 | from math import ceil, floor
  2 | import numpy as np
  3 | from cplex import Cplex, SparsePair, infinity as CPX_INFINITY
  4 | from .coefficient_set import CoefficientSet
  5 | from .utils import print_log
  6 | 
  7 | #todo: add loss cut
  8 | #todo: add constraint function
  9 | #todo: default cplex parameters
 10 | #todo: check cores
 11 | #todo: pass compute_loss to convert_risk_slim_cplex_solution
 12 | 
 13 | def create_risk_slim(coef_set, input):
 14 |     """
 15 |     create RiskSLIM MIP object
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     input - dictionary of RiskSLIM parameters and formulation
 20 | 
 21 |     Returns
 22 |     -------
 23 |     mip - RiskSLIM surrogate MIP without 0 cuts
 24 | 
 25 |     Issues
 26 |     ----
 27 |     no support for non-integer Lset "values"
 28 |     only drops intercept index for variable_names that match '(Intercept)'
 29 | 
 30 |     """
 31 |     assert isinstance(coef_set, CoefficientSet)
 32 |     assert isinstance(input, dict)
 33 | 
 34 |     # setup printing and loading
 35 |     function_print_flag = input.get('print_flag', False)
 36 |     print_from_function = lambda msg: print_log(msg) if function_print_flag else lambda msg: None
 37 | 
 38 |     # set default parameters
 39 |     input.setdefault('w_pos', 1.0)
 40 |     input.setdefault('w_neg', 2.0 - input['w_pos'])
 41 |     input.setdefault('C_0', 0.01)
 42 |     input.setdefault('include_auxillary_variable_for_objval', True)
 43 |     input.setdefault('include_auxillary_variable_for_L0_norm', True)
 44 |     input.setdefault('loss_min', 0.00)
 45 |     input.setdefault('loss_max', float(CPX_INFINITY))
 46 |     input.setdefault('L0_min', 0)
 47 |     input.setdefault('L0_max', len(coef_set))
 48 |     input.setdefault('objval_min', 0.00)
 49 |     input.setdefault('objval_max', float(CPX_INFINITY))
 50 |     input.setdefault('relax_integer_variables', False)
 51 |     input.setdefault('drop_variables', True)
 52 |     input.setdefault('tight_formulation', False)
 53 |     input.setdefault('set_cplex_cutoffs', True)
 54 | 
 55 |     # variables
 56 |     P = len(coef_set)
 57 |     w_pos, w_neg = input['w_pos'], input['w_neg']
 58 |     C_0j = np.copy(coef_set.c0)
 59 |     L0_reg_ind = np.isnan(C_0j)
 60 |     C_0j[L0_reg_ind] = input['C_0']
 61 |     C_0j = C_0j.tolist()
 62 |     C_0_rho = np.copy(C_0j)
 63 |     trivial_L0_min = 0
 64 |     trivial_L0_max = np.sum(L0_reg_ind)
 65 | 
 66 |     rho_ub = list(coef_set.ub)
 67 |     rho_lb = list(coef_set.lb)
 68 |     rho_type = ''.join(list(coef_set.vtype))
 69 | 
 70 |     # calculate min/max values for loss
 71 |     loss_min = max(0.0, float(input['loss_min']))
 72 |     loss_max = min(CPX_INFINITY, float(input['loss_max']))
 73 | 
 74 |     # calculate min/max values for model size
 75 |     L0_min = max(input['L0_min'], 0.0)
 76 |     L0_max = min(input['L0_max'], trivial_L0_max)
 77 |     L0_min = ceil(L0_min)
 78 |     L0_max = floor(L0_max)
 79 |     assert L0_min <= L0_max
 80 | 
 81 |     # calculate min/max values for objval
 82 |     objval_min = max(input['objval_min'], 0.0)
 83 |     objval_max = min(input['objval_max'], CPX_INFINITY)
 84 |     assert objval_min <= objval_max
 85 | 
 86 |     # include constraint on min/max model size?
 87 |     nontrivial_L0_min = L0_min > trivial_L0_min
 88 |     nontrivial_L0_max = L0_max < trivial_L0_max
 89 |     include_auxillary_variable_for_L0_norm = input['include_auxillary_variable_for_L0_norm'] or \
 90 |                                              nontrivial_L0_min or \
 91 |                                              nontrivial_L0_max
 92 | 
 93 |     # include constraint on min/max objective value?
 94 |     nontrivial_objval_min = objval_min > 0.0
 95 |     nontrivial_objval_max = objval_max < CPX_INFINITY
 96 |     include_auxillary_variable_for_objval = input['include_auxillary_variable_for_objval'] or \
 97 |                                             nontrivial_objval_min or \
 98 |                                             nontrivial_objval_max
 99 | 
100 |     has_intercept = '(Intercept)' in coef_set.variable_names
101 |     """
102 |     RiskSLIM MIP Formulation
103 |     
104 |     minimize w_pos*loss_pos + w_neg *loss_minus + 0*rho_j + C_0j*alpha_j
105 |     
106 |     such that 
107 |     
108 |     L0_min ≤ L0 ≤ L0_max
109 |     -rho_min * alpha_j < lambda_j < rho_max * alpha_j
110 | 
111 |     L_0 in 0 to P
112 |     rho_j in [rho_min_j, rho_max_j]
113 |     alpha_j in {0,1}
114 | 
115 |     x = [loss_pos, loss_neg, rho_j, alpha_j]
116 | 
117 |     optional constraints:
118 |     objval = w_pos * loss_pos + w_neg * loss_min + sum(C_0j * alpha_j) (required for callback)
119 |     L0_norm = sum(alpha_j) (required for callback)
120 | 
121 | 
122 |     Changes for Tight Formulation (included when input['tight_formulation'] = True):
123 | 
124 |     sigma_j in {0,1} for j s.t. lambda_j has free sign and alpha_j exists
125 |     lambda_j ≥ delta_pos_j if alpha_j = 1 and sigma_j = 1
126 |     lambda_j ≥ -delta_neg_j if alpha_j = 1 and sigma_j = 0
127 |     lambda_j ≥ alpha_j for j such that lambda_j >= 0
128 |     lambda_j ≤ -alpha_j for j such that lambda_j <= 0
129 |     
130 |     """
131 | 
132 |     # create MIP object
133 |     mip = Cplex()
134 |     vars = mip.variables
135 |     cons = mip.linear_constraints
136 | 
137 |     # set sense
138 |     mip.objective.set_sense(mip.objective.sense.minimize)
139 | 
140 |     # add main variables
141 |     loss_obj = [w_pos]
142 |     loss_ub = [loss_max]
143 |     loss_lb = [loss_min]
144 |     loss_type = 'C'
145 |     loss_names = ['loss']
146 | 
147 |     obj = loss_obj + [0.0] * P + C_0j
148 |     ub = loss_ub + rho_ub + [1.0] * P
149 |     lb = loss_lb + rho_lb + [0.0] * P
150 |     ctype = loss_type + rho_type + 'B' * P
151 | 
152 |     rho_names = ['rho_%d' % j for j in range(P)]
153 |     alpha_names = ['alpha_%d' % j for j in range(P)]
154 |     varnames = loss_names + rho_names + alpha_names
155 | 
156 |     if include_auxillary_variable_for_objval:
157 |         objval_auxillary_name = ['objval']
158 |         objval_auxillary_ub = [objval_max]
159 |         objval_auxillary_lb = [objval_min]
160 |         objval_type = 'C'
161 | 
162 |         print_from_function("adding auxiliary variable for objval s.t. %1.4f <= objval <= %1.4f" % (objval_min, objval_max))
163 |         obj += [0.0]
164 |         ub += objval_auxillary_ub
165 |         lb += objval_auxillary_lb
166 |         varnames += objval_auxillary_name
167 |         ctype += objval_type
168 | 
169 | 
170 |     if include_auxillary_variable_for_L0_norm:
171 |         L0_norm_auxillary_name = ['L0_norm']
172 |         L0_norm_auxillary_ub = [L0_max]
173 |         L0_norm_auxillary_lb = [L0_min]
174 |         L0_norm_type = 'I'
175 | 
176 |         print_from_function("adding auxiliary variable for L0_norm s.t. %d <= L0_norm <= %d" % (L0_min, L0_max))
177 |         obj += [0.0]
178 |         ub += L0_norm_auxillary_ub
179 |         lb += L0_norm_auxillary_lb
180 |         varnames += L0_norm_auxillary_name
181 |         ctype += L0_norm_type
182 | 
183 |     if input['relax_integer_variables']:
184 |         ctype = ctype.replace('I', 'C')
185 |         ctype = ctype.replace('B', 'C')
186 | 
187 |     vars.add(obj = obj, lb = lb, ub = ub, types = ctype, names = varnames)
188 | 
189 |     # 0-Norm LB Constraints:
190 |     # lambda_j,lb * alpha_j ≤ lambda_j <= Inf
191 |     # 0 ≤ lambda_j - lambda_j,lb * alpha_j < Inf
192 |     for j in range(P):
193 |         cons.add(names = ["L0_norm_lb_" + str(j)],
194 |                  lin_expr = [SparsePair(ind=[rho_names[j], alpha_names[j]], val=[1.0, -rho_lb[j]])],
195 |                  senses = "G",
196 |                  rhs = [0.0])
197 | 
198 |     # 0-Norm UB Constraints:
199 |     # lambda_j ≤ lambda_j,ub * alpha_j
200 |     # 0 <= -lambda_j + lambda_j,ub * alpha_j
201 |     for j in range(P):
202 |         cons.add(names = ["L0_norm_ub_" + str(j)],
203 |                  lin_expr =[SparsePair(ind=[rho_names[j], alpha_names[j]], val=[-1.0, rho_ub[j]])],
204 |                  senses = "G",
205 |                  rhs = [0.0])
206 | 
207 |     # objval_max constraint
208 |     # loss_var + sum(C_0j .* alpha_j) <= objval_max
209 |     if include_auxillary_variable_for_objval:
210 |         print_from_function("adding constraint so that objective value <= " + str(objval_max))
211 |         cons.add(names = ["objval_def"],
212 |                  lin_expr = [SparsePair(ind = objval_auxillary_name + loss_names + alpha_names, val=[-1.0] + loss_obj + C_0j)],
213 |                  senses = "E",
214 |                  rhs = [0.0])
215 | 
216 |     # Auxiliary L0_norm variable definition:
217 |     # L0_norm = sum(alpha_j)
218 |     # L0_norm - sum(alpha_j) = 0
219 |     if include_auxillary_variable_for_L0_norm:
220 |         cons.add(names = ["L0_norm_def"],
221 |                  lin_expr = [SparsePair(ind = L0_norm_auxillary_name + alpha_names, val = [1.0] + [-1.0] * P)],
222 |                  senses = "E",
223 |                  rhs = [0.0])
224 | 
225 | 
226 |     # drop L0_norm_lb constraint for any variable with rho_lb >= 0
227 |     dropped_variables = []
228 |     constraints_to_drop = []
229 | 
230 |     # drop alpha / L0_norm_ub / L0_norm_lb for ('Intercept')
231 |     if input['drop_variables']:
232 |         # drop L0_norm_ub/lb constraint for any variable with rho_ub/rho_lb >= 0
233 |         sign_pos_ind = np.flatnonzero(coef_set.sign > 0)
234 |         sign_neg_ind = np.flatnonzero(coef_set.sign < 0)
235 |         constraints_to_drop.extend(["L0_norm_lb_" + str(j) for j in sign_pos_ind])
236 |         constraints_to_drop.extend(["L0_norm_ub_" + str(j) for j in sign_neg_ind])
237 | 
238 |         # drop alpha for any variable where rho_ub = rho_lb = 0
239 |         fixed_value_ind = np.flatnonzero(coef_set.ub == coef_set.lb)
240 |         variables_to_drop = ["alpha_" + str(j) for j in fixed_value_ind]
241 |         vars.delete(variables_to_drop)
242 |         dropped_variables += variables_to_drop
243 |         alpha_names = [alpha_names[j] for j in range(P) if alpha_names[j] not in dropped_variables]
244 | 
245 |     if has_intercept:
246 |         intercept_idx = coef_set.variable_names.index('(Intercept)')
247 |         intercept_alpha_name = 'alpha_' + str(intercept_idx)
248 |         vars.delete([intercept_alpha_name])
249 | 
250 |         alpha_names.remove(intercept_alpha_name)
251 |         dropped_variables.append(intercept_alpha_name)
252 | 
253 |         print_from_function("dropped L0 indicator for '(Intercept)'")
254 |         constraints_to_drop.extend(["L0_norm_ub_" + str(intercept_idx), "L0_norm_lb_" + str(intercept_idx)])
255 | 
256 |     if len(constraints_to_drop) > 0:
257 |         constraints_to_drop = list(set(constraints_to_drop))
258 |         cons.delete(constraints_to_drop)
259 | 
260 |     # indices
261 |     indices = {
262 |         'n_variables': vars.get_num(),
263 |         'n_constraints': cons.get_num(),
264 |         'names': vars.get_names(),
265 |         'loss_names': loss_names,
266 |         'rho_names': rho_names,
267 |         'alpha_names': alpha_names,
268 |         'loss': vars.get_indices(loss_names),
269 |         'rho': vars.get_indices(rho_names),
270 |         'alpha': vars.get_indices(alpha_names),
271 |         'L0_reg_ind': L0_reg_ind,
272 |         'C_0_rho': C_0_rho,
273 |         'C_0_alpha': mip.objective.get_linear(alpha_names) if len(alpha_names) > 0 else [],
274 |         }
275 | 
276 |     if include_auxillary_variable_for_objval:
277 |         indices.update({
278 |             'objval_name': objval_auxillary_name,
279 |             'objval': vars.get_indices(objval_auxillary_name)[0],
280 |             })
281 | 
282 |     if include_auxillary_variable_for_L0_norm:
283 |         indices.update({
284 |             'L0_norm_name': L0_norm_auxillary_name,
285 |             'L0_norm': vars.get_indices(L0_norm_auxillary_name)[0],
286 |             })
287 | 
288 |     # officially change the problem to LP if variables are relaxed
289 |     if input['relax_integer_variables']:
290 |         old_problem_type = mip.problem_type[mip.get_problem_type()]
291 |         mip.set_problem_type(mip.problem_type.LP)
292 |         new_problem_type = mip.problem_type[mip.get_problem_type()]
293 |         print_from_function("changed problem type from %s to %s" % (old_problem_type, new_problem_type))
294 | 
295 |     if input['set_cplex_cutoffs'] and not input['relax_integer_variables']:
296 |         mip.parameters.mip.tolerances.lowercutoff.set(objval_min)
297 |         mip.parameters.mip.tolerances.uppercutoff.set(objval_max)
298 | 
299 |     return mip, indices
300 | 
301 | 
302 | def set_cplex_mip_parameters(cpx, param, display_cplex_progress = False):
303 |     """
304 |     Helper function to set CPLEX parameters of CPLEX MIP object
305 | 
306 |     Parameters
307 |     ----------
308 |     mip
309 |     param
310 |     display_cplex_progress
311 | 
312 |     Returns
313 |     -------
314 |     MIP with parameters
315 | 
316 |     """
317 |     p = cpx.parameters
318 |     p.randomseed.set(param['randomseed'])
319 |     p.threads.set(param['n_cores'])
320 |     p.output.clonelog.set(0)
321 |     p.parallel.set(1)
322 | 
323 |     if display_cplex_progress is (None or False):
324 |         cpx = set_cpx_display_options(cpx, display_mip = False, display_lp = False, display_parameters = False)
325 | 
326 |     problem_type = cpx.problem_type[cpx.get_problem_type()]
327 |     if problem_type == 'MIP':
328 |         # CPLEX Memory Parameters
329 |         # MIP.Param.workdir.Cur  = exp_workdir;
330 |         # MIP.Param.workmem.Cur                    = cplex_workingmem;
331 |         # MIP.Param.mip.strategy.file.Cur          = 2; %nodefile uncompressed
332 |         # MIP.Param.mip.limits.treememory.Cur      = cplex_nodefilesize;
333 | 
334 |         # CPLEX MIP Parameters
335 |         p.emphasis.mip.set(param['mipemphasis'])
336 |         p.mip.tolerances.mipgap.set(param['mipgap'])
337 |         p.mip.tolerances.absmipgap.set(param['absmipgap'])
338 |         p.mip.tolerances.integrality.set(param['integrality_tolerance'])
339 | 
340 |         # CPLEX Solution Pool Parameters
341 |         p.mip.limits.repairtries.set(param['repairtries'])
342 |         p.mip.pool.capacity.set(param['poolsize'])
343 |         p.mip.pool.replace.set(param['poolreplace'])
344 |         # 0 = replace oldest /1: replace worst objective / #2 = replace least diverse solutions
345 | 
346 |     return cpx
347 | 
348 | 
349 | def set_cpx_display_options(cpx, display_mip = True, display_parameters = False, display_lp = False):
350 | 
351 |     cpx.parameters.mip.display.set(display_mip)
352 |     cpx.parameters.simplex.display.set(display_lp)
353 | 
354 |     try:
355 |         cpx.parameters.paramdisplay.set(display_parameters)
356 |     except AttributeError:
357 |         pass
358 | 
359 |     if not (display_mip or display_lp):
360 |         cpx.set_results_stream(None)
361 |         cpx.set_log_stream(None)
362 |         cpx.set_error_stream(None)
363 |         cpx.set_warning_stream(None)
364 | 
365 |     return cpx
366 | 
367 | 
368 | def add_mip_starts(mip, indices, pool, max_mip_starts = float('inf'), mip_start_effort_level = 4):
369 |     """
370 | 
371 |     Parameters
372 |     ----------
373 |     mip - RiskSLIM surrogate MIP
374 |     indices - indices of RiskSLIM surrogate MIP
375 |     pool - solution pool
376 |     max_mip_starts - max number of mip starts to add (optional; default is add all)
377 |     mip_start_effort_level - effort that CPLEX will spend trying to fix (optional; default is 4)
378 | 
379 |     Returns
380 |     -------
381 | 
382 |     """
383 |     # todo remove suboptimal using pool filter
384 |     assert isinstance(mip, Cplex)
385 | 
386 |     try:
387 |         obj_cutoff = mip.parameters.mip.tolerances.uppercutoff.get()
388 |     except:
389 |         obj_cutoff = float('inf')
390 | 
391 |     pool = pool.distinct().sort()
392 | 
393 |     n_added = 0
394 |     for objval, rho in zip(pool.objvals, pool.solutions):
395 |         if np.less_equal(objval, obj_cutoff):
396 |             mip_start_name = "mip_start_" + str(n_added)
397 |             mip_start_obj, _ = convert_to_risk_slim_cplex_solution(rho = rho, indices = indices, objval = objval)
398 |             mip_start_obj = cast_mip_start(mip_start_obj, mip)
399 |             mip.MIP_starts.add(mip_start_obj, mip_start_effort_level, mip_start_name)
400 |             n_added += 1
401 | 
402 |         if n_added >= max_mip_starts:
403 |             break
404 | 
405 |     return mip
406 | 
407 | 
408 | def cast_mip_start(mip_start, cpx):
409 |     """
410 |     casts the solution values and indices in a Cplex SparsePair
411 | 
412 |     Parameters
413 |     ----------
414 |     mip_start cplex SparsePair
415 |     cpx Cplex
416 | 
417 |     Returns
418 |     -------
419 |     Cplex SparsePair where the indices are integers and the values for each variable match the variable type specified in CPLEX Object
420 |     """
421 | 
422 |     assert isinstance(cpx, Cplex)
423 |     assert isinstance(mip_start, SparsePair)
424 |     vals = list(mip_start.val)
425 |     idx = np.array(list(mip_start.ind), dtype = int).tolist()
426 |     types = cpx.variables.get_types(idx)
427 | 
428 |     for j, t in enumerate(types):
429 |         if t in ['B', 'I']:
430 |             vals[j] = int(vals[j])
431 |         elif t in ['C']:
432 |             vals[j] = float(vals[j])
433 | 
434 |     return SparsePair(ind = idx, val = vals)
435 | 
436 | 
437 | def convert_to_risk_slim_cplex_solution(rho, indices, loss = None, objval = None):
438 |     """
439 |     Convert coefficient vector 'rho' into a solution for RiskSLIM CPLEX MIP
440 | 
441 |     Parameters
442 |     ----------
443 |     rho
444 |     indices
445 |     loss
446 |     objval
447 | 
448 |     Returns
449 |     -------
450 | 
451 |     """
452 |     global compute_loss
453 |     n_variables = indices['n_variables']
454 |     solution_idx = np.arange(n_variables)
455 |     solution_val = np.zeros(n_variables)
456 | 
457 |     # rho
458 |     solution_val[indices['rho']] = rho
459 | 
460 |     # alpha
461 |     alpha = np.zeros(len(indices['alpha']))
462 |     alpha[np.flatnonzero(rho[indices['L0_reg_ind']])] = 1.0
463 |     solution_val[indices['alpha']] = alpha
464 |     L0_penalty = np.sum(indices['C_0_alpha'] * alpha)
465 | 
466 |     # add loss / objval
467 |     need_loss = 'loss' in indices
468 |     need_objective_val = 'objval' in indices
469 |     need_L0_norm = 'L0_norm' in indices
470 |     need_sigma = 'sigma_names' in indices
471 | 
472 |     # check that we have the right length
473 |     # COMMENT THIS OUT FOR DEPLOYMENT
474 |     # if need_sigma:
475 |     #     pass
476 |     # else:
477 |     #     assert (indices['n_variables'] == (len(rho) + len(alpha) + need_loss + need_objective_val + need_L0_norm))
478 | 
479 |     if need_loss:
480 |         if loss is None:
481 |             if objval is None:
482 |                 loss = compute_loss(rho)
483 |             else:
484 |                 loss = objval - L0_penalty
485 | 
486 |         solution_val[indices['loss']] = loss
487 | 
488 |     if need_objective_val:
489 |         if objval is None:
490 |             if loss is None:
491 |                 objval = compute_loss(rho) + L0_penalty
492 |             else:
493 |                 objval = loss + L0_penalty
494 | 
495 |         solution_val[indices['objval']] = objval
496 | 
497 |     if need_L0_norm:
498 |         solution_val[indices['L0_norm']] = np.sum(alpha)
499 | 
500 |     if need_sigma:
501 |         rho_for_sigma = np.array([indices['rho'][int(s.strip('sigma_'))] for s in indices['sigma_names']])
502 |         solution_val[indices['sigma']] = np.abs(solution_val[rho_for_sigma])
503 | 
504 |     solution_cpx = SparsePair(ind = solution_idx, val = solution_val.tolist())
505 |     return solution_cpx, objval
506 | 


--------------------------------------------------------------------------------
/riskslim/initialization.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | from cplex import Cplex, SparsePair, infinity as CPX_INFINITY
  4 | from .setup_functions import setup_penalty_parameters
  5 | from .mip import create_risk_slim, set_cplex_mip_parameters
  6 | from .solution_pool import SolutionPool
  7 | from .bound_tightening import chained_updates, chained_updates_for_lp
  8 | from .heuristics import discrete_descent, sequential_rounding
  9 | from .defaults import DEFAULT_CPA_SETTINGS, DEFAULT_INITIALIZATION_SETTINGS
 10 | from .utils import print_log, validate_settings
 11 | 
 12 | 
 13 | def initialize_lattice_cpa(Z,
 14 |                            c0_value,
 15 |                            constraints,
 16 |                            bounds,
 17 |                            settings,
 18 |                            risk_slim_settings,
 19 |                            cplex_settings,
 20 |                            compute_loss_real,
 21 |                            compute_loss_cut_real,
 22 |                            compute_loss_from_scores_real,
 23 |                            compute_loss_from_scores,
 24 |                            get_objval,
 25 |                            get_L0_penalty,
 26 |                            is_feasible):
 27 |     """
 28 | 
 29 |     Returns
 30 |     -------
 31 |     cuts
 32 |     solution pool
 33 |     bounds
 34 | 
 35 |     """
 36 |     #todo: recompute function handles here if required
 37 |     assert callable(compute_loss_real)
 38 |     assert callable(compute_loss_cut_real)
 39 |     assert callable(compute_loss_from_scores_real)
 40 |     assert callable(compute_loss_from_scores)
 41 |     assert callable(get_objval)
 42 |     assert callable(get_L0_penalty)
 43 |     assert callable(is_feasible)
 44 | 
 45 |     print_log('-' * 60)
 46 |     print_log('runnning initialization procedure')
 47 |     print_log('-' * 60)
 48 | 
 49 |     # trade-off parameter
 50 |     _, C_0, L0_reg_ind, C_0_nnz = setup_penalty_parameters(c0_value = c0_value, coef_set = constraints['coef_set'])
 51 | 
 52 |     settings = validate_settings(settings, default_settings = DEFAULT_INITIALIZATION_SETTINGS)
 53 |     settings['type'] = 'cvx'
 54 | 
 55 |     # create RiskSLIM LP
 56 |     risk_slim_settings = dict(risk_slim_settings)
 57 |     risk_slim_settings.update(bounds)
 58 |     risk_slim_settings['relax_integer_variables'] = True
 59 |     risk_slim_lp, risk_slim_lp_indices = create_risk_slim(coef_set = constraints['coef_set'], input = risk_slim_settings)
 60 |     risk_slim_lp = set_cplex_mip_parameters(risk_slim_lp, cplex_settings, display_cplex_progress = settings['display_cplex_progress'])
 61 | 
 62 |     # solve risk_slim_lp LP using standard CPA
 63 |     cpa_stats, cuts, cpa_pool = run_standard_cpa(cpx = risk_slim_lp,
 64 |                                                  cpx_indices = risk_slim_lp_indices,
 65 |                                                  compute_loss = compute_loss_real,
 66 |                                                  compute_loss_cut = compute_loss_cut_real,
 67 |                                                  settings = settings)
 68 | 
 69 |     # update bounds
 70 |     bounds = chained_updates(bounds, C_0_nnz, new_objval_at_relaxation = cpa_stats['lowerbound'])
 71 |     print_log('CPA produced %d cuts' % len(cuts))
 72 | 
 73 |     def rounded_model_size_is_ok(rho):
 74 |         zero_idx_rho_ceil = np.equal(np.ceil(rho), 0)
 75 |         zero_idx_rho_floor = np.equal(np.floor(rho), 0)
 76 |         cannot_round_to_zero = np.logical_not(np.logical_or(zero_idx_rho_ceil, zero_idx_rho_floor))
 77 |         rounded_rho_L0_min = np.count_nonzero(cannot_round_to_zero[L0_reg_ind])
 78 |         rounded_rho_L0_max = np.count_nonzero(rho[L0_reg_ind])
 79 |         return rounded_rho_L0_min >= constraints['L0_min'] and rounded_rho_L0_max <= constraints['L0_max']
 80 | 
 81 |     cpa_pool = cpa_pool.remove_infeasible(rounded_model_size_is_ok).distinct().sort()
 82 | 
 83 |     if len(cpa_pool) == 0:
 84 |         print_log('all CPA solutions are infeasible')
 85 | 
 86 |     pool = SolutionPool(cpa_pool.P)
 87 | 
 88 |     # round CPA solutions
 89 |     if settings['use_rounding'] and len(cpa_pool) > 0:
 90 |         print_log('running naive rounding on %d solutions' % len(cpa_pool))
 91 |         print_log('best objective value: %1.4f' % np.min(cpa_pool.objvals))
 92 |         rnd_pool, _, _ = round_solution_pool(cpa_pool,
 93 |                                              constraints,
 94 |                                              max_runtime = settings['rounding_max_runtime'],
 95 |                                              max_solutions = settings['rounding_max_solutions'])
 96 | 
 97 |         rnd_pool = rnd_pool.compute_objvals(get_objval).remove_infeasible(is_feasible)
 98 |         print_log('rounding produced %d integer solutions' % len(rnd_pool))
 99 |         if len(rnd_pool) > 0:
100 |             pool.append(rnd_pool)
101 |             print_log('best objective value is %1.4f' % np.min(rnd_pool.objvals))
102 | 
103 |     # sequentially round CPA solutions
104 |     if settings['use_sequential_rounding'] and len(cpa_pool) > 0:
105 |         print_log('running sequential rounding on %d solutions' % len(cpa_pool))
106 |         print_log('best objective value: %1.4f' % np.min(cpa_pool.objvals))
107 |         sqrnd_pool, _, _ = sequential_round_solution_pool(pool = cpa_pool,
108 |                                                           Z = Z,
109 |                                                           C_0 = C_0,
110 |                                                           compute_loss_from_scores_real = compute_loss_from_scores_real,
111 |                                                           get_L0_penalty = get_L0_penalty,
112 |                                                           max_runtime = settings['sequential_rounding_max_runtime'],
113 |                                                           max_solutions = settings['sequential_rounding_max_solutions'],
114 |                                                           objval_cutoff = bounds['objval_max'])
115 | 
116 |         sqrnd_pool = sqrnd_pool.remove_infeasible(is_feasible)
117 |         print_log('sequential rounding produced %d integer solutions' % len(sqrnd_pool))
118 |         if len(sqrnd_pool) > 0:
119 |             pool = pool.append(sqrnd_pool)
120 |             print_log('best objective value: %1.4f' % np.min(pool.objvals))
121 | 
122 |     # polish rounded solutions
123 |     if settings['polishing_after'] and len(pool) > 0:
124 |         print_log('polishing %d solutions' % len(pool))
125 |         print_log('best objective value: %1.4f' % np.min(pool.objvals))
126 |         dcd_pool, _, _ = discrete_descent_solution_pool(pool = pool,
127 |                                                         Z = Z,
128 |                                                         C_0 = C_0,
129 |                                                         constraints = constraints,
130 |                                                         compute_loss_from_scores = compute_loss_from_scores,
131 |                                                         get_L0_penalty = get_L0_penalty,
132 |                                                         max_runtime = settings['polishing_max_runtime'],
133 |                                                         max_solutions = settings['polishing_max_solutions'])
134 | 
135 |         dcd_pool = dcd_pool.remove_infeasible(is_feasible)
136 |         if len(dcd_pool) > 0:
137 |             print_log('polishing produced %d integer solutions' % len(dcd_pool))
138 |             pool.append(dcd_pool)
139 | 
140 |     # remove solutions that are not feasible, not integer
141 |     if len(pool) > 0:
142 |         pool = pool.remove_nonintegral().distinct().sort()
143 | 
144 |     # update upper and lower bounds
145 |     print_log('initialization produced %1.0f feasible solutions' % len(pool))
146 |     if len(pool) > 0:
147 |         bounds = chained_updates(bounds, C_0_nnz, new_objval_at_feasible = np.min(pool.objvals))
148 |         print_log('best objective value: %1.4f' % np.min(pool.objvals))
149 | 
150 |     print_log('-' * 60)
151 |     print_log('completed initialization procedure')
152 |     print_log('-' * 60)
153 |     return pool, cuts, bounds
154 | 
155 | 
156 | 
157 | def run_standard_cpa(cpx,
158 |                      cpx_indices,
159 |                      compute_loss,
160 |                      compute_loss_cut,
161 |                      settings = DEFAULT_CPA_SETTINGS,
162 |                      print_flag = False):
163 | 
164 |     assert isinstance(cpx, Cplex)
165 |     assert isinstance(cpx_indices, dict)
166 |     assert callable(compute_loss)
167 |     assert callable(compute_loss_cut)
168 |     assert isinstance(settings, dict)
169 | 
170 |     settings = validate_settings(settings, default_settings = DEFAULT_CPA_SETTINGS)
171 | 
172 |     rho_idx = cpx_indices["rho"]
173 |     loss_idx = cpx_indices["loss"]
174 |     alpha_idx = cpx_indices["alpha"]
175 |     cut_idx = loss_idx + rho_idx
176 |     objval_idx = cpx_indices["objval"]
177 |     L0_idx = cpx_indices["L0_norm"]
178 | 
179 |     P = len(cpx_indices["rho"])
180 |     C_0_alpha = np.array(cpx_indices['C_0_alpha'])
181 |     C_0_nnz = C_0_alpha[np.flatnonzero(C_0_alpha)]
182 | 
183 |     if isinstance(loss_idx, list) and len(loss_idx) == 1:
184 |         loss_idx = loss_idx[0]
185 | 
186 |     if len(alpha_idx) > 0:
187 |         get_alpha = lambda: np.array(cpx.solution.get_values(alpha_idx))
188 |     else:
189 |         get_alpha = lambda: np.array([])
190 | 
191 |     bounds = {
192 |         'loss_min': cpx.variables.get_lower_bounds(loss_idx),
193 |         'loss_max': cpx.variables.get_upper_bounds(loss_idx),
194 |         'objval_min': cpx.variables.get_lower_bounds(objval_idx),
195 |         'objval_max': cpx.variables.get_upper_bounds(objval_idx),
196 |         'L0_min': cpx.variables.get_lower_bounds(L0_idx),
197 |         'L0_max': cpx.variables.get_upper_bounds(L0_idx),
198 |         }
199 | 
200 |     if settings['update_bounds'] and settings['type'] == 'cvx':
201 |         update_bounds = lambda bounds, lb, ub: chained_updates_for_lp(bounds, C_0_nnz, ub, lb)
202 |     elif settings['update_bounds'] and settings['type'] == 'ntree':
203 |         update_bounds = lambda bounds, lb, ub: chained_updates(bounds, C_0_nnz, ub, lb)
204 |     else:
205 |         update_bounds = lambda bounds, lb, ub: bounds
206 | 
207 |     objval = 0.0
208 |     upperbound = CPX_INFINITY
209 |     lowerbound = 0.0
210 |     n_iterations = 0
211 |     n_simplex_iterations = 0
212 |     max_runtime = float(settings['max_runtime'])
213 |     max_cplex_time = float(settings['max_runtime_per_iteration'])
214 |     remaining_total_time = max_runtime
215 |     solutions = []
216 |     objvals = []
217 | 
218 |     progress_stats = {
219 |         'upperbounds': [],
220 |         'lowerbounds': [],
221 |         'simplex_iterations': [],
222 |         'cut_times': [],
223 |         'total_times': []
224 |         }
225 | 
226 |     run_start_time = time.time()
227 |     while True:
228 | 
229 |         iteration_start_time = time.time()
230 |         cpx.parameters.timelimit.set(min(remaining_total_time, max_cplex_time))
231 |         cpx.solve()
232 |         solution_status = cpx.solution.status[cpx.solution.get_status()]
233 | 
234 |         # get solution
235 |         if solution_status not in ('optimal', 'optimal_tolerance', 'MIP_optimal'):
236 |             stop_reason = solution_status
237 |             stop_msg = 'stopping CPA | solution is infeasible (status = %s)' % solution_status
238 |             break
239 | 
240 |         # get solution
241 |         rho = np.array(cpx.solution.get_values(rho_idx))
242 |         alpha = get_alpha()
243 |         simplex_iterations = int(cpx.solution.progress.get_num_iterations())
244 | 
245 |         # compute cut
246 |         cut_start_time = time.time()
247 |         loss_value, loss_slope = compute_loss_cut(rho)
248 |         cut_lhs = [float(loss_value - loss_slope.dot(rho))]
249 |         cut_constraint = [SparsePair(ind = cut_idx, val = [1.0] + (-loss_slope).tolist())]
250 |         cut_time = time.time() - cut_start_time
251 | 
252 |         # compute objective bounds
253 |         objval = float(loss_value + alpha.dot(C_0_alpha))
254 |         upperbound = min(upperbound, objval)
255 |         lowerbound = cpx.solution.get_objective_value()
256 |         relative_gap = (upperbound - lowerbound)/(upperbound + np.finfo('float').eps)
257 |         bounds = update_bounds(bounds, lb = lowerbound, ub = upperbound)
258 | 
259 |         #store solutions
260 |         solutions.append(rho)
261 |         objvals.append(objval)
262 | 
263 |         # update run stats
264 |         n_iterations += 1
265 |         n_simplex_iterations += simplex_iterations
266 |         current_time = time.time()
267 |         total_time = current_time - run_start_time
268 |         iteration_time = current_time - iteration_start_time
269 |         remaining_total_time = max(max_runtime - total_time, 0.0)
270 | 
271 |         # print progress
272 |         if print_flag and settings['display_progress']:
273 |             print_log("cuts = %d \t UB = %.4f \t LB = %.4f \t GAP = %.4f%%\n" % (n_iterations, upperbound, lowerbound, 100.0 * relative_gap))
274 | 
275 |         # save progress
276 |         if settings['save_progress']:
277 |             progress_stats['upperbounds'].append(upperbound)
278 |             progress_stats['lowerbounds'].append(lowerbound)
279 |             progress_stats['total_times'].append(total_time)
280 |             progress_stats['cut_times'].append(cut_time)
281 |             progress_stats['simplex_iterations'].append(simplex_iterations)
282 | 
283 |         # check termination conditions
284 |         if n_iterations >= settings['max_iterations']:
285 |             stop_reason = 'aborted:reached_max_cuts'
286 |             stop_msg = 'reached max iterations'
287 |             break
288 | 
289 |         if n_iterations >= settings['min_iterations_before_coefficient_gap_check']:
290 |             prior_rho = solutions[-2]
291 |             coef_gap = np.abs(np.max(rho - prior_rho))
292 |             if np.all(np.round(rho) == np.round(prior_rho)) and coef_gap < settings['max_coefficient_gap']:
293 |                 stop_reason = 'aborted:coefficient_gap_within_tolerance'
294 |                 stop_msg = 'stopping CPA | coef gap is within tolerance (%1.4f < %1.4f)' % (coef_gap, settings['max_coefficient_gap'])
295 |                 break
296 | 
297 |         if relative_gap < settings['max_tolerance']:
298 |             stop_reason = 'converged:gap_within_tolerance'
299 |             stop_msg = 'stopping CPA | optimality gap is within tolerance (%1.1f%% < %1.1f%%)' % (100 * settings['max_tolerance'], 100 * relative_gap)
300 |             break
301 | 
302 |         if iteration_time > settings['max_runtime_per_iteration']:
303 |             stop_reason = 'aborted:reached_max_train_time'
304 |             stop_msg = 'stopping CPA (reached max training time per iteration of %1.0f secs)' % settings['max_runtime_per_iteration']
305 |             break
306 | 
307 |         if (total_time > settings['max_runtime']) or (remaining_total_time == 0.0):
308 |             stop_reason = 'aborted:reached_max_train_time'
309 |             stop_msg = 'stopping CPA (reached max training time of %1.0f secs)' % settings['max_runtime']
310 |             break
311 | 
312 |         # switch bounds
313 |         if settings['update_bounds']:
314 |             cpx.variables.set_lower_bounds(L0_idx, bounds['L0_min'])
315 |             cpx.variables.set_upper_bounds(L0_idx, bounds['L0_max'])
316 |             cpx.variables.set_lower_bounds(loss_idx, bounds['loss_min'])
317 |             cpx.variables.set_upper_bounds(loss_idx, bounds['loss_max'])
318 |             cpx.variables.set_lower_bounds(objval_idx, bounds['objval_min'])
319 |             cpx.variables.set_upper_bounds(objval_idx, bounds['objval_max'])
320 | 
321 |         # add loss cut
322 |         cpx.linear_constraints.add(lin_expr = cut_constraint, senses = ["G"], rhs = cut_lhs)
323 | 
324 |     if print_flag:
325 |         print_log(stop_msg)
326 | 
327 |     #collect stats
328 |     stats = {
329 |         'solution': rho,
330 |         'stop_reason': stop_reason,
331 |         'n_iterations': n_iterations,
332 |         'n_simplex_iterations': n_simplex_iterations,
333 |         'objval': objval,
334 |         'upperbound': upperbound,
335 |         'lowerbound': lowerbound,
336 |         'cut_time': cut_time,
337 |         'total_time': total_time,
338 |         'cplex_time': total_time - cut_time,
339 |         }
340 | 
341 |     stats.update(bounds)
342 |     if settings['save_progress']:
343 |         progress_stats['cplex_times'] = (np.array(stats['total_times']) - np.array(stats['cut_times'])).tolist()
344 |         progress_stats['objvals'] = objvals
345 |         progress_stats['solutions'] = solutions
346 |         stats.update(progress_stats)
347 | 
348 |     #collect cuts
349 |     idx = list(range(cpx_indices['n_constraints'], cpx.linear_constraints.get_num(), 1))
350 |     cuts = {
351 |         'coefs': cpx.linear_constraints.get_rows(idx),
352 |         'lhs': cpx.linear_constraints.get_rhs(idx)
353 |         }
354 | 
355 |     #create solution pool
356 |     pool = SolutionPool(P)
357 |     if len(objvals) > 0:
358 |         pool.add(objvals, solutions)
359 | 
360 |     return stats, cuts, pool
361 | 
362 | 
363 | 
364 | def round_solution_pool(pool,
365 |                         constraints,
366 |                         max_runtime = float('inf'),
367 |                         max_solutions = float('inf')):
368 |     """
369 | 
370 |     Parameters
371 |     ----------
372 |     pool
373 |     constraints
374 |     max_runtime
375 |     max_solutions
376 | 
377 |     Returns
378 |     -------
379 | 
380 |     """
381 |     # quick return
382 |     if len(pool) == 0:
383 |         return pool
384 | 
385 |     pool = pool.distinct().sort()
386 |     P = pool.P
387 |     L0_reg_ind = np.isnan(constraints['coef_set'].c0)
388 |     L0_max = constraints['L0_max']
389 | 
390 | 
391 |     total_runtime = 0.0
392 |     total_rounded = 0
393 |     rounded_pool = SolutionPool(P)
394 | 
395 |     for rho in pool.solutions:
396 | 
397 |         start_time = time.time()
398 |         # sort from largest to smallest coefficients
399 |         feature_order = np.argsort([-abs(x) for x in rho])
400 |         rounded_solution = np.zeros(shape = (1, P))
401 |         l0_norm_count = 0
402 | 
403 |         for k in range(P):
404 |             j = feature_order[k]
405 |             if not L0_reg_ind[j]:
406 |                 rounded_solution[0, j] = np.round(rho[j], 0)
407 |             elif l0_norm_count < L0_max:
408 |                 rounded_solution[0, j] = np.round(rho[j], 0)
409 |                 l0_norm_count += L0_reg_ind[j]
410 | 
411 |         total_runtime += time.time() - start_time
412 |         total_rounded += 1
413 |         rounded_pool.add(objvals = np.nan, solutions = rounded_solution)
414 | 
415 |         if total_runtime > max_runtime or total_rounded >= max_solutions:
416 |             break
417 | 
418 |     rounded_pool = rounded_pool.distinct().sort()
419 |     return rounded_pool, total_runtime, total_rounded
420 | 
421 | 
422 | def sequential_round_solution_pool(pool,
423 |                                    Z,
424 |                                    C_0,
425 |                                    compute_loss_from_scores_real,
426 |                                    get_L0_penalty,
427 |                                    max_runtime = float('inf'),
428 |                                    max_solutions = float('inf'),
429 |                                    objval_cutoff = float('inf')):
430 | 
431 |     """
432 |     runs sequential rounding for all solutions in a solution pool
433 |     can be stopped early using max_runtime or max_solutions
434 | 
435 |     Parameters
436 |     ----------
437 |     pool
438 |     Z
439 |     C_0
440 |     compute_loss_from_scores_real
441 |     get_L0_penalty
442 |     max_runtime
443 |     max_solutions
444 |     objval_cutoff
445 |     L0_min
446 |     L0_max
447 | 
448 |     Returns
449 |     -------
450 | 
451 |     """
452 |     # quick return
453 |     if len(pool) == 0:
454 |         return pool, 0.0, 0
455 | 
456 |     assert callable(get_L0_penalty)
457 |     assert callable(compute_loss_from_scores_real)
458 | 
459 |     # if model size constraint is non-trivial, remove solutions that violate the model size constraint beforehand
460 |     pool = pool.distinct().sort()
461 |     rounding_handle = lambda rho: sequential_rounding(rho = rho,
462 |                                                       Z = Z,
463 |                                                       C_0 = C_0,
464 |                                                       compute_loss_from_scores_real = compute_loss_from_scores_real,
465 |                                                       get_L0_penalty = get_L0_penalty,
466 |                                                       objval_cutoff = objval_cutoff)
467 | 
468 | 
469 |     # apply sequential rounding to all solutions
470 |     total_runtime = 0.0
471 |     total_rounded = 0
472 |     rounded_pool = SolutionPool(pool.P)
473 | 
474 |     for rho in pool.solutions:
475 | 
476 |         start_time = time.time()
477 |         solution, objval, early_stop = rounding_handle(rho)
478 |         total_runtime += time.time() - start_time
479 |         total_rounded += 1
480 | 
481 |         if not early_stop:
482 |             rounded_pool = rounded_pool.add(objvals = objval, solutions = solution)
483 | 
484 |         if total_runtime > max_runtime or total_rounded > max_solutions:
485 |             break
486 | 
487 |     rounded_pool = rounded_pool.distinct().sort()
488 |     return rounded_pool, total_runtime, total_rounded
489 | 
490 | 
491 | def discrete_descent_solution_pool(pool,
492 |                                    Z,
493 |                                    C_0,
494 |                                    constraints,
495 |                                    get_L0_penalty,
496 |                                    compute_loss_from_scores,
497 |                                    max_runtime = float('inf'),
498 |                                    max_solutions = float('inf')):
499 |     """
500 | 
501 |     runs dcd polishing for all solutions in a solution pool
502 |     can be stopped early using max_runtime or max_solutions
503 | 
504 | 
505 |     Parameters
506 |     ----------
507 |     pool
508 |     Z
509 |     C_0
510 |     constraints
511 |     get_L0_penalty
512 |     compute_loss_from_scores
513 |     max_runtime
514 |     max_solutions
515 | 
516 |     Returns
517 |     -------
518 | 
519 |     """
520 | 
521 |     pool = pool.remove_nonintegral()
522 |     if len(pool) == 0:
523 |         return pool, 0.0, 0
524 | 
525 |     assert callable(get_L0_penalty)
526 |     assert callable(compute_loss_from_scores)
527 | 
528 |     rho_ub = constraints['coef_set'].ub
529 |     rho_lb = constraints['coef_set'].lb
530 | 
531 |     polishing_handle = lambda rho: discrete_descent(rho,
532 |                                                     Z = Z,
533 |                                                     C_0 = C_0,
534 |                                                     rho_ub = rho_ub,
535 |                                                     rho_lb = rho_lb,
536 |                                                     get_L0_penalty = get_L0_penalty,
537 |                                                     compute_loss_from_scores = compute_loss_from_scores)
538 |     pool = pool.distinct().sort()
539 | 
540 |     polished_pool = SolutionPool(pool.P)
541 |     total_runtime = 0.0
542 |     total_polished = 0
543 |     start_time = time.time()
544 |     for rho in pool.solutions:
545 |         polished_solution, _, polished_objval = polishing_handle(rho)
546 |         total_runtime = time.time() - start_time
547 |         total_polished += 1
548 |         polished_pool = polished_pool.add(objvals = polished_objval, solutions = polished_solution)
549 |         if total_runtime > max_runtime or total_polished >= max_solutions:
550 |             break
551 | 
552 |     polished_pool = polished_pool.distinct().sort()
553 |     return polished_pool, total_runtime, total_polished
554 | 


--------------------------------------------------------------------------------