├── .github
└── workflows
│ └── python-package.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.rst
├── doc
├── Makefile
├── make.bat
└── source
│ ├── _images
│ ├── binning_2d_readme.png
│ ├── binning_2d_readme_example.png
│ ├── binning_2d_readme_woe.png
│ ├── binning_binary.png
│ ├── binning_data_stream.gif
│ ├── binning_readme_example_split_woe.png
│ ├── binning_readme_example_woe.png
│ ├── logo.svg
│ ├── logo_optbinning.ico
│ └── logo_optbinning.svg
│ ├── _static
│ └── css
│ │ └── custom.css
│ ├── binning_2d_binary.rst
│ ├── binning_2d_continuous.rst
│ ├── binning_2d_tables.rst
│ ├── binning_binary.rst
│ ├── binning_continuous.rst
│ ├── binning_multiclass.rst
│ ├── binning_process.rst
│ ├── binning_process_sketch.rst
│ ├── binning_scenarios.rst
│ ├── binning_sketch.rst
│ ├── binning_tables.rst
│ ├── binning_utilities.rst
│ ├── conf.py
│ ├── counterfactual.rst
│ ├── index.rst
│ ├── installation.rst
│ ├── mdlp.rst
│ ├── outlier.rst
│ ├── piecewise_binary.rst
│ ├── piecewise_continuous.rst
│ ├── release_notes.rst
│ ├── scorecard.rst
│ ├── tutorials.rst
│ └── tutorials
│ ├── tutorial_binary.ipynb
│ ├── tutorial_binary_large_scale.ipynb
│ ├── tutorial_binary_localsolver.ipynb
│ ├── tutorial_binary_under_uncertainty.ipynb
│ ├── tutorial_binning_2d.ipynb
│ ├── tutorial_binning_process_FICO_update_binning.ipynb
│ ├── tutorial_binning_process_FICO_xAI.ipynb
│ ├── tutorial_binning_process_sklearn_pipeline.ipynb
│ ├── tutorial_binning_process_telco_churn.ipynb
│ ├── tutorial_continuous.ipynb
│ ├── tutorial_continuous_2d.ipynb
│ ├── tutorial_counterfactual_binary_target.ipynb
│ ├── tutorial_counterfactual_continuous_target.ipynb
│ ├── tutorial_multiclass.ipynb
│ ├── tutorial_piecewise_binary.ipynb
│ ├── tutorial_piecewise_continuous.ipynb
│ ├── tutorial_scorecard_binary_target.ipynb
│ ├── tutorial_scorecard_continuous_target.ipynb
│ ├── tutorial_scorecard_monitoring.ipynb
│ ├── tutorial_sketch_binary.ipynb
│ └── tutorial_sketch_binary_pyspark.rst
├── optbinning
├── __init__.py
├── _version.py
├── binning
│ ├── __init__.py
│ ├── auto_monotonic.py
│ ├── base.py
│ ├── binning.py
│ ├── binning_information.py
│ ├── binning_process.py
│ ├── binning_process_information.py
│ ├── binning_statistics.py
│ ├── continuous_binning.py
│ ├── continuous_cp.py
│ ├── cp.py
│ ├── distributed
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── binning_process_sketch.py
│ │ ├── binning_process_sketch_information.py
│ │ ├── binning_sketch.py
│ │ ├── bsketch.py
│ │ ├── bsketch_information.py
│ │ ├── gk.py
│ │ └── plots.py
│ ├── ls.py
│ ├── mdlp.py
│ ├── metrics.py
│ ├── mip.py
│ ├── model_data.py
│ ├── multiclass_binning.py
│ ├── multiclass_cp.py
│ ├── multiclass_mip.py
│ ├── multidimensional
│ │ ├── __init__.py
│ │ ├── binning_2d.py
│ │ ├── binning_statistics_2d.py
│ │ ├── continuous_binning_2d.py
│ │ ├── cp_2d.py
│ │ ├── mip_2d.py
│ │ ├── model_data_2d.py
│ │ ├── model_data_cart_2d.py
│ │ ├── preprocessing_2d.py
│ │ └── transformations_2d.py
│ ├── outlier.py
│ ├── piecewise
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── binning.py
│ │ ├── binning_information.py
│ │ ├── binning_statistics.py
│ │ ├── continuous_binning.py
│ │ ├── metrics.py
│ │ └── transformations.py
│ ├── prebinning.py
│ ├── preprocessing.py
│ ├── transformations.py
│ └── uncertainty
│ │ ├── __init__.py
│ │ └── binning_scenarios.py
├── exceptions.py
├── formatting.py
├── information.py
├── logging.py
├── metrics
│ ├── __init__.py
│ ├── classification.py
│ └── regression.py
├── options.py
└── scorecard
│ ├── __init__.py
│ ├── counterfactual
│ ├── __init__.py
│ ├── base.py
│ ├── counterfactual.py
│ ├── counterfactual_information.py
│ ├── mip.py
│ ├── model_data.py
│ ├── multi_mip.py
│ ├── problem_data.py
│ └── utils.py
│ ├── monitoring.py
│ ├── monitoring_information.py
│ ├── plots.py
│ ├── rounding.py
│ ├── scorecard.py
│ └── scorecard_information.py
├── requirements.txt
├── setup.py
├── test_requirements.txt
└── tests
├── __init__.py
├── data
├── breast_cancer.csv
└── breast_cancer.parquet
├── datasets
├── __init__.py
└── datasets.py
├── results
├── breast_cancer_woe.csv
├── breast_cancer_woe_2.csv
├── plot_auc_roc.png
├── plot_cap.png
├── plot_ks.png
├── psi_plot_binary.png
├── psi_plot_continuous.png
├── test_binning.png
├── test_binning_2d_event_rate.png
├── test_binning_2d_woe.png
├── test_binning_no_missing.png
├── test_binning_no_special.png
├── test_binning_process_information.txt
├── test_binning_process_verbose.txt
├── test_continuous_binning.png
├── test_continuous_binning_2d.png
├── test_continuous_binning_no_missing.png
├── test_continuous_binning_no_special.png
├── test_multiclass_binning.png
├── test_multiclass_binning_no_missing.png
├── test_multiclass_binning_no_special.png
├── test_scorecard_information.txt
├── test_scorecard_monitoring_default.txt
├── test_scorecard_monitoring_default_continuous.txt
├── test_scorecard_monitoring_information.txt
├── test_scorecard_monitoring_verbose.txt
└── test_scorecard_verbose.txt
├── test_binning.py
├── test_binning_2d.py
├── test_binning_piecewise.py
├── test_binning_process.py
├── test_binning_process_sketch.py
├── test_binning_scenarios.py
├── test_binning_sketch.py
├── test_continuous_binning.py
├── test_continuous_binning_2d.py
├── test_continuous_binning_piecewise.py
├── test_counterfactual.py
├── test_mdlp.py
├── test_multiclass_binning.py
├── test_outlier.py
├── test_scorecard.py
├── test_scorecard_monitoring.py
└── test_scorecard_plots.py
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: CI
5 |
6 | on:
7 | push:
8 | branches: [master, develop]
9 | pull_request:
10 | branches: [master, develop]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ${{ matrix.os }}
16 | strategy:
17 | matrix:
18 |
19 | python-version: ['3.9', '3.10', "3.11", "3.12"]
20 | os: [ubuntu-latest, windows-latest, macos-latest]
21 |
22 | steps:
23 | - uses: actions/checkout@v2
24 | - name: Set up Python ${{ matrix.python-version }}
25 | uses: actions/setup-python@v2
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | - name: Install dependencies
29 | run: |
30 | python -m pip install --upgrade pip
31 | pip install -r test_requirements.txt
32 | pip install -r requirements.txt
33 | - name: Install package
34 | run: |
35 | pip install -e .[distributed,test,ecos]
36 | - name: Lint with flake8
37 | run: |
38 | # stop the build if there are Python syntax errors or undefined names
39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
41 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
42 | - name: Test with pytest
43 | run: |
44 | pytest
45 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | # OS: Linux
3 | dist: xenial
4 | python:
5 | - "3.6"
6 | - "3.7"
7 |
8 | matrix:
9 | include:
10 | # OS: Windows
11 | - os: windows
12 | language: shell
13 | before_install:
14 | - choco install python --version 3.6.8
15 | - python --version
16 | - python -m pip install --upgrade pip
17 | - pip3 install --upgrade pytest
18 | # - pip3 install codecov
19 | env: PATH=/c/Python36:/c/Python36/Scripts:$PATH
20 |
21 | - os: windows
22 | language: shell
23 | before_install:
24 | - choco install python --version 3.7.4
25 | - python --version
26 | - python -m pip install --upgrade pip
27 | - pip3 install --upgrade pytest
28 | # - pip3 install codecov
29 | env: PATH=/c/Python37:/c/Python37/Scripts:$PATH
30 |
31 | # command to install dependencies
32 | install:
33 | - pip install codecov
34 | # - pip install coveralls
35 | - pip install -r requirements.txt
36 | - pip install -e .
37 | # command to run tests
38 | script:
39 | - coverage run --source optbinning -m pytest tests/
40 |
41 | after_success:
42 | - codecov
43 | # - coveralls
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/doc/source/_images/binning_2d_readme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_2d_readme.png
--------------------------------------------------------------------------------
/doc/source/_images/binning_2d_readme_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_2d_readme_example.png
--------------------------------------------------------------------------------
/doc/source/_images/binning_2d_readme_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_2d_readme_woe.png
--------------------------------------------------------------------------------
/doc/source/_images/binning_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_binary.png
--------------------------------------------------------------------------------
/doc/source/_images/binning_data_stream.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_data_stream.gif
--------------------------------------------------------------------------------
/doc/source/_images/binning_readme_example_split_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_readme_example_split_woe.png
--------------------------------------------------------------------------------
/doc/source/_images/binning_readme_example_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_readme_example_woe.png
--------------------------------------------------------------------------------
/doc/source/_images/logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/doc/source/_images/logo_optbinning.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/logo_optbinning.ico
--------------------------------------------------------------------------------
/doc/source/_images/logo_optbinning.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/doc/source/_static/css/custom.css:
--------------------------------------------------------------------------------
1 | /* Navigator and sidebar colors */
2 | .wy-side-nav-search, .wy-nav-top {
3 | background: #326d62;
4 | }
5 |
6 | .wy-menu-vertical a:active {
7 | background: #30bba3;
8 | }
9 |
10 | .wy-side-nav-search>div.version {
11 | color: #d8d8d8;
12 | }
13 |
14 | .wy-side-nav-search>a img.logo, .wy-side-nav-search .wy-dropdown>a img.logo {
15 | display: block;
16 | margin: 0 auto;
17 | height: 20%;
18 | width: 20%;
19 | border-radius: 0;
20 | max-width: 100%;
21 | background: transparent;
22 | }
23 |
24 | .wy-menu-vertical header, .wy-menu-vertical p.caption {
25 | color: #30bba3;
26 | }
27 |
28 | /* Class/function declaration colors */
29 | .rst-content dl:not(.docutils) dt {
30 | background: #daf2ee;
31 | color: #326d62;
32 | border-top: solid 3px #30bba3;
33 | }
34 |
35 |
36 | /* Links colors */
37 | a {
38 | color: #30bba3;
39 | }
40 |
41 | .wy-nav-content a:hover {
42 | color: #326d62;
43 | }
44 |
45 | .wy-nav-content a:visited:hover {
46 | color: #9B59B6;
47 | }
48 |
--------------------------------------------------------------------------------
/doc/source/binning_2d_binary.rst:
--------------------------------------------------------------------------------
1 | Optimal binning 2D with binary target
2 | =====================================
3 |
4 | .. autoclass:: optbinning.OptimalBinning2D
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
--------------------------------------------------------------------------------
/doc/source/binning_2d_continuous.rst:
--------------------------------------------------------------------------------
1 | Optimal binning 2D with continuous target
2 | =========================================
3 |
4 | .. autoclass:: optbinning.ContinuousOptimalBinning2D
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
--------------------------------------------------------------------------------
/doc/source/binning_2d_tables.rst:
--------------------------------------------------------------------------------
1 | Binning 2D tables
2 | =================
3 |
4 | Binning table 2D: binary target
5 | -------------------------------
6 |
7 | .. autoclass:: optbinning.binning.multidimensional.binning_statistics_2d.BinningTable2D
8 | :members:
9 | :inherited-members:
10 | :show-inheritance:
11 |
12 | Binning table 2D: continuous target
13 | -----------------------------------
14 |
15 | .. autoclass:: optbinning.binning.multidimensional.binning_statistics_2d.ContinuousBinningTable2D
16 | :members:
17 | :inherited-members:
18 | :show-inheritance:
--------------------------------------------------------------------------------
/doc/source/binning_binary.rst:
--------------------------------------------------------------------------------
1 | Optimal binning with binary target
2 | ==================================
3 |
4 | .. autoclass:: optbinning.OptimalBinning
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/binning_continuous.rst:
--------------------------------------------------------------------------------
1 | Optimal binning with continuous target
2 | ======================================
3 |
4 | .. autoclass:: optbinning.ContinuousOptimalBinning
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/binning_multiclass.rst:
--------------------------------------------------------------------------------
1 | Optimal binning with multiclass target
2 | ======================================
3 |
4 |
5 | .. autoclass:: optbinning.MulticlassOptimalBinning
6 | :members:
7 | :inherited-members:
8 | :show-inheritance:
--------------------------------------------------------------------------------
/doc/source/binning_process.rst:
--------------------------------------------------------------------------------
1 | Binning process
2 | ===============
3 |
4 | .. autoclass:: optbinning.BinningProcess
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/binning_process_sketch.rst:
--------------------------------------------------------------------------------
1 | Binning process sketch with binary target
2 | =========================================
3 |
4 | .. autoclass:: optbinning.BinningProcessSketch
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/binning_scenarios.rst:
--------------------------------------------------------------------------------
1 | Stochastic optimal binning
2 | ==========================
3 |
4 | Introduction
5 | ------------
6 | The data used when performing optimal binning is generally assumed to be known accurately and being fully representative of past, present, and future data. This confidence might produce misleading results, especially with data representing future events such as product demand, churn rate, or probability of default.
7 |
8 | Stochastic programming is a framework for explicitly incorporating uncertainty. Stochastic programming uses random variables to account for data variability and optimizes the expected value of the objective function. Optbinning implements the stochastic programming approach using the two-stage scenario-based formulation (also known as extensive form or deterministic equivalent), obtaining a deterministic mixed-integer linear programming formulation. The scenario-based formulation guarantees the nonanticipativity constraint and a solution that must be feasible for each scenario, leading to a more **robust** solution.
9 |
10 |
11 | Scenario-based optimal binning
12 | ------------------------------
13 |
14 | .. autoclass:: optbinning.binning.uncertainty.SBOptimalBinning
15 | :members:
16 | :inherited-members:
17 | :show-inheritance:
18 |
19 |
--------------------------------------------------------------------------------
/doc/source/binning_sketch.rst:
--------------------------------------------------------------------------------
1 | Optimal binning sketch with binary target
2 | =========================================
3 |
4 | Introduction
5 | ------------
6 |
7 | The optimal binning is the constrained discretization of a numerical feature into bins given a binary target, maximizing a statistic such as Jeffrey's divergence or Gini. Binning is a data preprocessing technique commonly used in binary classification, but the current list of existing binning algorithms supporting constraints lacks a method to handle streaming data. The new class OptimalBinningSketch implements a new scalable, memory-efficient and robust algorithm for performing optimal binning in the streaming settings. Algorithmic details are discussed in http://gnpalencia.org/blog/2020/binning_data_streams/.
8 |
9 |
10 | Algorithms
11 | ----------
12 |
13 | OptimalBinningSketch
14 | """"""""""""""""""""
15 |
16 | .. autoclass:: optbinning.binning.distributed.OptimalBinningSketch
17 | :members:
18 | :inherited-members:
19 | :show-inheritance:
20 |
21 |
22 | GK: Greenwald-Khanna's algorithm
23 | """"""""""""""""""""""""""""""""
24 |
25 | .. autoclass:: optbinning.binning.distributed.GK
26 | :members:
27 | :inherited-members:
28 | :show-inheritance:
29 |
30 |
31 | Binning sketch: numerical variable - binary target
32 | """"""""""""""""""""""""""""""""""""""""""""""""""
33 |
34 | .. autoclass:: optbinning.binning.distributed.BSketch
35 | :members:
36 | :inherited-members:
37 | :show-inheritance:
38 |
39 |
40 | Binning sketch: categorical variable - binary target
41 | """"""""""""""""""""""""""""""""""""""""""""""""""""
42 |
43 | .. autoclass:: optbinning.binning.distributed.BCatSketch
44 | :members:
45 | :inherited-members:
46 | :show-inheritance:
--------------------------------------------------------------------------------
/doc/source/binning_tables.rst:
--------------------------------------------------------------------------------
1 | Binning tables
2 | ==============
3 |
4 | Binning table: binary target
5 | ----------------------------
6 |
7 | .. autoclass:: optbinning.binning.binning_statistics.BinningTable
8 | :members:
9 | :inherited-members:
10 | :show-inheritance:
11 |
12 | Binning table: continuous target
13 | --------------------------------
14 |
15 | .. autoclass:: optbinning.binning.binning_statistics.ContinuousBinningTable
16 | :members:
17 | :inherited-members:
18 | :show-inheritance:
19 |
20 | Binning table: multiclass target
21 | --------------------------------
22 |
23 | .. autoclass:: optbinning.binning.binning_statistics.MulticlassBinningTable
24 | :members:
25 | :inherited-members:
26 | :show-inheritance:
--------------------------------------------------------------------------------
/doc/source/binning_utilities.rst:
--------------------------------------------------------------------------------
1 | Utilities
2 | =========
3 |
4 |
5 | Pre-binning
6 | -----------
7 |
8 | .. autoclass:: optbinning.binning.prebinning.PreBinning
9 | :members:
10 | :inherited-members:
11 | :show-inheritance:
12 |
13 |
14 | Transformations
15 | ---------------
16 |
17 | The Weight of Evidence :math:`\text{WoE}_i` and event rate :math:`D_i` for each bin are related by means of the functional equations
18 |
19 | .. math::
20 |
21 | \begin{align}
22 | \text{WoE}_i &= \log\left(\frac{1 - D_i}{D_i}\right) + \log\left(\frac{N_T^{E}}{N_T^{NE}}\right) =
23 | \log\left(\frac{N_T^{E}}{N_T^{NE}}\right) - \text{logit}(D_i)\\
24 | D_i &= \left(1 + \frac{N_T^{NE}}{N_T^{E}} e^{\text{WoE}_i}\right)^{-1} = \left(1 + e^{\text{WoE}_i - \log\left(\frac{N_T^{E}}{N_T^{NE}}\right)}\right)^{-1},
25 | \end{align}
26 |
27 | where :math:`D_i` can be characterized as a logistic function of :math:`\text{WoE}_i`, and :math:`\text{WoE}_i` can be expressed in terms of the logit function of :math:`D_i`.
28 | The constant term :math:`\log(N_T^{E} / N_T^{NE})` is the log ratio of the total
29 | number of event :math:`N_T^{E}` and the total number of non-events :math:`N_T^{NE}`. This shows that WoE is inversely related to the event rate.
30 |
31 | .. autofunction:: optbinning.binning.transformations.transform_event_rate_to_woe
32 |
33 | .. autofunction:: optbinning.binning.transformations.transform_woe_to_event_rate
34 |
35 |
36 | Metrics
37 | -------
38 |
39 | Gini coefficient
40 | """"""""""""""""
41 |
42 | The Gini coefficient or Accuracy Ratio is a quantitative measure of discriminatory and predictive power given a distribution of events and non-events. The Gini coefficient
43 | ranges from 0 to 1, and is defined by
44 |
45 | .. math::
46 |
47 | Gini = 1 - \frac{2 \sum_{i=2}^n \left(N_i^{E} \sum_{j=1}^{i-1} N_j^{NE}\right) + \sum_{k=1}^n N_k^{E} N_k^{NE}}{N_T^{E} N_T^{NE}},
48 |
49 | where :math:`N_i^{E}` and :math:`N_i^{NE}` are the number of events and non-events per
50 | bin, respectively, and :math:`N_T^{E}` and :math:`N_T^{NE}` are the total number of
51 | events and non-events, respectively.
52 |
53 | .. autofunction:: optbinning.binning.metrics.gini
54 |
55 | Divergence measures
56 | """""""""""""""""""
57 |
58 | Given two discrete probability distributions :math:`P` and :math:`Q`. The Shannon entropy
59 | is defined as
60 |
61 | .. math::
62 |
63 | S(P) = - \sum_{i=1}^n p_i \log(p_i).
64 |
65 | The Kullback-Leibler divergence, denoted as :math:`D_{KL}(P||Q)`, is given by
66 |
67 | .. math::
68 |
69 | D_{KL}(P || Q) = \sum_{i=1}^n p_i \log \left(\frac{p_i}{q_i}\right).
70 |
71 | The Jeffrey's divergence or Information Value (IV), is a symmetric measure expressible in terms of the Kullback-Leibler divergence defined by
72 |
73 | .. math::
74 |
75 | \begin{align*}
76 | J(P|| Q) &= D_{KL}(P || Q) + D_{KL}(Q || P) = \sum_{i=1}^n p_i \log \left(\frac{p_i}{q_i}\right) + \sum_{i=1}^n q_i \log \left(\frac{q_i}{p_i}\right)\\
77 | &= \sum_{i=1}^n (p_i - q_i) \log \left(\frac{p_i}{q_i}\right).
78 | \end{align*}
79 |
80 | The Jensen-Shannon divergence is a bounded symmetric measure also expressible in
81 | terms of the Kullback-Leibler divergence
82 |
83 | .. math::
84 |
85 | \begin{equation}
86 | JSD(P || Q) = \frac{1}{2}\left(D(P || M) + D(Q || M)\right), \quad M = \frac{1}{2}(P + Q),
87 | \end{equation}
88 |
89 | and bounded by :math:`JSD(P||Q) \in [0, \log(2)]`. We note that these measures cannot be directly used whenever :math:`p_i = 0` and/or :math:`q_i = 0`.
90 |
91 | .. autofunction:: optbinning.binning.metrics.entropy
92 |
93 | .. autofunction:: optbinning.binning.metrics.kullback_leibler
94 |
95 | .. autofunction:: optbinning.binning.metrics.jeffrey
96 |
97 | .. autofunction:: optbinning.binning.metrics.jensen_shannon
98 |
99 | .. autofunction:: optbinning.binning.metrics.jensen_shannon_multivariate
100 |
101 | .. autofunction:: optbinning.binning.metrics.hellinger
102 |
103 | .. autofunction:: optbinning.binning.metrics.triangular
104 |
--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../..'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'optbinning'
21 | copyright = '2019 - 2024, Guillermo Navas-Palencia'
22 | author = 'Guillermo Navas-Palencia'
23 |
24 | # The short X.Y version
25 | version = '0.20.0'
26 | # The full version, including alpha/beta/rc tags
27 | release = '0.20.0'
28 |
29 |
30 | # -- General configuration ---------------------------------------------------
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 | 'sphinx.ext.autodoc',
37 | 'sphinx.ext.doctest',
38 | 'sphinx.ext.mathjax',
39 | 'sphinx.ext.napoleon',
40 | 'sphinxcontrib.bibtex',
41 | 'nbsphinx',
42 | 'sphinx.ext.autosectionlabel'
43 | ]
44 |
45 | # Add any paths that contain templates here, relative to this directory.
46 | templates_path = ['_templates']
47 |
48 | # The suffix(es) of source filenames.
49 | # You can specify multiple suffix as a list of string:
50 | #
51 | # source_suffix = ['.rst', '.md']
52 | source_suffix = '.rst'
53 |
54 | # The master toctree document.
55 | master_doc = 'index'
56 |
57 | # The language for content autogenerated by Sphinx. Refer to documentation
58 | # for a list of supported languages.
59 | #
60 | # This is also used if you do content translation via gettext catalogs.
61 | # Usually you set "language" from the command line for these cases.
62 | language = None
63 |
64 | # List of patterns, relative to source directory, that match files and
65 | # directories to ignore when looking for source files.
66 | # This pattern also affects html_static_path and html_extra_path.
67 | exclude_patterns = []
68 |
69 |
70 | # -- Options for HTML output -------------------------------------------------
71 |
72 | # The theme to use for HTML and HTML Help pages. See the documentation for
73 | # a list of builtin themes.
74 | #
75 | html_theme_options = {
76 | 'logo_only': True
77 | }
78 |
79 | html_show_sourcelink = False
80 |
81 | html_theme = 'sphinx_rtd_theme'
82 | html_logo = '_images/logo_optbinning.svg'
83 | html_favicon = '_images/logo_optbinning.ico'
84 |
85 |
86 | # Add any paths that contain custom static files (such as style sheets) here,
87 | # relative to this directory. They are copied after the builtin static files,
88 | # so a file named "default.css" will overwrite the builtin "default.css".
89 | html_static_path = ['_static']
90 |
91 |
92 | # These paths are either relative to html_static_path
93 | # or fully qualified paths (eg. https://...)
94 | html_css_files = [
95 | 'css/custom.css',
96 | ]
97 |
--------------------------------------------------------------------------------
/doc/source/counterfactual.rst:
--------------------------------------------------------------------------------
1 | Counterfactual explanations
2 | ===========================
3 |
4 | .. autoclass:: optbinning.scorecard.Counterfactual
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
1 | .. optbinning documentation master file, created by
2 | sphinx-quickstart on Thu Dec 19 10:54:06 2019.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 |
7 | OptBinning: The Python Optimal Binning library
8 | ==============================================
9 |
10 | The optimal binning is the optimal discretization of a variable into bins given a discrete or continuous numeric target. **OptBinning** is a library
11 | written in Python implementing a **rigorous** and **flexible** mathematical programming formulation to solve the optimal binning problem for a binary, continuous and multiclass target type, incorporating constraints not previously addressed.
12 |
13 | .. toctree::
14 | :maxdepth: 1
15 | :caption: Getting started
16 |
17 | installation
18 | tutorials
19 | release_notes
20 |
21 | .. toctree::
22 | :maxdepth: 1
23 | :caption: Optimal binning algorithms
24 |
25 | binning_binary
26 | binning_continuous
27 | binning_multiclass
28 | binning_process
29 | binning_tables
30 | binning_utilities
31 |
32 | .. toctree::
33 | :maxdepth: 1
34 | :caption: Scorecard development
35 |
36 | scorecard
37 | counterfactual
38 |
39 | .. toctree::
40 | :maxdepth: 1
41 | :caption: Optimal piecewise binning
42 |
43 | piecewise_binary
44 | piecewise_continuous
45 |
46 | .. toctree::
47 | :maxdepth: 1
48 | :caption: Batch and stream optimal binning
49 |
50 | binning_sketch
51 | binning_process_sketch
52 |
53 | .. toctree::
54 | :maxdepth: 1
55 | :caption: Binning under uncertainty
56 |
57 | binning_scenarios
58 |
59 | .. toctree::
60 | :maxdepth: 1
61 | :caption: Optimal binning 2D
62 |
63 | binning_2d_binary
64 | binning_2d_continuous
65 | binning_2d_tables
66 |
67 | .. toctree::
68 | :maxdepth: 1
69 | :caption: Other binning algorithms
70 |
71 | mdlp
72 |
73 | .. toctree::
74 | :maxdepth: 1
75 | :caption: Utilities
76 |
77 | outlier
--------------------------------------------------------------------------------
/doc/source/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | Install release
5 | ---------------
6 |
7 | To install the current release of OptBinning:
8 |
9 | .. code-block:: text
10 |
11 | pip install optbinning
12 |
13 | Optionally, download a different release
14 | from https://github.com/guillermo-navas-palencia/optbinning/releases and install
15 | using
16 |
17 | .. code-block:: text
18 |
19 | python setup.py install
20 |
21 | Install from source
22 | -------------------
23 |
24 | To install from source, download or clone the git repository https://github.com/guillermo-navas-palencia/optbinning
25 |
26 | .. code-block:: text
27 |
28 | cd optbinning
29 | python setup.py install
--------------------------------------------------------------------------------
/doc/source/mdlp.rst:
--------------------------------------------------------------------------------
1 | MDLP discretization algorithm
2 | =============================
3 |
4 | .. autoclass:: optbinning.MDLP
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/doc/source/outlier.rst:
--------------------------------------------------------------------------------
1 | Outlier detection
2 | =================
3 |
4 | .. autoclass:: optbinning.binning.outlier.OutlierDetector
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
9 |
10 | .. autoclass:: optbinning.binning.outlier.RangeDetector
11 | :members:
12 | :inherited-members:
13 | :show-inheritance:
14 |
15 |
16 | .. autoclass:: optbinning.binning.outlier.ModifiedZScoreDetector
17 | :members:
18 | :inherited-members:
19 | :show-inheritance:
20 |
21 |
22 | .. autoclass:: optbinning.binning.outlier.YQuantileDetector
23 | :members:
24 | :inherited-members:
25 | :show-inheritance:
26 |
--------------------------------------------------------------------------------
/doc/source/piecewise_binary.rst:
--------------------------------------------------------------------------------
1 | Optimal piecewise binning with binary target
2 | ============================================
3 |
4 | .. autoclass:: optbinning.OptimalPWBinning
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
--------------------------------------------------------------------------------
/doc/source/piecewise_continuous.rst:
--------------------------------------------------------------------------------
1 | Optimal piecewise binning with continuous target
2 | ================================================
3 |
4 | .. autoclass:: optbinning.ContinuousOptimalPWBinning
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
--------------------------------------------------------------------------------
/doc/source/scorecard.rst:
--------------------------------------------------------------------------------
1 | Scorecard
2 | =========
3 |
4 | .. autoclass:: optbinning.scorecard.Scorecard
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
9 |
10 | Monitoring
11 | ----------
12 |
13 | .. autoclass:: optbinning.scorecard.ScorecardMonitoring
14 | :members:
15 | :inherited-members:
16 | :show-inheritance:
17 |
18 |
19 | Plot functions
20 | --------------
21 |
22 | .. autofunction:: optbinning.scorecard.plot_auc_roc
23 |
24 | .. autofunction:: optbinning.scorecard.plot_cap
25 |
26 | .. autofunction:: optbinning.scorecard.plot_ks
--------------------------------------------------------------------------------
/doc/source/tutorials.rst:
--------------------------------------------------------------------------------
1 | Tutorials
2 | =========
3 |
4 | This is a guide for optbinning new users with tutorials ranging from basic
5 | to advanced level for each supported target type.
6 |
7 | Optimal binning tutorials
8 | -------------------------
9 |
10 | .. toctree::
11 | :maxdepth: 1
12 |
13 | tutorials/tutorial_binary
14 | tutorials/tutorial_binary_localsolver
15 | tutorials/tutorial_binary_large_scale
16 | tutorials/tutorial_continuous
17 | tutorials/tutorial_multiclass
18 |
19 |
20 | Binning process tutorials
21 | -------------------------
22 |
23 | .. toctree::
24 | :maxdepth: 1
25 |
26 | tutorials/tutorial_binning_process_sklearn_pipeline
27 | tutorials/tutorial_binning_process_FICO_xAI
28 | tutorials/tutorial_binning_process_FICO_update_binning
29 | tutorials/tutorial_binning_process_telco_churn
30 |
31 |
32 | Scorecard tutorials
33 | -------------------
34 |
35 | .. toctree::
36 | :maxdepth: 1
37 |
38 | tutorials/tutorial_scorecard_binary_target
39 | tutorials/tutorial_scorecard_continuous_target
40 | tutorials/tutorial_scorecard_monitoring
41 | tutorials/tutorial_counterfactual_binary_target
42 | tutorials/tutorial_counterfactual_continuous_target
43 |
44 |
45 | Optimal piecewise binning tutorials
46 | -----------------------------------
47 |
48 | .. toctree::
49 | :maxdepth: 1
50 |
51 | tutorials/tutorial_piecewise_binary
52 | tutorials/tutorial_piecewise_continuous
53 |
54 |
55 | Optimal binning for batch and streaming data processing
56 | -------------------------------------------------------
57 |
58 | .. toctree::
59 | :maxdepth: 1
60 |
61 | tutorials/tutorial_sketch_binary
62 | tutorials/tutorial_sketch_binary_pyspark
63 |
64 |
65 | Optimal binning under uncertainty
66 | ---------------------------------
67 |
68 | .. toctree::
69 | :maxdepth: 1
70 |
71 | tutorials/tutorial_binary_under_uncertainty
72 |
73 |
74 | Optimal binning 2D
75 | ------------------
76 |
77 | .. toctree::
78 | :maxdepth: 1
79 |
80 | tutorials/tutorial_binning_2d
81 | tutorials/tutorial_continuous_2d
82 |
--------------------------------------------------------------------------------
/doc/source/tutorials/tutorial_sketch_binary_pyspark.rst:
--------------------------------------------------------------------------------
1 | Tutorial: optimal binning sketch with binary target using PySpark
2 | =================================================================
3 |
4 | In this example, we use PySpark mapPartitions function to compute the optimal
5 | binning of a single variable from a large dataset in a distributed fashion. The dataset is split into 4 partitions.
6 |
7 | .. code::
8 |
9 | from pyspark.sql import SparkSession
10 |
11 | spark.conf.set("spark.sql.execution.arrow.enabled", "true")
12 |
13 | df = spark.read.csv("data/kaggle/HomeCreditDefaultRisk/application_train.csv",
14 | sep=",", header=True, inferSchema=True)
15 |
16 | n_partitions = 4
17 | df = df.repartition(n_partitions)
18 |
19 |
20 | We prepare the MapReduce structure
21 |
22 | .. code ::
23 |
24 | import pandas as pd
25 | from optbinning import OptimalBinningSketch
26 |
27 | variable = "EXT_SOURCE_3"
28 | target = "TARGET"
29 | columns = [variable, target]
30 |
31 |
32 | def add(partition):
33 | df_pandas = pd.DataFrame.from_records(partition, columns=columns)
34 | x = df_pandas[variable]
35 | y = df_pandas[target]
36 | optbsketch = OptimalBinningSketch(eps=0.001)
37 | optbsketch.add(x, y)
38 |
39 | return [optbsketch]
40 |
41 | def merge(optbsketch, other_optbsketch):
42 | optbsketch.merge(other_optbsketch)
43 |
44 | return optbsketch
45 |
46 | Finally, with the required columns, we use mapPartitions and method
47 | treeReduce to aggregate the ``OptimalBinningSketch`` instance of each partition.
48 |
49 | .. code ::
50 |
51 | optbsketch = df.select(columns).rdd.mapPartitions(lambda partition: add(partition)
52 | ).treeReduce(merge)
--------------------------------------------------------------------------------
/optbinning/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | from .binning import BinningProcess
3 | from .binning import ContinuousOptimalBinning
4 | from .binning import MDLP
5 | from .binning import MulticlassOptimalBinning
6 | from .binning import OptimalBinning
7 | from .binning.distributed import BinningProcessSketch
8 | from .binning.distributed import OptimalBinningSketch
9 | from .binning.multidimensional import ContinuousOptimalBinning2D
10 | from .binning.multidimensional import OptimalBinning2D
11 | from .binning.piecewise import ContinuousOptimalPWBinning
12 | from .binning.piecewise import OptimalPWBinning
13 | from .binning.uncertainty import SBOptimalBinning
14 | from .scorecard import Scorecard
15 |
16 |
17 | __all__ = ['__version__',
18 | 'BinningProcess',
19 | 'BinningProcessSketch',
20 | 'ContinuousOptimalBinning',
21 | 'ContinuousOptimalBinning2D',
22 | 'ContinuousOptimalPWBinning',
23 | 'MDLP',
24 | 'MulticlassOptimalBinning',
25 | 'OptimalBinning',
26 | 'OptimalBinningSketch',
27 | 'OptimalBinning2D',
28 | 'OptimalPWBinning',
29 | 'SBOptimalBinning',
30 | 'Scorecard']
31 |
--------------------------------------------------------------------------------
/optbinning/_version.py:
--------------------------------------------------------------------------------
1 | """Version information."""
2 |
3 | __version__ = "0.20.1"
4 |
--------------------------------------------------------------------------------
/optbinning/binning/__init__.py:
--------------------------------------------------------------------------------
1 | from .binning import OptimalBinning
2 | from .binning_process import BinningProcess
3 | from .continuous_binning import ContinuousOptimalBinning
4 | from .mdlp import MDLP
5 | from .multiclass_binning import MulticlassOptimalBinning
6 |
7 |
8 | __all__ = ['BinningProcess',
9 | 'ContinuousOptimalBinning',
10 | 'MDLP',
11 | 'MulticlassOptimalBinning',
12 | 'OptimalBinning']
13 |
--------------------------------------------------------------------------------
/optbinning/binning/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Base optimal binning algorithm class.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | from abc import ABCMeta
9 | from abc import abstractmethod
10 |
11 | from sklearn.base import BaseEstimator
12 | from sklearn.exceptions import NotFittedError
13 |
14 |
15 | class Base:
16 | def _check_is_fitted(self):
17 | if not self._is_fitted:
18 | raise NotFittedError("This {} instance is not fitted yet. Call "
19 | "'fit' with appropriate arguments."
20 | .format(self.__class__.__name__))
21 |
22 |
23 | class BaseOptimalBinning(Base, BaseEstimator, metaclass=ABCMeta):
24 | @abstractmethod
25 | def fit(self):
26 | """Fit the optimal binning according to the given training data."""
27 |
28 | @abstractmethod
29 | def fit_transform(self):
30 | """Fit the optimal binning according to the given training data, then
31 | transform it."""
32 |
33 | @abstractmethod
34 | def transform(self):
35 | """Transform given data using bins from the fitted optimal binning."""
36 |
37 | @abstractmethod
38 | def information(self):
39 | """Print overview information about the options settings, problem
40 | statistics, and the solution of the computation."""
41 |
42 | @property
43 | @abstractmethod
44 | def binning_table(self):
45 | """Return an instantiated binning table."""
46 |
47 | @property
48 | @abstractmethod
49 | def splits(self):
50 | """List of optimal split points."""
51 |
52 | @property
53 | @abstractmethod
54 | def status(self):
55 | """The status of the underlying optimization solver."""
56 |
--------------------------------------------------------------------------------
/optbinning/binning/binning_information.py:
--------------------------------------------------------------------------------
1 | """
2 | Optimal binning information.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2019
7 |
8 | from ..information import print_header
9 | from ..information import print_optional_parameters
10 | from ..information import print_solver_statistics
11 | from ..options import continuous_optimal_binning_default_options
12 | from ..options import multiclass_optimal_binning_default_options
13 | from ..options import optimal_binning_default_options
14 | from ..options import sboptimal_binning_default_options
15 | from ..options import continuous_optimal_binning_2d_default_options
16 | from ..options import optimal_binning_2d_default_options
17 |
18 |
19 | def print_prebinning_statistics(n_prebins, n_refinement):
20 | prebinning_stats = (
21 | " Pre-binning statistics\n"
22 | " Number of pre-bins {:>10}\n"
23 | " Number of refinements {:>10}\n"
24 | ).format(n_prebins, n_refinement)
25 |
26 | print(prebinning_stats)
27 |
28 |
29 | def print_timing(solver_type, solver, time_total, time_preprocessing,
30 | time_prebinning, time_solver, time_optimizer,
31 | time_postprocessing):
32 |
33 | p_preprocessing = time_preprocessing / time_total
34 | p_prebinning = time_prebinning / time_total
35 | p_solver = time_solver / time_total
36 | p_postprocessing = time_postprocessing / time_total
37 |
38 | if solver_type == "cp" and solver is not None:
39 | time_model_generation = time_solver - time_optimizer
40 | p_model_generation = time_model_generation / time_solver
41 | p_optimizer = time_optimizer / time_solver
42 |
43 | time_stats = (
44 | " Timing\n"
45 | " Total time {:>18.2f} sec\n"
46 | " Pre-processing {:>18.2f} sec ({:>7.2%})\n"
47 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n"
48 | " Solver {:>18.2f} sec ({:>7.2%})\n"
49 | " model generation {:>18.2f} sec ({:>7.2%})\n"
50 | " optimizer {:>18.2f} sec ({:>7.2%})\n"
51 | " Post-processing {:>18.2f} sec ({:>7.2%})\n"
52 | ).format(time_total, time_preprocessing, p_preprocessing,
53 | time_prebinning, p_prebinning, time_solver, p_solver,
54 | time_model_generation, p_model_generation, time_optimizer,
55 | p_optimizer, time_postprocessing, p_postprocessing)
56 | else:
57 | time_stats = (
58 | " Timing\n"
59 | " Total time {:>18.2f} sec\n"
60 | " Pre-processing {:>18.2f} sec ({:>7.2%})\n"
61 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n"
62 | " Solver {:>18.2f} sec ({:>7.2%})\n"
63 | " Post-processing {:>18.2f} sec ({:>7.2%})\n"
64 | ).format(time_total, time_preprocessing, p_preprocessing,
65 | time_prebinning, p_prebinning, time_solver, p_solver,
66 | time_postprocessing, p_postprocessing)
67 |
68 | print(time_stats)
69 |
70 |
71 | def print_name_status(name, status):
72 | if not name:
73 | name = "UNKNOWN"
74 |
75 | print(" Name : {:<32}\n"
76 | " Status : {:<32}\n".format(name, status))
77 |
78 |
79 | def print_main_info(name, status, time_total):
80 | print_name_status(name, status)
81 |
82 | print(" Time : {:<7.4f} sec\n".format(time_total))
83 |
84 |
85 | def print_binning_information(binning_type, print_level, name, status,
86 | solver_type, solver, time_total,
87 | time_preprocessing, time_prebinning, time_solver,
88 | time_optimizer, time_postprocessing, n_prebins,
89 | n_refinements, dict_user_options):
90 |
91 | print_header()
92 |
93 | if print_level == 2:
94 | if binning_type == "optimalbinning":
95 | d_default_options = optimal_binning_default_options
96 | elif binning_type == "multiclassoptimalbinning":
97 | d_default_options = multiclass_optimal_binning_default_options
98 | elif binning_type == "continuousoptimalbinning":
99 | d_default_options = continuous_optimal_binning_default_options
100 | elif binning_type == "sboptimalbinning":
101 | d_default_options = sboptimal_binning_default_options
102 | elif binning_type == "optimalbinning2d":
103 | d_default_options = optimal_binning_2d_default_options
104 | elif binning_type == "continuousoptimalbinning2d":
105 | d_default_options = continuous_optimal_binning_2d_default_options
106 |
107 | print_optional_parameters(d_default_options, dict_user_options)
108 |
109 | if print_level == 0:
110 | print_main_info(name, status, time_total)
111 | elif print_level >= 1:
112 | print_name_status(name, status)
113 |
114 | print_prebinning_statistics(n_prebins, n_refinements)
115 |
116 | if status in ("OPTIMAL", "FEASIBLE"):
117 | if solver is not None:
118 | print_solver_statistics(solver_type, solver)
119 |
120 | print_timing(solver_type, solver, time_total, time_preprocessing,
121 | time_prebinning, time_solver, time_optimizer,
122 | time_postprocessing)
123 |
--------------------------------------------------------------------------------
/optbinning/binning/binning_process_information.py:
--------------------------------------------------------------------------------
1 | """
2 | Binning process information.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | from ..information import print_header
9 | from ..information import print_optional_parameters
10 | from ..options import binning_process_default_options
11 |
12 |
13 | def print_main_info(n_records, n_variables, time_total):
14 | print(" Number of records : {}".format(n_records))
15 | print(" Number of variables : {}".format(n_variables))
16 | print(" Time : {:<10.4f} sec\n".format(time_total))
17 |
18 |
19 | def print_binning_process_statistics(n_records, n_variables, target_dtype,
20 | n_numerical, n_categorical, n_selected,
21 | time_total):
22 | stats = (
23 | " Statistics\n"
24 | " Number of records {:>10}\n"
25 | " Number of variables {:>10}\n"
26 | " Target type {:>10}\n\n"
27 | " Number of numerical {:>10}\n"
28 | " Number of categorical {:>10}\n"
29 | " Number of selected {:>10}\n\n"
30 | " Time {:>10.4f} sec\n"
31 | ).format(n_records, n_variables, target_dtype, n_numerical,
32 | n_categorical, n_selected, time_total)
33 |
34 | print(stats)
35 |
36 |
37 | def print_binning_process_information(print_level, n_records, n_variables,
38 | target_dtype, n_numerical, n_categorical,
39 | n_selected, time_total,
40 | dict_user_options):
41 | print_header()
42 |
43 | if print_level == 2:
44 | dict_default_options = binning_process_default_options
45 | print_optional_parameters(dict_default_options, dict_user_options)
46 |
47 | if print_level == 0:
48 | print_main_info(n_records, n_variables, time_total)
49 | elif print_level >= 1:
50 | print_binning_process_statistics(n_records, n_variables, target_dtype,
51 | n_numerical, n_categorical,
52 | n_selected, time_total)
53 |
--------------------------------------------------------------------------------
/optbinning/binning/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .gk import GK
2 | from .bsketch import BSketch
3 | from .bsketch import BCatSketch
4 | from .binning_process_sketch import BinningProcessSketch
5 | from .binning_sketch import OptimalBinningSketch
6 |
7 |
8 | __all__ = ['BSketch',
9 | 'BCatSketch',
10 | 'GK',
11 | 'OptimalBinningSketch',
12 | 'BinningProcessSketch']
13 |
--------------------------------------------------------------------------------
/optbinning/binning/distributed/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Base optimal binning sketch algorithm class.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | from ...exceptions import NotSolvedError
9 |
10 |
11 | class BaseSketch:
12 | def _check_is_solved(self):
13 | if not self._is_solved:
14 | raise NotSolvedError("This {} instance is not solved yet. Call "
15 | "'solve' with appropriate arguments."
16 | .format(self.__class__.__name__))
17 |
--------------------------------------------------------------------------------
/optbinning/binning/distributed/binning_process_sketch_information.py:
--------------------------------------------------------------------------------
1 | """
2 | Binning process sketch information.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | from ...information import print_header
9 | from ...information import print_optional_parameters
10 | from ...options import binning_process_sketch_default_options
11 |
12 |
13 | def print_main_info(n_records, n_variables, time_add, time_solve):
14 | print(" Number of records : {}".format(n_records))
15 | print(" Number of variables : {}".format(n_variables))
16 | print(" Time add : {:<10.4f} sec".format(time_add))
17 | print(" Time solve : {:<10.4f} sec\n".format(time_solve))
18 |
19 |
20 | def print_binning_process_sketch_statistics(
21 | n_records, n_variables, target_dtype, n_numerical, n_categorical,
22 | n_selected, n_add, time_add, n_solve, time_solve):
23 |
24 | r_add = time_add / n_add
25 | r_solve = time_solve / n_solve
26 |
27 | stats = (
28 | " Statistics\n"
29 | " Number of records {:>10}\n"
30 | " Number of variables {:>10}\n"
31 | " Target type {:>10}\n\n"
32 | " Number of numerical {:>10}\n"
33 | " Number of categorical {:>10}\n"
34 | " Number of selected {:>10}\n"
35 | ).format(n_records, n_variables, target_dtype, n_numerical,
36 | n_categorical, n_selected)
37 |
38 | records_stats = (
39 | " Streaming statistics\n"
40 | " Add operations {:>18}\n"
41 | " Solve operations {:>18}\n"
42 | ).format(n_add, n_solve)
43 |
44 | time_stats = (
45 | " Streaming timing\n"
46 | " Time add {:>18.2f} sec ({:6.4f} sec / add)\n"
47 | " Time solve {:>18.2f} sec ({:6.4f} sec / solve)\n"
48 | ).format(time_add, r_add, time_solve, r_solve)
49 |
50 | print(stats)
51 | print(records_stats)
52 | print(time_stats)
53 |
54 |
55 | def print_binning_process_sketch_information(
56 | print_level, n_records, n_variables, target_dtype, n_numerical,
57 | n_categorical, n_selected, n_add, time_add, n_solve, time_solve,
58 | dict_user_options):
59 |
60 | print_header()
61 |
62 | if print_level == 2:
63 | dict_default_options = binning_process_sketch_default_options
64 | print_optional_parameters(dict_default_options, dict_user_options)
65 |
66 | if print_level == 0:
67 | print_main_info(n_records, n_variables, time_add, time_solve)
68 | elif print_level >= 1:
69 | print_binning_process_sketch_statistics(
70 | n_records, n_variables, target_dtype, n_numerical, n_categorical,
71 | n_selected, n_add, time_add, n_solve, time_solve)
72 |
--------------------------------------------------------------------------------
/optbinning/binning/distributed/bsketch_information.py:
--------------------------------------------------------------------------------
1 | """
2 | Binning sketch information.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | from ...binning.binning_information import print_main_info
9 | from ...binning.binning_information import print_name_status
10 | from ...binning.binning_information import print_prebinning_statistics
11 | from ...information import print_header
12 | from ...information import print_optional_parameters
13 | from ...information import print_solver_statistics
14 | from ...options import optimal_binning_sketch_options
15 |
16 |
17 | def print_timing(solver_type, solver, time_total, time_prebinning, time_solver,
18 | time_optimizer, time_postprocessing):
19 |
20 | p_prebinning = time_prebinning / time_total
21 | p_solver = time_solver / time_total
22 | p_postprocessing = time_postprocessing / time_total
23 |
24 | if solver_type == "cp" and solver is not None:
25 | time_model_generation = time_solver - time_optimizer
26 | p_model_generation = time_model_generation / time_solver
27 | p_optimizer = time_optimizer / time_solver
28 |
29 | time_stats = (
30 | " Timing\n"
31 | " Total time {:>18.2f} sec\n"
32 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n"
33 | " Solver {:>18.2f} sec ({:>7.2%})\n"
34 | " model generation {:>18.2f} sec ({:>7.2%})\n"
35 | " optimizer {:>18.2f} sec ({:>7.2%})\n"
36 | " Post-processing {:>18.2f} sec ({:>7.2%})\n"
37 | ).format(time_total, time_prebinning, p_prebinning, time_solver,
38 | p_solver, time_model_generation, p_model_generation,
39 | time_optimizer, p_optimizer, time_postprocessing,
40 | p_postprocessing)
41 | else:
42 | time_stats = (
43 | " Timing\n"
44 | " Total time {:>18.2f} sec\n"
45 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n"
46 | " Solver {:>18.2f} sec ({:>7.2%})\n"
47 | " Post-processing {:>18.2f} sec ({:>7.2%})\n"
48 | ).format(time_total, time_prebinning, p_prebinning, time_solver,
49 | p_solver, time_postprocessing, p_postprocessing)
50 |
51 | print(time_stats)
52 |
53 |
54 | def print_streaming_timing(memory_usage, n_records, n_add, time_add, n_solve,
55 | time_solve):
56 | r_add = time_add / n_add
57 | r_solve = time_solve / n_solve
58 |
59 | records_stats = (
60 | " Streaming statistics\n"
61 | " Sketch memory usage {:>18.5f} MB\n"
62 | " Processed records {:>18}\n"
63 | " Add operations {:>18}\n"
64 | " Solve operations {:>18}\n"
65 | ).format(memory_usage, n_records, n_add, n_solve)
66 |
67 | time_stats = (
68 | " Streaming timing\n"
69 | " Time add {:>18.2f} sec ({:6.4f} sec / add)\n"
70 | " Time solve {:>18.2f} sec ({:6.4f} sec / solve)\n"
71 | ).format(time_add, r_add, time_solve, r_solve)
72 |
73 | print(records_stats)
74 | print(time_stats)
75 |
76 |
77 | def print_binning_information(binning_type, print_level, name, status,
78 | solver_type, solver, time_total, time_prebinning,
79 | time_solver, time_optimizer, time_postprocessing,
80 | n_prebins, n_refinements, n_records, n_add,
81 | time_add, n_solve, time_solve, memory_usage,
82 | dict_user_options):
83 |
84 | print_header()
85 |
86 | if print_level == 2:
87 | if binning_type == "optimalbinningsketch":
88 | dict_default_options = optimal_binning_sketch_options
89 |
90 | print_optional_parameters(dict_default_options, dict_user_options)
91 |
92 | if print_level == 0:
93 | print_main_info(name, status, time_total)
94 | elif print_level >= 1:
95 | print_name_status(name, status)
96 |
97 | print_prebinning_statistics(n_prebins, n_refinements)
98 |
99 | if status in ("OPTIMAL", "FEASIBLE"):
100 | if solver is not None:
101 | print_solver_statistics(solver_type, solver)
102 |
103 | print_timing(solver_type, solver, time_total, time_prebinning,
104 | time_solver, time_optimizer, time_postprocessing)
105 |
106 | print_streaming_timing(memory_usage, n_records, n_add, time_add,
107 | n_solve, time_solve)
108 |
--------------------------------------------------------------------------------
/optbinning/binning/distributed/gk.py:
--------------------------------------------------------------------------------
1 | """
2 | Greenwald-Khanna's streaming quantiles.
3 |
4 | References:
5 | [1] M. Greenwald and S. Khanna, "Space-Efficient Online Computation of
6 | Quantile Summaries", (2001).
7 |
8 | Comment: + improvements (~ 30% faster for large arrays)
9 |
10 | [2] https://github.com/DataDog/sketches-py/tree/master/gkarray
11 | """
12 |
13 | import numpy as np
14 |
15 |
16 | class Entry:
17 | def __init__(self, value, g, delta):
18 | """
19 | Tuple t = (v, g, delta)
20 |
21 | Parameters
22 | ----------
23 | value : float
24 | value that corresponds to one of the elements of the sequence.
25 |
26 | g : float
27 | g = r_min(value_[i]) - r_min(value_[i-1])
28 |
29 | delta : float
30 | r_max - r_min
31 | """
32 | self.value = value
33 | self.g = g
34 | self.delta = delta
35 |
36 |
37 | class GK:
38 | """Greenwald-Khanna's streaming quantiles.
39 |
40 | Parameters
41 | ----------
42 | eps : float (default=0.01)
43 | Relative error epsilon.
44 | """
45 | def __init__(self, eps=0.01):
46 | self.eps = eps
47 |
48 | self.entries = []
49 | self.incoming = []
50 | self._min = np.inf
51 | self._max = -np.inf
52 | self._count = 0
53 | self._sum = 0
54 |
55 | self._compress_threshold = int(1.0 / self.eps) + 1
56 |
57 | def __len__(self):
58 | if len(self.incoming):
59 | self.merge_compress()
60 | return len(self.entries)
61 |
62 | def add(self, value):
63 | """Add value to sketch."""
64 | self.incoming.append(value)
65 | self._count += 1
66 | self._sum += value
67 |
68 | if value < self._min:
69 | self._min = value
70 | if value > self._max:
71 | self._max = value
72 |
73 | if self._count % self._compress_threshold == 0:
74 | self.merge_compress()
75 |
76 | def copy(self, gk):
77 | """Copy GK sketch."""
78 | self.entries = [Entry(e.value, e.g, e.delta) for e in gk.entries]
79 | self.incoming = gk.incoming[:]
80 | self._count = gk._count
81 | self._min = gk._min
82 | self._max = gk._max
83 | self._sum = gk._sum
84 |
85 | def merge(self, gk):
86 | """Merge sketch with another sketch gk."""
87 | if not self.mergeable(gk):
88 | raise Exception("gk does not share signature.")
89 |
90 | if gk._count == 0:
91 | return
92 |
93 | if self._count == 0:
94 | self.copy(gk)
95 | return
96 |
97 | entries = []
98 | spread = int(gk.eps * (gk.n - 1))
99 | gk.merge_compress()
100 |
101 | # upper bound elements(gk.v0, gk.v1) - spread
102 | g = gk.entries[0].g + gk.entries[0].delta - 1 - spread
103 |
104 | if g > 0:
105 | entries.append(Entry(gk._min, g, 0))
106 |
107 | n_gk = len(gk)
108 | for i in range(n_gk - 1):
109 | tp1 = gk.entries[i + 1]
110 | t = gk.entries[i]
111 | g = tp1.g + (tp1.delta - t.delta)
112 | if g > 0:
113 | entries.append(Entry(t.value, g, 0))
114 |
115 | last_t = gk.entries[n_gk - 1]
116 | g = spread + 1 - last_t.delta
117 | if g > 0:
118 | entries.append(Entry(last_t.value, g, 0))
119 |
120 | self._count += gk._count
121 | self._min = min(self._min, gk._min)
122 | self._max = max(self._max, gk._max)
123 | self._sum += gk._sum
124 |
125 | self.merge_compress(entries)
126 |
127 | def merge_compress(self, entries=[]):
128 | """Compress sketch."""
129 | remove_threshold = float(2.0 * self.eps * (self._count - 1))
130 |
131 | incoming = [Entry(value, 1, 0) for value in self.incoming]
132 |
133 | if len(entries):
134 | incoming.extend(Entry(e.value, e.g, e.delta) for e in entries)
135 |
136 | incoming = sorted(incoming, key=lambda e: e.value)
137 |
138 | merged = []
139 | i = 0
140 | j = 0
141 | n_incoming = len(incoming)
142 | n_entries = len(self.entries)
143 |
144 | while i < n_incoming or j < n_entries:
145 | if i == n_incoming:
146 | t = self.entries[j]
147 | j += 1
148 | if j < n_entries:
149 | tn = self.entries[j]
150 | if t.g + tn.g + tn.delta <= remove_threshold:
151 | tn.g += t.g
152 | continue
153 | merged.append(t)
154 | elif j == n_entries:
155 | t = incoming[i]
156 | i += 1
157 | if i < n_incoming:
158 | tn = incoming[i]
159 | if t.g + tn.g + tn.delta <= remove_threshold:
160 | tn.g += t.g
161 | continue
162 | merged.append(t)
163 | elif incoming[i].value < self.entries[j].value:
164 | ti = incoming[i]
165 | tj = self.entries[j]
166 | if ti.g + tj.g + tj.delta <= remove_threshold:
167 | tj.g += ti.g
168 | else:
169 | ti.delta = tj.g + tj.delta - ti.g
170 | merged.append(ti)
171 | i += 1
172 | else:
173 | t = self.entries[j]
174 | j += 1
175 | if j < n_entries:
176 | tn = self.entries[j]
177 | if t.g + tn.g + tn.delta <= remove_threshold:
178 | tn.g += t.g
179 | continue
180 | merged.append(t)
181 |
182 | self.entries = merged
183 | self.incoming = []
184 |
185 | def mergeable(self, gk):
186 | """Check whether a sketch gk is mergeable."""
187 | return self.eps == gk.eps
188 |
189 | def quantile(self, q):
190 | """Calculate quantile q."""
191 | if not (0 <= q <= 1):
192 | raise ValueError("q must be a value in [0, 1].")
193 |
194 | if self._count == 0:
195 | raise ValueError("GK sketch does not contain values.")
196 |
197 | if len(self.incoming):
198 | self.merge_compress()
199 |
200 | rank = int(q * (self._count - 1) + 1)
201 | spread = int(self.eps * (self._count - 1))
202 | g_sum = 0.0
203 | i = 0
204 |
205 | n_entries = len(self.entries)
206 | while i < n_entries:
207 | g_sum += self.entries[i].g
208 | if g_sum + self.entries[i].delta > rank + spread:
209 | break
210 | i += 1
211 | if i == 0:
212 | return self._min
213 |
214 | return self.entries[i - 1].value
215 |
216 | @property
217 | def n(self):
218 | """Number of records in sketch."""
219 | return self._count
220 |
--------------------------------------------------------------------------------
/optbinning/binning/distributed/plots.py:
--------------------------------------------------------------------------------
1 | """
2 | Binning sketch plots.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import matplotlib.pyplot as plt
9 | import numpy as np
10 |
11 |
12 | def plot_progress_divergence(df, divergence):
13 | n = len(df)
14 | n_add = df.n_add
15 | n_records = df.n_records
16 | div = df.divergence
17 |
18 | mv_div_mean = div.rolling(n, min_periods=1).mean()
19 | mv_div_std = div.rolling(n, min_periods=1).std()
20 | mv_div_std /= np.sqrt(np.arange(1, n+1))
21 |
22 | div_low = np.maximum(0, div - mv_div_std * 1.959963984540054)
23 | div_high = div + mv_div_std * 1.959963984540054
24 |
25 | div_label = "divergence ({:.5f})".format(div.values[-1])
26 | mv_div_label = "moving mean ({:.5f})".format(mv_div_mean.values[-1])
27 | mv_std_label = "standard error ({:.5f})".format(mv_div_std.values[-1])
28 |
29 | plt.plot(n_records, div, label=div_label)
30 | plt.plot(n_records, mv_div_mean, linestyle="-.", color="green",
31 | label=mv_div_label)
32 | plt.fill_between(n_records, div_low, div_high, alpha=0.2, color="green",
33 | label=mv_std_label)
34 |
35 | plt.title("Progress after {:} add and {} processed records".
36 | format(int(n_add.values[-1]), int(n_records.values[-1])),
37 | fontsize=14)
38 | plt.xlabel("Processed records", fontsize=12)
39 | plt.ylabel("Divergence: {}".format(divergence), fontsize=12)
40 | plt.legend(fontsize=12)
41 |
42 | plt.show()
43 |
--------------------------------------------------------------------------------
/optbinning/binning/mdlp.py:
--------------------------------------------------------------------------------
1 | """
2 | Minimum Description Length Principle (MDLP)
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import numbers
9 |
10 | import numpy as np
11 |
12 | from scipy import special
13 | from sklearn.base import BaseEstimator
14 | from sklearn.exceptions import NotFittedError
15 | from sklearn.utils import check_array
16 |
17 |
18 | def _check_parameters(min_samples_split, min_samples_leaf, max_candidates):
19 | if (not isinstance(min_samples_split, numbers.Integral) or
20 | min_samples_split < 2):
21 | raise ValueError("min_samples_split must be a positive integer >= 2; "
22 | "got {}.".format(min_samples_split))
23 |
24 | if (not isinstance(min_samples_leaf, numbers.Integral) or
25 | min_samples_leaf < 1):
26 | raise ValueError("min_samples_leaf must be a positive integer >= 1; "
27 | "got {}.".format(min_samples_leaf))
28 |
29 | if not isinstance(max_candidates, numbers.Integral) or max_candidates < 1:
30 | raise ValueError("max_candidates must be a positive integer >= 1; "
31 | "got {}.".format(max_candidates))
32 |
33 |
34 | class MDLP(BaseEstimator):
35 | """
36 | Minimum Description Length Principle (MDLP) discretization algorithm.
37 |
38 | Parameters
39 | ----------
40 | min_samples_split : int (default=2)
41 | The minimum number of samples required to split an internal node.
42 |
43 | min_samples_leaf : int (default=2)
44 | The minimum number of samples required to be at a leaf node.
45 |
46 | max_candidates : int (default=32)
47 | The maximum number of split points to evaluate at each partition.
48 |
49 | Notes
50 | -----
51 | Implementation of the discretization algorithm in [FI93]. A dynamic
52 | split strategy based on binning the number of candidate splits [CMR2001]
53 | is implemented to increase efficiency. For large size datasets, it is
54 | recommended to use a smaller ``max_candidates`` (e.g. 16) to get a
55 | significant speed up.
56 |
57 | References
58 | ----------
59 |
60 | .. [FI93] U. M. Fayyad and K. B. Irani. "Multi-Interval Discretization of
61 | Continuous-Valued Attributes for Classification Learning".
62 | International Joint Conferences on Artificial Intelligence,
63 | 13:1022–1027, 1993.
64 |
65 | .. [CMR2001] D. M. Chickering, C. Meek and R. Rounthwaite. "Efficient
66 | Determination of Dynamic Split Points in a Decision Tree". In
67 | Proceedings of the 2001 IEEE International Conference on Data
68 | Mining, 91-98, 2001.
69 | """
70 | def __init__(self, min_samples_split=2, min_samples_leaf=2,
71 | max_candidates=32):
72 |
73 | self.min_samples_split = min_samples_split
74 | self.min_samples_leaf = min_samples_leaf
75 | self.max_candidates = max_candidates
76 |
77 | # auxiliary
78 | self._splits = []
79 |
80 | self._is_fitted = None
81 |
82 | def fit(self, x, y):
83 | """Fit MDLP discretization algorithm.
84 |
85 | Parameters
86 | ----------
87 | x : array-like, shape = (n_samples)
88 | Data samples, where n_samples is the number of samples.
89 |
90 | y : array-like, shape = (n_samples)
91 | Target vector relative to x.
92 |
93 | Returns
94 | -------
95 | self : MDLP
96 | """
97 | return self._fit(x, y)
98 |
99 | def _fit(self, x, y):
100 | _check_parameters(**self.get_params())
101 |
102 | x = check_array(x, ensure_2d=False, force_all_finite=True)
103 | y = check_array(y, ensure_2d=False, force_all_finite=True)
104 |
105 | idx = np.argsort(x)
106 | x = x[idx]
107 | y = y[idx]
108 |
109 | self._recurse(x, y, 0)
110 |
111 | self._is_fitted = True
112 |
113 | return self
114 |
115 | def _recurse(self, x, y, id):
116 | u_x = np.unique(x)
117 | n_x = len(u_x)
118 | n_y = len(np.bincount(y))
119 |
120 | split = self._find_split(u_x, x, y)
121 |
122 | if split is not None:
123 | self._splits.append(split)
124 | t = np.searchsorted(x, split, side="right")
125 |
126 | if not self._terminate(n_x, n_y, y, y[:t], y[t:]):
127 | self._recurse(x[:t], y[:t], id + 1)
128 | self._recurse(x[t:], y[t:], id + 2)
129 |
130 | def _find_split(self, u_x, x, y):
131 | n_x = len(x)
132 | u_x = np.unique(0.5 * (x[1:] + x[:-1])[(y[1:] - y[:-1]) != 0])
133 |
134 | if len(u_x) > self.max_candidates:
135 | percentiles = np.linspace(1, 100, self.max_candidates)
136 | splits = np.percentile(u_x, percentiles)
137 | else:
138 | splits = u_x
139 |
140 | max_entropy_gain = 0
141 | best_split = None
142 |
143 | tt = np.searchsorted(x, splits, side="right")
144 | for i, t in enumerate(tt):
145 | samples_l = t >= self.min_samples_leaf
146 | samples_r = n_x - t >= self.min_samples_leaf
147 |
148 | if samples_l and samples_r:
149 | entropy_gain = self._entropy_gain(y, y[:t], y[t:])
150 | if entropy_gain > max_entropy_gain:
151 | max_entropy_gain = entropy_gain
152 | best_split = splits[i]
153 |
154 | return best_split
155 |
156 | def _entropy(self, x):
157 | n = len(x)
158 | ns1 = np.sum(x)
159 | ns0 = n - ns1
160 | p = np.array([ns0, ns1]) / n
161 | return -special.xlogy(p, p).sum()
162 |
163 | def _entropy_gain(self, y, y1, y2):
164 | n = len(y)
165 | n1 = len(y1)
166 | n2 = n - n1
167 | ent_y = self._entropy(y)
168 | ent_y1 = self._entropy(y1)
169 | ent_y2 = self._entropy(y2)
170 | return ent_y - (n1 * ent_y1 + n2 * ent_y2) / n
171 |
172 | def _terminate(self, n_x, n_y, y, y1, y2):
173 | splittable = (n_x >= self.min_samples_split) and (n_y >= 2)
174 |
175 | n = len(y)
176 | n1 = len(y1)
177 | n2 = n - n1
178 | ent_y = self._entropy(y)
179 | ent_y1 = self._entropy(y1)
180 | ent_y2 = self._entropy(y2)
181 | gain = ent_y - (n1 * ent_y1 + n2 * ent_y2) / n
182 |
183 | k = len(np.bincount(y))
184 | k1 = len(np.bincount(y1))
185 | k2 = len(np.bincount(y2))
186 |
187 | t0 = np.log(3**k - 2)
188 | t1 = k * ent_y
189 | t2 = k1 * ent_y1
190 | t3 = k2 * ent_y2
191 | delta = t0 - (t1 - t2 - t3)
192 |
193 | return gain <= (np.log(n - 1) + delta) / n or not splittable
194 |
195 | @property
196 | def splits(self):
197 | """List of split points
198 |
199 | Returns
200 | -------
201 | splits : numpy.ndarray
202 | """
203 | if not self._is_fitted:
204 | raise NotFittedError("This {} instance is not fitted yet. Call "
205 | "'fit' with appropriate arguments."
206 | .format(self.__class__.__name__))
207 |
208 | return np.sort(self._splits)
209 |
--------------------------------------------------------------------------------
/optbinning/binning/multiclass_cp.py:
--------------------------------------------------------------------------------
1 | """
2 | Generalized assigment problem: solve constrained multiclass optimal binning
3 | problem. Constraint programming implementation.
4 | """
5 |
6 | # Guillermo Navas-Palencia
7 | # Copyright (C) 2019
8 |
9 | from ortools.sat.python import cp_model
10 |
11 | from .cp import BinningCP
12 | from .model_data import multiclass_model_data
13 |
14 |
15 | class MulticlassBinningCP(BinningCP):
16 | def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size,
17 | max_bin_size, min_event_rate_diff, max_pvalue,
18 | max_pvalue_policy, user_splits_fixed, time_limit):
19 |
20 | self.monotonic_trend = monotonic_trend
21 |
22 | self.min_n_bins = min_n_bins
23 | self.max_n_bins = max_n_bins
24 | self.min_bin_size = min_bin_size
25 | self.max_bin_size = max_bin_size
26 |
27 | self.min_event_rate_diff = min_event_rate_diff
28 | self.max_pvalue = max_pvalue
29 | self.max_pvalue_policy = max_pvalue_policy
30 | self.user_splits_fixed = user_splits_fixed
31 | self.time_limit = time_limit
32 |
33 | self.solver_ = None
34 |
35 | # Auxiliary
36 | self._is_scenario_binning = False
37 | self._model = None
38 | self._n = None
39 | self._x = None
40 |
41 | def build_model(self, n_nonevent, n_event, trend_changes):
42 | # Parameters
43 | M = int(1e6)
44 | (D, V, pvalue_violation_indices,
45 | min_diff_violation_indices) = multiclass_model_data(
46 | n_nonevent, n_event, self.max_pvalue, self.max_pvalue_policy,
47 | self.min_event_rate_diff, M)
48 |
49 | n = len(n_nonevent)
50 | n_records = n_nonevent + n_event
51 | n_classes = len(self.monotonic_trend)
52 |
53 | # Initialize model
54 | model = cp_model.CpModel()
55 |
56 | # Decision variables
57 | x, y, t, d, u, bin_size_diff = self.decision_variables(
58 | model, n, n_classes)
59 |
60 | # Objective function
61 | model.Maximize(sum([sum([(V[c][i][i] * x[i, i]) +
62 | sum([(V[c][i][j] - V[c][i][j+1]) * x[i, j]
63 | for j in range(i)]) for i in range(n)])
64 | for c in range(n_classes)]))
65 |
66 | # Constraint: unique assignment
67 | self.add_constraint_unique_assignment(model, n, x)
68 |
69 | # Constraint: continuity
70 | self.add_constraint_continuity(model, n, x)
71 |
72 | # Constraint: min / max bins
73 | self.add_constraint_min_max_bins(model, n, x, d)
74 |
75 | # Constraint: min / max bin size
76 | self.add_constraint_min_max_bin_size(model, n, x, u, n_records,
77 | bin_size_diff)
78 |
79 | # Constraints: monotonicity
80 | for c in range(n_classes):
81 | if self.monotonic_trend[c] == "ascending":
82 | self.add_constraint_monotonic_ascending(model, n, D[c], x, M)
83 |
84 | if self.monotonic_trend[c] == "descending":
85 | self.add_constraint_monotonic_descending(model, n, D[c], x, M)
86 |
87 | elif self.monotonic_trend[c] in ("peak", "valley"):
88 | for i in range(n):
89 | model.Add(t[c] >= i - n * (1 - y[c, i]))
90 | model.Add(t[c] <= i + n * y[c, i])
91 |
92 | if self.monotonic_trend[c] == "peak":
93 | self.add_constraint_monotonic_peak(
94 | model, n, D[c], x, c, y, M)
95 | else:
96 | self.add_constraint_monotonic_valley(
97 | model, n, D[c], x, c, y, M)
98 |
99 | elif self.monotonic_trend == "peak_heuristic":
100 | self.add_constraint_monotonic_peak_heuristic(
101 | model, n, D[c], x, trend_changes[c], M)
102 |
103 | elif self.monotonic_trend == "valley_heuristic":
104 | self.add_constraint_monotonic_valley_heuristic(
105 | model, n, D[c], x, trend_changes[c], M)
106 |
107 | # Constraint: max-pvalue
108 | for c in range(n_classes):
109 | self.add_constraint_violation(model, x,
110 | pvalue_violation_indices[c])
111 |
112 | # Constraint: min diff
113 | for c in range(n_classes):
114 | self.add_constraint_violation(model, x,
115 | min_diff_violation_indices[c])
116 |
117 | # Constraint: fixed splits
118 | self.add_constraint_fixed_splits(model, n, x)
119 |
120 | self._model = model
121 | self._x = x
122 | self._n = n
123 |
124 | def decision_variables(self, model, n, n_classes):
125 | x = {}
126 | for i in range(n):
127 | for j in range(i + 1):
128 | x[i, j] = model.NewBoolVar("x[{}, {}]".format(i, j))
129 |
130 | y = None
131 | t = None
132 | d = None
133 | u = None
134 | bin_size_diff = None
135 |
136 | if "peak" in self.monotonic_trend or "valley" in self.monotonic_trend:
137 | # Auxiliary binary variables
138 | y = {}
139 | t = {}
140 | for c in range(n_classes):
141 | if self.monotonic_trend[c] in ("peak", "valley"):
142 | for i in range(n):
143 | y[c, i] = model.NewBoolVar("y[{}]".format(i))
144 |
145 | # Change points
146 | t[c] = model.NewIntVar(0, n, "t[{}]".format(c))
147 |
148 | if self.min_n_bins is not None and self.max_n_bins is not None:
149 | n_bin_diff = self.max_n_bins - self.min_n_bins
150 |
151 | # Range constraints auxiliary variables
152 | d = model.NewIntVar(0, n_bin_diff, "n_bin_diff")
153 |
154 | if self.min_bin_size is not None and self.max_bin_size is not None:
155 | bin_size_diff = self.max_bin_size - self.min_bin_size
156 |
157 | # Range constraints auxiliary variables
158 | u = {}
159 | for i in range(n):
160 | u[i] = model.NewIntVar(0, bin_size_diff, "u[{}]".format(i))
161 |
162 | return x, y, t, d, u, bin_size_diff
163 |
164 | def add_constraint_monotonic_peak(self, model, n, D, x, c, y, M):
165 | for i in range(1, n):
166 | for z in range(i):
167 | model.Add(
168 | M * (y[c, i] + y[c, z]) + M + (D[z][z] - M) * x[z, z] +
169 | sum([(D[z][j] - D[z][j+1]) * x[z, j]
170 | for j in range(z)]) -
171 | sum([(D[i][j] - D[i][j + 1]) * x[i, j]
172 | for j in range(i)]) -
173 | D[i][i] * x[i, i] >= 0)
174 |
175 | model.Add(
176 | M * (2 - y[c, i] - y[c, z]) + M + (D[i][i] - M) * x[i, i] +
177 | sum([(D[i][j] - D[i][j + 1]) * x[i, j]
178 | for j in range(i)]) -
179 | sum([(D[z][j] - D[z][j+1]) * x[z, j]
180 | for j in range(z)]) -
181 | D[z][z] * x[z, z] >= 0)
182 |
183 | def add_constraint_monotonic_valley(self, model, n, D, x, c, y, M):
184 | for i in range(1, n):
185 | for z in range(i):
186 | model.Add(
187 | M * (y[c, i] + y[c, z]) + M + (D[i][i] - M) * x[i, i] +
188 | sum([(D[i][j] - D[i][j + 1]) * x[i, j]
189 | for j in range(i)]) -
190 | sum([(D[z][j] - D[z][j+1]) * x[z, j]
191 | for j in range(z)]) -
192 | D[z][z] * x[z, z] >= 0)
193 |
194 | model.Add(
195 | M * (2 - y[c, i] - y[c, z]) + M + (D[z][z] - M) * x[z, z] +
196 | sum([(D[z][j] - D[z][j+1]) * x[z, j]
197 | for j in range(z)]) -
198 | sum([(D[i][j] - D[i][j + 1]) * x[i, j]
199 | for j in range(i)]) -
200 | D[i][i] * x[i, i] >= 0)
201 |
--------------------------------------------------------------------------------
/optbinning/binning/multidimensional/__init__.py:
--------------------------------------------------------------------------------
1 | from .binning_2d import OptimalBinning2D
2 | from .continuous_binning_2d import ContinuousOptimalBinning2D
3 |
4 | __all__ = ['ContinuousOptimalBinning2D',
5 | 'OptimalBinning2D']
6 |
--------------------------------------------------------------------------------
/optbinning/binning/multidimensional/cp_2d.py:
--------------------------------------------------------------------------------
1 | """
2 | Generalized assigment problem: solve constrained optimal 2D binning problem.
3 | Constraint programming implementation.
4 | """
5 |
6 | # Guillermo Navas-Palencia
7 | # Copyright (C) 2021
8 |
9 | import numpy as np
10 |
11 | from ortools.sat.python import cp_model
12 |
13 |
14 | class Binning2DCP:
15 | def __init__(self, monotonic_trend_x, monotonic_trend_y, min_n_bins,
16 | max_n_bins, min_diff_x, min_diff_y, gamma, n_jobs,
17 | time_limit):
18 |
19 | self.monotonic_trend_x = monotonic_trend_x
20 | self.monotonic_trend_y = monotonic_trend_y
21 | self.min_n_bins = min_n_bins
22 | self.max_n_bins = max_n_bins
23 | self.min_diff_x = min_diff_x
24 | self.min_diff_y = min_diff_y
25 | self.gamma = gamma
26 |
27 | self.n_jobs = n_jobs
28 | self.time_limit = time_limit
29 |
30 | self.solver_ = None
31 | self.event_rate_ = None
32 | self.iv_ = None
33 |
34 | self._model = None
35 | self._x = None
36 | self._n_rectangles = None
37 |
38 | def build_model(self, n_grid, n_rectangles, cols, c, d_connected_x,
39 | d_connected_y, er, n_records):
40 | # Parameters
41 | scale = int(1e6)
42 |
43 | # Initialize model
44 | model = cp_model.CpModel()
45 |
46 | # Decision variables
47 | x, d = self.decision_variables(model, n_rectangles)
48 |
49 | # Objective function
50 | if self.gamma:
51 | total_records = int(n_records.sum())
52 | regularization = int(np.ceil(scale * self.gamma / total_records))
53 | pmax = model.NewIntVar(0, total_records, "pmax")
54 | pmin = model.NewIntVar(0, total_records, "pmin")
55 |
56 | model.Maximize(sum([c[i] * x[i] for i in range(n_rectangles)]) -
57 | regularization * (pmax - pmin))
58 | else:
59 | model.Maximize(sum([c[i] * x[i] for i in range(n_rectangles)]))
60 |
61 | # Constraint: unique assignment
62 | self.add_constraint_unique_assignment(model, x, n_grid, cols)
63 |
64 | # Constraint: min / max bins
65 | self.add_constraint_min_max_bins(model, n_rectangles, x, d)
66 |
67 | # Constraint: monotonicity
68 | self.add_constraint_monotonic(
69 | model, n_rectangles, x, er, d_connected_x, d_connected_y,
70 | self.min_diff_x, self.min_diff_y)
71 |
72 | # Constraint: reduction of dominating bins
73 | if self.gamma:
74 | for i in range(n_rectangles):
75 | bin_size = n_records[i] * x[i]
76 |
77 | model.Add(pmin <= total_records * (1 - x[i]) + bin_size)
78 | model.Add(pmax >= bin_size)
79 | model.Add(pmin <= pmax)
80 |
81 | # Save data for post-processing
82 | self._model = model
83 | self._x = x
84 | self._n_rectangles = n_rectangles
85 |
86 | def solve(self):
87 | # Solve
88 | self.solver_ = cp_model.CpSolver()
89 | if self.n_jobs > 1:
90 | self.solver_.parameters.num_search_workers = self.n_jobs
91 | else:
92 | self.solver_.parameters.linearization_level = 2
93 |
94 | self.solver_.parameters.max_time_in_seconds = self.time_limit
95 |
96 | status = self.solver_.Solve(self._model)
97 | status_name = self.solver_.StatusName(status)
98 |
99 | if status in (cp_model.OPTIMAL, cp_model.FEASIBLE):
100 | solution = np.array([self.solver_.BooleanValue(self._x[i])
101 | for i in range(self._n_rectangles)])
102 | else:
103 | solution = np.zeros(self._n_rectangles).astype(np.bool)
104 |
105 | return status_name, solution
106 |
107 | def decision_variables(self, model, n_rectangles):
108 | x = {}
109 | for i in range(n_rectangles):
110 | x[i] = model.NewBoolVar("x[{}]".format(i))
111 |
112 | d = None
113 |
114 | if self.min_n_bins is not None and self.max_n_bins is not None:
115 | n_bin_diff = self.max_n_bins - self.min_n_bins
116 |
117 | # Range constraints auxiliary variables
118 | d = model.NewIntVar(0, n_bin_diff, "n_bin_diff")
119 |
120 | return x, d
121 |
122 | def add_constraint_unique_assignment(self, model, x, n_grid, cols):
123 | for j in range(n_grid):
124 | model.Add(sum([x[i] for i in cols[j]]) == 1)
125 |
126 | def add_constraint_min_max_bins(self, model, n_rectangles, x, d):
127 | if self.min_n_bins is not None or self.max_n_bins is not None:
128 | n_bins = sum([x[i] for i in range(n_rectangles)])
129 |
130 | if self.min_n_bins is not None and self.max_n_bins is not None:
131 | model.Add(d + n_bins - self.max_n_bins == 0)
132 | elif self.min_n_bins is not None:
133 | model.Add(n_bins >= self.min_n_bins)
134 | elif self.max_n_bins is not None:
135 | model.Add(n_bins <= self.max_n_bins)
136 |
137 | def add_constraint_monotonic(self, model, n_rectangles, x,
138 | er, d_connected_x, d_connected_y, min_diff_x,
139 | min_diff_y):
140 |
141 | if (self.monotonic_trend_x is not None and
142 | self.monotonic_trend_y is not None):
143 | for i in range(n_rectangles):
144 | ind_x = []
145 | ind_y = []
146 | for j in d_connected_x[i]:
147 | if self.monotonic_trend_x == "ascending":
148 | if er[i] + min_diff_x >= er[j]:
149 | ind_x.append(j)
150 | elif self.monotonic_trend_x == "descending":
151 | if er[i] <= er[j] + min_diff_x:
152 | ind_x.append(j)
153 |
154 | if ind_x:
155 | model.Add(sum([x[j] for j in ind_x]) <=
156 | len(ind_x) * (1 - x[i]))
157 |
158 | for j in d_connected_y[i]:
159 | if self.monotonic_trend_y == "ascending":
160 | if er[i] + min_diff_y >= er[j]:
161 | ind_y.append(j)
162 | elif self.monotonic_trend_y == "descending":
163 | if er[i] <= er[j] + min_diff_y:
164 | ind_y.append(j)
165 |
166 | if ind_y:
167 | model.Add(sum([x[j] for j in ind_y]) <=
168 | len(ind_y) * (1 - x[i]))
169 |
170 | elif self.monotonic_trend_x is not None:
171 | for i in range(n_rectangles):
172 | ind_x = []
173 | for j in d_connected_x[i]:
174 | if self.monotonic_trend_x == "ascending":
175 | if er[i] + min_diff_x >= er[j]:
176 | ind_x.append(j)
177 | elif self.monotonic_trend_x == "descending":
178 | if er[i] <= er[j] + min_diff_x:
179 | ind_x.append(j)
180 |
181 | if ind_x:
182 | model.Add(sum([x[j] for j in ind_x]) <=
183 | len(ind_x) * (1 - x[i]))
184 |
185 | elif self.monotonic_trend_y is not None:
186 | for i in range(n_rectangles):
187 | ind_y = []
188 | for j in d_connected_y[i]:
189 | if self.monotonic_trend_y == "ascending":
190 | if er[i] + min_diff_y >= er[j]:
191 | ind_y.append(j)
192 | elif self.monotonic_trend_y == "descending":
193 | if er[i] <= er[j] + min_diff_y:
194 | ind_y.append(j)
195 |
196 | if ind_y:
197 | model.Add(sum([x[j] for j in ind_y]) <=
198 | len(ind_y) * (1 - x[i]))
199 |
--------------------------------------------------------------------------------
/optbinning/binning/multidimensional/mip_2d.py:
--------------------------------------------------------------------------------
1 | """
2 | Generalized assigment problem: solve constrained optimal 2D binning problem.
3 | Mixed-Integer programming implementation.
4 | """
5 |
6 | # Guillermo Navas-Palencia
7 | # Copyright (C) 2021
8 |
9 | import numpy as np
10 |
11 | from ortools.linear_solver import pywraplp
12 |
13 |
14 | class Binning2DMIP:
15 | def __init__(self, monotonic_trend_x, monotonic_trend_y, min_n_bins,
16 | max_n_bins, min_diff_x, min_diff_y, gamma, n_jobs,
17 | time_limit):
18 |
19 | self.monotonic_trend_x = monotonic_trend_x
20 | self.monotonic_trend_y = monotonic_trend_y
21 | self.min_n_bins = min_n_bins
22 | self.max_n_bins = max_n_bins
23 | self.min_diff_x = min_diff_x
24 | self.min_diff_y = min_diff_y
25 | self.gamma = gamma
26 |
27 | self.n_jobs = n_jobs
28 | self.time_limit = time_limit
29 |
30 | self.solver_ = None
31 | self.event_rate_ = None
32 | self.iv_ = None
33 |
34 | self._model = None
35 | self._x = None
36 | self._n_rectangles = None
37 |
38 | def build_model(self, n_grid, n_rectangles, cols, c, d_connected_x,
39 | d_connected_y, er, n_records):
40 | # Initialize solver
41 | solver = pywraplp.Solver(
42 | 'BinningMIP', pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING)
43 |
44 | # Decision variables
45 | x, d = self.decision_variables(solver, n_rectangles)
46 |
47 | # Objective function
48 | if self.gamma:
49 | total_records = int(n_records.sum())
50 | regularization = self.gamma / total_records
51 | pmax = solver.NumVar(0, total_records, "pmax")
52 | pmin = solver.NumVar(0, total_records, "pmin")
53 |
54 | solver.Maximize(
55 | solver.Sum([c[i] * x[i] for i in range(n_rectangles)]) -
56 | regularization * (pmax - pmin))
57 | else:
58 | solver.Maximize(
59 | solver.Sum([c[i] * x[i] for i in range(n_rectangles)]))
60 |
61 | # Constraint: unique assignment
62 | self.add_constraint_unique_assignment(solver, x, n_grid, cols)
63 |
64 | # Constraint: min / max bins
65 | self.add_constraint_min_max_bins(solver, n_rectangles, x, d)
66 |
67 | # Constraint: monotonicity
68 | self.add_constraint_monotonic(
69 | solver, n_rectangles, x, er, d_connected_x, d_connected_y,
70 | self.min_diff_x, self.min_diff_y)
71 |
72 | # Constraint: reduction of dominating bins
73 | if self.gamma:
74 | for i in range(n_rectangles):
75 | bin_size = n_records[i] * x[i]
76 |
77 | solver.Add(pmin <= total_records * (1 - x[i]) + bin_size)
78 | solver.Add(pmax >= bin_size)
79 | solver.Add(pmin <= pmax)
80 |
81 | # Save data for post-processing
82 | self.solver_ = solver
83 | self._x = x
84 | self._n_rectangles = n_rectangles
85 |
86 | def solve(self):
87 | # Solve
88 | self.solver_.SetTimeLimit(self.time_limit * 1000)
89 | self.solver_.SetNumThreads(self.n_jobs)
90 | status = self.solver_.Solve()
91 |
92 | if status in (pywraplp.Solver.OPTIMAL, pywraplp.Solver.FEASIBLE):
93 | if status == pywraplp.Solver.OPTIMAL:
94 | status_name = "OPTIMAL"
95 | else:
96 | status_name = "FEASIBLE"
97 |
98 | solution = np.array([self._x[i].solution_value()
99 | for i in range(self._n_rectangles)])
100 |
101 | solution = solution.astype(bool)
102 | else:
103 | if status == pywraplp.Solver.ABNORMAL:
104 | status_name = "ABNORMAL"
105 | elif status == pywraplp.Solver.INFEASIBLE:
106 | status_name = "INFEASIBLE"
107 | elif status == pywraplp.Solver.UNBOUNDED:
108 | status_name = "UNBOUNDED"
109 | else:
110 | status_name = "UNKNOWN"
111 |
112 | solution = np.zeros(self._n_rectangles, dtype=bool)
113 |
114 | return status_name, solution
115 |
116 | def decision_variables(self, solver, n_rectangles):
117 | x = {}
118 |
119 | for i in range(n_rectangles):
120 | x[i] = solver.BoolVar("x[{}]".format(i))
121 |
122 | d = None
123 |
124 | if self.min_n_bins is not None and self.max_n_bins is not None:
125 | n_bin_diff = self.max_n_bins - self.min_n_bins
126 |
127 | # Range constraints auxiliary variables
128 | d = solver.NumVar(0, n_bin_diff, "n_bin_diff")
129 |
130 | return x, d
131 |
132 | def add_constraint_unique_assignment(self, solver, x, n_grid, cols):
133 | for j in range(n_grid):
134 | solver.Add(solver.Sum([x[i] for i in cols[j]]) == 1)
135 |
136 | def add_constraint_min_max_bins(self, solver, n_rectangles, x, d):
137 | if self.min_n_bins is not None or self.max_n_bins is not None:
138 | n_bins = solver.Sum([x[i] for i in range(n_rectangles)])
139 |
140 | if self.min_n_bins is not None and self.max_n_bins is not None:
141 | solver.Add(d + n_bins - self.max_n_bins == 0)
142 | elif self.min_n_bins is not None:
143 | solver.Add(n_bins >= self.min_n_bins)
144 | elif self.max_n_bins is not None:
145 | solver.Add(n_bins <= self.max_n_bins)
146 |
147 | def add_constraint_monotonic(self, solver, n_rectangles, x, er,
148 | d_connected_x, d_connected_y, min_diff_x,
149 | min_diff_y):
150 |
151 | if (self.monotonic_trend_x is not None and
152 | self.monotonic_trend_y is not None):
153 | for i in range(n_rectangles):
154 | ind_x = []
155 | ind_y = []
156 | for j in d_connected_x[i]:
157 | if self.monotonic_trend_x == "ascending":
158 | if er[i] + min_diff_x >= er[j]:
159 | ind_x.append(j)
160 | elif self.monotonic_trend_x == "descending":
161 | if er[i] <= er[j] + min_diff_x:
162 | ind_x.append(j)
163 |
164 | if ind_x:
165 | solver.Add(solver.Sum([x[j] for j in ind_x]) <=
166 | len(ind_x) * (1 - x[i]))
167 |
168 | for j in d_connected_y[i]:
169 | if self.monotonic_trend_y == "ascending":
170 | if er[i] + min_diff_y >= er[j]:
171 | ind_y.append(j)
172 | elif self.monotonic_trend_y == "descending":
173 | if er[i] <= er[j] + min_diff_y:
174 | ind_y.append(j)
175 |
176 | if ind_y:
177 | solver.Add(solver.Sum([x[j] for j in ind_y]) <=
178 | len(ind_y) * (1 - x[i]))
179 |
180 | elif self.monotonic_trend_x is not None:
181 | for i in range(n_rectangles):
182 | ind_x = []
183 | for j in d_connected_x[i]:
184 | if self.monotonic_trend_x == "ascending":
185 | if er[i] + min_diff_x >= er[j]:
186 | ind_x.append(j)
187 | elif self.monotonic_trend_x == "descending":
188 | if er[i] <= er[j] + min_diff_x:
189 | ind_x.append(j)
190 |
191 | if ind_x:
192 | solver.Add(solver.Sum([x[j] for j in ind_x]) <=
193 | len(ind_x) * (1 - x[i]))
194 |
195 | elif self.monotonic_trend_y is not None:
196 | for i in range(n_rectangles):
197 | ind_y = []
198 | for j in d_connected_y[i]:
199 | if self.monotonic_trend_y == "ascending":
200 | if er[i] + min_diff_y >= er[j]:
201 | ind_y.append(j)
202 | elif self.monotonic_trend_y == "descending":
203 | if er[i] <= er[j] + min_diff_y:
204 | ind_y.append(j)
205 |
206 | if ind_y:
207 | solver.Add(solver.Sum([x[j] for j in ind_y]) <=
208 | len(ind_y) * (1 - x[i]))
209 |
--------------------------------------------------------------------------------
/optbinning/binning/multidimensional/preprocessing_2d.py:
--------------------------------------------------------------------------------
1 | """
2 | Preprocessing 2D functions.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from sklearn.utils import check_array
12 | from sklearn.utils import check_consistent_length
13 |
14 | from ..preprocessing import categorical_transform
15 |
16 |
17 | def split_data_2d(dtype_x, dtype_y, x, y, z, special_codes_x=None,
18 | special_codes_y=None, check_input=True):
19 | """Split 2d data into clean, missing and special values data.
20 |
21 | Parameters
22 | ----------
23 | dtype_x : str, optional (default="numerical")
24 | The data type of variable x. Supported data type is "numerical" for
25 | continuous and ordinal variables.
26 |
27 | dtype_y : str, optional (default="numerical")
28 | The data type of variable y. Supported data type is "numerical" for
29 | continuous and ordinal variables.
30 |
31 | x : array-like, shape = (n_samples,)
32 | Training vector x, where n_samples is the number of samples.
33 |
34 | y : array-like, shape = (n_samples,)
35 | Training vector y, where n_samples is the number of samples.
36 |
37 | z : array-like, shape = (n_samples,)
38 | Target vector relative to x and y.
39 |
40 | special_codes_x : array-like or None, optional (default=None)
41 | List of special codes for the variable x. Use special codes to specify
42 | the data values that must be treated separately.
43 |
44 | special_codes_y : array-like or None, optional (default=None)
45 | List of special codes for the variable y. Use special codes to specify
46 | the data values that must be treated separately.
47 |
48 | check_input : bool, (default=True)
49 | If False, the input arrays x and y will not be checked.
50 |
51 | Returns
52 | -------
53 | """
54 | if check_input:
55 | x = check_array(x, ensure_2d=False, dtype=None,
56 | force_all_finite='allow-nan')
57 |
58 | y = check_array(y, ensure_2d=False, dtype=None,
59 | force_all_finite='allow-nan')
60 |
61 | z = check_array(z, ensure_2d=False, dtype=None,
62 | force_all_finite=True)
63 |
64 | check_consistent_length(x, y, z)
65 |
66 | x = np.asarray(x)
67 | y = np.asarray(y)
68 | z = np.asarray(z)
69 |
70 | if np.issubdtype(x.dtype, np.number) and np.issubdtype(z.dtype, np.number):
71 | missing_mask_x = np.isnan(x) | np.isnan(z)
72 | else:
73 | missing_mask_x = pd.isnull(x) | pd.isnull(z)
74 |
75 | if np.issubdtype(y.dtype, np.number) and np.issubdtype(z.dtype, np.number):
76 | missing_mask_y = np.isnan(y) | np.isnan(z)
77 | else:
78 | missing_mask_y = pd.isnull(y) | pd.isnull(z)
79 |
80 | if special_codes_x is not None:
81 | special_mask_x = pd.Series(x).isin(special_codes_x).values
82 | else:
83 | special_mask_x = np.zeros(len(x), dtype=bool)
84 |
85 | if special_codes_y is not None:
86 | special_mask_y = pd.Series(y).isin(special_codes_y).values
87 | else:
88 | special_mask_y = np.zeros(len(y), dtype=bool)
89 |
90 | missing_mask = missing_mask_x | missing_mask_y
91 | special_mask = special_mask_x | special_mask_y
92 |
93 | clean_mask = ~missing_mask & ~special_mask
94 |
95 | x_clean = x[clean_mask]
96 | y_clean = y[clean_mask]
97 | z_clean = z[clean_mask]
98 |
99 | x_missing = x[missing_mask]
100 | y_missing = y[missing_mask]
101 | z_missing = z[missing_mask]
102 |
103 | x_special = x[special_mask]
104 | y_special = y[special_mask]
105 | z_special = z[special_mask]
106 |
107 | if dtype_x == "categorical":
108 | x_categories, x_clean = categorical_transform(x_clean, z_clean)
109 | else:
110 | x_categories = []
111 |
112 | if dtype_y == "categorical":
113 | y_categories, y_clean = categorical_transform(y_clean, z_clean)
114 | else:
115 | y_categories = []
116 |
117 | return (x_clean, y_clean, z_clean, x_missing, y_missing, z_missing,
118 | x_special, y_special, z_special, x_categories, y_categories)
119 |
--------------------------------------------------------------------------------
/optbinning/binning/outlier.py:
--------------------------------------------------------------------------------
1 | """
2 | Univariate outlier detection methods.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import numbers
9 |
10 | import numpy as np
11 |
12 | from sklearn.base import BaseEstimator
13 | from sklearn.exceptions import NotFittedError
14 |
15 |
16 | class OutlierDetector:
17 | """Base class for all outlier detectors."""
18 | def __init__(self):
19 | self._support = None
20 |
21 | # flag
22 | self._is_fitted = False
23 |
24 | def fit(self, x, y=None):
25 | """Fit outlier detector.
26 |
27 | Parameters
28 | ----------
29 | x : array-like, shape = (n_samples)
30 |
31 | y : array-like, shape = (n_samples) or None (default=None)
32 |
33 | Returns
34 | -------
35 | self : OutlierDetector
36 | """
37 | self._fit(x, y)
38 |
39 | return self
40 |
41 | def get_support(self, indices=False):
42 | """Get a mask, or integer index, of the samples excluded, i.e, samples
43 | detected as outliers.
44 |
45 | Parameters
46 | ----------
47 | indices : boolean (default False)
48 | If True, the return value will be an array of integers, rather
49 | than a boolean mask.
50 |
51 | Returns
52 | -------
53 | support : array, shape = (n_samples)
54 | An index that selects the excluded samples from a vector.
55 | If `indices` is False, this is a boolean array, in which an element
56 | is True iff its corresponding sample is excluded. If `indices` is
57 | True, this is an integer array whose values are indices into the
58 | input vector.
59 | """
60 | if not self._is_fitted:
61 | raise NotFittedError("This {} instance is not fitted yet. Call "
62 | "'fit' with appropriate arguments."
63 | .format(self.__class__.__name__))
64 |
65 | mask = self._support
66 | return mask if not indices else np.where(mask)[0]
67 |
68 |
69 | class RangeDetector(BaseEstimator, OutlierDetector):
70 | r"""Interquartile range or interval based outlier detection method.
71 |
72 | The default settings compute the usual interquartile range method.
73 |
74 | Parameters
75 | ----------
76 | interval_length : float (default=0.5)
77 | Compute ``interval_length``\% credible interval. This is a value in
78 | [0, 1].
79 |
80 | k : float (default=1.5)
81 | Tukey's factor.
82 |
83 | method : str (default="ETI")
84 | Method to compute credible intervals. Supported methods are Highest
85 | Density interval (``method="HDI"``) and Equal-tailed interval
86 | (``method="ETI"``).
87 | """
88 | def __init__(self, interval_length=0.5, k=1.5, method="ETI"):
89 | self.interval_length = interval_length
90 | self.k = k
91 | self.method = method
92 |
93 | def _fit(self, x, y=None):
94 | if self.method not in ("ETI", "HDI"):
95 | raise ValueError('Invalid value for method. Allowed string '
96 | 'values are "ETI" and "HDI".')
97 |
98 | if (not isinstance(self.interval_length, numbers.Number) or
99 | not 0 <= self.interval_length <= 1):
100 | raise ValueError("Interval length must a value in [0, 1]; got {}."
101 | .format(self.interval_length))
102 |
103 | if self.method == "ETI":
104 | lower = 100 * (1 - self.interval_length) / 2
105 | upper = 100 * (1 + self.interval_length) / 2
106 |
107 | lb, ub = np.percentile(x, [lower, upper])
108 | else:
109 | n = len(x)
110 | xsorted = np.sort(x)
111 | n_included = int(np.ceil(self.interval_length * n))
112 | n_ci = n - n_included
113 | ci = xsorted[n_included:] - xsorted[:n_ci]
114 | j = np.argmin(ci)
115 | hdi_min = xsorted[j]
116 | hdi_max = xsorted[j + n_included]
117 |
118 | lb = hdi_min
119 | ub = hdi_max
120 |
121 | iqr = ub - lb
122 | lower_bound = lb - self.k * iqr
123 | upper_bound = ub + self.k * iqr
124 |
125 | self._support = (x > upper_bound) | (x < lower_bound)
126 |
127 | self._is_fitted = True
128 |
129 |
130 | class ModifiedZScoreDetector(BaseEstimator, OutlierDetector):
131 | """Modified Z-score method.
132 |
133 | Parameters
134 | ----------
135 | threshold : float (default=3.5)
136 | Modified Z-scores with an absolute value of greater than the threshold
137 | are labeled as outliers.
138 |
139 | References
140 | ----------
141 |
142 | .. [IH93] B. Iglewicz and D. Hoaglin. "Volume 16: How to Detect and Handle
143 | Outliers", The ASQC Basic References in Quality Control:
144 | Statistical Techniques, Edward F. Mykytka, Ph.D., Editor, 1993.
145 | """
146 | def __init__(self, threshold=3.5):
147 | self.threshold = threshold
148 |
149 | def _fit(self, x, y=None):
150 | if (not isinstance(self.threshold, numbers.Number) or
151 | self.threshold < 0):
152 | raise ValueError("threshold must be a value >= 0; got {}".
153 | format(self.threshold))
154 |
155 | x = np.asarray(x)
156 | median = np.median(x)
157 | mad = np.median(np.abs(x - median))
158 | m_z_score = 0.6745 * (x - median) / mad
159 |
160 | self._support = np.abs(m_z_score) > self.threshold
161 |
162 | self._is_fitted = True
163 |
164 |
165 | class YQuantileDetector(BaseEstimator, OutlierDetector):
166 | """Outlier detector on the y-axis over quantiles.
167 |
168 | Parameters
169 | ----------
170 | outlier_detector : str or None, optional (default=None)
171 | The outlier detection method. Supported methods are "range" to use
172 | the interquartile range based method or "zcore" to use the modified
173 | Z-score method.
174 |
175 | outlier_params : dict or None, optional (default=None)
176 | Dictionary of parameters to pass to the outlier detection method.
177 |
178 | n_bins : int (default=5)
179 | The maximum number of bins to consider.
180 | """
181 | def __init__(self, outlier_detector="zscore", outlier_params=None,
182 | n_bins=5):
183 | self.outlier_detector = outlier_detector
184 | self.outlier_params = outlier_params
185 | self.n_bins = n_bins
186 |
187 | def _fit(self, x, y):
188 | if self.outlier_detector not in ("range", "zscore"):
189 | raise ValueError('Invalid value for outlier_detector. Allowed '
190 | 'string values are "range" and "zscore".')
191 |
192 | if self.outlier_params is not None:
193 | if not isinstance(self.outlier_params, dict):
194 | raise TypeError("outlier_params must be a dict or None; "
195 | "got {}.".format(self.outlier_params))
196 |
197 | if not isinstance(self.n_bins, numbers.Integral) or self.n_bins <= 0:
198 | raise ValueError("bins must be a positive integer; got {}."
199 | .format(self.n_bins))
200 |
201 | x = np.asarray(x)
202 | y = np.asarray(y)
203 |
204 | q = np.linspace(0, 1, self.n_bins + 1)
205 | splits = np.unique(np.quantile(x, q))[1:-1]
206 | n_bins = len(splits) + 1
207 | indices = np.digitize(x, splits, right=False)
208 |
209 | self._support = np.zeros(x.size, dtype=bool)
210 | idx_support = np.arange(x.size)
211 |
212 | if self.outlier_detector == "zscore":
213 | detector = ModifiedZScoreDetector()
214 | elif self.outlier_detector == "range":
215 | detector = RangeDetector()
216 |
217 | if self.outlier_params is not None:
218 | detector.set_params(**self.outlier_params)
219 |
220 | for i in range(n_bins):
221 | mask_x = indices == i
222 | detector.fit(y[mask_x])
223 | mask_out = detector.get_support()
224 | idx_out = idx_support[mask_x][mask_out]
225 | self._support[idx_out] = True
226 |
227 | self._is_fitted = True
228 |
--------------------------------------------------------------------------------
/optbinning/binning/piecewise/__init__.py:
--------------------------------------------------------------------------------
1 | from .binning import OptimalPWBinning
2 | from .continuous_binning import ContinuousOptimalPWBinning
3 |
4 |
5 | __all__ = ['OptimalPWBinning',
6 | 'ContinuousOptimalPWBinning']
7 |
--------------------------------------------------------------------------------
/optbinning/binning/piecewise/binning_information.py:
--------------------------------------------------------------------------------
1 | """
2 | Optimal piecewise binning information.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | from ...binning.binning_information import print_header
9 | from ...binning.binning_information import print_optional_parameters
10 | from ...binning.binning_information import print_name_status
11 | from ...binning.binning_information import print_main_info
12 | from ...options import optimal_pw_binning_options
13 |
14 |
15 | def print_prebinning_statistics(n_prebins):
16 | prebinning_stats = (
17 | " Pre-binning statistics\n"
18 | " Number of bins {:>10}\n"
19 | ).format(n_prebins)
20 |
21 | print(prebinning_stats)
22 |
23 |
24 | def print_solver_statistics(solver_type, solver):
25 | if isinstance(solver.stats, list):
26 | n_constraints = sum(info["n_constraints"] for info in solver.stats)
27 | n_variables = sum(info["n_variables"] for info in solver.stats)
28 | else:
29 | n_constraints = solver.stats["n_constraints"]
30 | n_variables = solver.stats["n_variables"]
31 |
32 | solver_stats = (
33 | " Solver statistics\n"
34 | " Type {:>10}\n"
35 | " Number of variables {:>10}\n"
36 | " Number of constraints {:>10}\n"
37 | ).format(solver_type, n_variables, n_constraints)
38 |
39 | print(solver_stats)
40 |
41 |
42 | def print_timing(solver_type, solver, time_total, time_preprocessing,
43 | time_estimator, time_prebinning, time_solver,
44 | time_postprocessing):
45 |
46 | p_preprocessing = time_preprocessing / time_total
47 | p_estimator = time_estimator / time_total
48 | p_prebinning = time_prebinning / time_total
49 | p_solver = time_solver / time_total
50 | p_postprocessing = time_postprocessing / time_total
51 |
52 | time_stats = (
53 | " Timing\n"
54 | " Total time {:>18.2f} sec\n"
55 | " Pre-processing {:>18.2f} sec ({:>7.2%})\n"
56 | " Estimator {:>18.2f} sec ({:>7.2%})\n"
57 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n"
58 | " Solver {:>18.2f} sec ({:>7.2%})\n"
59 | " Post-processing {:>18.2f} sec ({:>7.2%})\n"
60 | ).format(time_total, time_preprocessing, p_preprocessing,
61 | time_estimator, p_estimator, time_prebinning, p_prebinning,
62 | time_solver, p_solver, time_postprocessing, p_postprocessing)
63 |
64 | print(time_stats)
65 |
66 |
67 | def retrieve_status(status):
68 | if isinstance(status, list):
69 | n_status = len(status)
70 | n_optimal = 0
71 | n_feasible = 0
72 | n_unbouded = 0
73 | for s in status:
74 | if "optimal" in s:
75 | n_optimal += 1
76 | elif "feasible" in s:
77 | n_feasible += 1
78 | elif "unbounded" in s:
79 | n_unbouded += 1
80 | if n_optimal == n_status:
81 | return "OPTIMAL"
82 | elif n_feasible == n_status:
83 | return "FEASIBLE"
84 | elif n_unbouded == n_status:
85 | return "UNBOUNDED"
86 | else:
87 | new_status = ""
88 | if n_optimal > 0:
89 | new_status += "OPTIMAL ({}/{})".format(n_optimal, n_status)
90 | if n_feasible > 0:
91 | new_status += "FEASIBLE ({}/{})".format(n_feasible, n_status)
92 | if n_unbouded > 0:
93 | new_status += "UNBOUNDED ({}/{})".format(n_unbouded, n_status)
94 | return new_status
95 | else:
96 | if "optimal" in status:
97 | return "OPTIMAL"
98 | elif "feasible" in status:
99 | return "FEASIBLE"
100 | elif "unbounded" in status:
101 | return "UNBOUNDED"
102 |
103 |
104 | def print_binning_information(print_level, name, status, solver_type, solver,
105 | time_total, time_preprocessing, time_estimator,
106 | time_prebinning, time_solver,
107 | time_postprocessing, n_prebins,
108 | dict_user_options):
109 |
110 | print_header()
111 |
112 | if print_level == 2:
113 | dict_default_options = optimal_pw_binning_options
114 |
115 | print_optional_parameters(dict_default_options, dict_user_options)
116 |
117 | if print_level == 0:
118 | print_main_info(name, status, time_total)
119 | elif print_level >= 1:
120 | print_name_status(name, status)
121 |
122 | print_prebinning_statistics(n_prebins)
123 |
124 | if status in ("OPTIMAL", "FEASIBLE"):
125 | if solver is not None:
126 | print_solver_statistics(solver_type, solver)
127 |
128 | print_timing(solver_type, solver, time_total, time_preprocessing,
129 | time_estimator, time_prebinning, time_solver,
130 | time_postprocessing)
131 |
--------------------------------------------------------------------------------
/optbinning/binning/piecewise/metrics.py:
--------------------------------------------------------------------------------
1 | """
2 | Optimal piecewise binning metrics.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import numpy as np
9 |
10 | from sklearn.metrics import average_precision_score
11 | from sklearn.metrics import brier_score_loss
12 |
13 | from ...binning.metrics import jeffrey
14 | from ...binning.metrics import jensen_shannon
15 | from ...binning.metrics import hellinger
16 | from ...binning.metrics import triangular
17 | from ...metrics.classification import gini
18 | from ...metrics.classification import ks
19 | from ...metrics.regression import regression_metrics
20 | from .transformations import transform_binary_target
21 | from .transformations import transform_continuous_target
22 |
23 |
24 | def _fun_divergence(fun, n, pi, qi, pi_special, qi_special, pi_missing,
25 | qi_missing, flag_special, flag_missing, n_special):
26 |
27 | div_value = fun(pi, qi, return_sum=True) / n
28 |
29 | if flag_special:
30 | div_value += fun(pi_special, qi_special, return_sum=True) / n_special
31 |
32 | if flag_missing:
33 | div_value += fun([pi_missing], [qi_missing])
34 |
35 | return float(div_value)
36 |
37 |
38 | def divergences_asymptotic(event_rate, n_nonevent_special, n_event_special,
39 | n_nonevent_missing, n_event_missing, t_n_nonevent,
40 | t_n_event):
41 |
42 | n = t_n_nonevent + t_n_event
43 | p = t_n_event / n
44 |
45 | pi = (1.0 - event_rate) / (1.0 - p)
46 | qi = event_rate / p
47 |
48 | if isinstance(n_event_special, (np.ndarray, list)):
49 | n_special = n_event_special.size
50 | mask = (n_event_special > 0) & (n_nonevent_special > 0)
51 | flag_special = np.any(mask)
52 |
53 | pi_special = n_nonevent_special[mask] / t_n_nonevent
54 | qi_special = n_event_special[mask] / t_n_event
55 | else:
56 | n_special = 1
57 | flag_special = (n_event_special > 0 and n_nonevent_special > 0)
58 | pi_special = n_nonevent_special / t_n_nonevent
59 | qi_special = n_event_special / t_n_event
60 |
61 | flag_missing = (n_event_missing > 0 and n_nonevent_missing > 0)
62 | pi_missing = n_nonevent_missing / t_n_nonevent
63 | qi_missing = n_event_missing / t_n_event
64 |
65 | d_divergences = {}
66 |
67 | d_divergences["IV (Jeffrey)"] = _fun_divergence(
68 | jeffrey, n, pi, qi, pi_special, qi_special, pi_missing, qi_missing,
69 | flag_special, flag_missing, n_special)
70 |
71 | d_divergences["JS (Jensen-Shannon)"] = _fun_divergence(
72 | jensen_shannon, n, pi, qi, pi_special, qi_special, pi_missing,
73 | qi_missing, flag_special, flag_missing, n_special)
74 |
75 | d_divergences["Hellinger"] = _fun_divergence(
76 | hellinger, n, pi, qi, pi_special, qi_special, pi_missing, qi_missing,
77 | flag_special, flag_missing, n_special)
78 |
79 | d_divergences["Triangular"] = _fun_divergence(
80 | triangular, n, pi, qi, pi_special, qi_special, pi_missing, qi_missing,
81 | flag_special, flag_missing, n_special)
82 |
83 | return d_divergences
84 |
85 |
86 | def binary_metrics(x, y, splits, c, t_n_nonevent, t_n_event,
87 | n_nonevent_special, n_event_special, n_nonevent_missing,
88 | n_event_missing, special_codes):
89 |
90 | d_metrics = {}
91 |
92 | n_nonevent_special = np.asarray(n_nonevent_special)
93 | n_event_special = np.asarray(n_event_special)
94 |
95 | # Metrics using predicted probability of Y=1.
96 | min_pred = 1e-8
97 | max_pred = 1 - min_pred
98 |
99 | event_rate = transform_binary_target(
100 | splits, x, c, min_pred, max_pred, t_n_nonevent, t_n_event,
101 | n_nonevent_special, n_event_special, n_nonevent_missing,
102 | n_event_missing, special_codes, "event_rate", "empirical", "empirical")
103 |
104 | d_metrics["Gini index"] = gini(y, event_rate)
105 |
106 | # Divergence metrics
107 | d_divergences = divergences_asymptotic(
108 | event_rate, n_nonevent_special, n_event_special, n_nonevent_missing,
109 | n_event_missing, t_n_nonevent, t_n_event)
110 |
111 | for dk, dv in d_divergences.items():
112 | d_metrics[dk] = dv
113 |
114 | d_metrics["KS"] = ks(y, event_rate)[0]
115 | d_metrics["Avg precision"] = average_precision_score(y, event_rate)
116 | d_metrics["Brier score"] = brier_score_loss(y, event_rate)
117 |
118 | return d_metrics
119 |
120 |
121 | def continuous_metrics(x, y, splits, c, lb, ub, n_records_special, sum_special,
122 | n_records_missing, sum_missing, special_codes):
123 |
124 | y_pred = transform_continuous_target(
125 | splits, x, c, lb, ub, n_records_special, sum_special,
126 | n_records_missing, sum_missing, special_codes, "empirical",
127 | "empirical")
128 |
129 | d_metrics = regression_metrics(y, y_pred)
130 |
131 | return d_metrics
132 |
--------------------------------------------------------------------------------
/optbinning/binning/piecewise/transformations.py:
--------------------------------------------------------------------------------
1 | """
2 | Piecewise binning transformations.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from sklearn.utils import check_array
12 |
13 | from ...binning.transformations import transform_event_rate_to_woe
14 | from ...binning.transformations import _check_metric_special_missing
15 | from ...binning.transformations import _mask_special_missing
16 |
17 |
18 | def _apply_transform(x, c, lb, ub, special_codes, metric_special,
19 | metric_missing, clean_mask, special_mask, missing_mask,
20 | indices, x_clean, n_bins, n_special, event_rate_special,
21 | event_rate_missing):
22 |
23 | x_transform = np.zeros(x.shape)
24 | x_clean_transform = np.zeros(x_clean.shape)
25 |
26 | for i in range(n_bins):
27 | mask = (indices == i)
28 | x_clean_transform[mask] = np.polyval(c[i, :][::-1], x_clean[mask])
29 |
30 | # Clip values using LB/UB
31 | bounded = (lb is not None or ub is not None)
32 | if bounded:
33 | x_clean_transform = np.clip(x_clean_transform, lb, ub)
34 |
35 | x_transform[clean_mask] = x_clean_transform
36 |
37 | if special_codes:
38 | if isinstance(special_codes, dict):
39 | xt = pd.Series(x)
40 | for i, (k, s) in enumerate(special_codes.items()):
41 | sl = s if isinstance(s, (list, np.ndarray)) else [s]
42 | mask = xt.isin(sl).values
43 | if metric_special == "empirical":
44 | x_transform[mask] = event_rate_special[i]
45 | else:
46 | x_transform[mask] = metric_special
47 | else:
48 | if metric_special == "empirical":
49 | x_transform[special_mask] = event_rate_special
50 | else:
51 | x_transform[special_mask] = metric_special
52 |
53 | if metric_missing == "empirical":
54 | x_transform[missing_mask] = event_rate_missing
55 | else:
56 | x_transform[missing_mask] = metric_missing
57 |
58 | return x_transform
59 |
60 |
61 | def transform_binary_target(splits, x, c, lb, ub, n_nonevent, n_event,
62 | n_event_special, n_nonevent_special,
63 | n_event_missing, n_nonevent_missing,
64 | special_codes, metric, metric_special,
65 | metric_missing, check_input=False):
66 |
67 | if metric not in ("event_rate", "woe"):
68 | raise ValueError('Invalid value for metric. Allowed string '
69 | 'values are "event_rate" and "woe".')
70 |
71 | _check_metric_special_missing(metric_special, metric_missing)
72 |
73 | if check_input:
74 | x = check_array(x, ensure_2d=False, dtype=None,
75 | force_all_finite='allow-nan')
76 |
77 | x = np.asarray(x)
78 |
79 | special_mask, missing_mask, clean_mask, n_special = _mask_special_missing(
80 | x, special_codes)
81 |
82 | x_clean = x[clean_mask]
83 |
84 | if len(splits):
85 | indices = np.digitize(x_clean, splits, right=False)
86 | else:
87 | indices = np.zeros(x_clean.shape)
88 |
89 | n_bins = len(splits) + 1
90 |
91 | # Compute event rate for special and missing bin
92 | event_rate_special = metric_special
93 | event_rate_missing = metric_missing
94 |
95 | if metric_special == "empirical":
96 | n_event_special = np.asarray(n_event_special)
97 | n_nonevent_special = np.asarray(n_nonevent_special)
98 |
99 | event_rate_special = np.zeros(n_special)
100 | n_records_special = n_event_special + n_nonevent_special
101 |
102 | mask = (n_event_special > 0) & (n_nonevent_special > 0)
103 |
104 | if n_special > 1:
105 | event_rate_special[mask] = (
106 | n_event_special[mask] / n_records_special[mask])
107 | elif mask:
108 | event_rate_special = n_event_special / n_records_special
109 |
110 | if metric == "woe":
111 | event_rate_special = transform_event_rate_to_woe(
112 | event_rate_special, n_nonevent, n_event)
113 |
114 | if metric_missing == "empirical":
115 | n_records_missing = n_event_missing + n_nonevent_missing
116 |
117 | if n_records_missing > 0:
118 | event_rate_missing = n_event_missing / n_records_missing
119 | else:
120 | event_rate_missing = 0
121 |
122 | if metric == "woe":
123 | event_rate_missing = transform_event_rate_to_woe(
124 | event_rate_missing, n_nonevent, n_event)
125 |
126 | x_transform = _apply_transform(
127 | x, c, lb, ub, special_codes, metric_special, metric_missing,
128 | clean_mask, special_mask, missing_mask, indices, x_clean, n_bins,
129 | n_special, event_rate_special, event_rate_missing)
130 |
131 | if metric == "woe":
132 | x_transform[clean_mask] = transform_event_rate_to_woe(
133 | x_transform[clean_mask], n_nonevent, n_event)
134 |
135 | return x_transform
136 |
137 |
138 | def transform_continuous_target(splits, x, c, lb, ub, n_records_special,
139 | sum_special, n_records_missing, sum_missing,
140 | special_codes, metric_special, metric_missing,
141 | check_input=False):
142 |
143 | _check_metric_special_missing(metric_special, metric_missing)
144 |
145 | if check_input:
146 | x = check_array(x, ensure_2d=False, dtype=None,
147 | force_all_finite='allow-nan')
148 |
149 | x = np.asarray(x)
150 |
151 | special_mask, missing_mask, clean_mask, n_special = _mask_special_missing(
152 | x, special_codes)
153 |
154 | x_clean = x[clean_mask]
155 |
156 | if len(splits):
157 | indices = np.digitize(x_clean, splits, right=False)
158 | else:
159 | indices = np.zeros(x_clean.shape)
160 |
161 | n_bins = len(splits) + 1
162 |
163 | # Compute event rate for special and missing bin
164 | mean_special = metric_special
165 | mean_missing = metric_missing
166 |
167 | if metric_special == "empirical":
168 | sum_special = np.asarray(sum_special)
169 | n_records_special = np.asarray(n_records_special)
170 |
171 | mean_special = np.zeros(n_special)
172 |
173 | mask = (n_records_special > 0)
174 |
175 | if n_special > 1:
176 | mean_special[mask] = sum_special[mask] / n_records_special[mask]
177 | elif mask:
178 | mean_special = sum_special / n_records_special
179 |
180 | if metric_missing == "empirical":
181 | if n_records_missing > 0:
182 | mean_missing = sum_missing / n_records_missing
183 | else:
184 | mean_missing = 0
185 |
186 | x_transform = _apply_transform(
187 | x, c, lb, ub, special_codes, metric_special, metric_missing,
188 | clean_mask, special_mask, missing_mask, indices, x_clean, n_bins,
189 | n_special, mean_special, mean_missing)
190 |
191 | return x_transform
192 |
--------------------------------------------------------------------------------
/optbinning/binning/prebinning.py:
--------------------------------------------------------------------------------
1 | """
2 | Pre-binning class.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2019
7 |
8 | import numpy as np
9 |
10 | from sklearn.preprocessing import KBinsDiscretizer
11 | from sklearn.tree import _tree
12 | from sklearn.tree import DecisionTreeClassifier
13 | from sklearn.tree import DecisionTreeRegressor
14 |
15 | from .mdlp import MDLP
16 |
17 |
18 | class PreBinning:
19 | """Prebinning algorithms.
20 |
21 | Parameters
22 | ----------
23 | problem_type:
24 | The problem type depending on the target type.
25 |
26 | method : str
27 | Available methods are 'uniform', 'quantile' and 'cart'.
28 |
29 | n_bins : int
30 | The number of bins to produce.
31 |
32 | min_bin_size : int, float
33 | The minimum bin size.
34 |
35 | **kwargs : keyword arguments
36 | Keyword arguments for prebinning method. See notes.
37 |
38 | Notes
39 | -----
40 | Keyword arguments are those available in the following classes:
41 |
42 | * ``method="uniform"``: `sklearn.preprocessing.KBinsDiscretizer.
43 |
44 | * ``method="quantile"``: `sklearn.preprocessing.KBinsDiscretizer.
45 |
46 | * ``method="cart"``: sklearn.tree.DecistionTreeClassifier.
47 |
48 | * ``method="mdlp"``: optbinning.binning.mdlp.MDLP.
49 |
50 | """
51 | def __init__(self, problem_type, method, n_bins, min_bin_size,
52 | class_weight=None, **kwargs):
53 |
54 | self.problem_type = problem_type
55 | self.method = method
56 | self.n_bins = n_bins
57 | self.min_bin_size = min_bin_size
58 | self.class_weight = class_weight
59 | self.kwargs = kwargs
60 |
61 | self._splits = None
62 |
63 | def fit(self, x, y, sample_weight=None):
64 | """Fit PreBinning algorithm.
65 |
66 | Parameters
67 | ----------
68 | x : array-like, shape = (n_samples)
69 | Data samples, where n_samples is the number of samples.
70 |
71 | y : array-like, shape = (n_samples)
72 | Target vector relative to x.
73 |
74 | sample_weight : array-like of shape (n_samples,) (default=None)
75 | Array of weights that are assigned to individual samples.
76 |
77 | Returns
78 | -------
79 | self : PreBinning
80 | """
81 | if self.method not in ("uniform", "quantile", "cart", "mdlp"):
82 | raise ValueError('Invalid value for prebinning method. Allowed '
83 | 'string values are "cart", "mdlp", "quantile" '
84 | 'and "uniform".')
85 |
86 | if self.problem_type not in ("classification", "regression"):
87 | raise ValueError('Invalid value for problem_type. Allowed '
88 | 'string values are "classification" and '
89 | '"regression".')
90 |
91 | if self.problem_type == "regression" and self.method == "mdlp":
92 | raise ValueError("mdlp method can only handle binary "
93 | "classification problems.")
94 |
95 | if self.method in ("uniform", "quantile"):
96 | unsup_kwargs = {"n_bins": self.n_bins, "strategy": self.method}
97 | unsup_kwargs.update(**self.kwargs)
98 |
99 | est = KBinsDiscretizer(**unsup_kwargs)
100 | est.fit(x.reshape(-1, 1), y)
101 | self._splits = est.bin_edges_[0][1:-1]
102 |
103 | elif self.method == "cart":
104 | cart_kwargs = {
105 | "min_samples_leaf": self.min_bin_size,
106 | "max_leaf_nodes": self.n_bins}
107 |
108 | if self.problem_type == "classification":
109 | cart_kwargs["class_weight"] = self.class_weight
110 | cart_kwargs.update(**self.kwargs)
111 |
112 | est = DecisionTreeClassifier(**cart_kwargs)
113 | else:
114 | cart_kwargs.update(**self.kwargs)
115 | est = DecisionTreeRegressor(**cart_kwargs)
116 |
117 | est.fit(x.reshape(-1, 1), y, sample_weight=sample_weight)
118 | splits = np.unique(est.tree_.threshold)
119 | self._splits = splits[splits != _tree.TREE_UNDEFINED]
120 |
121 | elif self.method == "mdlp":
122 | mdlp_kwargs = {"min_samples_leaf": self.min_bin_size}
123 | mdlp_kwargs.update(**self.kwargs)
124 |
125 | est = MDLP(**mdlp_kwargs)
126 | est.fit(x, y)
127 | self._splits = est.splits
128 |
129 | return self
130 |
131 | @property
132 | def splits(self):
133 | """List of split points
134 |
135 | Returns
136 | -------
137 | splits : numpy.ndarray
138 | """
139 | return self._splits
140 |
--------------------------------------------------------------------------------
/optbinning/binning/uncertainty/__init__.py:
--------------------------------------------------------------------------------
1 | from .binning_scenarios import SBOptimalBinning
2 |
3 |
4 | __all__ = ['SBOptimalBinning']
5 |
--------------------------------------------------------------------------------
/optbinning/exceptions.py:
--------------------------------------------------------------------------------
1 | """
2 | Custom error and warning exceptions.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 |
9 | class NotDataAddedError(ValueError, AttributeError):
10 | """Exception class to raise if binning sketch is solved before adding
11 | data.
12 |
13 | This class inherits from both ValueError and AttributeError to help with
14 | exception handling and backward compatibility.
15 | """
16 |
17 |
18 | class NotSolvedError(ValueError, AttributeError):
19 | """Exception class to raise if binning sketch methods are called before
20 | solving.
21 |
22 | This class inherits from both ValueError and AttributeError to help with
23 | exception handling and backward compatibility.
24 | """
25 |
26 |
27 | class NotGeneratedError(ValueError, AttributeError):
28 | """Exception class to raise is counterfactual information is requested
29 | before generating explanations.
30 |
31 | This class inherits from both ValueError and AttributeError to help with
32 | exception handling and backward compatibility.
33 | """
34 |
35 |
36 | class CounterfactualsFoundWarning(UserWarning):
37 | """Warning used to notify no feasible counterfactual explanations were
38 | found.
39 | """
40 |
--------------------------------------------------------------------------------
/optbinning/formatting.py:
--------------------------------------------------------------------------------
1 | """
2 | Printing utilities.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import numbers
9 | import textwrap
10 |
11 | import pandas as pd
12 |
13 |
14 | def dataframe_to_string(df, tab=None):
15 | if not isinstance(df, pd.DataFrame):
16 | raise TypeError("df must be a pandas.DataFrame.")
17 |
18 | if tab is not None:
19 | if not isinstance(tab, numbers.Integral) or tab < 0:
20 | raise ValueError("tab must be a positive integer; got {}."
21 | .format(tab))
22 |
23 | df_string = textwrap.dedent(df.to_string(index=False))
24 |
25 | if tab is None:
26 | return df_string
27 | else:
28 | return textwrap.indent(df_string, " " * tab)
29 |
--------------------------------------------------------------------------------
/optbinning/information.py:
--------------------------------------------------------------------------------
1 | """
2 | General information routines.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | import numpy as np
9 |
10 | from sklearn.base import BaseEstimator
11 |
12 | from ._version import __version__
13 |
14 | try:
15 | from localsolver import LSStatistics
16 | LOCALSOLVER_AVAILABLE = True
17 | except ImportError:
18 | LOCALSOLVER_AVAILABLE = False
19 |
20 |
21 | def print_header():
22 | header = (
23 | "optbinning (Version {})\n"
24 | "Copyright (c) 2019-2024 Guillermo Navas-Palencia, Apache License 2.0"
25 | "\n".format(__version__))
26 |
27 | print(header)
28 |
29 |
30 | def print_optional_parameters(dict_default_options, dict_user_options):
31 | option_format = " {:<24} {:>15} * {}\n"
32 | str_options = " Begin options\n"
33 | for key, value in dict_default_options.items():
34 | user_value = dict_user_options[key]
35 |
36 | if (isinstance(user_value, (list, np.ndarray, dict)) or
37 | value != user_value):
38 | user_flag = "U"
39 | else:
40 | user_flag = "d"
41 |
42 | if user_value is None:
43 | user_value = "no"
44 | elif isinstance(user_value, (list, np.ndarray, dict)):
45 | user_value = "yes"
46 | elif isinstance(user_value, BaseEstimator):
47 | user_value = "yes"
48 |
49 | str_options += option_format.format(key, str(user_value), user_flag)
50 | str_options += " End options\n"
51 | print(str_options)
52 |
53 |
54 | def solver_statistics(solver_type, solver):
55 | time_optimizer = None
56 | d_solver = {}
57 |
58 | if solver_type == "cp":
59 | d_solver["n_booleans"] = solver.NumBooleans()
60 | d_solver["n_branches"] = solver.NumBranches()
61 | d_solver["n_conflicts"] = solver.NumConflicts()
62 | d_solver["objective"] = int(solver.ObjectiveValue())
63 | d_solver["best_objective_bound"] = int(solver.BestObjectiveBound())
64 |
65 | time_optimizer = solver.WallTime()
66 |
67 | elif solver_type == "mip":
68 | d_solver["n_constraints"] = solver.NumConstraints()
69 | d_solver["n_variables"] = solver.NumVariables()
70 | d_solver["objective"] = solver.Objective().Value()
71 | d_solver["best_bound"] = solver.Objective().BestBound()
72 |
73 | elif solver_type == "ls":
74 | if not LOCALSOLVER_AVAILABLE:
75 | raise ImportError('Cannot import localsolver. Install LocalSolver '
76 | 'or choose another solver, options are "cp" and '
77 | '"mip".')
78 |
79 | d_solver["n_iterations"] = LSStatistics.get_nb_iterations(
80 | solver.statistics)
81 |
82 | elif solver_type == "lp":
83 | d_solver["n_variables"] = solver.n_variables
84 | d_solver["n_constraints"] = solver.n_constraints
85 | d_solver["n_iterations"] = solver.n_iterations
86 | d_solver["objective"] = solver.objective
87 |
88 | return d_solver, time_optimizer
89 |
90 |
91 | def print_solver_statistics(solver_type, d_solver):
92 | if solver_type == "cp":
93 | solver_stats = (
94 | " Solver statistics\n"
95 | " Type {:>10}\n"
96 | " Number of booleans {:>10}\n"
97 | " Number of branches {:>10}\n"
98 | " Number of conflicts {:>10}\n"
99 | " Objective value {:>10}\n"
100 | " Best objective bound {:>10}\n"
101 | ).format(solver_type, *d_solver.values())
102 |
103 | elif solver_type == "mip":
104 | solver_stats = (
105 | " Solver statistics\n"
106 | " Type {:>10}\n"
107 | " Number of variables {:>10}\n"
108 | " Number of constraints {:>10}\n"
109 | " Objective value {:>10.4f}\n"
110 | " Best objective bound {:>10.4f}\n"
111 | ).format(solver_type, *d_solver.values())
112 |
113 | elif solver_type == "ls":
114 | solver_stats = (
115 | " Solver statistics\n"
116 | " Type {:>10}\n"
117 | " Number of iterations {:>10}\n"
118 | ).format(solver_type, *d_solver.values())
119 |
120 | elif solver_type == "lp":
121 | solver_stats = (
122 | " Solver statistics\n"
123 | " Type {:>10}\n"
124 | " Number of variables {:>10}\n"
125 | " Number of constraints {:>10}\n"
126 | " Number of iterations {:>10}\n"
127 | " Objective value {:>10.4f}\n"
128 | ).format(solver_type, *d_solver.values())
129 |
130 | print(solver_stats)
131 |
--------------------------------------------------------------------------------
/optbinning/logging.py:
--------------------------------------------------------------------------------
1 | """
2 | Logging class.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2019
7 |
8 | import logging
9 | import sys
10 |
11 |
12 | class Logger:
13 | def __init__(self, logger_name=None, filename=None):
14 | self.logger = logging.getLogger(logger_name)
15 | self.logger.setLevel(logging.INFO)
16 | self.logger.propagate = False
17 |
18 | formatter = logging.Formatter(
19 | '%(asctime)s | %(levelname)s : %(message)s')
20 |
21 | handler = logging.StreamHandler(sys.stdout)
22 | handler.setFormatter(formatter)
23 | self.logger.addHandler(handler)
24 |
25 | if filename is not None:
26 | fhandler = logging.FileHandler(filename)
27 | fhandler.setFormatter(formatter)
28 | self.logger.addHandler(fhandler)
29 |
30 | def close(self):
31 | for handler in self.logger.handlers:
32 | handler.close()
33 | self.logger.removeHandler(handler)
34 |
--------------------------------------------------------------------------------
/optbinning/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/optbinning/metrics/__init__.py
--------------------------------------------------------------------------------
/optbinning/metrics/classification.py:
--------------------------------------------------------------------------------
1 | """
2 | Metrics to asses performance of classification models.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | import numpy as np
9 |
10 | from sklearn.metrics import auc
11 | from sklearn.metrics import confusion_matrix
12 | from sklearn.metrics import roc_curve
13 |
14 |
15 | def gini(y_true, y_pred_proba):
16 | """Compute the Gini Index or Accuracy Ration (AR).
17 |
18 | Parameters
19 | ----------
20 | y_true : array-like, shape (n_samples,)
21 | Ground truth (correct) target values.
22 |
23 | y_pred_proba : array-like, shape (n_samples,)
24 | Probability estimates of the positive class.
25 |
26 | Returns
27 | -------
28 | gini : float
29 | """
30 | fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
31 | return 2 * auc(fpr, tpr) - 1
32 |
33 |
34 | def ks(y_true, y_pred_proba):
35 | """Compute the Kolmogorov-Smirnov (KS).
36 |
37 | Parameters
38 | ----------
39 | y_true : array-like, shape (n_samples,)
40 | Ground truth (correct) target values.
41 |
42 | y_pred_proba : array-like, shape (n_samples,)
43 | Probability estimates of the positive class.
44 |
45 | Returns
46 | -------
47 | ks : tuple(ks_score, ks_position)
48 | """
49 | n_samples = y_true.shape[0]
50 | n_event = np.sum(y_true)
51 | n_nonevent = n_samples - n_event
52 |
53 | idx = np.argsort(y_pred_proba)
54 | yy = y_true[idx]
55 |
56 | cum_event = np.cumsum(yy)
57 | cum_population = np.arange(0, n_samples)
58 | cum_nonevent = cum_population - cum_event
59 |
60 | p_event = cum_event / n_event
61 | p_nonevent = cum_nonevent / n_nonevent
62 |
63 | p_diff = p_nonevent - p_event
64 | ks_max_idx = np.argmax(p_diff)
65 | ks_score = p_diff[ks_max_idx]
66 |
67 | return ks_score, ks_max_idx
68 |
69 |
70 | def imbalanced_classification_metrics(y_true, y_pred):
71 | """Compute imbalanced binary classification metrics.
72 |
73 | Parameters
74 | ----------
75 | y_true : array-like, shape (n_samples,)
76 | Ground truth (correct) target values.
77 |
78 | y_pred : array-like, shape (n_samples,)
79 | Estimated target values.
80 |
81 | Returns
82 | -------
83 | metrics : dict
84 | Dictionary of metrics.
85 | """
86 | tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
87 |
88 | # Sensitivity - True positive rate (TPR)
89 | tpr = tp / (tp + fn)
90 |
91 | # Specificity - True negative rate (TNR)
92 | tnr = tn / (fp + tn)
93 |
94 | # False positive rate (FPR)
95 | fpr = 1.0 - tnr
96 |
97 | # False negative rate (FNR)
98 | fnr = 1.0 - tpr
99 |
100 | # Balanced accuracy
101 | balanced_accuracy = 0.5 * (tpr + tnr)
102 |
103 | # Discriminant power
104 | dp = np.sqrt(3) / np.pi * (np.log(tpr / (1-tnr)) + np.log(tnr / (1-tpr)))
105 |
106 | d_metrics = {
107 | "True positive rate": tpr,
108 | "True negative rate": tnr,
109 | "False positive rate": fpr,
110 | "False negative rate": fnr,
111 | "Balanced accuracy": balanced_accuracy,
112 | "Discriminant power": dp
113 | }
114 |
115 | return d_metrics
116 |
--------------------------------------------------------------------------------
/optbinning/metrics/regression.py:
--------------------------------------------------------------------------------
1 | """
2 | Metrics to asses performance of regression models.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | import numpy as np
9 |
10 | from sklearn.metrics import explained_variance_score
11 | from sklearn.metrics import mean_absolute_error
12 | from sklearn.metrics import mean_squared_error
13 | from sklearn.metrics import median_absolute_error
14 | from sklearn.metrics import r2_score
15 |
16 |
17 | def mean_absolute_percentage_error(y_true, y_pred):
18 | """Compute the mean absolute percentage error (MAPE).
19 |
20 | Parameters
21 | ----------
22 | y_true : array-like, shape (n_samples,)
23 | Ground truth (correct) target values.
24 |
25 | y_pred : array-like, shape (n_samples,)
26 | Estimated target values.
27 |
28 | Returns
29 | -------
30 | mape : float
31 | """
32 | return np.abs((y_true - y_pred) / y_true).mean()
33 |
34 |
35 | def median_absolute_percentage_error(y_true, y_pred):
36 | """Compute the median absolute percentage error (MdAPE).
37 |
38 | Parameters
39 | ----------
40 | y_true : array-like, shape (n_samples,)
41 | Ground truth (correct) target values.
42 |
43 | y_pred : array-like, shape (n_samples,)
44 | Estimated target values.
45 |
46 | Returns
47 | -------
48 | mdape : float
49 | """
50 | return np.median(np.abs((y_true - y_pred) / y_true))
51 |
52 |
53 | def mean_percentage_error(y_true, y_pred):
54 | """Compute the mean percentage error (MPE).
55 |
56 | Parameters
57 | ----------
58 | y_true : array-like, shape (n_samples,)
59 | Ground truth (correct) target values.
60 |
61 | y_pred : array-like, shape (n_samples,)
62 | Estimated target values.
63 |
64 | Returns
65 | -------
66 | mpe : float
67 | """
68 | return ((y_true - y_pred) / y_true).mean()
69 |
70 |
71 | def symmetric_mean_absolute_percentage_error(y_true, y_pred):
72 | """Compute the symmetric mean absolute percentage error (SMAPE).
73 |
74 | Parameters
75 | ----------
76 | y_true : array-like, shape (n_samples,)
77 | Ground truth (correct) target values.
78 |
79 | y_pred : array-like, shape (n_samples,)
80 | Estimated target values.
81 |
82 | Returns
83 | -------
84 | smape : float
85 | """
86 | e = np.abs(y_true - y_pred)
87 | return (e / (np.abs(y_true) + np.abs(y_pred))).mean()
88 |
89 |
90 | def symmetric_median_absolute_percentage_error(y_true, y_pred):
91 | """Compute the symmetric median absolute percentage error (SMdAPE).
92 |
93 | Parameters
94 | ----------
95 | y_true : array-like, shape (n_samples,)
96 | Ground truth (correct) target values.
97 |
98 | y_pred : array-like, shape (n_samples,)
99 | Estimated target values.
100 |
101 | Returns
102 | -------
103 | smdape : float
104 | """
105 | e = np.abs(y_true - y_pred)
106 | return np.median(e / (np.abs(y_true) + np.abs(y_pred)))
107 |
108 |
109 | def regression_metrics(y_true, y_pred):
110 | """Compute regression metrics.
111 |
112 | Parameters
113 | ----------
114 | y_true : array-like, shape (n_samples,)
115 | Ground truth (correct) target values.
116 |
117 | y_pred : array-like, shape (n_samples,)
118 | Estimated target values.
119 |
120 | Returns
121 | -------
122 | metrics : dict
123 | Dictionary of metrics.
124 | """
125 |
126 | # Explained variance
127 | variance = explained_variance_score(y_true, y_pred)
128 |
129 | # Mean absolute error
130 | mae = mean_absolute_error(y_true, y_pred)
131 |
132 | # Mean squared error
133 | mse = mean_squared_error(y_true, y_pred)
134 |
135 | # Median absolute error
136 | median_ae = median_absolute_error(y_true, y_pred)
137 |
138 | # R^2 score
139 | r2 = r2_score(y_true, y_pred)
140 |
141 | # Mean absolute percentage error
142 | mape = mean_absolute_percentage_error(y_true, y_pred)
143 |
144 | # Mean percentage error
145 | mpe = mean_percentage_error(y_true, y_pred)
146 |
147 | # Symmetric mean absolute percentage error
148 | smape = symmetric_mean_absolute_percentage_error(y_true, y_pred)
149 |
150 | # Median absolute percentage error
151 | mdape = median_absolute_percentage_error(y_true, y_pred)
152 |
153 | # Symmetric meadian absolute percentage error
154 | smdape = symmetric_median_absolute_percentage_error(y_true, y_pred)
155 |
156 | d_metrics = {
157 | "Mean absolute error": mae,
158 | "Mean squared error": mse,
159 | "Median absolute error": median_ae,
160 | "Explained variance": variance,
161 | "R^2": r2,
162 | "MPE": mpe,
163 | "MAPE": mape,
164 | "SMAPE": smape,
165 | "MdAPE": mdape,
166 | "SMdAPE": smdape
167 | }
168 |
169 | return d_metrics
170 |
--------------------------------------------------------------------------------
/optbinning/scorecard/__init__.py:
--------------------------------------------------------------------------------
1 | from .counterfactual import Counterfactual
2 | from .monitoring import ScorecardMonitoring
3 | from .plots import plot_auc_roc, plot_cap, plot_ks
4 | from .scorecard import Scorecard
5 |
6 |
7 | __all__ = ["Scorecard",
8 | "ScorecardMonitoring",
9 | "plot_auc_roc",
10 | "plot_cap",
11 | "plot_ks",
12 | "Counterfactual"]
13 |
--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/__init__.py:
--------------------------------------------------------------------------------
1 | from .counterfactual import Counterfactual
2 |
3 |
4 | __all__ = ['Counterfactual']
5 |
--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Base counterfactual algorithm class.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | from abc import ABCMeta
9 | from abc import abstractmethod
10 |
11 | from sklearn.base import BaseEstimator
12 |
13 | from ...binning.base import Base
14 | from ...exceptions import CounterfactualsFoundWarning
15 | from ...exceptions import NotGeneratedError
16 |
17 |
18 | class BaseCounterfactual(Base, BaseEstimator, metaclass=ABCMeta):
19 | @abstractmethod
20 | def fit(self):
21 | """Fit counterfactual with training data."""
22 |
23 | @abstractmethod
24 | def generate(self):
25 | """Generate counterfactual explanations."""
26 |
27 | @abstractmethod
28 | def display(self):
29 | """Display counterfactual explanations."""
30 |
31 | @property
32 | @abstractmethod
33 | def status(self):
34 | """The status of the underlying optimization solver."""
35 |
36 | def _check_is_generated(self):
37 | if not self._is_generated:
38 | raise NotGeneratedError("This {} instance has not generated "
39 | "counterfactuals yet. Call "
40 | "'generate' with appropriate arguments."
41 | .format(self.__class__.__name__))
42 |
43 | def _check_counterfactual_is_found(self):
44 | if not self._cfs:
45 | raise CounterfactualsFoundWarning(
46 | "Neither optimal or feasible counterfactuals were found.")
47 |
--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/counterfactual_information.py:
--------------------------------------------------------------------------------
1 | """
2 | Counterfactual information.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | from ...information import print_header
9 | from ...information import print_optional_parameters
10 | from ...information import print_solver_statistics
11 | from ...options import counterfactual_default_options
12 |
13 |
14 | def print_status(status):
15 | print(" Status : {:<32}\n".format(status))
16 |
17 |
18 | def print_main_info(status, time_total):
19 | print_status(status)
20 |
21 | print(" Time : {:<7.4f} sec\n".format(time_total))
22 |
23 |
24 | def print_objectives(objectives):
25 | str_objectives = " Objectives\n"
26 |
27 | for objname, objexp in objectives.items():
28 | objval = objexp.solution_value()
29 | if objname in ("diversity_features", "diversity_values"):
30 | objval = abs(objval)
31 |
32 | str_objectives += " {:<18} {:>10.4f}\n".format(
33 | objname, objval)
34 |
35 | print(str_objectives)
36 |
37 |
38 | def print_timing(time_total, time_fit, time_solver, time_postprocessing):
39 | p_fit = time_fit / time_total
40 | p_solver = time_solver / time_total
41 | p_postprocessing = time_postprocessing / time_solver
42 |
43 | time_stats = (
44 | " Timing\n"
45 | " Total time {:>18.2f} sec\n"
46 | " Fit {:>18.2f} sec ({:>7.2%})\n"
47 | " Solver {:>18.2f} sec ({:>7.2%})\n"
48 | " Post-processing {:>18.2f} sec ({:>7.2%})\n"
49 | ).format(time_total, time_fit, p_fit, time_solver, p_solver,
50 | time_postprocessing, p_postprocessing)
51 |
52 | print(time_stats)
53 |
54 |
55 | def print_counterfactual_information(print_level, status, solver, objectives,
56 | time_total, time_fit, time_solver,
57 | time_postprocessing, dict_user_options):
58 |
59 | print_header()
60 |
61 | if print_level == 2:
62 | dict_default_options = counterfactual_default_options
63 | print_optional_parameters(dict_default_options, dict_user_options)
64 |
65 | if print_level == 0:
66 | print_main_info(status, time_total)
67 | elif print_level >= 1:
68 | print_status(status)
69 |
70 | if status in ("OPTIMAL", "FEASIBLE"):
71 | if solver is not None:
72 | print_solver_statistics("mip", solver)
73 | print_objectives(objectives)
74 |
75 | print_timing(time_total, time_fit, time_solver, time_postprocessing)
76 |
--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/model_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Counterfactual model data.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 |
9 | def model_data(scorecard, x, special_missing):
10 | s_vars = scorecard.binning_process_.get_support(names=True)
11 |
12 | sc = scorecard.table(style="detailed")
13 | metric_name = "WoE" if scorecard._target_dtype == "binary" else "Mean"
14 |
15 | # Number of bins, metric and indices
16 | nbins = []
17 | metric = []
18 | indices = []
19 | for i, v in enumerate(s_vars):
20 | metric_i = sc[sc.Variable == v][metric_name].values
21 |
22 | if not special_missing:
23 | metric_i = metric_i[:-2]
24 |
25 | _metric = []
26 | _indices = []
27 | for j, m in enumerate(metric_i):
28 | if m != x[i]:
29 | _indices.append(j)
30 | _metric.append(m)
31 |
32 | metric.append(_metric)
33 | nbins.append(len(_metric))
34 | indices.append(_indices)
35 |
36 | return nbins, metric, indices
37 |
--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/problem_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Counterfactual problem data.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | import numpy as np
9 |
10 |
11 | def problem_data(scorecard, X):
12 | s_vars = X.columns
13 | n_vars = X.shape[1]
14 |
15 | # Scorecard table
16 | sc = scorecard.table(style="detailed")
17 |
18 | if scorecard._target_dtype == "binary":
19 | sc["Points"] = sc["WoE"] * sc["Coefficient"]
20 | else:
21 | sc["Points"] = sc["Mean"] * sc["Coefficient"]
22 |
23 | # Linear model coefficients
24 |
25 | # Only index into the intercept if it is an array, it is a scalar otherwise
26 | if isinstance(scorecard.estimator_.intercept_, np.ndarray):
27 | intercept = float(scorecard.estimator_.intercept_[0])
28 | else:
29 | intercept = float(scorecard.estimator_.intercept_)
30 |
31 | coef = scorecard.estimator_.coef_.ravel()
32 |
33 | # Big-M parameters (min, max) points.
34 | # Proximity weights. Inverse value range for each feature
35 | min_p = 0
36 | max_p = 0
37 | wrange = np.empty(n_vars)
38 |
39 | for i, v in enumerate(s_vars):
40 | v_points = sc[sc["Variable"] == v]["Points"]
41 | _min = np.min(v_points)
42 | _max = np.max(v_points)
43 | min_p += _min
44 | max_p += _max
45 |
46 | wrange[i] = 1.0 / (_max - _min)
47 |
48 | min_p += intercept
49 | max_p += intercept
50 |
51 | # Mahalanobis distance
52 | Xt = scorecard.binning_process_.transform(X).values
53 | F = np.linalg.cholesky(np.linalg.inv(np.cov(Xt.T)))
54 | mu = Xt.mean(axis=0)
55 |
56 | return intercept, coef, min_p, max_p, wrange, F, mu
57 |
--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Piecewise linear approximation of logistic function.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | import numpy as np
9 |
10 | from ropwr import RobustPWRegression
11 |
12 |
13 | def logistic_pw(min_p, max_p, n_bins):
14 | xl = np.linspace(min_p, max_p, 100)
15 | yl = (1.0 / (1 + np.exp(-xl)))
16 |
17 | splits = np.linspace(min_p, max_p, n_bins+1)[1:-1]
18 |
19 | pw = RobustPWRegression(objective="l1", degree=1, monotonic_trend=None)
20 | pw.fit(xl, yl, splits)
21 |
22 | splits = np.array([min_p] + list(splits) + [max_p])
23 | b_pw = [(splits[i], splits[i+1]) for i in range(len(splits) - 1)]
24 | c_pw = pw.coef_
25 |
26 | return b_pw, c_pw
27 |
--------------------------------------------------------------------------------
/optbinning/scorecard/monitoring_information.py:
--------------------------------------------------------------------------------
1 | """
2 | Monitoring information.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | from ..binning.binning_information import print_header
9 | from ..binning.binning_information import print_optional_parameters
10 | from ..options import scorecard_monitoring_default_options
11 |
12 |
13 | def print_main_info(n_records_a, n_records_e, n_variables, time_total):
14 | print(" Number of records A : {}".format(n_records_a))
15 | print(" Number of records E : {}".format(n_records_e))
16 | print(" Number of variables : {}".format(n_variables))
17 | print(" Time : {:<7.4f} sec\n".format(time_total))
18 |
19 |
20 | def print_monitoring_statistics(n_records_a, n_records_e, n_variables,
21 | target_dtype, time_total, time_system,
22 | time_variables):
23 |
24 | stats = (
25 | " Statistics\n"
26 | " Number of records Actual {:>10}\n"
27 | " Number of records Expected {:>10}\n"
28 | " Number of scorecard variables {:>10}\n"
29 | " Target type {:>10}\n"
30 | ).format(n_records_a, n_records_e, n_variables, target_dtype)
31 |
32 | print(stats)
33 |
34 | p_system = time_system / time_total
35 | p_variables = time_variables / time_total
36 |
37 | time_stats = (
38 | " Timing\n"
39 | " Total time {:>18.2f} sec\n"
40 | " System stability {:>18.2f} sec ({:>7.2%})\n"
41 | " Variables stability {:>18.2f} sec ({:>7.2%})\n"
42 | ).format(time_total, time_system, p_system, time_variables,
43 | p_variables)
44 |
45 | print(time_stats)
46 |
47 |
48 | def print_monitoring_information(print_level, n_records_a, n_records_e,
49 | n_variables, target_dtype, time_total,
50 | time_system, time_variables,
51 | dict_user_options):
52 |
53 | print_header()
54 |
55 | if print_level == 2:
56 | dict_default_options = scorecard_monitoring_default_options
57 | print_optional_parameters(dict_default_options, dict_user_options)
58 |
59 | if print_level == 0:
60 | print_main_info(n_records_a, n_records_e, n_variables, time_total)
61 | elif print_level >= 1:
62 | print_monitoring_statistics(n_records_a, n_records_e, n_variables,
63 | target_dtype, time_total, time_system,
64 | time_variables)
65 |
--------------------------------------------------------------------------------
/optbinning/scorecard/rounding.py:
--------------------------------------------------------------------------------
1 | """
2 | Rounding strategy.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import numpy as np
9 |
10 | from ortools.linear_solver import pywraplp
11 |
12 |
13 | class RoundingMIP:
14 | def __init__(self):
15 | self.solver_ = None
16 |
17 | self._nb = None
18 | self._nn = None
19 | self._p = None
20 |
21 | def build_model(self, df_scorecard):
22 | # Parameters
23 | points = []
24 | mins = []
25 | maxs = []
26 | for variable in df_scorecard.Variable.unique():
27 | mask = df_scorecard.Variable == variable
28 | p = df_scorecard[mask].Points.values
29 | mins.append(p.min())
30 | maxs.append(p.max())
31 | points.append(p)
32 |
33 | nb = len(points)
34 | nn = [len(p) for p in points]
35 |
36 | min_point = np.rint(np.sum(mins))
37 | max_point = np.rint(np.sum(maxs))
38 |
39 | min_p = np.min(mins)
40 | max_p = np.max(maxs)
41 |
42 | # Initialize solver
43 | solver = pywraplp.Solver(
44 | 'RoundingMIP', pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING)
45 |
46 | # Decision variables
47 | p = {}
48 | tp = {}
49 | tm = {}
50 | min_b = {}
51 | max_b = {}
52 | for i in range(nb):
53 | min_b[i] = solver.IntVar(min_p, max_p, "min_b[{}]".format(i))
54 | max_b[i] = solver.IntVar(min_p, max_p, "max_b[{}]".format(i))
55 | for j in range(nn[i]):
56 | p[i, j] = solver.IntVar(min_p, max_p, "p[{}, {}]".format(i, j))
57 | tp[i, j] = solver.NumVar(0, np.inf, "tp[{}, {}]".format(i, j))
58 | tm[i, j] = solver.NumVar(0, np.inf, "tm[{}, {}]".format(i, j))
59 |
60 | # Objective function
61 | solver.Minimize(solver.Sum([solver.Sum([tp[i, j] + tm[i, j]
62 | for j in range(nn[i])]) for i in range(nb)]))
63 |
64 | # Constraints
65 | for i in range(nb):
66 | for j in range(nn[i]):
67 | solver.Add(tp[i, j] - tm[i, j] == points[i][j] - p[i, j])
68 |
69 | # Max score constraint for each variable
70 | solver.Add(max_b[i] >= p[i, j])
71 |
72 | # Min score constraints for each variable
73 | solver.Add(min_b[i] <= p[i, j])
74 |
75 | # Sum of minimum/maximum point by variable must be min_point/max_point
76 | solver.Add(solver.Sum([min_b[i] for i in range(nb)]) == min_point)
77 | solver.Add(solver.Sum([max_b[i] for i in range(nb)]) == max_point)
78 |
79 | self.solver_ = solver
80 | self._nb = nb
81 | self._nn = nn
82 | self._p = p
83 |
84 | def solve(self):
85 | status = self.solver_.Solve()
86 |
87 | if status in (pywraplp.Solver.OPTIMAL, pywraplp.Solver.FEASIBLE):
88 | if status == pywraplp.Solver.OPTIMAL:
89 | status_name = "OPTIMAL"
90 | else:
91 | status_name = "FEASIBLE"
92 |
93 | # compute solution
94 | solution = []
95 | for i in range(self._nb):
96 | for j in range(self._nn[i]):
97 | solution.append(self._p[i, j].solution_value())
98 | else:
99 | if status == pywraplp.Solver.ABNORMAL:
100 | status_name = "ABNORMAL"
101 | elif status == pywraplp.Solver.INFEASIBLE:
102 | status_name = "INFEASIBLE"
103 | elif status == pywraplp.Solver.UNBOUNDED:
104 | status_name = "UNBOUNDED"
105 | else:
106 | status_name = "UNKNOWN"
107 |
108 | solution = None
109 |
110 | return status_name, solution
111 |
--------------------------------------------------------------------------------
/optbinning/scorecard/scorecard_information.py:
--------------------------------------------------------------------------------
1 | """
2 | Scorecard information.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | from ..information import print_header
9 | from ..information import print_optional_parameters
10 | from ..options import scorecard_default_options
11 |
12 |
13 | def print_main_info(n_records, n_variables, time_total):
14 | print(" Number of records : {}".format(n_records))
15 | print(" Number of variables : {}".format(n_variables))
16 | print(" Time : {:<7.4f} sec\n".format(time_total))
17 |
18 |
19 | def print_scorecard_statistics(n_records, n_variables, target_dtype,
20 | n_numerical, n_categorical, n_selected,
21 | time_total, time_binning_process,
22 | time_estimator, time_build_scorecard,
23 | time_rounding):
24 |
25 | stats = (
26 | " Statistics\n"
27 | " Number of records {:>10}\n"
28 | " Number of variables {:>10}\n"
29 | " Target type {:>10}\n\n"
30 | " Number of numerical {:>10}\n"
31 | " Number of categorical {:>10}\n"
32 | " Number of selected {:>10}\n"
33 | ).format(n_records, n_variables, target_dtype, n_numerical,
34 | n_categorical, n_selected)
35 |
36 | print(stats)
37 |
38 | p_binning_process = time_binning_process / time_total
39 | p_estimator = time_estimator / time_total
40 | p_build_scorecard = time_build_scorecard / time_total
41 | p_rounding = time_rounding / time_build_scorecard
42 |
43 | time_stats = (
44 | " Timing\n"
45 | " Total time {:>18.2f} sec\n"
46 | " Binning process {:>18.2f} sec ({:>7.2%})\n"
47 | " Estimator {:>18.2f} sec ({:>7.2%})\n"
48 | " Build scorecard {:>18.2f} sec ({:>7.2%})\n"
49 | " rounding {:>18.2f} sec ({:>7.2%})\n"
50 | ).format(time_total, time_binning_process, p_binning_process,
51 | time_estimator, p_estimator, time_build_scorecard,
52 | p_build_scorecard, time_rounding, p_rounding)
53 |
54 | print(time_stats)
55 |
56 |
57 | def print_scorecard_information(print_level, n_records, n_variables,
58 | target_dtype, n_numerical, n_categorical,
59 | n_selected, time_total, time_binning_process,
60 | time_estimator, time_build_scorecard,
61 | time_rounding, dict_user_options):
62 | print_header()
63 |
64 | if print_level == 2:
65 | dict_default_options = scorecard_default_options
66 | print_optional_parameters(dict_default_options, dict_user_options)
67 |
68 | if print_level == 0:
69 | print_main_info(n_records, n_variables, time_total)
70 | elif print_level >= 1:
71 | print_scorecard_statistics(n_records, n_variables, target_dtype,
72 | n_numerical, n_categorical, n_selected,
73 | time_total, time_binning_process,
74 | time_estimator, time_build_scorecard,
75 | time_rounding)
76 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | numpy>=1.16.1
3 | ortools>=9.4,<9.12
4 | pandas
5 | ropwr>=1.0.0
6 | scikit-learn>=1.0.2
7 | scipy>=1.6.0
8 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 |
5 | from setuptools import find_packages, setup, Command
6 |
7 | long_description = '''
8 | The optimal binning is the optimal discretization of a variable into bins
9 | given a discrete or continuous numeric target. OptBinning is a library
10 | written in Python implementing a rigorous and flexible mathematical
11 | programming formulation to solving the optimal binning problem for a binary,
12 | continuous and multiclass target type, incorporating constraints not
13 | previously addressed.
14 |
15 | Read the documentation at: http://gnpalencia.org/optbinning/
16 |
17 | OptBinning is distributed under the Apache Software License (Apache 2.0).
18 | '''
19 |
20 |
21 | class CleanCommand(Command):
22 | user_options = []
23 |
24 | def initialize_options(self):
25 | pass
26 |
27 | def finalize_options(self):
28 | pass
29 |
30 | def run(self):
31 | os.system('rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info')
32 |
33 |
34 | # install requirements
35 | install_requires = [
36 | 'matplotlib',
37 | 'numpy>=1.16.1',
38 | 'ortools>=9.4,<9.12',
39 | 'pandas',
40 | 'ropwr>=1.0.0',
41 | 'scikit-learn>=1.0.2',
42 | 'scipy>=1.6.0',
43 | ]
44 |
45 | # extra requirements
46 | extras_require = {
47 | 'distributed': ['pympler', 'tdigest'],
48 | 'test': [
49 | 'coverage',
50 | 'flake8',
51 | 'pytest',
52 | 'pyarrow',
53 | 'pympler',
54 | 'tdigest',
55 | ],
56 | # For ecos support: https://github.com/embotech/ecos
57 | 'ecos': ['ecos']
58 | }
59 |
60 |
61 | # Read version file
62 | version_info = {}
63 | with open("optbinning/_version.py") as f:
64 | exec(f.read(), version_info)
65 |
66 |
67 | setup(
68 | name="optbinning",
69 | version=version_info['__version__'],
70 | description="OptBinning: The Python Optimal Binning library",
71 | long_description=long_description,
72 | author="Guillermo Navas-Palencia",
73 | author_email="g.navas.palencia@gmail.com",
74 | packages=find_packages(exclude=['tests', 'tests.*']),
75 | platforms="any",
76 | include_package_data=True,
77 | license="Apache Licence 2.0",
78 | url="https://github.com/guillermo-navas-palencia/optbinning",
79 | cmdclass={'clean': CleanCommand},
80 | python_requires='>=3.7',
81 | install_requires=install_requires,
82 | extras_require=extras_require,
83 | classifiers=[
84 | 'Topic :: Scientific/Engineering :: Mathematics',
85 | 'Topic :: Software Development :: Libraries',
86 | 'Topic :: Software Development :: Libraries :: Python Modules',
87 | 'Intended Audience :: Developers',
88 | 'Intended Audience :: Education',
89 | 'Intended Audience :: Science/Research',
90 | 'License :: OSI Approved :: Apache Software License',
91 | 'Programming Language :: Python :: 3',
92 | 'Programming Language :: Python :: 3.9',
93 | 'Programming Language :: Python :: 3.10',
94 | 'Programming Language :: Python :: 3.11',
95 | 'Programming Language :: Python :: 3.12',
96 | ]
97 | )
98 |
--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | coverage
2 | flake8
3 | pytest
4 | pyarrow
5 | pympler
6 | tdigest
7 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/__init__.py
--------------------------------------------------------------------------------
/tests/data/breast_cancer.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/data/breast_cancer.parquet
--------------------------------------------------------------------------------
/tests/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import load_boston
2 |
3 |
4 | __all__ = ['load_boston']
5 |
--------------------------------------------------------------------------------
/tests/datasets/datasets.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | class Data:
6 | def __init__(self, data, target, feature_names):
7 | self.data = data
8 | self.target = target
9 | self.feature_names = feature_names
10 |
11 |
12 | def load_boston():
13 | data_url = "http://lib.stat.cmu.edu/datasets/boston"
14 | raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)
15 | raw_data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
16 | target = raw_df.values[1::2, 2]
17 | feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',
18 | 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
19 |
20 | return Data(raw_data, target, feature_names)
21 |
--------------------------------------------------------------------------------
/tests/results/plot_auc_roc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/plot_auc_roc.png
--------------------------------------------------------------------------------
/tests/results/plot_cap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/plot_cap.png
--------------------------------------------------------------------------------
/tests/results/plot_ks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/plot_ks.png
--------------------------------------------------------------------------------
/tests/results/psi_plot_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/psi_plot_binary.png
--------------------------------------------------------------------------------
/tests/results/psi_plot_continuous.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/psi_plot_continuous.png
--------------------------------------------------------------------------------
/tests/results/test_binning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning.png
--------------------------------------------------------------------------------
/tests/results/test_binning_2d_event_rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_2d_event_rate.png
--------------------------------------------------------------------------------
/tests/results/test_binning_2d_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_2d_woe.png
--------------------------------------------------------------------------------
/tests/results/test_binning_no_missing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_no_missing.png
--------------------------------------------------------------------------------
/tests/results/test_binning_no_special.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_no_special.png
--------------------------------------------------------------------------------
/tests/results/test_binning_process_information.txt:
--------------------------------------------------------------------------------
1 | optbinning (Version 0.14.0)
2 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
3 |
4 | Number of records : 569
5 | Number of variables : 30
6 | Time : 4.2282 sec
7 |
8 | optbinning (Version 0.14.0)
9 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
10 |
11 | Statistics
12 | Number of records 569
13 | Number of variables 30
14 | Target type binary
15 |
16 | Number of numerical 30
17 | Number of categorical 0
18 | Number of selected 30
19 |
20 | Time 4.2282 sec
21 |
22 | optbinning (Version 0.14.0)
23 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
24 |
25 | Begin options
26 | max_n_prebins 20 * d
27 | min_prebin_size 0.05 * d
28 | min_n_bins no * d
29 | max_n_bins no * d
30 | min_bin_size no * d
31 | max_bin_size no * d
32 | max_pvalue no * d
33 | max_pvalue_policy consecutive * d
34 | selection_criteria no * d
35 | fixed_variables no * d
36 | categorical_variables no * d
37 | special_codes no * d
38 | split_digits no * d
39 | binning_fit_params no * d
40 | binning_transform_params no * d
41 | verbose False * d
42 | End options
43 |
44 | Statistics
45 | Number of records 569
46 | Number of variables 30
47 | Target type binary
48 |
49 | Number of numerical 30
50 | Number of categorical 0
51 | Number of selected 30
52 |
53 | Time 4.2282 sec
54 |
55 |
--------------------------------------------------------------------------------
/tests/results/test_binning_process_verbose.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_process_verbose.txt
--------------------------------------------------------------------------------
/tests/results/test_continuous_binning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning.png
--------------------------------------------------------------------------------
/tests/results/test_continuous_binning_2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning_2d.png
--------------------------------------------------------------------------------
/tests/results/test_continuous_binning_no_missing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning_no_missing.png
--------------------------------------------------------------------------------
/tests/results/test_continuous_binning_no_special.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning_no_special.png
--------------------------------------------------------------------------------
/tests/results/test_multiclass_binning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_multiclass_binning.png
--------------------------------------------------------------------------------
/tests/results/test_multiclass_binning_no_missing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_multiclass_binning_no_missing.png
--------------------------------------------------------------------------------
/tests/results/test_multiclass_binning_no_special.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_multiclass_binning_no_special.png
--------------------------------------------------------------------------------
/tests/results/test_scorecard_information.txt:
--------------------------------------------------------------------------------
1 | optbinning (Version 0.14.0)
2 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
3 |
4 | Number of records : 569
5 | Number of variables : 30
6 | Time : 4.5420 sec
7 |
8 | optbinning (Version 0.14.0)
9 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
10 |
11 | Statistics
12 | Number of records 569
13 | Number of variables 30
14 | Target type binary
15 |
16 | Number of numerical 30
17 | Number of categorical 0
18 | Number of selected 30
19 |
20 | Timing
21 | Total time 4.54 sec
22 | Binning process 4.18 sec ( 92.04%)
23 | Estimator 0.04 sec ( 0.94%)
24 | Build scorecard 0.32 sec ( 7.01%)
25 | rounding 0.00 sec ( 0.00%)
26 |
27 | optbinning (Version 0.14.0)
28 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
29 |
30 | Begin options
31 | binning_process yes * U
32 | estimator yes * U
33 | scaling_method no * d
34 | scaling_method_params no * d
35 | intercept_based False * d
36 | reverse_scorecard False * d
37 | rounding False * d
38 | verbose False * d
39 | End options
40 |
41 | Statistics
42 | Number of records 569
43 | Number of variables 30
44 | Target type binary
45 |
46 | Number of numerical 30
47 | Number of categorical 0
48 | Number of selected 30
49 |
50 | Timing
51 | Total time 4.54 sec
52 | Binning process 4.18 sec ( 92.04%)
53 | Estimator 0.04 sec ( 0.94%)
54 | Build scorecard 0.32 sec ( 7.01%)
55 | rounding 0.00 sec ( 0.00%)
56 |
57 |
--------------------------------------------------------------------------------
/tests/results/test_scorecard_monitoring_default.txt:
--------------------------------------------------------------------------------
1 | -----------------------------------
2 | Monitoring: System Stability Report
3 | -----------------------------------
4 |
5 | Population Stability Index (PSI)
6 |
7 |
8 | PSI total: 0.0018 (No significant change)
9 |
10 | PSI bin Count Count (%)
11 | [0.00, 0.10) 3 1.0
12 | [0.10, 0.25) 0 0.0
13 | [0.25, Inf+) 0 0.0
14 |
15 | Significance tests (H0: actual == expected)
16 |
17 | p-value bin Count Count (%)
18 | [0.00, 0.05) 1 0.333333
19 | [0.05, 0.10) 0 0.000000
20 | [0.10, 0.50) 1 0.333333
21 | [0.50, 1.00) 1 0.333333
22 |
23 | Target analysis
24 |
25 | Metric Actual Actual (%) Expected Expected (%)
26 | Number of records 171 - 398 -
27 | Event records 108 0.631579 249 0.625628
28 | Non-event records 63 0.368421 149 0.374372
29 |
30 | Performance metrics
31 |
32 | Metric Actual Expected Diff A - E
33 | True positive rate 0.990741 1.000000 -0.009259
34 | True negative rate 0.968254 0.979866 -0.011612
35 | False positive rate 0.031746 0.020134 0.011612
36 | False negative rate 0.009259 0.000000 0.009259
37 | Balanced accuracy 0.979497 0.989933 -0.010436
38 | Discriminant power 4.460557 inf -inf
39 | Gini 0.986185 0.999838 -0.013654
40 |
41 |
--------------------------------------------------------------------------------
/tests/results/test_scorecard_monitoring_default_continuous.txt:
--------------------------------------------------------------------------------
1 | -----------------------------------
2 | Monitoring: System Stability Report
3 | -----------------------------------
4 |
5 | Population Stability Index (PSI)
6 |
7 |
8 | PSI total: 0.1630 (Requires investigation)
9 |
10 | PSI bin Count Count (%)
11 | [0.00, 0.10) 14 0.933333
12 | [0.10, 0.25) 1 0.066667
13 | [0.25, Inf+) 0 0.000000
14 |
15 | Significance tests (H0: actual == expected)
16 |
17 | p-value bin Count Count (%)
18 | [0.00, 0.05) 1 0.066667
19 | [0.05, 0.10) 0 0.000000
20 | [0.10, 0.50) 5 0.333333
21 | [0.50, 1.00) 9 0.600000
22 |
23 | Target analysis
24 |
25 | Metric Actual Expected
26 | Mean 21.407895 23.015819
27 | Std 8.632097 9.375315
28 | p25 16.325000 17.400000
29 | Median 20.000000 21.750000
30 | p75 24.125000 26.600000
31 |
32 | Performance metrics
33 |
34 | Metric Actual Expected Diff A - E
35 | Mean absolute error 2.482286 2.546775 -0.064488
36 | Mean squared error 12.583966 12.187764 0.396202
37 | Median absolute error 2.059913 1.947342 0.112571
38 | Explained variance 0.831908 0.861340 -0.029432
39 | R^2 0.831117 0.861340 -0.030222
40 | MPE -0.032197 -0.024922 -0.007275
41 | MAPE 0.125897 0.125992 -0.000095
42 | SMAPE 0.061339 0.060410 0.000929
43 | MdAPE 0.097021 0.091783 0.005238
44 | SMdAPE 0.049889 0.046868 0.003021
45 |
46 |
--------------------------------------------------------------------------------
/tests/results/test_scorecard_monitoring_information.txt:
--------------------------------------------------------------------------------
1 | optbinning (Version 0.14.0)
2 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
3 |
4 | Number of records A : 152
5 | Number of records E : 354
6 | Number of variables : 13
7 | Time : 0.1124 sec
8 |
9 | optbinning (Version 0.14.0)
10 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
11 |
12 | Statistics
13 | Number of records Actual 152
14 | Number of records Expected 354
15 | Number of scorecard variables 13
16 | Target type continuous
17 |
18 | Timing
19 | Total time 0.11 sec
20 | System stability 0.07 sec ( 60.67%)
21 | Variables stability 0.04 sec ( 38.99%)
22 |
23 | optbinning (Version 0.14.0)
24 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
25 |
26 | Begin options
27 | scorecard yes * U
28 | psi_method cart * d
29 | psi_n_bins 20 * d
30 | psi_min_bin_size 0.05 * d
31 | show_digits 2 * d
32 | verbose False * d
33 | End options
34 |
35 | Statistics
36 | Number of records Actual 152
37 | Number of records Expected 354
38 | Number of scorecard variables 13
39 | Target type continuous
40 |
41 | Timing
42 | Total time 0.11 sec
43 | System stability 0.07 sec ( 60.67%)
44 | Variables stability 0.04 sec ( 38.99%)
45 |
46 |
--------------------------------------------------------------------------------
/tests/results/test_scorecard_monitoring_verbose.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_scorecard_monitoring_verbose.txt
--------------------------------------------------------------------------------
/tests/results/test_scorecard_verbose.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_scorecard_verbose.txt
--------------------------------------------------------------------------------
/tests/test_binning_piecewise.py:
--------------------------------------------------------------------------------
1 | """
2 | OptimalPWBinning testing.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2022
7 |
8 | import pandas as pd
9 |
10 | from pytest import approx, raises
11 |
12 | from optbinning import OptimalPWBinning
13 | from sklearn.datasets import load_breast_cancer
14 | from sklearn.exceptions import NotFittedError
15 |
16 |
17 | data = load_breast_cancer()
18 | df = pd.DataFrame(data.data, columns=data.feature_names)
19 |
20 | variable = "mean radius"
21 | x = df[variable].values
22 | y = data.target
23 |
24 |
25 | def test_params():
26 | with raises(TypeError):
27 | optb = OptimalPWBinning(name=1)
28 | optb.fit(x, y)
29 |
30 | with raises(TypeError):
31 | optb = OptimalPWBinning(estimator=2)
32 | optb.fit(x, y)
33 |
34 | with raises(ValueError):
35 | optb = OptimalPWBinning(objective="new")
36 | optb.fit(x, y)
37 |
38 | with raises(ValueError):
39 | optb = OptimalPWBinning(degree=0.2)
40 | optb.fit(x, y)
41 |
42 | with raises(TypeError):
43 | optb = OptimalPWBinning(continuous=1)
44 | optb.fit(x, y)
45 |
46 | with raises(ValueError):
47 | optb = OptimalPWBinning(prebinning_method="new")
48 | optb.fit(x, y)
49 |
50 | with raises(ValueError):
51 | optb = OptimalPWBinning(min_prebin_size=0.9)
52 | optb.fit(x, y)
53 |
54 | with raises(ValueError):
55 | optb = OptimalPWBinning(min_n_bins=1.2)
56 | optb.fit(x, y)
57 |
58 | with raises(ValueError):
59 | optb = OptimalPWBinning(max_n_bins=1.2)
60 | optb.fit(x, y)
61 |
62 | with raises(ValueError):
63 | optb = OptimalPWBinning(min_n_bins=10, max_n_bins=5)
64 | optb.fit(x, y)
65 |
66 | with raises(ValueError):
67 | optb = OptimalPWBinning(min_bin_size=0.6)
68 | optb.fit(x, y)
69 |
70 | with raises(ValueError):
71 | optb = OptimalPWBinning(max_bin_size=1.1)
72 | optb.fit(x, y)
73 |
74 | with raises(ValueError):
75 | optb = OptimalPWBinning(min_bin_size=0.3, max_bin_size=0.2)
76 | optb.fit(x, y)
77 |
78 | with raises(ValueError):
79 | optb = OptimalPWBinning(monotonic_trend="new")
80 | optb.fit(x, y)
81 |
82 | with raises(ValueError):
83 | optb = OptimalPWBinning(monotonic_trend="convex", degree=2)
84 | optb.fit(x, y)
85 |
86 | with raises(ValueError):
87 | optb = OptimalPWBinning(n_subsamples=1001.2)
88 | optb.fit(x, y)
89 |
90 | with raises(ValueError):
91 | optb = OptimalPWBinning(max_pvalue=1.1)
92 | optb.fit(x, y)
93 |
94 | with raises(ValueError):
95 | optb = OptimalPWBinning(max_pvalue_policy="new_policy")
96 | optb.fit(x, y)
97 |
98 | with raises(ValueError):
99 | optb = OptimalPWBinning(outlier_detector="new_method")
100 | optb.fit(x, y)
101 |
102 | with raises(TypeError):
103 | optb = OptimalPWBinning(outlier_detector="range",
104 | outlier_params="pass")
105 | optb.fit(x, y)
106 |
107 | with raises(TypeError):
108 | optb = OptimalPWBinning(user_splits={"a": [1, 2]})
109 | optb.fit(x, y)
110 |
111 | with raises(ValueError):
112 | optb = OptimalPWBinning(user_splits=None,
113 | user_splits_fixed=[True, True])
114 | optb.fit(x, y)
115 |
116 | with raises(TypeError):
117 | optb = OptimalPWBinning(user_splits=[1, 2],
118 | user_splits_fixed=(True, True))
119 | optb.fit(x, y)
120 |
121 | with raises(ValueError):
122 | optb = OptimalPWBinning(user_splits=[1, 2],
123 | user_splits_fixed=[True, 1])
124 | optb.fit(x, y)
125 |
126 | with raises(ValueError):
127 | optb = OptimalPWBinning(user_splits=[1, 2],
128 | user_splits_fixed=[True])
129 | optb.fit(x, y)
130 |
131 | with raises(TypeError):
132 | optb = OptimalPWBinning(special_codes={1, 2, 3})
133 | optb.fit(x, y)
134 |
135 | with raises(ValueError):
136 | optb = OptimalPWBinning(split_digits=9)
137 | optb.fit(x, y)
138 |
139 | with raises(ValueError):
140 | optb = OptimalPWBinning(solver=None)
141 | optb.fit(x, y)
142 |
143 | with raises(ValueError):
144 | optb = OptimalPWBinning(h_epsilon=0.9)
145 | optb.fit(x, y)
146 |
147 | with raises(ValueError):
148 | optb = OptimalPWBinning(quantile=0)
149 | optb.fit(x, y)
150 |
151 | with raises(ValueError):
152 | optb = OptimalPWBinning(regularization='l3')
153 | optb.fit(x, y)
154 |
155 | with raises(ValueError):
156 | optb = OptimalPWBinning(reg_l1=-0.5)
157 | optb.fit(x, y)
158 |
159 | with raises(ValueError):
160 | optb = OptimalPWBinning(reg_l2=-0.5)
161 | optb.fit(x, y)
162 |
163 | with raises(TypeError):
164 | optb = OptimalPWBinning(random_state='None')
165 | optb.fit(x, y)
166 |
167 | with raises(TypeError):
168 | optb = OptimalPWBinning(verbose=1)
169 | optb.fit(x, y)
170 |
171 |
172 | def test_default():
173 | optb = OptimalPWBinning(name=variable)
174 | optb.fit(x, y)
175 |
176 | optb.binning_table.build()
177 | assert optb.binning_table.iv == approx(5.87474602, rel=1e-6)
178 |
179 | with raises(ValueError):
180 | optb.binning_table.plot(metric="new_metric")
181 |
182 | optb.binning_table.plot(
183 | metric="woe", savefig="tests/results/test_binning_piecewise.png")
184 |
185 |
186 | def test_default_discontinuous():
187 | optb = OptimalPWBinning(name=variable, continuous=False)
188 | optb.fit(x, y)
189 |
190 | optb.binning_table.build()
191 | assert optb.binning_table.iv == approx(5.84465825, rel=1e-6)
192 |
193 |
194 | def test_bounds_transform():
195 | optb = OptimalPWBinning(name=variable)
196 | optb.fit(x, y, lb=0.001, ub=0.999)
197 |
198 | x_transform_woe = optb.transform(x, metric="woe")
199 | assert x_transform_woe[:4] == approx(
200 | [3.99180564, 4.28245092, 4.17407503, -3.2565373], rel=1e-6)
201 |
202 | x_transform_event_rate = optb.transform(x, metric="event_rate")
203 | assert x_transform_event_rate[:4] == approx(
204 | [0.03015878, 0.02272502, 0.02526056, 0.97763604], rel=1e-6)
205 |
206 |
207 | def test_bounds_fit_transform():
208 | optb = OptimalPWBinning(name=variable)
209 |
210 | x_transform_woe = optb.fit_transform(
211 | x, y, lb=0.001, ub=0.999, metric="woe")
212 |
213 | assert x_transform_woe[:4] == approx(
214 | [3.9918056, 4.2824509, 4.17407503, -3.25653732], rel=1e-6)
215 | x_transform_event_rate = optb.fit_transform(
216 | x, y, lb=0.001, ub=0.999, metric="event_rate")
217 | assert x_transform_event_rate[:4] == approx(
218 | [0.03015878, 0.02272502, 0.02526056, 0.97763604], rel=1e-6)
219 |
220 |
221 | def test_solvers():
222 | for solver in ("auto", "ecos", "osqp"):
223 | optb = OptimalPWBinning(name=variable, solver=solver)
224 | optb.fit(x, y)
225 |
226 | optb.binning_table.build()
227 | assert optb.binning_table.iv == approx(5.87474602, rel=1e-6)
228 |
229 |
230 | def test_user_splits():
231 | variable = "mean texture"
232 | x = df[variable].values
233 |
234 | user_splits = [14, 15, 16, 17, 20, 21, 22, 27]
235 | user_splits_fixed = [False, True, True, False, False, False, False, False]
236 |
237 | optb = OptimalPWBinning(name=variable, user_splits=user_splits,
238 | user_splits_fixed=user_splits_fixed)
239 |
240 | optb.fit(x, y)
241 |
242 |
243 | def test_information():
244 | optb = OptimalPWBinning()
245 |
246 | with raises(NotFittedError):
247 | optb.information()
248 |
249 | optb.fit(x, y)
250 |
251 | with raises(ValueError):
252 | optb.information(print_level=-1)
253 |
254 | optb.information(print_level=0)
255 | optb.information(print_level=1)
256 | optb.information(print_level=2)
257 |
258 | optb = OptimalPWBinning()
259 | optb.fit(x, y)
260 | optb.information(print_level=2)
261 |
262 |
263 | def test_verbose():
264 | optb = OptimalPWBinning(verbose=True)
265 | optb.fit(x, y)
266 |
267 | assert optb.status == "OPTIMAL"
268 |
--------------------------------------------------------------------------------
/tests/test_binning_process_sketch.py:
--------------------------------------------------------------------------------
1 | """
2 | BinningProcessSketch testing.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2021
7 |
8 | import pandas as pd
9 |
10 | from pytest import approx, raises
11 |
12 | from optbinning import BinningProcessSketch
13 | from optbinning import OptimalBinningSketch
14 | from optbinning.exceptions import NotSolvedError
15 | from optbinning.exceptions import NotDataAddedError
16 | from sklearn.datasets import load_breast_cancer
17 |
18 | data = load_breast_cancer()
19 | variable_names = data.feature_names
20 | df = pd.DataFrame(data.data, columns=variable_names)
21 | y = data.target
22 |
23 |
24 | def test_params():
25 | with raises(TypeError):
26 | BinningProcessSketch(variable_names=1)
27 |
28 | with raises(ValueError):
29 | BinningProcessSketch(variable_names=[], max_n_prebins=-2)
30 |
31 | with raises(ValueError):
32 | BinningProcessSketch(variable_names=[], min_n_bins=-2)
33 |
34 | with raises(ValueError):
35 | BinningProcessSketch(variable_names=[], max_n_bins=-2.2)
36 |
37 | with raises(ValueError):
38 | BinningProcessSketch(variable_names=[], min_n_bins=3, max_n_bins=2)
39 |
40 | with raises(ValueError):
41 | BinningProcessSketch(variable_names=[], min_bin_size=0.6)
42 |
43 | with raises(ValueError):
44 | BinningProcessSketch(variable_names=[], max_bin_size=-0.6)
45 |
46 | with raises(ValueError):
47 | BinningProcessSketch(variable_names=[], min_bin_size=0.5,
48 | max_bin_size=0.3)
49 |
50 | with raises(ValueError):
51 | BinningProcessSketch(variable_names=[], max_pvalue=1.1)
52 |
53 | with raises(ValueError):
54 | BinningProcessSketch(variable_names=[], max_pvalue_policy="new_policy")
55 |
56 | with raises(TypeError):
57 | BinningProcessSketch(variable_names=[], selection_criteria=[])
58 |
59 | with raises(TypeError):
60 | BinningProcessSketch(variable_names=[], categorical_variables={})
61 |
62 | with raises(TypeError):
63 | BinningProcessSketch(variable_names=[], categorical_variables=[1, 2])
64 |
65 | with raises(TypeError):
66 | BinningProcessSketch(variable_names=[], special_codes={1, 2, 3})
67 |
68 | with raises(ValueError):
69 | BinningProcessSketch(variable_names=[], split_digits=9)
70 |
71 | with raises(TypeError):
72 | BinningProcessSketch(variable_names=[], binning_fit_params=[1, 2])
73 |
74 | with raises(TypeError):
75 | BinningProcessSketch(variable_names=[],
76 | binning_transform_params=[1, 2])
77 |
78 | with raises(TypeError):
79 | BinningProcessSketch(variable_names=[], verbose=1)
80 |
81 |
82 | def test_default():
83 | bpsketch = BinningProcessSketch(variable_names)
84 | bpsketch.add(df, y)
85 | bpsketch.solve()
86 |
87 | optb = bpsketch.get_binned_variable("mean radius")
88 |
89 | assert optb.status == "OPTIMAL"
90 |
91 | optb.binning_table.build()
92 | assert optb.binning_table.iv == approx(5.04392547, rel=1e-2)
93 |
94 |
95 | def test_default_merge():
96 | bpsketch_1 = BinningProcessSketch(variable_names)
97 | bpsketch_2 = BinningProcessSketch(variable_names)
98 |
99 | df_1, y_1 = df.iloc[:200, :], y[:200]
100 | df_2, y_2 = df.iloc[200:, :], y[200:]
101 |
102 | bpsketch_1.add(df_1, y_1)
103 | bpsketch_2.add(df_2, y_2)
104 | bpsketch_1.merge(bpsketch_2)
105 |
106 | bpsketch_1.solve()
107 |
108 | optb = bpsketch_1.get_binned_variable("mean radius")
109 |
110 | assert optb.status == "OPTIMAL"
111 |
112 | optb.binning_table.build()
113 | assert optb.binning_table.iv == approx(5.04392547, rel=1e-2)
114 |
115 |
116 | def test_default_tdigest_merge():
117 | binning_fit_params = {v: {"sketch": "t-digest"} for v in variable_names}
118 |
119 | bpsketch_1 = BinningProcessSketch(variable_names,
120 | binning_fit_params=binning_fit_params)
121 | bpsketch_2 = BinningProcessSketch(variable_names,
122 | binning_fit_params=binning_fit_params)
123 |
124 | df_1, y_1 = df.iloc[:200, :], y[:200]
125 | df_2, y_2 = df.iloc[200:, :], y[200:]
126 |
127 | bpsketch_1.add(df_1, y_1)
128 | bpsketch_2.add(df_2, y_2)
129 | bpsketch_1.merge(bpsketch_2)
130 |
131 | bpsketch_1.solve()
132 |
133 | optb = bpsketch_1.get_binned_variable("mean radius")
134 |
135 | assert optb.status == "OPTIMAL"
136 |
137 | optb.binning_table.build()
138 | assert optb.binning_table.iv == approx(5.04392547, rel=1e-2)
139 |
140 |
141 | def test_default_transform():
142 | bpsketch = BinningProcessSketch(variable_names)
143 | bpsketch.add(df, y)
144 |
145 | with raises(NotSolvedError):
146 | bpsketch.transform(df, metric="woe")
147 |
148 | bpsketch.solve()
149 |
150 | with raises(TypeError):
151 | X_transform = bpsketch.transform(df.values, metric="woe")
152 |
153 | with raises(ValueError):
154 | X_transform = bpsketch.transform(df, metric="new_woe")
155 |
156 | X_transform = bpsketch.transform(df)
157 |
158 | optb = OptimalBinningSketch()
159 | x = df["mean radius"]
160 | optb.add(x, y)
161 | optb.solve()
162 |
163 | assert optb.transform(x, metric="woe") == approx(
164 | X_transform["mean radius"], rel=1e-6)
165 |
166 |
167 | def test_information():
168 | bpsketch = BinningProcessSketch(variable_names)
169 |
170 | with raises(NotDataAddedError):
171 | bpsketch.solve()
172 |
173 | bpsketch.add(df, y)
174 |
175 | with raises(NotSolvedError):
176 | bpsketch.information()
177 |
178 | bpsketch.solve()
179 |
180 | with raises(ValueError):
181 | bpsketch.information(print_level=-1)
182 |
183 | bpsketch.information(print_level=0)
184 | bpsketch.information(print_level=1)
185 | bpsketch.information(print_level=2)
186 |
--------------------------------------------------------------------------------
/tests/test_continuous_binning_piecewise.py:
--------------------------------------------------------------------------------
1 | """
2 | ContinuousOptimalPWBinning testing.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2022
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from pytest import approx
12 |
13 | from optbinning import ContinuousOptimalPWBinning
14 | from tests.datasets import load_boston
15 |
16 | data = load_boston()
17 | df = pd.DataFrame(data.data, columns=data.feature_names)
18 |
19 | variable = "LSTAT"
20 | x = df[variable].values
21 | y = data.target
22 |
23 |
24 | def test_default():
25 | optb = ContinuousOptimalPWBinning(name=variable)
26 | optb.fit(x, y)
27 |
28 | optb.binning_table.build()
29 | optb.binning_table.plot(
30 | savefig="tests/results/test_continuous_binning_piecewise.png")
31 |
32 |
33 | def test_transform():
34 | optb = ContinuousOptimalPWBinning(name=variable)
35 | optb.fit(x, y)
36 |
37 | x_transform = optb.transform(x)
38 | assert x_transform[:3] == approx(
39 | [31.46014643, 23.87619986, 37.31237732], rel=1e-6)
40 |
41 |
42 | def test_fit_transform():
43 | optb = ContinuousOptimalPWBinning(name=variable)
44 |
45 | x_transform = optb.fit_transform(x, y)
46 | assert x_transform[:3] == approx(
47 | [31.46014643, 23.87619986, 37.31237732], rel=1e-6)
48 |
49 |
50 | def test_special_codes():
51 | variable = "INDUS"
52 | x = df[variable].values
53 |
54 | x[:50] = -9
55 | x[50:100] = -8
56 | special_codes = {'special_-9': -9, 'special_-8': -8}
57 |
58 | optb = ContinuousOptimalPWBinning(
59 | name=variable, monotonic_trend="convex", special_codes=special_codes)
60 | optb.fit(x, y)
61 |
62 | x_transform = optb.transform([-9, -8], metric_special=1000)
63 | assert x_transform == approx([1000, 1000], rel=1e-6)
64 |
65 | x_transform = optb.transform([-9, -8], metric_special='empirical')
66 | assert x_transform == approx([20.502000, 24.116000], rel=1e-6)
67 |
68 | optb = ContinuousOptimalPWBinning(
69 | name=variable, monotonic_trend="convex", special_codes=[-9, -8])
70 | optb.fit(x, y)
71 |
72 | x_transform = optb.transform([-9, -8], metric_special=1000)
73 | assert x_transform == approx([1000, 1000], rel=1e-6)
74 |
75 | x_transform = optb.transform([-9, -8], metric_special='empirical')
76 | assert x_transform == approx([22.309, 22.309], rel=1e-6)
77 |
78 | x[45:50] = np.nan
79 | optb = ContinuousOptimalPWBinning(
80 | name=variable, monotonic_trend="convex", special_codes=special_codes)
81 | optb.fit(x, y)
82 |
83 | x_transform = optb.transform([np.nan], metric_missing='empirical')
84 | assert x_transform == approx([17.94], rel=1e-6)
85 |
86 |
87 | def test_verbose():
88 | optb = ContinuousOptimalPWBinning(verbose=True)
89 | optb.fit(x, y)
90 |
91 | assert optb.status == "OPTIMAL"
92 |
--------------------------------------------------------------------------------
/tests/test_mdlp.py:
--------------------------------------------------------------------------------
1 | """
2 | MDLP testing.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from pytest import approx, raises
12 |
13 | from optbinning import MDLP
14 | from sklearn.datasets import load_breast_cancer
15 | from sklearn.exceptions import NotFittedError
16 |
17 |
18 | data = load_breast_cancer()
19 | df = pd.DataFrame(data.data, columns=data.feature_names)
20 |
21 | variable = "mean radius"
22 | x = df[variable].values
23 | y = data.target
24 |
25 |
26 | def test_params():
27 | with raises(ValueError):
28 | mdlp = MDLP(min_samples_split=-1)
29 | mdlp.fit(x, y)
30 |
31 | with raises(ValueError):
32 | mdlp = MDLP(min_samples_leaf=-1)
33 | mdlp.fit(x, y)
34 |
35 | with raises(ValueError):
36 | mdlp = MDLP(max_candidates=-1)
37 | mdlp.fit(x, y)
38 |
39 |
40 | # def test_numerical_default():
41 | # mdlp = MDLP()
42 | # mdlp.fit(x, y)
43 |
44 | # assert mdlp.splits == approx([10.945, 13.08729032, 15.00163870,
45 | # 15.10030322, 16.925, 17.88], rel=1e-6)
46 |
47 |
48 | # def test_numerical_practical():
49 | # min_samples_leaf = int(np.ceil(len(x) * 0.05))
50 | # mdlp = MDLP(max_candidates=128, min_samples_leaf=min_samples_leaf)
51 | # mdlp.fit(x, y)
52 |
53 | # assert mdlp.splits == approx([10.945, 12.995, 13.71, 15.045, 16.325,
54 | # 17.88], rel=1e-6)
55 |
56 |
57 | def test_splits():
58 | mdlp = MDLP()
59 |
60 | with raises(NotFittedError):
61 | mdlp.splits
62 |
--------------------------------------------------------------------------------
/tests/test_multiclass_binning.py:
--------------------------------------------------------------------------------
1 | """
2 | MulticlassOptimalBinning testing.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import pandas as pd
9 |
10 | from pytest import approx, raises
11 |
12 | from optbinning import MulticlassOptimalBinning
13 | from sklearn.datasets import load_wine
14 | from sklearn.exceptions import NotFittedError
15 |
16 |
17 | data = load_wine()
18 | df = pd.DataFrame(data.data, columns=data.feature_names)
19 |
20 | variable = "ash"
21 | x = df[variable].values
22 | y = data.target
23 |
24 |
25 | def test_params():
26 | with raises(TypeError):
27 | optb = MulticlassOptimalBinning(name=1)
28 | optb.fit(x, y)
29 |
30 | with raises(ValueError):
31 | optb = MulticlassOptimalBinning(prebinning_method="new_method")
32 | optb.fit(x, y)
33 |
34 | with raises(ValueError):
35 | optb = MulticlassOptimalBinning(solver="new_solver")
36 | optb.fit(x, y)
37 |
38 | with raises(ValueError):
39 | optb = MulticlassOptimalBinning(max_n_prebins=-2)
40 | optb.fit(x, y)
41 |
42 | with raises(ValueError):
43 | optb = MulticlassOptimalBinning(min_prebin_size=0.6)
44 | optb.fit(x, y)
45 |
46 | with raises(ValueError):
47 | optb = MulticlassOptimalBinning(min_n_bins=-2)
48 | optb.fit(x, y)
49 |
50 | with raises(ValueError):
51 | optb = MulticlassOptimalBinning(max_n_bins=-2.2)
52 | optb.fit(x, y)
53 |
54 | with raises(ValueError):
55 | optb = MulticlassOptimalBinning(min_n_bins=3, max_n_bins=2)
56 | optb.fit(x, y)
57 |
58 | with raises(ValueError):
59 | optb = MulticlassOptimalBinning(min_bin_size=0.6)
60 | optb.fit(x, y)
61 |
62 | with raises(ValueError):
63 | optb = MulticlassOptimalBinning(max_bin_size=-0.6)
64 | optb.fit(x, y)
65 |
66 | with raises(ValueError):
67 | optb = MulticlassOptimalBinning(min_bin_size=0.5, max_bin_size=0.3)
68 | optb.fit(x, y)
69 |
70 | with raises(ValueError):
71 | optb = MulticlassOptimalBinning(monotonic_trend=["new_trend", "auto"])
72 | optb.fit(x, y)
73 |
74 | with raises(ValueError):
75 | optb = MulticlassOptimalBinning(monotonic_trend="new_trend")
76 | optb.fit(x, y)
77 |
78 | with raises(ValueError):
79 | optb = MulticlassOptimalBinning(max_pvalue=1.1)
80 | optb.fit(x, y)
81 |
82 | with raises(ValueError):
83 | optb = MulticlassOptimalBinning(max_pvalue_policy="new_policy")
84 | optb.fit(x, y)
85 |
86 | with raises(TypeError):
87 | optb = MulticlassOptimalBinning(user_splits={"a": [1, 2]})
88 | optb.fit(x, y)
89 |
90 | with raises(TypeError):
91 | optb = MulticlassOptimalBinning(special_codes={1, 2, 3})
92 | optb.fit(x, y)
93 |
94 | with raises(ValueError):
95 | optb = MulticlassOptimalBinning(split_digits=9)
96 | optb.fit(x, y)
97 |
98 | with raises(ValueError):
99 | optb = MulticlassOptimalBinning(mip_solver="new_solver")
100 | optb.fit(x, y)
101 |
102 | with raises(ValueError):
103 | optb = MulticlassOptimalBinning(time_limit=-2)
104 | optb.fit(x, y)
105 |
106 | with raises(TypeError):
107 | optb = MulticlassOptimalBinning(verbose=1)
108 | optb.fit(x, y)
109 |
110 |
111 | def test_numerical_default():
112 | optb = MulticlassOptimalBinning()
113 | optb.fit(x, y)
114 |
115 | assert optb.status == "OPTIMAL"
116 | assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999,
117 | 2.6450001], rel=1e-6)
118 |
119 | optb.binning_table.build()
120 | optb.binning_table.analysis()
121 | assert optb.binning_table.js == approx(0.10989515, rel=1e-6)
122 | assert optb.binning_table.quality_score == approx(0.05279822, rel=1e-6)
123 | optb.binning_table.plot(
124 | savefig="tests/results/test_multiclass_binning.png")
125 | optb.binning_table.plot(
126 | add_special=False,
127 | savefig="tests/results/test_multiclass_binning_no_special.png")
128 | optb.binning_table.plot(
129 | add_missing=False,
130 | savefig="tests/results/test_multiclass_binning_no_missing.png")
131 |
132 |
133 | def test_numerical_default_solvers():
134 | optb_mip_bop = MulticlassOptimalBinning(solver="mip", mip_solver="bop")
135 | optb_mip_bop.fit(x, y)
136 |
137 | optb_cp = MulticlassOptimalBinning(solver="cp")
138 | optb_cp.fit(x, y)
139 |
140 | for optb in [optb_mip_bop, optb_cp]:
141 | assert optb.status == "OPTIMAL"
142 | assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999,
143 | 2.6450001], rel=1e-6)
144 |
145 |
146 | def test_numerical_user_splits_fixed():
147 | user_splits = [2.1, 2.2, 2.3, 2.6, 2.9]
148 |
149 | with raises(ValueError):
150 | user_splits_fixed = [False, False, False, True, False]
151 | optb = MulticlassOptimalBinning(user_splits_fixed=user_splits_fixed)
152 | optb.fit(x, y)
153 |
154 | with raises(TypeError):
155 | user_splits_fixed = (False, False, False, True, False)
156 | optb = MulticlassOptimalBinning(user_splits=user_splits,
157 | user_splits_fixed=user_splits_fixed)
158 | optb.fit(x, y)
159 |
160 | with raises(ValueError):
161 | user_splits_fixed = [0, 0, 0, 1, 0]
162 | optb = MulticlassOptimalBinning(user_splits=user_splits,
163 | user_splits_fixed=user_splits_fixed)
164 | optb.fit(x, y)
165 |
166 | with raises(ValueError):
167 | user_splits_fixed = [False, False, False, False]
168 | optb = MulticlassOptimalBinning(user_splits=user_splits,
169 | user_splits_fixed=user_splits_fixed)
170 | optb.fit(x, y)
171 |
172 | user_splits_fixed = [False, False, False, True, True]
173 |
174 | with raises(ValueError):
175 | # pure pre-bins
176 | optb = MulticlassOptimalBinning(user_splits=user_splits,
177 | user_splits_fixed=user_splits_fixed)
178 | optb.fit(x, y)
179 |
180 | user_splits = [2.1, 2.2, 2.3, 2.6, 2.7]
181 | optb = MulticlassOptimalBinning(user_splits=user_splits,
182 | user_splits_fixed=user_splits_fixed)
183 | optb.fit(x, y)
184 |
185 | assert optb.status == "OPTIMAL"
186 | assert 2.7 in optb.splits
187 |
188 |
189 | def test_numerical_user_splits_non_unique():
190 | user_splits = [2.1, 2.2, 2.2, 2.6, 2.9]
191 | optb = MulticlassOptimalBinning(user_splits=user_splits)
192 |
193 | with raises(ValueError):
194 | optb.fit(x, y)
195 |
196 |
197 | def test_numerical_default_transform():
198 | optb = MulticlassOptimalBinning()
199 | with raises(NotFittedError):
200 | x_transform = optb.transform(x)
201 |
202 | optb.fit(x, y)
203 |
204 | x_transform = optb.transform([0.3, 2.1, 2.5, 3], metric="mean_woe")
205 | assert x_transform == approx([0.48973998, 0.48973998, -0.00074357,
206 | 0.02189459], rel=1e-5)
207 |
208 |
209 | def test_numerical_default_fit_transform():
210 | optb = MulticlassOptimalBinning()
211 |
212 | x_transform = optb.fit_transform(x, y, metric="mean_woe")
213 | assert x_transform[:5] == approx([-0.00074357, 0.48973998, 0.02189459,
214 | -0.00074357, 0.02189459], rel=1e-5)
215 |
216 |
217 | def test_classes():
218 | optb = MulticlassOptimalBinning()
219 | optb.fit(x, y)
220 |
221 | assert optb.classes == approx([0, 1, 2])
222 |
223 |
224 | def test_verbose():
225 | optb = MulticlassOptimalBinning(verbose=True)
226 | optb.fit(x, y)
227 |
228 | assert optb.status == "OPTIMAL"
229 |
--------------------------------------------------------------------------------
/tests/test_outlier.py:
--------------------------------------------------------------------------------
1 | """
2 | Outlier classes testing.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2022
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from pytest import approx, raises
12 |
13 | from optbinning.binning.outlier import ModifiedZScoreDetector
14 | from optbinning.binning.outlier import RangeDetector
15 | from optbinning.binning.outlier import YQuantileDetector
16 | from tests.datasets import load_boston
17 |
18 | data = load_boston()
19 | df = pd.DataFrame(data.data, columns=data.feature_names)
20 |
21 | variable = "LSTAT"
22 | x = df[variable].values
23 | y = data.target
24 |
25 |
26 | def test_range_params():
27 | with raises(ValueError):
28 | detector = RangeDetector(method="new")
29 | detector.fit(x)
30 |
31 | with raises(ValueError):
32 | detector = RangeDetector(interval_length=1.5)
33 | detector.fit(x)
34 |
35 |
36 | def test_zscore_params():
37 | with raises(ValueError):
38 | detector = ModifiedZScoreDetector(threshold=-1.5)
39 | detector.fit(x)
40 |
41 |
42 | def test_yquantile_params():
43 | with raises(ValueError):
44 | detector = YQuantileDetector(outlier_detector="new")
45 | detector.fit(x, y)
46 |
47 | with raises(TypeError):
48 | detector = YQuantileDetector(outlier_params=[])
49 | detector.fit(x, y)
50 |
51 | with raises(ValueError):
52 | detector = YQuantileDetector(n_bins=-1)
53 | detector.fit(x, y)
54 |
55 | with raises(ValueError):
56 | detector = YQuantileDetector(
57 | outlier_detector="range",
58 | outlier_params={"threshold": 3.7})
59 |
60 | detector.fit(x, y)
61 |
62 |
63 | def test_range_default():
64 | detector = RangeDetector(method="ETI")
65 | detector.fit(x)
66 | assert np.count_nonzero(detector.get_support()) == 7
67 |
68 | detector = RangeDetector(method="HDI")
69 | detector.fit(x)
70 | assert np.count_nonzero(detector.get_support()) == 31
71 |
72 |
73 | def test_zscore_default():
74 | detector = ModifiedZScoreDetector()
75 | detector.fit(x)
76 |
77 | mask = detector.get_support()
78 | assert np.count_nonzero(mask) == 2
79 |
80 | assert x[mask] == approx([37.97, 36.98])
81 |
82 |
83 | def test_yquantile_default():
84 | detector = YQuantileDetector()
85 | detector.fit(x, y)
86 | mask = detector.get_support()
87 |
88 | assert x[mask] == approx(
89 | [7.56, 9.59, 7.26, 11.25, 14.79, 7.44, 9.53, 8.88])
90 |
91 | assert y[mask] == approx([39.8, 33.8, 43.1, 31, 30.7, 50, 50, 50])
92 |
93 |
94 | def test_yquantile_outlier_params():
95 | detector = YQuantileDetector(n_bins=10, outlier_detector="range",
96 | outlier_params={'method': 'HDI'})
97 |
98 | detector.fit(x, y)
99 | assert np.count_nonzero(detector.get_support()) == 39
100 |
--------------------------------------------------------------------------------
/tests/test_scorecard_plots.py:
--------------------------------------------------------------------------------
1 | """
2 | Scorecard plots testing.
3 | """
4 |
5 | # Guillermo Navas-Palencia
6 | # Copyright (C) 2020
7 |
8 | import numpy as np
9 |
10 | from pytest import raises
11 |
12 | from optbinning.scorecard import plot_auc_roc
13 | from optbinning.scorecard import plot_cap
14 | from optbinning.scorecard import plot_ks
15 |
16 |
17 | y = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0])
18 | y_pred = np.array([0.2, 0.1, 0.6, 0.3, 0.7, 0.2, 0.8, 0.1, 0.9, 0.7, 0.3])
19 |
20 |
21 | def test_params():
22 | for plot in (plot_auc_roc, plot_cap, plot_ks):
23 | with raises(ValueError):
24 | y_pred_wrong = y_pred[:-1]
25 | plot(y, y_pred_wrong)
26 |
27 | with raises(TypeError):
28 | plot(y, y_pred, title=1)
29 |
30 | with raises(TypeError):
31 | plot(y, y_pred, xlabel=1)
32 |
33 | with raises(TypeError):
34 | plot(y, y_pred, ylabel=1)
35 |
36 | with raises(TypeError):
37 | plot(y, y_pred, savefig=1)
38 |
39 | with raises(TypeError):
40 | plot(y, y_pred, fname=1)
41 |
42 | with raises(ValueError):
43 | plot(y, y_pred, savefig=True, fname=None)
44 |
45 |
46 | def test_savefig():
47 | for plot in (plot_auc_roc, plot_cap, plot_ks):
48 | plot(y, y_pred, savefig=True,
49 | fname="tests/results/{}.png".format(plot.__name__))
50 |
--------------------------------------------------------------------------------