├── .github
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.rst
├── doc
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── _images
    │       ├── binning_2d_readme.png
    │       ├── binning_2d_readme_example.png
    │       ├── binning_2d_readme_woe.png
    │       ├── binning_binary.png
    │       ├── binning_data_stream.gif
    │       ├── binning_readme_example_split_woe.png
    │       ├── binning_readme_example_woe.png
    │       ├── logo.svg
    │       ├── logo_optbinning.ico
    │       └── logo_optbinning.svg
    │   ├── _static
    │       └── css
    │       │   └── custom.css
    │   ├── binning_2d_binary.rst
    │   ├── binning_2d_continuous.rst
    │   ├── binning_2d_tables.rst
    │   ├── binning_binary.rst
    │   ├── binning_continuous.rst
    │   ├── binning_multiclass.rst
    │   ├── binning_process.rst
    │   ├── binning_process_sketch.rst
    │   ├── binning_scenarios.rst
    │   ├── binning_sketch.rst
    │   ├── binning_tables.rst
    │   ├── binning_utilities.rst
    │   ├── conf.py
    │   ├── counterfactual.rst
    │   ├── index.rst
    │   ├── installation.rst
    │   ├── mdlp.rst
    │   ├── outlier.rst
    │   ├── piecewise_binary.rst
    │   ├── piecewise_continuous.rst
    │   ├── release_notes.rst
    │   ├── scorecard.rst
    │   ├── tutorials.rst
    │   └── tutorials
    │       ├── tutorial_binary.ipynb
    │       ├── tutorial_binary_large_scale.ipynb
    │       ├── tutorial_binary_localsolver.ipynb
    │       ├── tutorial_binary_under_uncertainty.ipynb
    │       ├── tutorial_binning_2d.ipynb
    │       ├── tutorial_binning_process_FICO_update_binning.ipynb
    │       ├── tutorial_binning_process_FICO_xAI.ipynb
    │       ├── tutorial_binning_process_sklearn_pipeline.ipynb
    │       ├── tutorial_binning_process_telco_churn.ipynb
    │       ├── tutorial_continuous.ipynb
    │       ├── tutorial_continuous_2d.ipynb
    │       ├── tutorial_counterfactual_binary_target.ipynb
    │       ├── tutorial_counterfactual_continuous_target.ipynb
    │       ├── tutorial_multiclass.ipynb
    │       ├── tutorial_piecewise_binary.ipynb
    │       ├── tutorial_piecewise_continuous.ipynb
    │       ├── tutorial_scorecard_binary_target.ipynb
    │       ├── tutorial_scorecard_continuous_target.ipynb
    │       ├── tutorial_scorecard_monitoring.ipynb
    │       ├── tutorial_sketch_binary.ipynb
    │       └── tutorial_sketch_binary_pyspark.rst
├── optbinning
    ├── __init__.py
    ├── _version.py
    ├── binning
    │   ├── __init__.py
    │   ├── auto_monotonic.py
    │   ├── base.py
    │   ├── binning.py
    │   ├── binning_information.py
    │   ├── binning_process.py
    │   ├── binning_process_information.py
    │   ├── binning_statistics.py
    │   ├── continuous_binning.py
    │   ├── continuous_cp.py
    │   ├── cp.py
    │   ├── distributed
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── binning_process_sketch.py
    │   │   ├── binning_process_sketch_information.py
    │   │   ├── binning_sketch.py
    │   │   ├── bsketch.py
    │   │   ├── bsketch_information.py
    │   │   ├── gk.py
    │   │   └── plots.py
    │   ├── ls.py
    │   ├── mdlp.py
    │   ├── metrics.py
    │   ├── mip.py
    │   ├── model_data.py
    │   ├── multiclass_binning.py
    │   ├── multiclass_cp.py
    │   ├── multiclass_mip.py
    │   ├── multidimensional
    │   │   ├── __init__.py
    │   │   ├── binning_2d.py
    │   │   ├── binning_statistics_2d.py
    │   │   ├── continuous_binning_2d.py
    │   │   ├── cp_2d.py
    │   │   ├── mip_2d.py
    │   │   ├── model_data_2d.py
    │   │   ├── model_data_cart_2d.py
    │   │   ├── preprocessing_2d.py
    │   │   └── transformations_2d.py
    │   ├── outlier.py
    │   ├── piecewise
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── binning.py
    │   │   ├── binning_information.py
    │   │   ├── binning_statistics.py
    │   │   ├── continuous_binning.py
    │   │   ├── metrics.py
    │   │   └── transformations.py
    │   ├── prebinning.py
    │   ├── preprocessing.py
    │   ├── transformations.py
    │   └── uncertainty
    │   │   ├── __init__.py
    │   │   └── binning_scenarios.py
    ├── exceptions.py
    ├── formatting.py
    ├── information.py
    ├── logging.py
    ├── metrics
    │   ├── __init__.py
    │   ├── classification.py
    │   └── regression.py
    ├── options.py
    └── scorecard
    │   ├── __init__.py
    │   ├── counterfactual
    │       ├── __init__.py
    │       ├── base.py
    │       ├── counterfactual.py
    │       ├── counterfactual_information.py
    │       ├── mip.py
    │       ├── model_data.py
    │       ├── multi_mip.py
    │       ├── problem_data.py
    │       └── utils.py
    │   ├── monitoring.py
    │   ├── monitoring_information.py
    │   ├── plots.py
    │   ├── rounding.py
    │   ├── scorecard.py
    │   └── scorecard_information.py
├── requirements.txt
├── setup.py
├── test_requirements.txt
└── tests
    ├── __init__.py
    ├── data
        ├── breast_cancer.csv
        └── breast_cancer.parquet
    ├── datasets
        ├── __init__.py
        └── datasets.py
    ├── results
        ├── breast_cancer_woe.csv
        ├── breast_cancer_woe_2.csv
        ├── plot_auc_roc.png
        ├── plot_cap.png
        ├── plot_ks.png
        ├── psi_plot_binary.png
        ├── psi_plot_continuous.png
        ├── test_binning.png
        ├── test_binning_2d_event_rate.png
        ├── test_binning_2d_woe.png
        ├── test_binning_no_missing.png
        ├── test_binning_no_special.png
        ├── test_binning_process_information.txt
        ├── test_binning_process_verbose.txt
        ├── test_continuous_binning.png
        ├── test_continuous_binning_2d.png
        ├── test_continuous_binning_no_missing.png
        ├── test_continuous_binning_no_special.png
        ├── test_multiclass_binning.png
        ├── test_multiclass_binning_no_missing.png
        ├── test_multiclass_binning_no_special.png
        ├── test_scorecard_information.txt
        ├── test_scorecard_monitoring_default.txt
        ├── test_scorecard_monitoring_default_continuous.txt
        ├── test_scorecard_monitoring_information.txt
        ├── test_scorecard_monitoring_verbose.txt
        └── test_scorecard_verbose.txt
    ├── test_binning.py
    ├── test_binning_2d.py
    ├── test_binning_piecewise.py
    ├── test_binning_process.py
    ├── test_binning_process_sketch.py
    ├── test_binning_scenarios.py
    ├── test_binning_sketch.py
    ├── test_continuous_binning.py
    ├── test_continuous_binning_2d.py
    ├── test_continuous_binning_piecewise.py
    ├── test_counterfactual.py
    ├── test_mdlp.py
    ├── test_multiclass_binning.py
    ├── test_outlier.py
    ├── test_scorecard.py
    ├── test_scorecard_monitoring.py
    └── test_scorecard_plots.py


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: CI
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [master, develop]
 9 |   pull_request:
10 |     branches: [master, develop]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ${{ matrix.os }}
16 |     strategy:
17 |       matrix:
18 | 
19 |         python-version: ['3.9', '3.10', "3.11", "3.12"]
20 |         os: [ubuntu-latest, windows-latest, macos-latest]
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v2
24 |     - name: Set up Python ${{ matrix.python-version }}
25 |       uses: actions/setup-python@v2
26 |       with:
27 |         python-version: ${{ matrix.python-version }}
28 |     - name: Install dependencies
29 |       run: |
30 |         python -m pip install --upgrade pip
31 |         pip install -r test_requirements.txt
32 |         pip install -r requirements.txt
33 |     - name: Install package
34 |       run: |
35 |         pip install -e .[distributed,test,ecos]
36 |     - name: Lint with flake8
37 |       run: |
38 |         # stop the build if there are Python syntax errors or undefined names
39 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
40 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
41 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
42 |     - name: Test with pytest
43 |       run: |
44 |         pytest
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | # OS: Linux
 3 | dist: xenial
 4 | python:
 5 |   - "3.6"
 6 |   - "3.7"
 7 | 
 8 | matrix:
 9 |   include:
10 |     # OS: Windows
11 |     - os: windows
12 |       language: shell
13 |       before_install:
14 |         - choco install python --version 3.6.8
15 |         - python --version
16 |         - python -m pip install --upgrade pip
17 |         - pip3 install --upgrade pytest
18 |         # - pip3 install codecov
19 |       env: PATH=/c/Python36:/c/Python36/Scripts:$PATH
20 | 
21 |     - os: windows
22 |       language: shell
23 |       before_install:
24 |         - choco install python --version 3.7.4
25 |         - python --version
26 |         - python -m pip install --upgrade pip
27 |         - pip3 install --upgrade pytest
28 |         # - pip3 install codecov
29 |       env: PATH=/c/Python37:/c/Python37/Scripts:$PATH
30 | 
31 | # command to install dependencies
32 | install:
33 |   - pip install codecov
34 |   # - pip install coveralls
35 |   - pip install -r requirements.txt
36 |   - pip install -e .
37 | # command to run tests
38 | script:
39 |   - coverage run --source optbinning -m pytest tests/
40 | 
41 | after_success:
42 |   - codecov
43 | #   - coveralls


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/doc/source/_images/binning_2d_readme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_2d_readme.png


--------------------------------------------------------------------------------
/doc/source/_images/binning_2d_readme_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_2d_readme_example.png


--------------------------------------------------------------------------------
/doc/source/_images/binning_2d_readme_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_2d_readme_woe.png


--------------------------------------------------------------------------------
/doc/source/_images/binning_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_binary.png


--------------------------------------------------------------------------------
/doc/source/_images/binning_data_stream.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_data_stream.gif


--------------------------------------------------------------------------------
/doc/source/_images/binning_readme_example_split_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_readme_example_split_woe.png


--------------------------------------------------------------------------------
/doc/source/_images/binning_readme_example_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_readme_example_woe.png


--------------------------------------------------------------------------------
/doc/source/_images/logo.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="988px" height="338px" viewBox="-0.5 -0.5 988 338" content="&lt;mxfile host=&quot;www.draw.io&quot; modified=&quot;2019-12-19T12:28:02.803Z&quot; agent=&quot;Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36&quot; etag=&quot;84eCdrl_XJmEuP-GbGq5&quot; version=&quot;12.4.3&quot; type=&quot;device&quot; pages=&quot;1&quot;&gt;&lt;diagram id=&quot;oh4HsDGrw9CZt78So6EW&quot; name=&quot;Page-1&quot;&gt;3Zjfb9owEMf/mjwiJXYa4HGBQqV12gNTq+5lMrFJrDp25pgm8NfPJjYk/FBXaVoK8EDue2f7/LlzJOPBSV7PJSqybwIT5gEf1x6cegAEYx/oH6NsrOKPokZJJcVWOwgLuiUu0KpriknZCVRCMEWLrpgIzkmiOhqSUlTdsJVg3VULlJITYZEgdqo+U6yyRh2B4UF/IDTN3MpBNG48OXLBdidlhrCoWhK89+BECqGap7yeEGboOS7NuNkF7z4xSbj6mwE/ps+Pi209efoZyi0v5iVMHwaBneYNsbXdsc1WbRyCKqOKLAqUGLvSdfZgnKmcaSvQj6gsGvArWhO9VryijE0EE3I3HEJ/uURQ66WS4pW0PP7us/c4vib2dHMuUSIVqVuS3eyciJwoudEhzhtZ8Lb1wMjaVauOVspaJXQasp2T7mc+wNUPlu9HWIN/z/q/kINR3+TglZILw77JhVdK7g70Te7uZt+MIPxs5zu6ki49Jtf/+R5eKbn+z/foSshB8Nl6bnyzb8Zj1r13qVv/BlmHfn+sR4PNS8BeBl9/v6UyzzLwa/Y4OL0KfS9UTDmnPD2Brjepupy7+Ljg5Ii1lRCjKddmotERrccGGdU3zi/WkVOMzTKxFGuOTcmmpggrwZW9HwfDvWAzMnYmJN1qDbmMGFoSFguJiTxOS4fNUE6ZYf9EJEYcWblV/3FsvvuJUPKa7lI6muxc/11ohjMtc7k/4Pv3twieaRBD56Mdos3DTXzna/2hAe//AA==&lt;/diagram&gt;&lt;/mxfile&gt;"><defs/><g><rect x="1" y="16" width="80" height="80" fill="#30bba3" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="1" y="96" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="1" y="176" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="1" y="256" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="81" y="96" width="80" height="80" fill="#30bba3" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="81" y="176" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="81" y="256" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="161" y="176" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="161" y="256" width="80" height="80" fill="#30bba3" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="241" y="256" width="80" height="80" fill="#30bba3" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="271" y="16" width="630" height="170" fill="none" stroke="none" pointer-events="all"/><g transform="translate(183.5,-0.5)"><switch><foreignObject style="overflow:visible;" pointer-events="all" width="804" height="202" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: inline-block; font-size: 170px; font-family: Verdana; color: rgb(155, 155, 155); line-height: 1.2; vertical-align: top; width: 804px; white-space: nowrap; overflow-wrap: normal; text-align: center;"><div xmlns="http://www.w3.org/1999/xhtml" style="display:inline-block;text-align:inherit;text-decoration:inherit;white-space:normal;">OptBinning</div></div></foreignObject><text x="402" y="186" fill="#9B9B9B" text-anchor="middle" font-size="170px" font-family="Verdana">OptBinning</text></switch></g></g></svg>


--------------------------------------------------------------------------------
/doc/source/_images/logo_optbinning.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/logo_optbinning.ico


--------------------------------------------------------------------------------
/doc/source/_images/logo_optbinning.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
3 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="323px" height="323px" viewBox="-0.5 -0.5 323 323" content="&lt;mxfile host=&quot;www.draw.io&quot; modified=&quot;2019-12-19T12:28:31.085Z&quot; agent=&quot;Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36&quot; etag=&quot;bZ3QZkb-l-N9p3K-HU5W&quot; version=&quot;12.4.3&quot; type=&quot;device&quot; pages=&quot;1&quot;&gt;&lt;diagram id=&quot;oh4HsDGrw9CZt78So6EW&quot; name=&quot;Page-1&quot;&gt;3ZdBb4IwFIB/Dccl0CLqVXR62E4sM9mtwhOaFUpKHeivX5FWQbNkSxYRvdh+75W2X19JsLCfVktB8uSVR8AsZEeVhecWQs7URuqvJntNbDRqSCxopNkZBPQAJlHTHY2g6CRKzpmkeReGPMsglB1GhOBlN23LWXfWnMRwBYKQsGu6ppFMGjpB4zNfAY0TM7PjTZtISkyy3kmRkIiXLYQXFvYF57JppZUPrLZnvDTjnn+InhYmIJO/GfA2X78Eh8p//3DFIcuXBY5XT45+zBdhO71jvVq5NwrKhEoIchLW/VKds4VniUyZ6jmqSYq8Eb+lFai5ZlvKmM8ZF8fhGNubDcGKF1LwT2hF7OPvFDF+69zrzZmFgpBQtZDe7BJ4ClLsVYqJelq8Lj000f2ydY4aJa0jNIzoyolPTz7LVQ3t9y+u0f+7vok57PVtDg/UnOv2bc4dqLkR6tvc6GHfjMi9t/vtDaRKL831f7/HAzXX//2eDMQcRvdWc9OHfTNeuu69Ss38D+jatW/mWnXP31nHWOtzFS++AQ==&lt;/diagram&gt;&lt;/mxfile&gt;"><defs/><g><rect x="1" y="1" width="80" height="80" fill="#30bba3" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="1" y="81" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="1" y="161" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="1" y="241" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="81" y="81" width="80" height="80" fill="#30bba3" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="81" y="161" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="81" y="241" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="161" y="161" width="80" height="80" fill="#ffffff" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="161" y="241" width="80" height="80" fill="#30bba3" stroke="#000000" stroke-width="3" pointer-events="all"/><rect x="241" y="241" width="80" height="80" fill="#30bba3" stroke="#000000" stroke-width="3" pointer-events="all"/></g></svg>


--------------------------------------------------------------------------------
/doc/source/_static/css/custom.css:
--------------------------------------------------------------------------------
 1 | /* Navigator and sidebar colors */
 2 | .wy-side-nav-search, .wy-nav-top {
 3 |     background: #326d62;
 4 | }
 5 | 
 6 | .wy-menu-vertical a:active {
 7 |     background: #30bba3;
 8 | }
 9 | 
10 | .wy-side-nav-search>div.version {
11 |     color: #d8d8d8;
12 | }
13 | 
14 | .wy-side-nav-search>a img.logo, .wy-side-nav-search .wy-dropdown>a img.logo {
15 |     display: block;
16 |     margin: 0 auto;
17 |     height: 20%;
18 |     width: 20%;
19 |     border-radius: 0;
20 |     max-width: 100%;
21 |     background: transparent;
22 | }
23 | 
24 | .wy-menu-vertical header, .wy-menu-vertical p.caption {
25 |     color: #30bba3;
26 | }
27 | 
28 | /* Class/function declaration colors */
29 | .rst-content dl:not(.docutils) dt {
30 |     background: #daf2ee;
31 |     color: #326d62;
32 |     border-top: solid 3px #30bba3;
33 | }
34 | 
35 | 
36 | /* Links colors */
37 | a {
38 |     color: #30bba3;
39 | }
40 | 
41 | .wy-nav-content a:hover {
42 |     color: #326d62;
43 | }
44 | 
45 | .wy-nav-content a:visited:hover {
46 | 	color: #9B59B6;
47 | }
48 | 


--------------------------------------------------------------------------------
/doc/source/binning_2d_binary.rst:
--------------------------------------------------------------------------------
1 | Optimal binning 2D with binary target
2 | =====================================
3 | 
4 | .. autoclass:: optbinning.OptimalBinning2D
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:


--------------------------------------------------------------------------------
/doc/source/binning_2d_continuous.rst:
--------------------------------------------------------------------------------
1 | Optimal binning 2D with continuous target
2 | =========================================
3 | 
4 | .. autoclass:: optbinning.ContinuousOptimalBinning2D
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:


--------------------------------------------------------------------------------
/doc/source/binning_2d_tables.rst:
--------------------------------------------------------------------------------
 1 | Binning 2D tables
 2 | =================
 3 | 
 4 | Binning table 2D: binary target
 5 | -------------------------------
 6 | 
 7 | .. autoclass:: optbinning.binning.multidimensional.binning_statistics_2d.BinningTable2D
 8 |    :members:
 9 |    :inherited-members:
10 |    :show-inheritance:
11 | 
12 | Binning table 2D: continuous target
13 | -----------------------------------
14 | 
15 | .. autoclass:: optbinning.binning.multidimensional.binning_statistics_2d.ContinuousBinningTable2D
16 |    :members:
17 |    :inherited-members:
18 |    :show-inheritance:   


--------------------------------------------------------------------------------
/doc/source/binning_binary.rst:
--------------------------------------------------------------------------------
1 | Optimal binning with binary target
2 | ==================================
3 | 
4 | .. autoclass:: optbinning.OptimalBinning
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/binning_continuous.rst:
--------------------------------------------------------------------------------
1 | Optimal binning with continuous target
2 | ======================================
3 | 
4 | .. autoclass:: optbinning.ContinuousOptimalBinning
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/binning_multiclass.rst:
--------------------------------------------------------------------------------
1 | Optimal binning with multiclass target
2 | ======================================
3 | 
4 | 
5 | .. autoclass:: optbinning.MulticlassOptimalBinning
6 |    :members:
7 |    :inherited-members:
8 |    :show-inheritance:


--------------------------------------------------------------------------------
/doc/source/binning_process.rst:
--------------------------------------------------------------------------------
1 | Binning process
2 | ===============
3 | 
4 | .. autoclass:: optbinning.BinningProcess
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/binning_process_sketch.rst:
--------------------------------------------------------------------------------
1 | Binning process sketch with binary target
2 | =========================================
3 | 
4 | .. autoclass:: optbinning.BinningProcessSketch
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/binning_scenarios.rst:
--------------------------------------------------------------------------------
 1 | Stochastic optimal binning
 2 | ==========================
 3 | 
 4 | Introduction
 5 | ------------
 6 | The data used when performing optimal binning is generally assumed to be known accurately and being fully representative of past, present, and future data. This confidence might produce misleading results, especially with data representing future events such as product demand, churn rate, or probability of default.
 7 | 
 8 | Stochastic programming is a framework for explicitly incorporating uncertainty. Stochastic programming uses random variables to account for data variability and optimizes the expected value of the objective function. Optbinning implements the stochastic programming approach using the two-stage scenario-based formulation (also known as extensive form or deterministic equivalent), obtaining a deterministic mixed-integer linear programming formulation. The scenario-based formulation guarantees the nonanticipativity constraint and a solution that must be feasible for each scenario, leading to a more **robust** solution.
 9 | 
10 | 
11 | Scenario-based optimal binning
12 | ------------------------------
13 | 
14 | .. autoclass:: optbinning.binning.uncertainty.SBOptimalBinning
15 |    :members:
16 |    :inherited-members:
17 |    :show-inheritance:
18 | 
19 | 


--------------------------------------------------------------------------------
/doc/source/binning_sketch.rst:
--------------------------------------------------------------------------------
 1 | Optimal binning sketch with binary target
 2 | =========================================
 3 | 
 4 | Introduction
 5 | ------------
 6 | 
 7 | The optimal binning is the constrained discretization of a numerical feature into bins given a binary target, maximizing a statistic such as Jeffrey's divergence or Gini. Binning is a data preprocessing technique commonly used in binary classification, but the current list of existing binning algorithms supporting constraints lacks a method to handle streaming data. The new class OptimalBinningSketch implements a new scalable, memory-efficient and robust algorithm for performing optimal binning in the streaming settings. Algorithmic details are discussed in http://gnpalencia.org/blog/2020/binning_data_streams/.
 8 | 
 9 | 
10 | Algorithms
11 | ----------
12 | 
13 | OptimalBinningSketch
14 | """"""""""""""""""""
15 | 
16 | .. autoclass:: optbinning.binning.distributed.OptimalBinningSketch
17 |    :members:
18 |    :inherited-members:
19 |    :show-inheritance:
20 | 
21 | 
22 | GK: Greenwald-Khanna's algorithm
23 | """"""""""""""""""""""""""""""""
24 | 
25 | .. autoclass:: optbinning.binning.distributed.GK
26 |    :members:
27 |    :inherited-members:
28 |    :show-inheritance:
29 | 
30 | 
31 | Binning sketch: numerical variable - binary target
32 | """"""""""""""""""""""""""""""""""""""""""""""""""
33 | 
34 | .. autoclass:: optbinning.binning.distributed.BSketch
35 |    :members:
36 |    :inherited-members:
37 |    :show-inheritance:
38 | 
39 | 
40 | Binning sketch: categorical variable - binary target
41 | """"""""""""""""""""""""""""""""""""""""""""""""""""
42 | 
43 | .. autoclass:: optbinning.binning.distributed.BCatSketch
44 |    :members:
45 |    :inherited-members:
46 |    :show-inheritance:   


--------------------------------------------------------------------------------
/doc/source/binning_tables.rst:
--------------------------------------------------------------------------------
 1 | Binning tables
 2 | ==============
 3 | 
 4 | Binning table: binary target
 5 | ----------------------------
 6 | 
 7 | .. autoclass:: optbinning.binning.binning_statistics.BinningTable
 8 |    :members:
 9 |    :inherited-members:
10 |    :show-inheritance:
11 | 
12 | Binning table: continuous target
13 | --------------------------------
14 | 
15 | .. autoclass:: optbinning.binning.binning_statistics.ContinuousBinningTable
16 |    :members:
17 |    :inherited-members:
18 |    :show-inheritance:
19 | 
20 | Binning table: multiclass target
21 | --------------------------------
22 | 
23 | .. autoclass:: optbinning.binning.binning_statistics.MulticlassBinningTable
24 |    :members:
25 |    :inherited-members:
26 |    :show-inheritance:   


--------------------------------------------------------------------------------
/doc/source/binning_utilities.rst:
--------------------------------------------------------------------------------
  1 | Utilities
  2 | =========
  3 | 
  4 | 
  5 | Pre-binning
  6 | -----------
  7 | 
  8 | .. autoclass:: optbinning.binning.prebinning.PreBinning
  9 |    :members:
 10 |    :inherited-members:
 11 |    :show-inheritance:
 12 | 
 13 | 
 14 | Transformations
 15 | ---------------
 16 | 
 17 | The Weight of Evidence :math:`\text{WoE}_i` and event rate :math:`D_i` for each bin are related by means of the functional equations
 18 | 
 19 | .. math::
 20 | 
 21 |    \begin{align}
 22 |    \text{WoE}_i &= \log\left(\frac{1 - D_i}{D_i}\right) + \log\left(\frac{N_T^{E}}{N_T^{NE}}\right) = 
 23 |    \log\left(\frac{N_T^{E}}{N_T^{NE}}\right) - \text{logit}(D_i)\\
 24 |    D_i &= \left(1 + \frac{N_T^{NE}}{N_T^{E}} e^{\text{WoE}_i}\right)^{-1} = \left(1 + e^{\text{WoE}_i - \log\left(\frac{N_T^{E}}{N_T^{NE}}\right)}\right)^{-1},
 25 |    \end{align}
 26 | 
 27 | where :math:`D_i` can be characterized as a logistic function of :math:`\text{WoE}_i`, and  :math:`\text{WoE}_i` can be expressed in terms of the logit function of :math:`D_i`. 
 28 | The constant term :math:`\log(N_T^{E} / N_T^{NE})` is the log ratio of the total
 29 | number of event :math:`N_T^{E}` and the total number of non-events :math:`N_T^{NE}`. This shows that WoE is inversely related to the event rate.
 30 | 
 31 | .. autofunction:: optbinning.binning.transformations.transform_event_rate_to_woe
 32 | 
 33 | .. autofunction:: optbinning.binning.transformations.transform_woe_to_event_rate
 34 | 
 35 | 
 36 | Metrics
 37 | -------
 38 | 
 39 | Gini coefficient
 40 | """"""""""""""""
 41 | 
 42 | The Gini coefficient or Accuracy Ratio is a quantitative measure of discriminatory and predictive power given a distribution of events and non-events. The Gini coefficient
 43 | ranges from 0 to 1, and is defined by
 44 | 
 45 | .. math::
 46 | 
 47 |    Gini = 1 - \frac{2 \sum_{i=2}^n \left(N_i^{E} \sum_{j=1}^{i-1} N_j^{NE}\right) + \sum_{k=1}^n N_k^{E} N_k^{NE}}{N_T^{E} N_T^{NE}},
 48 | 
 49 | where :math:`N_i^{E}` and :math:`N_i^{NE}` are the number of events and non-events per
 50 | bin, respectively, and :math:`N_T^{E}` and :math:`N_T^{NE}` are the total number of
 51 | events and non-events, respectively.
 52 | 
 53 | .. autofunction:: optbinning.binning.metrics.gini
 54 | 
 55 | Divergence measures
 56 | """""""""""""""""""
 57 | 
 58 | Given two discrete probability distributions :math:`P` and :math:`Q`. The Shannon entropy
 59 | is defined as 
 60 | 
 61 | .. math::
 62 | 
 63 |    S(P) = - \sum_{i=1}^n p_i \log(p_i).
 64 | 
 65 | The Kullback-Leibler divergence, denoted as :math:`D_{KL}(P||Q)`, is given by
 66 | 
 67 | .. math::
 68 | 
 69 |    D_{KL}(P || Q) = \sum_{i=1}^n p_i \log \left(\frac{p_i}{q_i}\right).
 70 | 
 71 | The Jeffrey's divergence or Information Value (IV), is a symmetric measure expressible in terms of the Kullback-Leibler divergence defined by
 72 | 
 73 | .. math::
 74 | 
 75 |    \begin{align*}
 76 |    J(P|| Q) &= D_{KL}(P || Q) + D_{KL}(Q || P) = \sum_{i=1}^n p_i \log \left(\frac{p_i}{q_i}\right) + \sum_{i=1}^n q_i \log \left(\frac{q_i}{p_i}\right)\\ 
 77 |    &= \sum_{i=1}^n (p_i - q_i) \log \left(\frac{p_i}{q_i}\right).
 78 |    \end{align*}
 79 | 
 80 | The Jensen-Shannon divergence is a bounded symmetric measure also expressible in 
 81 | terms of the Kullback-Leibler divergence
 82 | 
 83 | .. math::
 84 | 
 85 |    \begin{equation}
 86 |    JSD(P || Q) = \frac{1}{2}\left(D(P || M) + D(Q || M)\right), \quad M = \frac{1}{2}(P + Q),
 87 |    \end{equation}
 88 | 
 89 | and bounded by :math:`JSD(P||Q) \in [0, \log(2)]`. We note that these measures cannot be directly used whenever :math:`p_i = 0` and/or :math:`q_i = 0`.
 90 | 
 91 | .. autofunction:: optbinning.binning.metrics.entropy
 92 | 
 93 | .. autofunction:: optbinning.binning.metrics.kullback_leibler
 94 | 
 95 | .. autofunction:: optbinning.binning.metrics.jeffrey
 96 | 
 97 | .. autofunction:: optbinning.binning.metrics.jensen_shannon
 98 | 
 99 | .. autofunction:: optbinning.binning.metrics.jensen_shannon_multivariate
100 | 
101 | .. autofunction:: optbinning.binning.metrics.hellinger
102 | 
103 | .. autofunction:: optbinning.binning.metrics.triangular
104 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../..'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'optbinning'
21 | copyright = '2019 - 2024, Guillermo Navas-Palencia'
22 | author = 'Guillermo Navas-Palencia'
23 | 
24 | # The short X.Y version
25 | version = '0.20.0'
26 | # The full version, including alpha/beta/rc tags
27 | release = '0.20.0'
28 | 
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | 
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 |     'sphinx.ext.autodoc',
37 |     'sphinx.ext.doctest',
38 |     'sphinx.ext.mathjax',
39 |     'sphinx.ext.napoleon',
40 |     'sphinxcontrib.bibtex',
41 |     'nbsphinx',
42 |     'sphinx.ext.autosectionlabel'
43 | ]
44 | 
45 | # Add any paths that contain templates here, relative to this directory.
46 | templates_path = ['_templates']
47 | 
48 | # The suffix(es) of source filenames.
49 | # You can specify multiple suffix as a list of string:
50 | #
51 | # source_suffix = ['.rst', '.md']
52 | source_suffix = '.rst'
53 | 
54 | # The master toctree document.
55 | master_doc = 'index'
56 | 
57 | # The language for content autogenerated by Sphinx. Refer to documentation
58 | # for a list of supported languages.
59 | #
60 | # This is also used if you do content translation via gettext catalogs.
61 | # Usually you set "language" from the command line for these cases.
62 | language = None
63 | 
64 | # List of patterns, relative to source directory, that match files and
65 | # directories to ignore when looking for source files.
66 | # This pattern also affects html_static_path and html_extra_path.
67 | exclude_patterns = []
68 | 
69 | 
70 | # -- Options for HTML output -------------------------------------------------
71 | 
72 | # The theme to use for HTML and HTML Help pages.  See the documentation for
73 | # a list of builtin themes.
74 | #
75 | html_theme_options = {
76 |     'logo_only': True
77 | }
78 | 
79 | html_show_sourcelink = False
80 | 
81 | html_theme = 'sphinx_rtd_theme'
82 | html_logo = '_images/logo_optbinning.svg'
83 | html_favicon = '_images/logo_optbinning.ico'
84 | 
85 | 
86 | # Add any paths that contain custom static files (such as style sheets) here,
87 | # relative to this directory. They are copied after the builtin static files,
88 | # so a file named "default.css" will overwrite the builtin "default.css".
89 | html_static_path = ['_static']
90 | 
91 | 
92 | # These paths are either relative to html_static_path
93 | # or fully qualified paths (eg. https://...)
94 | html_css_files = [
95 |     'css/custom.css',
96 | ]
97 | 


--------------------------------------------------------------------------------
/doc/source/counterfactual.rst:
--------------------------------------------------------------------------------
1 | Counterfactual explanations
2 | ===========================
3 | 
4 | .. autoclass:: optbinning.scorecard.Counterfactual
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. optbinning documentation master file, created by
 2 |    sphinx-quickstart on Thu Dec 19 10:54:06 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | 
 7 | OptBinning: The Python Optimal Binning library
 8 | ==============================================
 9 | 
10 | The optimal binning is the optimal discretization of a variable into bins given a discrete or continuous numeric target. **OptBinning** is a library
11 | written in Python implementing a **rigorous** and **flexible** mathematical programming formulation to solve the optimal binning problem for a binary, continuous and multiclass target type, incorporating constraints not previously addressed.
12 | 
13 | .. toctree::
14 |    :maxdepth: 1
15 |    :caption: Getting started
16 | 
17 |    installation
18 |    tutorials
19 |    release_notes
20 | 
21 | .. toctree::
22 |    :maxdepth: 1
23 |    :caption: Optimal binning algorithms
24 | 
25 |    binning_binary
26 |    binning_continuous
27 |    binning_multiclass
28 |    binning_process
29 |    binning_tables
30 |    binning_utilities
31 | 
32 | .. toctree::
33 |    :maxdepth: 1
34 |    :caption: Scorecard development
35 | 
36 |    scorecard
37 |    counterfactual
38 | 
39 | .. toctree::
40 |    :maxdepth: 1
41 |    :caption: Optimal piecewise binning
42 | 
43 |    piecewise_binary
44 |    piecewise_continuous
45 | 
46 | .. toctree::
47 |    :maxdepth: 1
48 |    :caption: Batch and stream optimal binning
49 | 
50 |    binning_sketch
51 |    binning_process_sketch
52 | 
53 | .. toctree::
54 |    :maxdepth: 1
55 |    :caption: Binning under uncertainty
56 | 
57 |    binning_scenarios
58 | 
59 | .. toctree::
60 |    :maxdepth: 1
61 |    :caption: Optimal binning 2D
62 | 
63 |    binning_2d_binary
64 |    binning_2d_continuous
65 |    binning_2d_tables
66 | 
67 | .. toctree::
68 |    :maxdepth: 1
69 |    :caption: Other binning algorithms
70 | 
71 |    mdlp
72 | 
73 | .. toctree::
74 |    :maxdepth: 1
75 |    :caption: Utilities
76 | 
77 |    outlier


--------------------------------------------------------------------------------
/doc/source/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Install release
 5 | ---------------
 6 | 
 7 | To install the current release of OptBinning:
 8 | 
 9 | .. code-block:: text
10 | 
11 |    pip install optbinning
12 | 
13 | Optionally, download a different release
14 | from https://github.com/guillermo-navas-palencia/optbinning/releases and install
15 | using
16 | 
17 | .. code-block:: text
18 | 
19 |    python setup.py install
20 | 
21 | Install from source
22 | -------------------
23 | 
24 | To install from source, download or clone the git repository https://github.com/guillermo-navas-palencia/optbinning
25 | 
26 | .. code-block:: text
27 | 
28 |    cd optbinning
29 |    python setup.py install


--------------------------------------------------------------------------------
/doc/source/mdlp.rst:
--------------------------------------------------------------------------------
1 | MDLP discretization algorithm
2 | =============================
3 | 
4 | .. autoclass:: optbinning.MDLP
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/doc/source/outlier.rst:
--------------------------------------------------------------------------------
 1 | Outlier detection
 2 | =================
 3 | 
 4 | .. autoclass:: optbinning.binning.outlier.OutlierDetector
 5 |    :members:
 6 |    :inherited-members:
 7 |    :show-inheritance:
 8 | 
 9 | 
10 | .. autoclass:: optbinning.binning.outlier.RangeDetector
11 |    :members:
12 |    :inherited-members:
13 |    :show-inheritance:
14 | 
15 | 
16 | .. autoclass:: optbinning.binning.outlier.ModifiedZScoreDetector
17 |    :members:
18 |    :inherited-members:
19 |    :show-inheritance:
20 | 
21 | 
22 | .. autoclass:: optbinning.binning.outlier.YQuantileDetector
23 |    :members:
24 |    :inherited-members:
25 |    :show-inheritance:
26 | 


--------------------------------------------------------------------------------
/doc/source/piecewise_binary.rst:
--------------------------------------------------------------------------------
1 | Optimal piecewise binning with binary target
2 | ============================================
3 | 
4 | .. autoclass:: optbinning.OptimalPWBinning
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:


--------------------------------------------------------------------------------
/doc/source/piecewise_continuous.rst:
--------------------------------------------------------------------------------
1 | Optimal piecewise binning with continuous target
2 | ================================================
3 | 
4 | .. autoclass:: optbinning.ContinuousOptimalPWBinning
5 |    :members:
6 |    :inherited-members:
7 |    :show-inheritance:


--------------------------------------------------------------------------------
/doc/source/scorecard.rst:
--------------------------------------------------------------------------------
 1 | Scorecard
 2 | =========
 3 | 
 4 | .. autoclass:: optbinning.scorecard.Scorecard
 5 |    :members:
 6 |    :inherited-members:
 7 |    :show-inheritance:
 8 | 
 9 | 
10 | Monitoring
11 | ----------
12 | 
13 | .. autoclass:: optbinning.scorecard.ScorecardMonitoring
14 |    :members:
15 |    :inherited-members:
16 |    :show-inheritance:
17 | 
18 | 
19 | Plot functions
20 | --------------
21 | 
22 | .. autofunction:: optbinning.scorecard.plot_auc_roc
23 | 
24 | .. autofunction:: optbinning.scorecard.plot_cap
25 | 
26 | .. autofunction:: optbinning.scorecard.plot_ks


--------------------------------------------------------------------------------
/doc/source/tutorials.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | =========
 3 | 
 4 | This is a guide for optbinning new users with tutorials ranging from basic
 5 | to advanced level for each supported target type.
 6 | 
 7 | Optimal binning tutorials
 8 | -------------------------
 9 | 
10 | .. toctree::
11 |    :maxdepth: 1
12 | 
13 |    tutorials/tutorial_binary
14 |    tutorials/tutorial_binary_localsolver
15 |    tutorials/tutorial_binary_large_scale
16 |    tutorials/tutorial_continuous
17 |    tutorials/tutorial_multiclass
18 | 
19 | 
20 | Binning process tutorials
21 | -------------------------
22 | 
23 | .. toctree::
24 |    :maxdepth: 1
25 | 
26 |    tutorials/tutorial_binning_process_sklearn_pipeline
27 |    tutorials/tutorial_binning_process_FICO_xAI
28 |    tutorials/tutorial_binning_process_FICO_update_binning
29 |    tutorials/tutorial_binning_process_telco_churn
30 | 
31 | 
32 | Scorecard tutorials
33 | -------------------
34 | 
35 | .. toctree::
36 |    :maxdepth: 1
37 | 
38 |    tutorials/tutorial_scorecard_binary_target
39 |    tutorials/tutorial_scorecard_continuous_target
40 |    tutorials/tutorial_scorecard_monitoring
41 |    tutorials/tutorial_counterfactual_binary_target
42 |    tutorials/tutorial_counterfactual_continuous_target
43 | 
44 | 
45 | Optimal piecewise binning tutorials
46 | -----------------------------------
47 | 
48 | .. toctree::
49 |    :maxdepth: 1
50 | 
51 |    tutorials/tutorial_piecewise_binary
52 |    tutorials/tutorial_piecewise_continuous
53 | 
54 | 
55 | Optimal binning for batch and streaming data processing
56 | -------------------------------------------------------
57 | 
58 | .. toctree::
59 |    :maxdepth: 1
60 | 
61 |    tutorials/tutorial_sketch_binary
62 |    tutorials/tutorial_sketch_binary_pyspark
63 | 
64 | 
65 | Optimal binning under uncertainty
66 | ---------------------------------
67 | 
68 | .. toctree::
69 |    :maxdepth: 1
70 | 
71 |    tutorials/tutorial_binary_under_uncertainty
72 | 
73 | 
74 | Optimal binning 2D
75 | ------------------
76 | 
77 | .. toctree::
78 |    :maxdepth: 1
79 | 
80 |    tutorials/tutorial_binning_2d
81 |    tutorials/tutorial_continuous_2d
82 | 


--------------------------------------------------------------------------------
/doc/source/tutorials/tutorial_sketch_binary_pyspark.rst:
--------------------------------------------------------------------------------
 1 | Tutorial: optimal binning sketch with binary target using PySpark
 2 | =================================================================
 3 | 
 4 | In this example, we use PySpark mapPartitions function to compute the optimal
 5 | binning of a single variable from a large dataset in a distributed fashion. The dataset is split into 4 partitions.
 6 | 
 7 | .. code::
 8 | 
 9 |   from pyspark.sql import SparkSession
10 | 
11 |   spark.conf.set("spark.sql.execution.arrow.enabled", "true")
12 | 
13 |   df = spark.read.csv("data/kaggle/HomeCreditDefaultRisk/application_train.csv",
14 |                       sep=",", header=True, inferSchema=True)
15 | 
16 |   n_partitions = 4
17 |   df = df.repartition(n_partitions)
18 | 
19 | 
20 | We prepare the MapReduce structure
21 | 
22 | .. code ::
23 | 
24 |   import pandas as pd
25 |   from optbinning import OptimalBinningSketch
26 | 
27 |   variable = "EXT_SOURCE_3"
28 |   target = "TARGET"
29 |   columns = [variable, target]
30 | 
31 | 
32 |   def add(partition):
33 |       df_pandas = pd.DataFrame.from_records(partition, columns=columns)
34 |       x = df_pandas[variable]
35 |       y = df_pandas[target]
36 |       optbsketch = OptimalBinningSketch(eps=0.001)
37 |       optbsketch.add(x, y)
38 |       
39 |       return [optbsketch]
40 | 
41 |   def merge(optbsketch, other_optbsketch):
42 |       optbsketch.merge(other_optbsketch)
43 |       
44 |       return optbsketch
45 | 
46 | Finally, with the required columns, we use mapPartitions and method
47 | treeReduce to aggregate the ``OptimalBinningSketch`` instance of each partition.
48 | 
49 | .. code ::
50 | 
51 |   optbsketch = df.select(columns).rdd.mapPartitions(lambda partition: add(partition)
52 |                                                    ).treeReduce(merge)


--------------------------------------------------------------------------------
/optbinning/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._version import __version__
 2 | from .binning import BinningProcess
 3 | from .binning import ContinuousOptimalBinning
 4 | from .binning import MDLP
 5 | from .binning import MulticlassOptimalBinning
 6 | from .binning import OptimalBinning
 7 | from .binning.distributed import BinningProcessSketch
 8 | from .binning.distributed import OptimalBinningSketch
 9 | from .binning.multidimensional import ContinuousOptimalBinning2D
10 | from .binning.multidimensional import OptimalBinning2D
11 | from .binning.piecewise import ContinuousOptimalPWBinning
12 | from .binning.piecewise import OptimalPWBinning
13 | from .binning.uncertainty import SBOptimalBinning
14 | from .scorecard import Scorecard
15 | 
16 | 
17 | __all__ = ['__version__',
18 |            'BinningProcess',
19 |            'BinningProcessSketch',
20 |            'ContinuousOptimalBinning',
21 |            'ContinuousOptimalBinning2D',
22 |            'ContinuousOptimalPWBinning',
23 |            'MDLP',
24 |            'MulticlassOptimalBinning',
25 |            'OptimalBinning',
26 |            'OptimalBinningSketch',
27 |            'OptimalBinning2D',
28 |            'OptimalPWBinning',
29 |            'SBOptimalBinning',
30 |            'Scorecard']
31 | 


--------------------------------------------------------------------------------
/optbinning/_version.py:
--------------------------------------------------------------------------------
1 | """Version information."""
2 | 
3 | __version__ = "0.20.1"
4 | 


--------------------------------------------------------------------------------
/optbinning/binning/__init__.py:
--------------------------------------------------------------------------------
 1 | from .binning import OptimalBinning
 2 | from .binning_process import BinningProcess
 3 | from .continuous_binning import ContinuousOptimalBinning
 4 | from .mdlp import MDLP
 5 | from .multiclass_binning import MulticlassOptimalBinning
 6 | 
 7 | 
 8 | __all__ = ['BinningProcess',
 9 |            'ContinuousOptimalBinning',
10 |            'MDLP',
11 |            'MulticlassOptimalBinning',
12 |            'OptimalBinning']
13 | 


--------------------------------------------------------------------------------
/optbinning/binning/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base optimal binning algorithm class.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2020
 7 | 
 8 | from abc import ABCMeta
 9 | from abc import abstractmethod
10 | 
11 | from sklearn.base import BaseEstimator
12 | from sklearn.exceptions import NotFittedError
13 | 
14 | 
15 | class Base:
16 |     def _check_is_fitted(self):
17 |         if not self._is_fitted:
18 |             raise NotFittedError("This {} instance is not fitted yet. Call "
19 |                                  "'fit' with appropriate arguments."
20 |                                  .format(self.__class__.__name__))
21 | 
22 | 
23 | class BaseOptimalBinning(Base, BaseEstimator, metaclass=ABCMeta):
24 |     @abstractmethod
25 |     def fit(self):
26 |         """Fit the optimal binning according to the given training data."""
27 | 
28 |     @abstractmethod
29 |     def fit_transform(self):
30 |         """Fit the optimal binning according to the given training data, then
31 |         transform it."""
32 | 
33 |     @abstractmethod
34 |     def transform(self):
35 |         """Transform given data using bins from the fitted optimal binning."""
36 | 
37 |     @abstractmethod
38 |     def information(self):
39 |         """Print overview information about the options settings, problem
40 |         statistics, and the solution of the computation."""
41 | 
42 |     @property
43 |     @abstractmethod
44 |     def binning_table(self):
45 |         """Return an instantiated binning table."""
46 | 
47 |     @property
48 |     @abstractmethod
49 |     def splits(self):
50 |         """List of optimal split points."""
51 | 
52 |     @property
53 |     @abstractmethod
54 |     def status(self):
55 |         """The status of the underlying optimization solver."""
56 | 


--------------------------------------------------------------------------------
/optbinning/binning/binning_information.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Optimal binning information.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2019
  7 | 
  8 | from ..information import print_header
  9 | from ..information import print_optional_parameters
 10 | from ..information import print_solver_statistics
 11 | from ..options import continuous_optimal_binning_default_options
 12 | from ..options import multiclass_optimal_binning_default_options
 13 | from ..options import optimal_binning_default_options
 14 | from ..options import sboptimal_binning_default_options
 15 | from ..options import continuous_optimal_binning_2d_default_options
 16 | from ..options import optimal_binning_2d_default_options
 17 | 
 18 | 
 19 | def print_prebinning_statistics(n_prebins, n_refinement):
 20 |     prebinning_stats = (
 21 |         "  Pre-binning statistics\n"
 22 |         "    Number of pre-bins            {:>10}\n"
 23 |         "    Number of refinements         {:>10}\n"
 24 |         ).format(n_prebins, n_refinement)
 25 | 
 26 |     print(prebinning_stats)
 27 | 
 28 | 
 29 | def print_timing(solver_type, solver, time_total, time_preprocessing,
 30 |                  time_prebinning, time_solver, time_optimizer,
 31 |                  time_postprocessing):
 32 | 
 33 |     p_preprocessing = time_preprocessing / time_total
 34 |     p_prebinning = time_prebinning / time_total
 35 |     p_solver = time_solver / time_total
 36 |     p_postprocessing = time_postprocessing / time_total
 37 | 
 38 |     if solver_type == "cp" and solver is not None:
 39 |         time_model_generation = time_solver - time_optimizer
 40 |         p_model_generation = time_model_generation / time_solver
 41 |         p_optimizer = time_optimizer / time_solver
 42 | 
 43 |         time_stats = (
 44 |             "  Timing\n"
 45 |             "    Total time            {:>18.2f} sec\n"
 46 |             "    Pre-processing        {:>18.2f} sec   ({:>7.2%})\n"
 47 |             "    Pre-binning           {:>18.2f} sec   ({:>7.2%})\n"
 48 |             "    Solver                {:>18.2f} sec   ({:>7.2%})\n"
 49 |             "      model generation    {:>18.2f} sec   ({:>7.2%})\n"
 50 |             "      optimizer           {:>18.2f} sec   ({:>7.2%})\n"
 51 |             "    Post-processing       {:>18.2f} sec   ({:>7.2%})\n"
 52 |             ).format(time_total, time_preprocessing, p_preprocessing,
 53 |                      time_prebinning, p_prebinning, time_solver, p_solver,
 54 |                      time_model_generation, p_model_generation, time_optimizer,
 55 |                      p_optimizer, time_postprocessing, p_postprocessing)
 56 |     else:
 57 |         time_stats = (
 58 |             "  Timing\n"
 59 |             "    Total time            {:>18.2f} sec\n"
 60 |             "    Pre-processing        {:>18.2f} sec   ({:>7.2%})\n"
 61 |             "    Pre-binning           {:>18.2f} sec   ({:>7.2%})\n"
 62 |             "    Solver                {:>18.2f} sec   ({:>7.2%})\n"
 63 |             "    Post-processing       {:>18.2f} sec   ({:>7.2%})\n"
 64 |             ).format(time_total, time_preprocessing, p_preprocessing,
 65 |                      time_prebinning, p_prebinning, time_solver, p_solver,
 66 |                      time_postprocessing, p_postprocessing)
 67 | 
 68 |     print(time_stats)
 69 | 
 70 | 
 71 | def print_name_status(name, status):
 72 |     if not name:
 73 |         name = "UNKNOWN"
 74 | 
 75 |     print("  Name    : {:<32}\n"
 76 |           "  Status  : {:<32}\n".format(name, status))
 77 | 
 78 | 
 79 | def print_main_info(name, status, time_total):
 80 |     print_name_status(name, status)
 81 | 
 82 |     print("  Time    : {:<7.4f} sec\n".format(time_total))
 83 | 
 84 | 
 85 | def print_binning_information(binning_type, print_level, name, status,
 86 |                               solver_type, solver, time_total,
 87 |                               time_preprocessing, time_prebinning, time_solver,
 88 |                               time_optimizer, time_postprocessing, n_prebins,
 89 |                               n_refinements, dict_user_options):
 90 | 
 91 |     print_header()
 92 | 
 93 |     if print_level == 2:
 94 |         if binning_type == "optimalbinning":
 95 |             d_default_options = optimal_binning_default_options
 96 |         elif binning_type == "multiclassoptimalbinning":
 97 |             d_default_options = multiclass_optimal_binning_default_options
 98 |         elif binning_type == "continuousoptimalbinning":
 99 |             d_default_options = continuous_optimal_binning_default_options
100 |         elif binning_type == "sboptimalbinning":
101 |             d_default_options = sboptimal_binning_default_options
102 |         elif binning_type == "optimalbinning2d":
103 |             d_default_options = optimal_binning_2d_default_options
104 |         elif binning_type == "continuousoptimalbinning2d":
105 |             d_default_options = continuous_optimal_binning_2d_default_options
106 | 
107 |         print_optional_parameters(d_default_options, dict_user_options)
108 | 
109 |     if print_level == 0:
110 |         print_main_info(name, status, time_total)
111 |     elif print_level >= 1:
112 |         print_name_status(name, status)
113 | 
114 |         print_prebinning_statistics(n_prebins, n_refinements)
115 | 
116 |         if status in ("OPTIMAL", "FEASIBLE"):
117 |             if solver is not None:
118 |                 print_solver_statistics(solver_type, solver)
119 | 
120 |             print_timing(solver_type, solver, time_total, time_preprocessing,
121 |                          time_prebinning, time_solver, time_optimizer,
122 |                          time_postprocessing)
123 | 


--------------------------------------------------------------------------------
/optbinning/binning/binning_process_information.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Binning process information.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2020
 7 | 
 8 | from ..information import print_header
 9 | from ..information import print_optional_parameters
10 | from ..options import binning_process_default_options
11 | 
12 | 
13 | def print_main_info(n_records, n_variables, time_total):
14 |     print("  Number of records   : {}".format(n_records))
15 |     print("  Number of variables : {}".format(n_variables))
16 |     print("  Time                : {:<10.4f} sec\n".format(time_total))
17 | 
18 | 
19 | def print_binning_process_statistics(n_records, n_variables, target_dtype,
20 |                                      n_numerical, n_categorical, n_selected,
21 |                                      time_total):
22 |     stats = (
23 |         "  Statistics\n"
24 |         "    Number of records             {:>10}\n"
25 |         "    Number of variables           {:>10}\n"
26 |         "    Target type                   {:>10}\n\n"
27 |         "    Number of numerical           {:>10}\n"
28 |         "    Number of categorical         {:>10}\n"
29 |         "    Number of selected            {:>10}\n\n"
30 |         "  Time                            {:>10.4f} sec\n"
31 |         ).format(n_records, n_variables, target_dtype, n_numerical,
32 |                  n_categorical, n_selected, time_total)
33 | 
34 |     print(stats)
35 | 
36 | 
37 | def print_binning_process_information(print_level, n_records, n_variables,
38 |                                       target_dtype, n_numerical, n_categorical,
39 |                                       n_selected, time_total,
40 |                                       dict_user_options):
41 |     print_header()
42 | 
43 |     if print_level == 2:
44 |         dict_default_options = binning_process_default_options
45 |         print_optional_parameters(dict_default_options, dict_user_options)
46 | 
47 |     if print_level == 0:
48 |         print_main_info(n_records, n_variables, time_total)
49 |     elif print_level >= 1:
50 |         print_binning_process_statistics(n_records, n_variables, target_dtype,
51 |                                          n_numerical, n_categorical,
52 |                                          n_selected, time_total)
53 | 


--------------------------------------------------------------------------------
/optbinning/binning/distributed/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gk import GK
 2 | from .bsketch import BSketch
 3 | from .bsketch import BCatSketch
 4 | from .binning_process_sketch import BinningProcessSketch
 5 | from .binning_sketch import OptimalBinningSketch
 6 | 
 7 | 
 8 | __all__ = ['BSketch',
 9 |            'BCatSketch',
10 |            'GK',
11 |            'OptimalBinningSketch',
12 |            'BinningProcessSketch']
13 | 


--------------------------------------------------------------------------------
/optbinning/binning/distributed/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base optimal binning sketch algorithm class.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2021
 7 | 
 8 | from ...exceptions import NotSolvedError
 9 | 
10 | 
11 | class BaseSketch:
12 |     def _check_is_solved(self):
13 |         if not self._is_solved:
14 |             raise NotSolvedError("This {} instance is not solved yet. Call "
15 |                                  "'solve' with appropriate arguments."
16 |                                  .format(self.__class__.__name__))
17 | 


--------------------------------------------------------------------------------
/optbinning/binning/distributed/binning_process_sketch_information.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Binning process sketch information.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2021
 7 | 
 8 | from ...information import print_header
 9 | from ...information import print_optional_parameters
10 | from ...options import binning_process_sketch_default_options
11 | 
12 | 
13 | def print_main_info(n_records, n_variables, time_add, time_solve):
14 |     print("  Number of records   : {}".format(n_records))
15 |     print("  Number of variables : {}".format(n_variables))
16 |     print("  Time add            : {:<10.4f} sec".format(time_add))
17 |     print("  Time solve          : {:<10.4f} sec\n".format(time_solve))
18 | 
19 | 
20 | def print_binning_process_sketch_statistics(
21 |         n_records, n_variables, target_dtype, n_numerical, n_categorical,
22 |         n_selected, n_add, time_add, n_solve, time_solve):
23 | 
24 |     r_add = time_add / n_add
25 |     r_solve = time_solve / n_solve
26 | 
27 |     stats = (
28 |         "  Statistics\n"
29 |         "    Number of records             {:>10}\n"
30 |         "    Number of variables           {:>10}\n"
31 |         "    Target type                   {:>10}\n\n"
32 |         "    Number of numerical           {:>10}\n"
33 |         "    Number of categorical         {:>10}\n"
34 |         "    Number of selected            {:>10}\n"
35 |         ).format(n_records, n_variables, target_dtype, n_numerical,
36 |                  n_categorical, n_selected)
37 | 
38 |     records_stats = (
39 |         "  Streaming statistics\n"
40 |         "    Add operations        {:>18}\n"
41 |         "    Solve operations      {:>18}\n"
42 |         ).format(n_add, n_solve)
43 | 
44 |     time_stats = (
45 |         "  Streaming timing\n"
46 |         "    Time add              {:>18.2f} sec   ({:6.4f} sec / add)\n"
47 |         "    Time solve            {:>18.2f} sec   ({:6.4f} sec / solve)\n"
48 |         ).format(time_add, r_add, time_solve, r_solve)
49 | 
50 |     print(stats)
51 |     print(records_stats)
52 |     print(time_stats)
53 | 
54 | 
55 | def print_binning_process_sketch_information(
56 |         print_level, n_records, n_variables, target_dtype, n_numerical,
57 |         n_categorical, n_selected, n_add, time_add, n_solve, time_solve,
58 |         dict_user_options):
59 | 
60 |     print_header()
61 | 
62 |     if print_level == 2:
63 |         dict_default_options = binning_process_sketch_default_options
64 |         print_optional_parameters(dict_default_options, dict_user_options)
65 | 
66 |     if print_level == 0:
67 |         print_main_info(n_records, n_variables, time_add, time_solve)
68 |     elif print_level >= 1:
69 |         print_binning_process_sketch_statistics(
70 |             n_records, n_variables, target_dtype, n_numerical, n_categorical,
71 |             n_selected, n_add, time_add, n_solve, time_solve)
72 | 


--------------------------------------------------------------------------------
/optbinning/binning/distributed/bsketch_information.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Binning sketch information.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2020
  7 | 
  8 | from ...binning.binning_information import print_main_info
  9 | from ...binning.binning_information import print_name_status
 10 | from ...binning.binning_information import print_prebinning_statistics
 11 | from ...information import print_header
 12 | from ...information import print_optional_parameters
 13 | from ...information import print_solver_statistics
 14 | from ...options import optimal_binning_sketch_options
 15 | 
 16 | 
 17 | def print_timing(solver_type, solver, time_total, time_prebinning, time_solver,
 18 |                  time_optimizer, time_postprocessing):
 19 | 
 20 |     p_prebinning = time_prebinning / time_total
 21 |     p_solver = time_solver / time_total
 22 |     p_postprocessing = time_postprocessing / time_total
 23 | 
 24 |     if solver_type == "cp" and solver is not None:
 25 |         time_model_generation = time_solver - time_optimizer
 26 |         p_model_generation = time_model_generation / time_solver
 27 |         p_optimizer = time_optimizer / time_solver
 28 | 
 29 |         time_stats = (
 30 |             "  Timing\n"
 31 |             "    Total time            {:>18.2f} sec\n"
 32 |             "    Pre-binning           {:>18.2f} sec   ({:>7.2%})\n"
 33 |             "    Solver                {:>18.2f} sec   ({:>7.2%})\n"
 34 |             "      model generation    {:>18.2f} sec   ({:>7.2%})\n"
 35 |             "      optimizer           {:>18.2f} sec   ({:>7.2%})\n"
 36 |             "    Post-processing       {:>18.2f} sec   ({:>7.2%})\n"
 37 |             ).format(time_total, time_prebinning, p_prebinning, time_solver,
 38 |                      p_solver, time_model_generation, p_model_generation,
 39 |                      time_optimizer, p_optimizer, time_postprocessing,
 40 |                      p_postprocessing)
 41 |     else:
 42 |         time_stats = (
 43 |             "  Timing\n"
 44 |             "    Total time            {:>18.2f} sec\n"
 45 |             "    Pre-binning           {:>18.2f} sec   ({:>7.2%})\n"
 46 |             "    Solver                {:>18.2f} sec   ({:>7.2%})\n"
 47 |             "    Post-processing       {:>18.2f} sec   ({:>7.2%})\n"
 48 |             ).format(time_total, time_prebinning, p_prebinning, time_solver,
 49 |                      p_solver, time_postprocessing, p_postprocessing)
 50 | 
 51 |     print(time_stats)
 52 | 
 53 | 
 54 | def print_streaming_timing(memory_usage, n_records, n_add, time_add, n_solve,
 55 |                            time_solve):
 56 |     r_add = time_add / n_add
 57 |     r_solve = time_solve / n_solve
 58 | 
 59 |     records_stats = (
 60 |         "  Streaming statistics\n"
 61 |         "    Sketch memory usage   {:>18.5f} MB\n"
 62 |         "    Processed records     {:>18}\n"
 63 |         "    Add operations        {:>18}\n"
 64 |         "    Solve operations      {:>18}\n"
 65 |         ).format(memory_usage, n_records, n_add, n_solve)
 66 | 
 67 |     time_stats = (
 68 |         "  Streaming timing\n"
 69 |         "    Time add              {:>18.2f} sec   ({:6.4f} sec / add)\n"
 70 |         "    Time solve            {:>18.2f} sec   ({:6.4f} sec / solve)\n"
 71 |         ).format(time_add, r_add, time_solve, r_solve)
 72 | 
 73 |     print(records_stats)
 74 |     print(time_stats)
 75 | 
 76 | 
 77 | def print_binning_information(binning_type, print_level, name, status,
 78 |                               solver_type, solver, time_total, time_prebinning,
 79 |                               time_solver, time_optimizer, time_postprocessing,
 80 |                               n_prebins, n_refinements, n_records, n_add,
 81 |                               time_add, n_solve, time_solve, memory_usage,
 82 |                               dict_user_options):
 83 | 
 84 |     print_header()
 85 | 
 86 |     if print_level == 2:
 87 |         if binning_type == "optimalbinningsketch":
 88 |             dict_default_options = optimal_binning_sketch_options
 89 | 
 90 |         print_optional_parameters(dict_default_options, dict_user_options)
 91 | 
 92 |     if print_level == 0:
 93 |         print_main_info(name, status, time_total)
 94 |     elif print_level >= 1:
 95 |         print_name_status(name, status)
 96 | 
 97 |         print_prebinning_statistics(n_prebins, n_refinements)
 98 | 
 99 |         if status in ("OPTIMAL", "FEASIBLE"):
100 |             if solver is not None:
101 |                 print_solver_statistics(solver_type, solver)
102 | 
103 |             print_timing(solver_type, solver, time_total, time_prebinning,
104 |                          time_solver, time_optimizer, time_postprocessing)
105 | 
106 |         print_streaming_timing(memory_usage, n_records, n_add, time_add,
107 |                                n_solve, time_solve)
108 | 


--------------------------------------------------------------------------------
/optbinning/binning/distributed/gk.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Greenwald-Khanna's streaming quantiles.
  3 | 
  4 | References:
  5 |     [1] M. Greenwald and S. Khanna, "Space-Efficient Online Computation of
  6 |         Quantile Summaries", (2001).
  7 | 
  8 | Comment: + improvements (~ 30% faster for large arrays)
  9 | 
 10 |     [2] https://github.com/DataDog/sketches-py/tree/master/gkarray
 11 | """
 12 | 
 13 | import numpy as np
 14 | 
 15 | 
 16 | class Entry:
 17 |     def __init__(self, value, g, delta):
 18 |         """
 19 |         Tuple t = (v, g, delta)
 20 | 
 21 |         Parameters
 22 |         ----------
 23 |         value : float
 24 |             value that corresponds to one of the elements of the sequence.
 25 | 
 26 |         g : float
 27 |             g = r_min(value_[i]) - r_min(value_[i-1])
 28 | 
 29 |         delta : float
 30 |             r_max - r_min
 31 |         """
 32 |         self.value = value
 33 |         self.g = g
 34 |         self.delta = delta
 35 | 
 36 | 
 37 | class GK:
 38 |     """Greenwald-Khanna's streaming quantiles.
 39 | 
 40 |     Parameters
 41 |     ----------
 42 |     eps : float (default=0.01)
 43 |         Relative error epsilon.
 44 |     """
 45 |     def __init__(self, eps=0.01):
 46 |         self.eps = eps
 47 | 
 48 |         self.entries = []
 49 |         self.incoming = []
 50 |         self._min = np.inf
 51 |         self._max = -np.inf
 52 |         self._count = 0
 53 |         self._sum = 0
 54 | 
 55 |         self._compress_threshold = int(1.0 / self.eps) + 1
 56 | 
 57 |     def __len__(self):
 58 |         if len(self.incoming):
 59 |             self.merge_compress()
 60 |         return len(self.entries)
 61 | 
 62 |     def add(self, value):
 63 |         """Add value to sketch."""
 64 |         self.incoming.append(value)
 65 |         self._count += 1
 66 |         self._sum += value
 67 | 
 68 |         if value < self._min:
 69 |             self._min = value
 70 |         if value > self._max:
 71 |             self._max = value
 72 | 
 73 |         if self._count % self._compress_threshold == 0:
 74 |             self.merge_compress()
 75 | 
 76 |     def copy(self, gk):
 77 |         """Copy GK sketch."""
 78 |         self.entries = [Entry(e.value, e.g, e.delta) for e in gk.entries]
 79 |         self.incoming = gk.incoming[:]
 80 |         self._count = gk._count
 81 |         self._min = gk._min
 82 |         self._max = gk._max
 83 |         self._sum = gk._sum
 84 | 
 85 |     def merge(self, gk):
 86 |         """Merge sketch with another sketch gk."""
 87 |         if not self.mergeable(gk):
 88 |             raise Exception("gk does not share signature.")
 89 | 
 90 |         if gk._count == 0:
 91 |             return
 92 | 
 93 |         if self._count == 0:
 94 |             self.copy(gk)
 95 |             return
 96 | 
 97 |         entries = []
 98 |         spread = int(gk.eps * (gk.n - 1))
 99 |         gk.merge_compress()
100 | 
101 |         # upper bound elements(gk.v0, gk.v1) - spread
102 |         g = gk.entries[0].g + gk.entries[0].delta - 1 - spread
103 | 
104 |         if g > 0:
105 |             entries.append(Entry(gk._min, g, 0))
106 | 
107 |         n_gk = len(gk)
108 |         for i in range(n_gk - 1):
109 |             tp1 = gk.entries[i + 1]
110 |             t = gk.entries[i]
111 |             g = tp1.g + (tp1.delta - t.delta)
112 |             if g > 0:
113 |                 entries.append(Entry(t.value, g, 0))
114 | 
115 |         last_t = gk.entries[n_gk - 1]
116 |         g = spread + 1 - last_t.delta
117 |         if g > 0:
118 |             entries.append(Entry(last_t.value, g, 0))
119 | 
120 |         self._count += gk._count
121 |         self._min = min(self._min, gk._min)
122 |         self._max = max(self._max, gk._max)
123 |         self._sum += gk._sum
124 | 
125 |         self.merge_compress(entries)
126 | 
127 |     def merge_compress(self, entries=[]):
128 |         """Compress sketch."""
129 |         remove_threshold = float(2.0 * self.eps * (self._count - 1))
130 | 
131 |         incoming = [Entry(value, 1, 0) for value in self.incoming]
132 | 
133 |         if len(entries):
134 |             incoming.extend(Entry(e.value, e.g, e.delta) for e in entries)
135 | 
136 |         incoming = sorted(incoming, key=lambda e: e.value)
137 | 
138 |         merged = []
139 |         i = 0
140 |         j = 0
141 |         n_incoming = len(incoming)
142 |         n_entries = len(self.entries)
143 | 
144 |         while i < n_incoming or j < n_entries:
145 |             if i == n_incoming:
146 |                 t = self.entries[j]
147 |                 j += 1
148 |                 if j < n_entries:
149 |                     tn = self.entries[j]
150 |                     if t.g + tn.g + tn.delta <= remove_threshold:
151 |                         tn.g += t.g
152 |                         continue
153 |                 merged.append(t)
154 |             elif j == n_entries:
155 |                 t = incoming[i]
156 |                 i += 1
157 |                 if i < n_incoming:
158 |                     tn = incoming[i]
159 |                     if t.g + tn.g + tn.delta <= remove_threshold:
160 |                         tn.g += t.g
161 |                         continue
162 |                 merged.append(t)
163 |             elif incoming[i].value < self.entries[j].value:
164 |                 ti = incoming[i]
165 |                 tj = self.entries[j]
166 |                 if ti.g + tj.g + tj.delta <= remove_threshold:
167 |                     tj.g += ti.g
168 |                 else:
169 |                     ti.delta = tj.g + tj.delta - ti.g
170 |                     merged.append(ti)
171 |                 i += 1
172 |             else:
173 |                 t = self.entries[j]
174 |                 j += 1
175 |                 if j < n_entries:
176 |                     tn = self.entries[j]
177 |                     if t.g + tn.g + tn.delta <= remove_threshold:
178 |                         tn.g += t.g
179 |                         continue
180 |                 merged.append(t)
181 | 
182 |         self.entries = merged
183 |         self.incoming = []
184 | 
185 |     def mergeable(self, gk):
186 |         """Check whether a sketch gk is mergeable."""
187 |         return self.eps == gk.eps
188 | 
189 |     def quantile(self, q):
190 |         """Calculate quantile q."""
191 |         if not (0 <= q <= 1):
192 |             raise ValueError("q must be a value in [0, 1].")
193 | 
194 |         if self._count == 0:
195 |             raise ValueError("GK sketch does not contain values.")
196 | 
197 |         if len(self.incoming):
198 |             self.merge_compress()
199 | 
200 |         rank = int(q * (self._count - 1) + 1)
201 |         spread = int(self.eps * (self._count - 1))
202 |         g_sum = 0.0
203 |         i = 0
204 | 
205 |         n_entries = len(self.entries)
206 |         while i < n_entries:
207 |             g_sum += self.entries[i].g
208 |             if g_sum + self.entries[i].delta > rank + spread:
209 |                 break
210 |             i += 1
211 |         if i == 0:
212 |             return self._min
213 | 
214 |         return self.entries[i - 1].value
215 | 
216 |     @property
217 |     def n(self):
218 |         """Number of records in sketch."""
219 |         return self._count
220 | 


--------------------------------------------------------------------------------
/optbinning/binning/distributed/plots.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Binning sketch plots.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2020
 7 | 
 8 | import matplotlib.pyplot as plt
 9 | import numpy as np
10 | 
11 | 
12 | def plot_progress_divergence(df, divergence):
13 |     n = len(df)
14 |     n_add = df.n_add
15 |     n_records = df.n_records
16 |     div = df.divergence
17 | 
18 |     mv_div_mean = div.rolling(n, min_periods=1).mean()
19 |     mv_div_std = div.rolling(n, min_periods=1).std()
20 |     mv_div_std /= np.sqrt(np.arange(1, n+1))
21 | 
22 |     div_low = np.maximum(0, div - mv_div_std * 1.959963984540054)
23 |     div_high = div + mv_div_std * 1.959963984540054
24 | 
25 |     div_label = "divergence ({:.5f})".format(div.values[-1])
26 |     mv_div_label = "moving mean ({:.5f})".format(mv_div_mean.values[-1])
27 |     mv_std_label = "standard error ({:.5f})".format(mv_div_std.values[-1])
28 | 
29 |     plt.plot(n_records, div, label=div_label)
30 |     plt.plot(n_records, mv_div_mean, linestyle="-.", color="green",
31 |              label=mv_div_label)
32 |     plt.fill_between(n_records, div_low, div_high, alpha=0.2, color="green",
33 |                      label=mv_std_label)
34 | 
35 |     plt.title("Progress after {:} add and {} processed records".
36 |               format(int(n_add.values[-1]), int(n_records.values[-1])),
37 |               fontsize=14)
38 |     plt.xlabel("Processed records", fontsize=12)
39 |     plt.ylabel("Divergence: {}".format(divergence), fontsize=12)
40 |     plt.legend(fontsize=12)
41 | 
42 |     plt.show()
43 | 


--------------------------------------------------------------------------------
/optbinning/binning/mdlp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Minimum Description Length Principle (MDLP)
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2020
  7 | 
  8 | import numbers
  9 | 
 10 | import numpy as np
 11 | 
 12 | from scipy import special
 13 | from sklearn.base import BaseEstimator
 14 | from sklearn.exceptions import NotFittedError
 15 | from sklearn.utils import check_array
 16 | 
 17 | 
 18 | def _check_parameters(min_samples_split, min_samples_leaf, max_candidates):
 19 |     if (not isinstance(min_samples_split, numbers.Integral) or
 20 |             min_samples_split < 2):
 21 |         raise ValueError("min_samples_split must be a positive integer >= 2; "
 22 |                          "got {}.".format(min_samples_split))
 23 | 
 24 |     if (not isinstance(min_samples_leaf, numbers.Integral) or
 25 |             min_samples_leaf < 1):
 26 |         raise ValueError("min_samples_leaf must be a positive integer >= 1; "
 27 |                          "got {}.".format(min_samples_leaf))
 28 | 
 29 |     if not isinstance(max_candidates, numbers.Integral) or max_candidates < 1:
 30 |         raise ValueError("max_candidates must be a positive integer >= 1; "
 31 |                          "got {}.".format(max_candidates))
 32 | 
 33 | 
 34 | class MDLP(BaseEstimator):
 35 |     """
 36 |     Minimum Description Length Principle (MDLP) discretization algorithm.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     min_samples_split : int (default=2)
 41 |         The minimum number of samples required to split an internal node.
 42 | 
 43 |     min_samples_leaf : int (default=2)
 44 |         The minimum number of samples required to be at a leaf node.
 45 | 
 46 |     max_candidates : int (default=32)
 47 |         The maximum number of split points to evaluate at each partition.
 48 | 
 49 |     Notes
 50 |     -----
 51 |     Implementation of the discretization algorithm in [FI93]. A dynamic
 52 |     split strategy based on binning the number of candidate splits [CMR2001]
 53 |     is implemented to increase efficiency. For large size datasets, it is
 54 |     recommended to use a smaller ``max_candidates`` (e.g. 16) to get a
 55 |     significant speed up.
 56 | 
 57 |     References
 58 |     ----------
 59 | 
 60 |     .. [FI93] U. M. Fayyad and K. B. Irani. "Multi-Interval Discretization of
 61 |               Continuous-Valued Attributes for Classification Learning".
 62 |               International Joint Conferences on Artificial Intelligence,
 63 |               13:1022–1027, 1993.
 64 | 
 65 |     .. [CMR2001] D. M. Chickering, C. Meek and R. Rounthwaite. "Efficient
 66 |                  Determination of Dynamic Split Points in a Decision Tree". In
 67 |                  Proceedings of the 2001 IEEE International Conference on Data
 68 |                  Mining, 91-98, 2001.
 69 |     """
 70 |     def __init__(self, min_samples_split=2, min_samples_leaf=2,
 71 |                  max_candidates=32):
 72 | 
 73 |         self.min_samples_split = min_samples_split
 74 |         self.min_samples_leaf = min_samples_leaf
 75 |         self.max_candidates = max_candidates
 76 | 
 77 |         # auxiliary
 78 |         self._splits = []
 79 | 
 80 |         self._is_fitted = None
 81 | 
 82 |     def fit(self, x, y):
 83 |         """Fit MDLP discretization algorithm.
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         x : array-like, shape = (n_samples)
 88 |             Data samples, where n_samples is the number of samples.
 89 | 
 90 |         y : array-like, shape = (n_samples)
 91 |             Target vector relative to x.
 92 | 
 93 |         Returns
 94 |         -------
 95 |         self : MDLP
 96 |         """
 97 |         return self._fit(x, y)
 98 | 
 99 |     def _fit(self, x, y):
100 |         _check_parameters(**self.get_params())
101 | 
102 |         x = check_array(x, ensure_2d=False, force_all_finite=True)
103 |         y = check_array(y, ensure_2d=False, force_all_finite=True)
104 | 
105 |         idx = np.argsort(x)
106 |         x = x[idx]
107 |         y = y[idx]
108 | 
109 |         self._recurse(x, y, 0)
110 | 
111 |         self._is_fitted = True
112 | 
113 |         return self
114 | 
115 |     def _recurse(self, x, y, id):
116 |         u_x = np.unique(x)
117 |         n_x = len(u_x)
118 |         n_y = len(np.bincount(y))
119 | 
120 |         split = self._find_split(u_x, x, y)
121 | 
122 |         if split is not None:
123 |             self._splits.append(split)
124 |             t = np.searchsorted(x, split, side="right")
125 | 
126 |             if not self._terminate(n_x, n_y, y, y[:t], y[t:]):
127 |                 self._recurse(x[:t], y[:t], id + 1)
128 |                 self._recurse(x[t:], y[t:], id + 2)
129 | 
130 |     def _find_split(self, u_x, x, y):
131 |         n_x = len(x)
132 |         u_x = np.unique(0.5 * (x[1:] + x[:-1])[(y[1:] - y[:-1]) != 0])
133 | 
134 |         if len(u_x) > self.max_candidates:
135 |             percentiles = np.linspace(1, 100, self.max_candidates)
136 |             splits = np.percentile(u_x, percentiles)
137 |         else:
138 |             splits = u_x
139 | 
140 |         max_entropy_gain = 0
141 |         best_split = None
142 | 
143 |         tt = np.searchsorted(x, splits, side="right")
144 |         for i, t in enumerate(tt):
145 |             samples_l = t >= self.min_samples_leaf
146 |             samples_r = n_x - t >= self.min_samples_leaf
147 | 
148 |             if samples_l and samples_r:
149 |                 entropy_gain = self._entropy_gain(y, y[:t], y[t:])
150 |                 if entropy_gain > max_entropy_gain:
151 |                     max_entropy_gain = entropy_gain
152 |                     best_split = splits[i]
153 | 
154 |         return best_split
155 | 
156 |     def _entropy(self, x):
157 |         n = len(x)
158 |         ns1 = np.sum(x)
159 |         ns0 = n - ns1
160 |         p = np.array([ns0, ns1]) / n
161 |         return -special.xlogy(p, p).sum()
162 | 
163 |     def _entropy_gain(self, y, y1, y2):
164 |         n = len(y)
165 |         n1 = len(y1)
166 |         n2 = n - n1
167 |         ent_y = self._entropy(y)
168 |         ent_y1 = self._entropy(y1)
169 |         ent_y2 = self._entropy(y2)
170 |         return ent_y - (n1 * ent_y1 + n2 * ent_y2) / n
171 | 
172 |     def _terminate(self, n_x, n_y, y, y1, y2):
173 |         splittable = (n_x >= self.min_samples_split) and (n_y >= 2)
174 | 
175 |         n = len(y)
176 |         n1 = len(y1)
177 |         n2 = n - n1
178 |         ent_y = self._entropy(y)
179 |         ent_y1 = self._entropy(y1)
180 |         ent_y2 = self._entropy(y2)
181 |         gain = ent_y - (n1 * ent_y1 + n2 * ent_y2) / n
182 | 
183 |         k = len(np.bincount(y))
184 |         k1 = len(np.bincount(y1))
185 |         k2 = len(np.bincount(y2))
186 | 
187 |         t0 = np.log(3**k - 2)
188 |         t1 = k * ent_y
189 |         t2 = k1 * ent_y1
190 |         t3 = k2 * ent_y2
191 |         delta = t0 - (t1 - t2 - t3)
192 | 
193 |         return gain <= (np.log(n - 1) + delta) / n or not splittable
194 | 
195 |     @property
196 |     def splits(self):
197 |         """List of split points
198 | 
199 |         Returns
200 |         -------
201 |         splits : numpy.ndarray
202 |         """
203 |         if not self._is_fitted:
204 |             raise NotFittedError("This {} instance is not fitted yet. Call "
205 |                                  "'fit' with appropriate arguments."
206 |                                  .format(self.__class__.__name__))
207 | 
208 |         return np.sort(self._splits)
209 | 


--------------------------------------------------------------------------------
/optbinning/binning/multiclass_cp.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generalized assigment problem: solve constrained multiclass optimal binning
  3 | problem. Constraint programming implementation.
  4 | """
  5 | 
  6 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  7 | # Copyright (C) 2019
  8 | 
  9 | from ortools.sat.python import cp_model
 10 | 
 11 | from .cp import BinningCP
 12 | from .model_data import multiclass_model_data
 13 | 
 14 | 
 15 | class MulticlassBinningCP(BinningCP):
 16 |     def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size,
 17 |                  max_bin_size, min_event_rate_diff, max_pvalue,
 18 |                  max_pvalue_policy, user_splits_fixed, time_limit):
 19 | 
 20 |         self.monotonic_trend = monotonic_trend
 21 | 
 22 |         self.min_n_bins = min_n_bins
 23 |         self.max_n_bins = max_n_bins
 24 |         self.min_bin_size = min_bin_size
 25 |         self.max_bin_size = max_bin_size
 26 | 
 27 |         self.min_event_rate_diff = min_event_rate_diff
 28 |         self.max_pvalue = max_pvalue
 29 |         self.max_pvalue_policy = max_pvalue_policy
 30 |         self.user_splits_fixed = user_splits_fixed
 31 |         self.time_limit = time_limit
 32 | 
 33 |         self.solver_ = None
 34 | 
 35 |         # Auxiliary
 36 |         self._is_scenario_binning = False
 37 |         self._model = None
 38 |         self._n = None
 39 |         self._x = None
 40 | 
 41 |     def build_model(self, n_nonevent, n_event, trend_changes):
 42 |         # Parameters
 43 |         M = int(1e6)
 44 |         (D, V, pvalue_violation_indices,
 45 |          min_diff_violation_indices) = multiclass_model_data(
 46 |             n_nonevent, n_event, self.max_pvalue, self.max_pvalue_policy,
 47 |             self.min_event_rate_diff, M)
 48 | 
 49 |         n = len(n_nonevent)
 50 |         n_records = n_nonevent + n_event
 51 |         n_classes = len(self.monotonic_trend)
 52 | 
 53 |         # Initialize model
 54 |         model = cp_model.CpModel()
 55 | 
 56 |         # Decision variables
 57 |         x, y, t, d, u, bin_size_diff = self.decision_variables(
 58 |             model, n, n_classes)
 59 | 
 60 |         # Objective function
 61 |         model.Maximize(sum([sum([(V[c][i][i] * x[i, i]) +
 62 |                             sum([(V[c][i][j] - V[c][i][j+1]) * x[i, j]
 63 |                                  for j in range(i)]) for i in range(n)])
 64 |                             for c in range(n_classes)]))
 65 | 
 66 |         # Constraint: unique assignment
 67 |         self.add_constraint_unique_assignment(model, n, x)
 68 | 
 69 |         # Constraint: continuity
 70 |         self.add_constraint_continuity(model, n, x)
 71 | 
 72 |         # Constraint: min / max bins
 73 |         self.add_constraint_min_max_bins(model, n, x, d)
 74 | 
 75 |         # Constraint: min / max bin size
 76 |         self.add_constraint_min_max_bin_size(model, n, x, u, n_records,
 77 |                                              bin_size_diff)
 78 | 
 79 |         # Constraints: monotonicity
 80 |         for c in range(n_classes):
 81 |             if self.monotonic_trend[c] == "ascending":
 82 |                 self.add_constraint_monotonic_ascending(model, n, D[c], x, M)
 83 | 
 84 |             if self.monotonic_trend[c] == "descending":
 85 |                 self.add_constraint_monotonic_descending(model, n, D[c], x, M)
 86 | 
 87 |             elif self.monotonic_trend[c] in ("peak", "valley"):
 88 |                 for i in range(n):
 89 |                     model.Add(t[c] >= i - n * (1 - y[c, i]))
 90 |                     model.Add(t[c] <= i + n * y[c, i])
 91 | 
 92 |                 if self.monotonic_trend[c] == "peak":
 93 |                     self.add_constraint_monotonic_peak(
 94 |                         model, n, D[c], x, c, y, M)
 95 |                 else:
 96 |                     self.add_constraint_monotonic_valley(
 97 |                         model, n, D[c], x, c, y, M)
 98 | 
 99 |             elif self.monotonic_trend == "peak_heuristic":
100 |                 self.add_constraint_monotonic_peak_heuristic(
101 |                     model, n, D[c], x, trend_changes[c], M)
102 | 
103 |             elif self.monotonic_trend == "valley_heuristic":
104 |                 self.add_constraint_monotonic_valley_heuristic(
105 |                     model, n, D[c], x, trend_changes[c], M)
106 | 
107 |         # Constraint: max-pvalue
108 |         for c in range(n_classes):
109 |             self.add_constraint_violation(model, x,
110 |                                           pvalue_violation_indices[c])
111 | 
112 |         # Constraint: min diff
113 |         for c in range(n_classes):
114 |             self.add_constraint_violation(model, x,
115 |                                           min_diff_violation_indices[c])
116 | 
117 |         # Constraint: fixed splits
118 |         self.add_constraint_fixed_splits(model, n, x)
119 | 
120 |         self._model = model
121 |         self._x = x
122 |         self._n = n
123 | 
124 |     def decision_variables(self, model, n, n_classes):
125 |         x = {}
126 |         for i in range(n):
127 |             for j in range(i + 1):
128 |                 x[i, j] = model.NewBoolVar("x[{}, {}]".format(i, j))
129 | 
130 |         y = None
131 |         t = None
132 |         d = None
133 |         u = None
134 |         bin_size_diff = None
135 | 
136 |         if "peak" in self.monotonic_trend or "valley" in self.monotonic_trend:
137 |             # Auxiliary binary variables
138 |             y = {}
139 |             t = {}
140 |             for c in range(n_classes):
141 |                 if self.monotonic_trend[c] in ("peak", "valley"):
142 |                     for i in range(n):
143 |                         y[c, i] = model.NewBoolVar("y[{}]".format(i))
144 | 
145 |                     # Change points
146 |                     t[c] = model.NewIntVar(0, n, "t[{}]".format(c))
147 | 
148 |         if self.min_n_bins is not None and self.max_n_bins is not None:
149 |             n_bin_diff = self.max_n_bins - self.min_n_bins
150 | 
151 |             # Range constraints auxiliary variables
152 |             d = model.NewIntVar(0, n_bin_diff, "n_bin_diff")
153 | 
154 |         if self.min_bin_size is not None and self.max_bin_size is not None:
155 |             bin_size_diff = self.max_bin_size - self.min_bin_size
156 | 
157 |             # Range constraints auxiliary variables
158 |             u = {}
159 |             for i in range(n):
160 |                 u[i] = model.NewIntVar(0, bin_size_diff, "u[{}]".format(i))
161 | 
162 |         return x, y, t, d, u, bin_size_diff
163 | 
164 |     def add_constraint_monotonic_peak(self, model, n, D, x, c, y, M):
165 |         for i in range(1, n):
166 |             for z in range(i):
167 |                 model.Add(
168 |                     M * (y[c, i] + y[c, z]) + M + (D[z][z] - M) * x[z, z] +
169 |                     sum([(D[z][j] - D[z][j+1]) * x[z, j]
170 |                          for j in range(z)]) -
171 |                     sum([(D[i][j] - D[i][j + 1]) * x[i, j]
172 |                          for j in range(i)]) -
173 |                     D[i][i] * x[i, i] >= 0)
174 | 
175 |                 model.Add(
176 |                     M * (2 - y[c, i] - y[c, z]) + M + (D[i][i] - M) * x[i, i] +
177 |                     sum([(D[i][j] - D[i][j + 1]) * x[i, j]
178 |                          for j in range(i)]) -
179 |                     sum([(D[z][j] - D[z][j+1]) * x[z, j]
180 |                          for j in range(z)]) -
181 |                     D[z][z] * x[z, z] >= 0)
182 | 
183 |     def add_constraint_monotonic_valley(self, model, n, D, x, c, y, M):
184 |         for i in range(1, n):
185 |             for z in range(i):
186 |                 model.Add(
187 |                     M * (y[c, i] + y[c, z]) + M + (D[i][i] - M) * x[i, i] +
188 |                     sum([(D[i][j] - D[i][j + 1]) * x[i, j]
189 |                          for j in range(i)]) -
190 |                     sum([(D[z][j] - D[z][j+1]) * x[z, j]
191 |                          for j in range(z)]) -
192 |                     D[z][z] * x[z, z] >= 0)
193 | 
194 |                 model.Add(
195 |                     M * (2 - y[c, i] - y[c, z]) + M + (D[z][z] - M) * x[z, z] +
196 |                     sum([(D[z][j] - D[z][j+1]) * x[z, j]
197 |                          for j in range(z)]) -
198 |                     sum([(D[i][j] - D[i][j + 1]) * x[i, j]
199 |                          for j in range(i)]) -
200 |                     D[i][i] * x[i, i] >= 0)
201 | 


--------------------------------------------------------------------------------
/optbinning/binning/multidimensional/__init__.py:
--------------------------------------------------------------------------------
1 | from .binning_2d import OptimalBinning2D
2 | from .continuous_binning_2d import ContinuousOptimalBinning2D
3 | 
4 | __all__ = ['ContinuousOptimalBinning2D',
5 |            'OptimalBinning2D']
6 | 


--------------------------------------------------------------------------------
/optbinning/binning/multidimensional/cp_2d.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generalized assigment problem: solve constrained optimal 2D binning problem.
  3 | Constraint programming implementation.
  4 | """
  5 | 
  6 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  7 | # Copyright (C) 2021
  8 | 
  9 | import numpy as np
 10 | 
 11 | from ortools.sat.python import cp_model
 12 | 
 13 | 
 14 | class Binning2DCP:
 15 |     def __init__(self, monotonic_trend_x, monotonic_trend_y, min_n_bins,
 16 |                  max_n_bins, min_diff_x, min_diff_y, gamma, n_jobs,
 17 |                  time_limit):
 18 | 
 19 |         self.monotonic_trend_x = monotonic_trend_x
 20 |         self.monotonic_trend_y = monotonic_trend_y
 21 |         self.min_n_bins = min_n_bins
 22 |         self.max_n_bins = max_n_bins
 23 |         self.min_diff_x = min_diff_x
 24 |         self.min_diff_y = min_diff_y
 25 |         self.gamma = gamma
 26 | 
 27 |         self.n_jobs = n_jobs
 28 |         self.time_limit = time_limit
 29 | 
 30 |         self.solver_ = None
 31 |         self.event_rate_ = None
 32 |         self.iv_ = None
 33 | 
 34 |         self._model = None
 35 |         self._x = None
 36 |         self._n_rectangles = None
 37 | 
 38 |     def build_model(self, n_grid, n_rectangles, cols, c, d_connected_x,
 39 |                     d_connected_y, er, n_records):
 40 |         # Parameters
 41 |         scale = int(1e6)
 42 | 
 43 |         # Initialize model
 44 |         model = cp_model.CpModel()
 45 | 
 46 |         # Decision variables
 47 |         x, d = self.decision_variables(model, n_rectangles)
 48 | 
 49 |         # Objective function
 50 |         if self.gamma:
 51 |             total_records = int(n_records.sum())
 52 |             regularization = int(np.ceil(scale * self.gamma / total_records))
 53 |             pmax = model.NewIntVar(0, total_records, "pmax")
 54 |             pmin = model.NewIntVar(0, total_records, "pmin")
 55 | 
 56 |             model.Maximize(sum([c[i] * x[i] for i in range(n_rectangles)]) -
 57 |                            regularization * (pmax - pmin))
 58 |         else:
 59 |             model.Maximize(sum([c[i] * x[i] for i in range(n_rectangles)]))
 60 | 
 61 |         # Constraint: unique assignment
 62 |         self.add_constraint_unique_assignment(model, x, n_grid, cols)
 63 | 
 64 |         # Constraint: min / max bins
 65 |         self.add_constraint_min_max_bins(model, n_rectangles, x, d)
 66 | 
 67 |         # Constraint: monotonicity
 68 |         self.add_constraint_monotonic(
 69 |             model, n_rectangles, x, er, d_connected_x, d_connected_y,
 70 |             self.min_diff_x, self.min_diff_y)
 71 | 
 72 |         # Constraint: reduction of dominating bins
 73 |         if self.gamma:
 74 |             for i in range(n_rectangles):
 75 |                 bin_size = n_records[i] * x[i]
 76 | 
 77 |                 model.Add(pmin <= total_records * (1 - x[i]) + bin_size)
 78 |                 model.Add(pmax >= bin_size)
 79 |             model.Add(pmin <= pmax)
 80 | 
 81 |         # Save data for post-processing
 82 |         self._model = model
 83 |         self._x = x
 84 |         self._n_rectangles = n_rectangles
 85 | 
 86 |     def solve(self):
 87 |         # Solve
 88 |         self.solver_ = cp_model.CpSolver()
 89 |         if self.n_jobs > 1:
 90 |             self.solver_.parameters.num_search_workers = self.n_jobs
 91 |         else:
 92 |             self.solver_.parameters.linearization_level = 2
 93 | 
 94 |         self.solver_.parameters.max_time_in_seconds = self.time_limit
 95 | 
 96 |         status = self.solver_.Solve(self._model)
 97 |         status_name = self.solver_.StatusName(status)
 98 | 
 99 |         if status in (cp_model.OPTIMAL, cp_model.FEASIBLE):
100 |             solution = np.array([self.solver_.BooleanValue(self._x[i])
101 |                                  for i in range(self._n_rectangles)])
102 |         else:
103 |             solution = np.zeros(self._n_rectangles).astype(np.bool)
104 | 
105 |         return status_name, solution
106 | 
107 |     def decision_variables(self, model, n_rectangles):
108 |         x = {}
109 |         for i in range(n_rectangles):
110 |             x[i] = model.NewBoolVar("x[{}]".format(i))
111 | 
112 |         d = None
113 | 
114 |         if self.min_n_bins is not None and self.max_n_bins is not None:
115 |             n_bin_diff = self.max_n_bins - self.min_n_bins
116 | 
117 |             # Range constraints auxiliary variables
118 |             d = model.NewIntVar(0, n_bin_diff, "n_bin_diff")
119 | 
120 |         return x, d
121 | 
122 |     def add_constraint_unique_assignment(self, model, x, n_grid, cols):
123 |         for j in range(n_grid):
124 |             model.Add(sum([x[i] for i in cols[j]]) == 1)
125 | 
126 |     def add_constraint_min_max_bins(self, model, n_rectangles, x, d):
127 |         if self.min_n_bins is not None or self.max_n_bins is not None:
128 |             n_bins = sum([x[i] for i in range(n_rectangles)])
129 | 
130 |             if self.min_n_bins is not None and self.max_n_bins is not None:
131 |                 model.Add(d + n_bins - self.max_n_bins == 0)
132 |             elif self.min_n_bins is not None:
133 |                 model.Add(n_bins >= self.min_n_bins)
134 |             elif self.max_n_bins is not None:
135 |                 model.Add(n_bins <= self.max_n_bins)
136 | 
137 |     def add_constraint_monotonic(self, model, n_rectangles, x,
138 |                                  er, d_connected_x, d_connected_y, min_diff_x,
139 |                                  min_diff_y):
140 | 
141 |         if (self.monotonic_trend_x is not None and
142 |                 self.monotonic_trend_y is not None):
143 |             for i in range(n_rectangles):
144 |                 ind_x = []
145 |                 ind_y = []
146 |                 for j in d_connected_x[i]:
147 |                     if self.monotonic_trend_x == "ascending":
148 |                         if er[i] + min_diff_x >= er[j]:
149 |                             ind_x.append(j)
150 |                     elif self.monotonic_trend_x == "descending":
151 |                         if er[i] <= er[j] + min_diff_x:
152 |                             ind_x.append(j)
153 | 
154 |                 if ind_x:
155 |                     model.Add(sum([x[j] for j in ind_x]) <=
156 |                               len(ind_x) * (1 - x[i]))
157 | 
158 |                 for j in d_connected_y[i]:
159 |                     if self.monotonic_trend_y == "ascending":
160 |                         if er[i] + min_diff_y >= er[j]:
161 |                             ind_y.append(j)
162 |                     elif self.monotonic_trend_y == "descending":
163 |                         if er[i] <= er[j] + min_diff_y:
164 |                             ind_y.append(j)
165 | 
166 |                 if ind_y:
167 |                     model.Add(sum([x[j] for j in ind_y]) <=
168 |                               len(ind_y) * (1 - x[i]))
169 | 
170 |         elif self.monotonic_trend_x is not None:
171 |             for i in range(n_rectangles):
172 |                 ind_x = []
173 |                 for j in d_connected_x[i]:
174 |                     if self.monotonic_trend_x == "ascending":
175 |                         if er[i] + min_diff_x >= er[j]:
176 |                             ind_x.append(j)
177 |                     elif self.monotonic_trend_x == "descending":
178 |                         if er[i] <= er[j] + min_diff_x:
179 |                             ind_x.append(j)
180 | 
181 |                 if ind_x:
182 |                     model.Add(sum([x[j] for j in ind_x]) <=
183 |                               len(ind_x) * (1 - x[i]))
184 | 
185 |         elif self.monotonic_trend_y is not None:
186 |             for i in range(n_rectangles):
187 |                 ind_y = []
188 |                 for j in d_connected_y[i]:
189 |                     if self.monotonic_trend_y == "ascending":
190 |                         if er[i] + min_diff_y >= er[j]:
191 |                             ind_y.append(j)
192 |                     elif self.monotonic_trend_y == "descending":
193 |                         if er[i] <= er[j] + min_diff_y:
194 |                             ind_y.append(j)
195 | 
196 |                 if ind_y:
197 |                     model.Add(sum([x[j] for j in ind_y]) <=
198 |                               len(ind_y) * (1 - x[i]))
199 | 


--------------------------------------------------------------------------------
/optbinning/binning/multidimensional/mip_2d.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generalized assigment problem: solve constrained optimal 2D binning problem.
  3 | Mixed-Integer programming implementation.
  4 | """
  5 | 
  6 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  7 | # Copyright (C) 2021
  8 | 
  9 | import numpy as np
 10 | 
 11 | from ortools.linear_solver import pywraplp
 12 | 
 13 | 
 14 | class Binning2DMIP:
 15 |     def __init__(self, monotonic_trend_x, monotonic_trend_y, min_n_bins,
 16 |                  max_n_bins, min_diff_x, min_diff_y, gamma, n_jobs,
 17 |                  time_limit):
 18 | 
 19 |         self.monotonic_trend_x = monotonic_trend_x
 20 |         self.monotonic_trend_y = monotonic_trend_y
 21 |         self.min_n_bins = min_n_bins
 22 |         self.max_n_bins = max_n_bins
 23 |         self.min_diff_x = min_diff_x
 24 |         self.min_diff_y = min_diff_y
 25 |         self.gamma = gamma
 26 | 
 27 |         self.n_jobs = n_jobs
 28 |         self.time_limit = time_limit
 29 | 
 30 |         self.solver_ = None
 31 |         self.event_rate_ = None
 32 |         self.iv_ = None
 33 | 
 34 |         self._model = None
 35 |         self._x = None
 36 |         self._n_rectangles = None
 37 | 
 38 |     def build_model(self, n_grid, n_rectangles, cols, c, d_connected_x,
 39 |                     d_connected_y, er, n_records):
 40 |         # Initialize solver
 41 |         solver = pywraplp.Solver(
 42 |             'BinningMIP', pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING)
 43 | 
 44 |         # Decision variables
 45 |         x, d = self.decision_variables(solver, n_rectangles)
 46 | 
 47 |         # Objective function
 48 |         if self.gamma:
 49 |             total_records = int(n_records.sum())
 50 |             regularization = self.gamma / total_records
 51 |             pmax = solver.NumVar(0, total_records, "pmax")
 52 |             pmin = solver.NumVar(0, total_records, "pmin")
 53 | 
 54 |             solver.Maximize(
 55 |                 solver.Sum([c[i] * x[i] for i in range(n_rectangles)]) -
 56 |                 regularization * (pmax - pmin))
 57 |         else:
 58 |             solver.Maximize(
 59 |                 solver.Sum([c[i] * x[i] for i in range(n_rectangles)]))
 60 | 
 61 |         # Constraint: unique assignment
 62 |         self.add_constraint_unique_assignment(solver, x, n_grid, cols)
 63 | 
 64 |         # Constraint: min / max bins
 65 |         self.add_constraint_min_max_bins(solver, n_rectangles, x, d)
 66 | 
 67 |         # Constraint: monotonicity
 68 |         self.add_constraint_monotonic(
 69 |             solver, n_rectangles, x, er, d_connected_x, d_connected_y,
 70 |             self.min_diff_x, self.min_diff_y)
 71 | 
 72 |         # Constraint: reduction of dominating bins
 73 |         if self.gamma:
 74 |             for i in range(n_rectangles):
 75 |                 bin_size = n_records[i] * x[i]
 76 | 
 77 |                 solver.Add(pmin <= total_records * (1 - x[i]) + bin_size)
 78 |                 solver.Add(pmax >= bin_size)
 79 |             solver.Add(pmin <= pmax)
 80 | 
 81 |         # Save data for post-processing
 82 |         self.solver_ = solver
 83 |         self._x = x
 84 |         self._n_rectangles = n_rectangles
 85 | 
 86 |     def solve(self):
 87 |         # Solve
 88 |         self.solver_.SetTimeLimit(self.time_limit * 1000)
 89 |         self.solver_.SetNumThreads(self.n_jobs)
 90 |         status = self.solver_.Solve()
 91 | 
 92 |         if status in (pywraplp.Solver.OPTIMAL, pywraplp.Solver.FEASIBLE):
 93 |             if status == pywraplp.Solver.OPTIMAL:
 94 |                 status_name = "OPTIMAL"
 95 |             else:
 96 |                 status_name = "FEASIBLE"
 97 | 
 98 |             solution = np.array([self._x[i].solution_value()
 99 |                                  for i in range(self._n_rectangles)])
100 | 
101 |             solution = solution.astype(bool)
102 |         else:
103 |             if status == pywraplp.Solver.ABNORMAL:
104 |                 status_name = "ABNORMAL"
105 |             elif status == pywraplp.Solver.INFEASIBLE:
106 |                 status_name = "INFEASIBLE"
107 |             elif status == pywraplp.Solver.UNBOUNDED:
108 |                 status_name = "UNBOUNDED"
109 |             else:
110 |                 status_name = "UNKNOWN"
111 | 
112 |             solution = np.zeros(self._n_rectangles, dtype=bool)
113 | 
114 |         return status_name, solution
115 | 
116 |     def decision_variables(self, solver, n_rectangles):
117 |         x = {}
118 | 
119 |         for i in range(n_rectangles):
120 |             x[i] = solver.BoolVar("x[{}]".format(i))
121 | 
122 |         d = None
123 | 
124 |         if self.min_n_bins is not None and self.max_n_bins is not None:
125 |             n_bin_diff = self.max_n_bins - self.min_n_bins
126 | 
127 |             # Range constraints auxiliary variables
128 |             d = solver.NumVar(0, n_bin_diff, "n_bin_diff")
129 | 
130 |         return x, d
131 | 
132 |     def add_constraint_unique_assignment(self, solver, x, n_grid, cols):
133 |         for j in range(n_grid):
134 |             solver.Add(solver.Sum([x[i] for i in cols[j]]) == 1)
135 | 
136 |     def add_constraint_min_max_bins(self, solver, n_rectangles, x, d):
137 |         if self.min_n_bins is not None or self.max_n_bins is not None:
138 |             n_bins = solver.Sum([x[i] for i in range(n_rectangles)])
139 | 
140 |             if self.min_n_bins is not None and self.max_n_bins is not None:
141 |                 solver.Add(d + n_bins - self.max_n_bins == 0)
142 |             elif self.min_n_bins is not None:
143 |                 solver.Add(n_bins >= self.min_n_bins)
144 |             elif self.max_n_bins is not None:
145 |                 solver.Add(n_bins <= self.max_n_bins)
146 | 
147 |     def add_constraint_monotonic(self, solver, n_rectangles, x, er,
148 |                                  d_connected_x, d_connected_y, min_diff_x,
149 |                                  min_diff_y):
150 | 
151 |         if (self.monotonic_trend_x is not None and
152 |                 self.monotonic_trend_y is not None):
153 |             for i in range(n_rectangles):
154 |                 ind_x = []
155 |                 ind_y = []
156 |                 for j in d_connected_x[i]:
157 |                     if self.monotonic_trend_x == "ascending":
158 |                         if er[i] + min_diff_x >= er[j]:
159 |                             ind_x.append(j)
160 |                     elif self.monotonic_trend_x == "descending":
161 |                         if er[i] <= er[j] + min_diff_x:
162 |                             ind_x.append(j)
163 | 
164 |                 if ind_x:
165 |                     solver.Add(solver.Sum([x[j] for j in ind_x]) <=
166 |                                len(ind_x) * (1 - x[i]))
167 | 
168 |                 for j in d_connected_y[i]:
169 |                     if self.monotonic_trend_y == "ascending":
170 |                         if er[i] + min_diff_y >= er[j]:
171 |                             ind_y.append(j)
172 |                     elif self.monotonic_trend_y == "descending":
173 |                         if er[i] <= er[j] + min_diff_y:
174 |                             ind_y.append(j)
175 | 
176 |                 if ind_y:
177 |                     solver.Add(solver.Sum([x[j] for j in ind_y]) <=
178 |                                len(ind_y) * (1 - x[i]))
179 | 
180 |         elif self.monotonic_trend_x is not None:
181 |             for i in range(n_rectangles):
182 |                 ind_x = []
183 |                 for j in d_connected_x[i]:
184 |                     if self.monotonic_trend_x == "ascending":
185 |                         if er[i] + min_diff_x >= er[j]:
186 |                             ind_x.append(j)
187 |                     elif self.monotonic_trend_x == "descending":
188 |                         if er[i] <= er[j] + min_diff_x:
189 |                             ind_x.append(j)
190 | 
191 |                 if ind_x:
192 |                     solver.Add(solver.Sum([x[j] for j in ind_x]) <=
193 |                                len(ind_x) * (1 - x[i]))
194 | 
195 |         elif self.monotonic_trend_y is not None:
196 |             for i in range(n_rectangles):
197 |                 ind_y = []
198 |                 for j in d_connected_y[i]:
199 |                     if self.monotonic_trend_y == "ascending":
200 |                         if er[i] + min_diff_y >= er[j]:
201 |                             ind_y.append(j)
202 |                     elif self.monotonic_trend_y == "descending":
203 |                         if er[i] <= er[j] + min_diff_y:
204 |                             ind_y.append(j)
205 | 
206 |                 if ind_y:
207 |                     solver.Add(solver.Sum([x[j] for j in ind_y]) <=
208 |                                len(ind_y) * (1 - x[i]))
209 | 


--------------------------------------------------------------------------------
/optbinning/binning/multidimensional/preprocessing_2d.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Preprocessing 2D functions.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2021
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | from sklearn.utils import check_array
 12 | from sklearn.utils import check_consistent_length
 13 | 
 14 | from ..preprocessing import categorical_transform
 15 | 
 16 | 
 17 | def split_data_2d(dtype_x, dtype_y, x, y, z, special_codes_x=None,
 18 |                   special_codes_y=None, check_input=True):
 19 |     """Split 2d data into clean, missing and special values data.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     dtype_x : str, optional (default="numerical")
 24 |         The data type of variable x. Supported data type is "numerical" for
 25 |         continuous and ordinal variables.
 26 | 
 27 |     dtype_y : str, optional (default="numerical")
 28 |         The data type of variable y. Supported data type is "numerical" for
 29 |         continuous and ordinal variables.
 30 | 
 31 |     x : array-like, shape = (n_samples,)
 32 |         Training vector x, where n_samples is the number of samples.
 33 | 
 34 |     y : array-like, shape = (n_samples,)
 35 |         Training vector y, where n_samples is the number of samples.
 36 | 
 37 |     z : array-like, shape = (n_samples,)
 38 |         Target vector relative to x and y.
 39 | 
 40 |     special_codes_x : array-like or None, optional (default=None)
 41 |         List of special codes for the variable x. Use special codes to specify
 42 |         the data values that must be treated separately.
 43 | 
 44 |     special_codes_y : array-like or None, optional (default=None)
 45 |         List of special codes for the variable y. Use special codes to specify
 46 |         the data values that must be treated separately.
 47 | 
 48 |     check_input : bool, (default=True)
 49 |         If False, the input arrays x and y will not be checked.
 50 | 
 51 |     Returns
 52 |     -------
 53 |     """
 54 |     if check_input:
 55 |         x = check_array(x, ensure_2d=False, dtype=None,
 56 |                         force_all_finite='allow-nan')
 57 | 
 58 |         y = check_array(y, ensure_2d=False, dtype=None,
 59 |                         force_all_finite='allow-nan')
 60 | 
 61 |         z = check_array(z, ensure_2d=False, dtype=None,
 62 |                         force_all_finite=True)
 63 | 
 64 |         check_consistent_length(x, y, z)
 65 | 
 66 |     x = np.asarray(x)
 67 |     y = np.asarray(y)
 68 |     z = np.asarray(z)
 69 | 
 70 |     if np.issubdtype(x.dtype, np.number) and np.issubdtype(z.dtype, np.number):
 71 |         missing_mask_x = np.isnan(x) | np.isnan(z)
 72 |     else:
 73 |         missing_mask_x = pd.isnull(x) | pd.isnull(z)
 74 | 
 75 |     if np.issubdtype(y.dtype, np.number) and np.issubdtype(z.dtype, np.number):
 76 |         missing_mask_y = np.isnan(y) | np.isnan(z)
 77 |     else:
 78 |         missing_mask_y = pd.isnull(y) | pd.isnull(z)
 79 | 
 80 |     if special_codes_x is not None:
 81 |         special_mask_x = pd.Series(x).isin(special_codes_x).values
 82 |     else:
 83 |         special_mask_x = np.zeros(len(x), dtype=bool)
 84 | 
 85 |     if special_codes_y is not None:
 86 |         special_mask_y = pd.Series(y).isin(special_codes_y).values
 87 |     else:
 88 |         special_mask_y = np.zeros(len(y), dtype=bool)
 89 | 
 90 |     missing_mask = missing_mask_x | missing_mask_y
 91 |     special_mask = special_mask_x | special_mask_y
 92 | 
 93 |     clean_mask = ~missing_mask & ~special_mask
 94 | 
 95 |     x_clean = x[clean_mask]
 96 |     y_clean = y[clean_mask]
 97 |     z_clean = z[clean_mask]
 98 | 
 99 |     x_missing = x[missing_mask]
100 |     y_missing = y[missing_mask]
101 |     z_missing = z[missing_mask]
102 | 
103 |     x_special = x[special_mask]
104 |     y_special = y[special_mask]
105 |     z_special = z[special_mask]
106 | 
107 |     if dtype_x == "categorical":
108 |         x_categories, x_clean = categorical_transform(x_clean, z_clean)
109 |     else:
110 |         x_categories = []
111 | 
112 |     if dtype_y == "categorical":
113 |         y_categories, y_clean = categorical_transform(y_clean, z_clean)
114 |     else:
115 |         y_categories = []
116 | 
117 |     return (x_clean, y_clean, z_clean, x_missing, y_missing, z_missing,
118 |             x_special, y_special, z_special, x_categories, y_categories)
119 | 


--------------------------------------------------------------------------------
/optbinning/binning/outlier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Univariate outlier detection methods.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2020
  7 | 
  8 | import numbers
  9 | 
 10 | import numpy as np
 11 | 
 12 | from sklearn.base import BaseEstimator
 13 | from sklearn.exceptions import NotFittedError
 14 | 
 15 | 
 16 | class OutlierDetector:
 17 |     """Base class for all outlier detectors."""
 18 |     def __init__(self):
 19 |         self._support = None
 20 | 
 21 |         # flag
 22 |         self._is_fitted = False
 23 | 
 24 |     def fit(self, x, y=None):
 25 |         """Fit outlier detector.
 26 | 
 27 |         Parameters
 28 |         ----------
 29 |         x : array-like, shape = (n_samples)
 30 | 
 31 |         y : array-like, shape = (n_samples) or None (default=None)
 32 | 
 33 |         Returns
 34 |         -------
 35 |         self : OutlierDetector
 36 |         """
 37 |         self._fit(x, y)
 38 | 
 39 |         return self
 40 | 
 41 |     def get_support(self, indices=False):
 42 |         """Get a mask, or integer index, of the samples excluded, i.e, samples
 43 |         detected as outliers.
 44 | 
 45 |         Parameters
 46 |         ----------
 47 |         indices : boolean (default False)
 48 |             If True, the return value will be an array of integers, rather
 49 |             than a boolean mask.
 50 | 
 51 |         Returns
 52 |         -------
 53 |         support : array, shape = (n_samples)
 54 |             An index that selects the excluded samples from a vector.
 55 |             If `indices` is False, this is a boolean array, in which an element
 56 |             is True iff its corresponding sample is excluded. If `indices` is
 57 |             True, this is an integer array whose values are indices into the
 58 |             input vector.
 59 |         """
 60 |         if not self._is_fitted:
 61 |             raise NotFittedError("This {} instance is not fitted yet. Call "
 62 |                                  "'fit' with appropriate arguments."
 63 |                                  .format(self.__class__.__name__))
 64 | 
 65 |         mask = self._support
 66 |         return mask if not indices else np.where(mask)[0]
 67 | 
 68 | 
 69 | class RangeDetector(BaseEstimator, OutlierDetector):
 70 |     r"""Interquartile range or interval based outlier detection method.
 71 | 
 72 |     The default settings compute the usual interquartile range method.
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     interval_length : float (default=0.5)
 77 |         Compute ``interval_length``\% credible interval. This is a value in
 78 |         [0, 1].
 79 | 
 80 |     k : float (default=1.5)
 81 |         Tukey's factor.
 82 | 
 83 |     method : str (default="ETI")
 84 |         Method to compute credible intervals. Supported methods are Highest
 85 |         Density interval (``method="HDI"``) and Equal-tailed interval
 86 |         (``method="ETI"``).
 87 |     """
 88 |     def __init__(self, interval_length=0.5, k=1.5, method="ETI"):
 89 |         self.interval_length = interval_length
 90 |         self.k = k
 91 |         self.method = method
 92 | 
 93 |     def _fit(self, x, y=None):
 94 |         if self.method not in ("ETI", "HDI"):
 95 |             raise ValueError('Invalid value for method. Allowed string '
 96 |                              'values are "ETI" and "HDI".')
 97 | 
 98 |         if (not isinstance(self.interval_length, numbers.Number) or
 99 |                 not 0 <= self.interval_length <= 1):
100 |             raise ValueError("Interval length must a value in [0, 1]; got {}."
101 |                              .format(self.interval_length))
102 | 
103 |         if self.method == "ETI":
104 |             lower = 100 * (1 - self.interval_length) / 2
105 |             upper = 100 * (1 + self.interval_length) / 2
106 | 
107 |             lb, ub = np.percentile(x, [lower, upper])
108 |         else:
109 |             n = len(x)
110 |             xsorted = np.sort(x)
111 |             n_included = int(np.ceil(self.interval_length * n))
112 |             n_ci = n - n_included
113 |             ci = xsorted[n_included:] - xsorted[:n_ci]
114 |             j = np.argmin(ci)
115 |             hdi_min = xsorted[j]
116 |             hdi_max = xsorted[j + n_included]
117 | 
118 |             lb = hdi_min
119 |             ub = hdi_max
120 | 
121 |         iqr = ub - lb
122 |         lower_bound = lb - self.k * iqr
123 |         upper_bound = ub + self.k * iqr
124 | 
125 |         self._support = (x > upper_bound) | (x < lower_bound)
126 | 
127 |         self._is_fitted = True
128 | 
129 | 
130 | class ModifiedZScoreDetector(BaseEstimator, OutlierDetector):
131 |     """Modified Z-score method.
132 | 
133 |     Parameters
134 |     ----------
135 |     threshold : float (default=3.5)
136 |         Modified Z-scores with an absolute value of greater than the threshold
137 |         are labeled as outliers.
138 | 
139 |     References
140 |     ----------
141 | 
142 |     .. [IH93] B. Iglewicz and D. Hoaglin. "Volume 16: How to Detect and Handle
143 |               Outliers", The ASQC Basic References in Quality Control:
144 |               Statistical Techniques, Edward F. Mykytka, Ph.D., Editor, 1993.
145 |     """
146 |     def __init__(self, threshold=3.5):
147 |         self.threshold = threshold
148 | 
149 |     def _fit(self, x, y=None):
150 |         if (not isinstance(self.threshold, numbers.Number) or
151 |                 self.threshold < 0):
152 |             raise ValueError("threshold must be a value >= 0; got {}".
153 |                              format(self.threshold))
154 | 
155 |         x = np.asarray(x)
156 |         median = np.median(x)
157 |         mad = np.median(np.abs(x - median))
158 |         m_z_score = 0.6745 * (x - median) / mad
159 | 
160 |         self._support = np.abs(m_z_score) > self.threshold
161 | 
162 |         self._is_fitted = True
163 | 
164 | 
165 | class YQuantileDetector(BaseEstimator, OutlierDetector):
166 |     """Outlier detector on the y-axis over quantiles.
167 | 
168 |     Parameters
169 |     ----------
170 |     outlier_detector : str or None, optional (default=None)
171 |         The outlier detection method. Supported methods are "range" to use
172 |         the interquartile range based method or "zcore" to use the modified
173 |         Z-score method.
174 | 
175 |     outlier_params : dict or None, optional (default=None)
176 |         Dictionary of parameters to pass to the outlier detection method.
177 | 
178 |     n_bins : int (default=5)
179 |         The maximum number of bins to consider.
180 |     """
181 |     def __init__(self, outlier_detector="zscore",  outlier_params=None,
182 |                  n_bins=5):
183 |         self.outlier_detector = outlier_detector
184 |         self.outlier_params = outlier_params
185 |         self.n_bins = n_bins
186 | 
187 |     def _fit(self, x, y):
188 |         if self.outlier_detector not in ("range", "zscore"):
189 |             raise ValueError('Invalid value for outlier_detector. Allowed '
190 |                              'string values are "range" and "zscore".')
191 | 
192 |         if self.outlier_params is not None:
193 |             if not isinstance(self.outlier_params, dict):
194 |                 raise TypeError("outlier_params must be a dict or None; "
195 |                                 "got {}.".format(self.outlier_params))
196 | 
197 |         if not isinstance(self.n_bins, numbers.Integral) or self.n_bins <= 0:
198 |             raise ValueError("bins must be a positive integer; got {}."
199 |                              .format(self.n_bins))
200 | 
201 |         x = np.asarray(x)
202 |         y = np.asarray(y)
203 | 
204 |         q = np.linspace(0, 1, self.n_bins + 1)
205 |         splits = np.unique(np.quantile(x, q))[1:-1]
206 |         n_bins = len(splits) + 1
207 |         indices = np.digitize(x, splits, right=False)
208 | 
209 |         self._support = np.zeros(x.size, dtype=bool)
210 |         idx_support = np.arange(x.size)
211 | 
212 |         if self.outlier_detector == "zscore":
213 |             detector = ModifiedZScoreDetector()
214 |         elif self.outlier_detector == "range":
215 |             detector = RangeDetector()
216 | 
217 |         if self.outlier_params is not None:
218 |             detector.set_params(**self.outlier_params)
219 | 
220 |         for i in range(n_bins):
221 |             mask_x = indices == i
222 |             detector.fit(y[mask_x])
223 |             mask_out = detector.get_support()
224 |             idx_out = idx_support[mask_x][mask_out]
225 |             self._support[idx_out] = True
226 | 
227 |         self._is_fitted = True
228 | 


--------------------------------------------------------------------------------
/optbinning/binning/piecewise/__init__.py:
--------------------------------------------------------------------------------
1 | from .binning import OptimalPWBinning
2 | from .continuous_binning import ContinuousOptimalPWBinning
3 | 
4 | 
5 | __all__ = ['OptimalPWBinning',
6 |            'ContinuousOptimalPWBinning']
7 | 


--------------------------------------------------------------------------------
/optbinning/binning/piecewise/binning_information.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Optimal piecewise binning information.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2020
  7 | 
  8 | from ...binning.binning_information import print_header
  9 | from ...binning.binning_information import print_optional_parameters
 10 | from ...binning.binning_information import print_name_status
 11 | from ...binning.binning_information import print_main_info
 12 | from ...options import optimal_pw_binning_options
 13 | 
 14 | 
 15 | def print_prebinning_statistics(n_prebins):
 16 |     prebinning_stats = (
 17 |         "  Pre-binning statistics\n"
 18 |         "    Number of bins                {:>10}\n"
 19 |         ).format(n_prebins)
 20 | 
 21 |     print(prebinning_stats)
 22 | 
 23 | 
 24 | def print_solver_statistics(solver_type, solver):
 25 |     if isinstance(solver.stats, list):
 26 |         n_constraints = sum(info["n_constraints"] for info in solver.stats)
 27 |         n_variables = sum(info["n_variables"] for info in solver.stats)
 28 |     else:
 29 |         n_constraints = solver.stats["n_constraints"]
 30 |         n_variables = solver.stats["n_variables"]
 31 | 
 32 |     solver_stats = (
 33 |         "  Solver statistics\n"
 34 |         "    Type                          {:>10}\n"
 35 |         "    Number of variables           {:>10}\n"
 36 |         "    Number of constraints         {:>10}\n"
 37 |         ).format(solver_type, n_variables, n_constraints)
 38 | 
 39 |     print(solver_stats)
 40 | 
 41 | 
 42 | def print_timing(solver_type, solver, time_total, time_preprocessing,
 43 |                  time_estimator, time_prebinning, time_solver,
 44 |                  time_postprocessing):
 45 | 
 46 |     p_preprocessing = time_preprocessing / time_total
 47 |     p_estimator = time_estimator / time_total
 48 |     p_prebinning = time_prebinning / time_total
 49 |     p_solver = time_solver / time_total
 50 |     p_postprocessing = time_postprocessing / time_total
 51 | 
 52 |     time_stats = (
 53 |         "  Timing\n"
 54 |         "    Total time            {:>18.2f} sec\n"
 55 |         "    Pre-processing        {:>18.2f} sec   ({:>7.2%})\n"
 56 |         "    Estimator             {:>18.2f} sec   ({:>7.2%})\n"
 57 |         "    Pre-binning           {:>18.2f} sec   ({:>7.2%})\n"
 58 |         "    Solver                {:>18.2f} sec   ({:>7.2%})\n"
 59 |         "    Post-processing       {:>18.2f} sec   ({:>7.2%})\n"
 60 |         ).format(time_total, time_preprocessing, p_preprocessing,
 61 |                  time_estimator, p_estimator, time_prebinning, p_prebinning,
 62 |                  time_solver, p_solver, time_postprocessing, p_postprocessing)
 63 | 
 64 |     print(time_stats)
 65 | 
 66 | 
 67 | def retrieve_status(status):
 68 |     if isinstance(status, list):
 69 |         n_status = len(status)
 70 |         n_optimal = 0
 71 |         n_feasible = 0
 72 |         n_unbouded = 0
 73 |         for s in status:
 74 |             if "optimal" in s:
 75 |                 n_optimal += 1
 76 |             elif "feasible" in s:
 77 |                 n_feasible += 1
 78 |             elif "unbounded" in s:
 79 |                 n_unbouded += 1
 80 |         if n_optimal == n_status:
 81 |             return "OPTIMAL"
 82 |         elif n_feasible == n_status:
 83 |             return "FEASIBLE"
 84 |         elif n_unbouded == n_status:
 85 |             return "UNBOUNDED"
 86 |         else:
 87 |             new_status = ""
 88 |             if n_optimal > 0:
 89 |                 new_status += "OPTIMAL ({}/{})".format(n_optimal, n_status)
 90 |             if n_feasible > 0:
 91 |                 new_status += "FEASIBLE ({}/{})".format(n_feasible, n_status)
 92 |             if n_unbouded > 0:
 93 |                 new_status += "UNBOUNDED ({}/{})".format(n_unbouded, n_status)
 94 |         return new_status
 95 |     else:
 96 |         if "optimal" in status:
 97 |             return "OPTIMAL"
 98 |         elif "feasible" in status:
 99 |             return "FEASIBLE"
100 |         elif "unbounded" in status:
101 |             return "UNBOUNDED"
102 | 
103 | 
104 | def print_binning_information(print_level, name, status, solver_type, solver,
105 |                               time_total, time_preprocessing, time_estimator,
106 |                               time_prebinning, time_solver,
107 |                               time_postprocessing, n_prebins,
108 |                               dict_user_options):
109 | 
110 |     print_header()
111 | 
112 |     if print_level == 2:
113 |         dict_default_options = optimal_pw_binning_options
114 | 
115 |         print_optional_parameters(dict_default_options, dict_user_options)
116 | 
117 |     if print_level == 0:
118 |         print_main_info(name, status, time_total)
119 |     elif print_level >= 1:
120 |         print_name_status(name, status)
121 | 
122 |         print_prebinning_statistics(n_prebins)
123 | 
124 |         if status in ("OPTIMAL", "FEASIBLE"):
125 |             if solver is not None:
126 |                 print_solver_statistics(solver_type, solver)
127 | 
128 |             print_timing(solver_type, solver, time_total, time_preprocessing,
129 |                          time_estimator, time_prebinning, time_solver,
130 |                          time_postprocessing)
131 | 


--------------------------------------------------------------------------------
/optbinning/binning/piecewise/metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Optimal piecewise binning metrics.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2020
  7 | 
  8 | import numpy as np
  9 | 
 10 | from sklearn.metrics import average_precision_score
 11 | from sklearn.metrics import brier_score_loss
 12 | 
 13 | from ...binning.metrics import jeffrey
 14 | from ...binning.metrics import jensen_shannon
 15 | from ...binning.metrics import hellinger
 16 | from ...binning.metrics import triangular
 17 | from ...metrics.classification import gini
 18 | from ...metrics.classification import ks
 19 | from ...metrics.regression import regression_metrics
 20 | from .transformations import transform_binary_target
 21 | from .transformations import transform_continuous_target
 22 | 
 23 | 
 24 | def _fun_divergence(fun, n, pi, qi, pi_special, qi_special, pi_missing,
 25 |                     qi_missing, flag_special, flag_missing, n_special):
 26 | 
 27 |     div_value = fun(pi, qi, return_sum=True) / n
 28 | 
 29 |     if flag_special:
 30 |         div_value += fun(pi_special, qi_special, return_sum=True) / n_special
 31 | 
 32 |     if flag_missing:
 33 |         div_value += fun([pi_missing], [qi_missing])
 34 | 
 35 |     return float(div_value)
 36 | 
 37 | 
 38 | def divergences_asymptotic(event_rate, n_nonevent_special, n_event_special,
 39 |                            n_nonevent_missing, n_event_missing, t_n_nonevent,
 40 |                            t_n_event):
 41 | 
 42 |     n = t_n_nonevent + t_n_event
 43 |     p = t_n_event / n
 44 | 
 45 |     pi = (1.0 - event_rate) / (1.0 - p)
 46 |     qi = event_rate / p
 47 | 
 48 |     if isinstance(n_event_special, (np.ndarray, list)):
 49 |         n_special = n_event_special.size
 50 |         mask = (n_event_special > 0) & (n_nonevent_special > 0)
 51 |         flag_special = np.any(mask)
 52 | 
 53 |         pi_special = n_nonevent_special[mask] / t_n_nonevent
 54 |         qi_special = n_event_special[mask] / t_n_event
 55 |     else:
 56 |         n_special = 1
 57 |         flag_special = (n_event_special > 0 and n_nonevent_special > 0)
 58 |         pi_special = n_nonevent_special / t_n_nonevent
 59 |         qi_special = n_event_special / t_n_event
 60 | 
 61 |     flag_missing = (n_event_missing > 0 and n_nonevent_missing > 0)
 62 |     pi_missing = n_nonevent_missing / t_n_nonevent
 63 |     qi_missing = n_event_missing / t_n_event
 64 | 
 65 |     d_divergences = {}
 66 | 
 67 |     d_divergences["IV (Jeffrey)"] = _fun_divergence(
 68 |         jeffrey, n, pi, qi, pi_special, qi_special, pi_missing, qi_missing,
 69 |         flag_special, flag_missing, n_special)
 70 | 
 71 |     d_divergences["JS (Jensen-Shannon)"] = _fun_divergence(
 72 |         jensen_shannon, n, pi, qi, pi_special, qi_special, pi_missing,
 73 |         qi_missing, flag_special, flag_missing, n_special)
 74 | 
 75 |     d_divergences["Hellinger"] = _fun_divergence(
 76 |         hellinger, n, pi, qi, pi_special, qi_special, pi_missing, qi_missing,
 77 |         flag_special, flag_missing, n_special)
 78 | 
 79 |     d_divergences["Triangular"] = _fun_divergence(
 80 |         triangular, n, pi, qi, pi_special, qi_special, pi_missing, qi_missing,
 81 |         flag_special, flag_missing, n_special)
 82 | 
 83 |     return d_divergences
 84 | 
 85 | 
 86 | def binary_metrics(x, y, splits, c, t_n_nonevent, t_n_event,
 87 |                    n_nonevent_special, n_event_special, n_nonevent_missing,
 88 |                    n_event_missing, special_codes):
 89 | 
 90 |     d_metrics = {}
 91 | 
 92 |     n_nonevent_special = np.asarray(n_nonevent_special)
 93 |     n_event_special = np.asarray(n_event_special)
 94 | 
 95 |     # Metrics using predicted probability of Y=1.
 96 |     min_pred = 1e-8
 97 |     max_pred = 1 - min_pred
 98 | 
 99 |     event_rate = transform_binary_target(
100 |         splits, x, c, min_pred, max_pred, t_n_nonevent, t_n_event,
101 |         n_nonevent_special, n_event_special, n_nonevent_missing,
102 |         n_event_missing, special_codes, "event_rate", "empirical", "empirical")
103 | 
104 |     d_metrics["Gini index"] = gini(y, event_rate)
105 | 
106 |     # Divergence metrics
107 |     d_divergences = divergences_asymptotic(
108 |         event_rate, n_nonevent_special, n_event_special, n_nonevent_missing,
109 |         n_event_missing, t_n_nonevent, t_n_event)
110 | 
111 |     for dk, dv in d_divergences.items():
112 |         d_metrics[dk] = dv
113 | 
114 |     d_metrics["KS"] = ks(y, event_rate)[0]
115 |     d_metrics["Avg precision"] = average_precision_score(y, event_rate)
116 |     d_metrics["Brier score"] = brier_score_loss(y, event_rate)
117 | 
118 |     return d_metrics
119 | 
120 | 
121 | def continuous_metrics(x, y, splits, c, lb, ub, n_records_special, sum_special,
122 |                        n_records_missing, sum_missing, special_codes):
123 | 
124 |     y_pred = transform_continuous_target(
125 |         splits, x, c, lb, ub, n_records_special, sum_special,
126 |         n_records_missing, sum_missing, special_codes, "empirical",
127 |         "empirical")
128 | 
129 |     d_metrics = regression_metrics(y, y_pred)
130 | 
131 |     return d_metrics
132 | 


--------------------------------------------------------------------------------
/optbinning/binning/piecewise/transformations.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Piecewise binning transformations.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2020
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | from sklearn.utils import check_array
 12 | 
 13 | from ...binning.transformations import transform_event_rate_to_woe
 14 | from ...binning.transformations import _check_metric_special_missing
 15 | from ...binning.transformations import _mask_special_missing
 16 | 
 17 | 
 18 | def _apply_transform(x, c, lb, ub, special_codes, metric_special,
 19 |                      metric_missing, clean_mask, special_mask, missing_mask,
 20 |                      indices, x_clean, n_bins, n_special, event_rate_special,
 21 |                      event_rate_missing):
 22 | 
 23 |     x_transform = np.zeros(x.shape)
 24 |     x_clean_transform = np.zeros(x_clean.shape)
 25 | 
 26 |     for i in range(n_bins):
 27 |         mask = (indices == i)
 28 |         x_clean_transform[mask] = np.polyval(c[i, :][::-1], x_clean[mask])
 29 | 
 30 |     # Clip values using LB/UB
 31 |     bounded = (lb is not None or ub is not None)
 32 |     if bounded:
 33 |         x_clean_transform = np.clip(x_clean_transform, lb, ub)
 34 | 
 35 |     x_transform[clean_mask] = x_clean_transform
 36 | 
 37 |     if special_codes:
 38 |         if isinstance(special_codes, dict):
 39 |             xt = pd.Series(x)
 40 |             for i, (k, s) in enumerate(special_codes.items()):
 41 |                 sl = s if isinstance(s, (list, np.ndarray)) else [s]
 42 |                 mask = xt.isin(sl).values
 43 |                 if metric_special == "empirical":
 44 |                     x_transform[mask] = event_rate_special[i]
 45 |                 else:
 46 |                     x_transform[mask] = metric_special
 47 |         else:
 48 |             if metric_special == "empirical":
 49 |                 x_transform[special_mask] = event_rate_special
 50 |             else:
 51 |                 x_transform[special_mask] = metric_special
 52 | 
 53 |     if metric_missing == "empirical":
 54 |         x_transform[missing_mask] = event_rate_missing
 55 |     else:
 56 |         x_transform[missing_mask] = metric_missing
 57 | 
 58 |     return x_transform
 59 | 
 60 | 
 61 | def transform_binary_target(splits, x, c, lb, ub, n_nonevent, n_event,
 62 |                             n_event_special, n_nonevent_special,
 63 |                             n_event_missing, n_nonevent_missing,
 64 |                             special_codes, metric, metric_special,
 65 |                             metric_missing, check_input=False):
 66 | 
 67 |     if metric not in ("event_rate", "woe"):
 68 |         raise ValueError('Invalid value for metric. Allowed string '
 69 |                          'values are "event_rate" and "woe".')
 70 | 
 71 |     _check_metric_special_missing(metric_special, metric_missing)
 72 | 
 73 |     if check_input:
 74 |         x = check_array(x, ensure_2d=False, dtype=None,
 75 |                         force_all_finite='allow-nan')
 76 | 
 77 |     x = np.asarray(x)
 78 | 
 79 |     special_mask, missing_mask, clean_mask, n_special = _mask_special_missing(
 80 |         x, special_codes)
 81 | 
 82 |     x_clean = x[clean_mask]
 83 | 
 84 |     if len(splits):
 85 |         indices = np.digitize(x_clean, splits, right=False)
 86 |     else:
 87 |         indices = np.zeros(x_clean.shape)
 88 | 
 89 |     n_bins = len(splits) + 1
 90 | 
 91 |     # Compute event rate for special and missing bin
 92 |     event_rate_special = metric_special
 93 |     event_rate_missing = metric_missing
 94 | 
 95 |     if metric_special == "empirical":
 96 |         n_event_special = np.asarray(n_event_special)
 97 |         n_nonevent_special = np.asarray(n_nonevent_special)
 98 | 
 99 |         event_rate_special = np.zeros(n_special)
100 |         n_records_special = n_event_special + n_nonevent_special
101 | 
102 |         mask = (n_event_special > 0) & (n_nonevent_special > 0)
103 | 
104 |         if n_special > 1:
105 |             event_rate_special[mask] = (
106 |                 n_event_special[mask] / n_records_special[mask])
107 |         elif mask:
108 |             event_rate_special = n_event_special / n_records_special
109 | 
110 |         if metric == "woe":
111 |             event_rate_special = transform_event_rate_to_woe(
112 |                 event_rate_special, n_nonevent, n_event)
113 | 
114 |     if metric_missing == "empirical":
115 |         n_records_missing = n_event_missing + n_nonevent_missing
116 | 
117 |         if n_records_missing > 0:
118 |             event_rate_missing = n_event_missing / n_records_missing
119 |         else:
120 |             event_rate_missing = 0
121 | 
122 |         if metric == "woe":
123 |             event_rate_missing = transform_event_rate_to_woe(
124 |                 event_rate_missing, n_nonevent, n_event)
125 | 
126 |     x_transform = _apply_transform(
127 |         x, c, lb, ub, special_codes, metric_special, metric_missing,
128 |         clean_mask, special_mask, missing_mask, indices, x_clean, n_bins,
129 |         n_special, event_rate_special, event_rate_missing)
130 | 
131 |     if metric == "woe":
132 |         x_transform[clean_mask] = transform_event_rate_to_woe(
133 |             x_transform[clean_mask], n_nonevent, n_event)
134 | 
135 |     return x_transform
136 | 
137 | 
138 | def transform_continuous_target(splits, x, c, lb, ub, n_records_special,
139 |                                 sum_special, n_records_missing, sum_missing,
140 |                                 special_codes, metric_special, metric_missing,
141 |                                 check_input=False):
142 | 
143 |     _check_metric_special_missing(metric_special, metric_missing)
144 | 
145 |     if check_input:
146 |         x = check_array(x, ensure_2d=False, dtype=None,
147 |                         force_all_finite='allow-nan')
148 | 
149 |     x = np.asarray(x)
150 | 
151 |     special_mask, missing_mask, clean_mask, n_special = _mask_special_missing(
152 |         x, special_codes)
153 | 
154 |     x_clean = x[clean_mask]
155 | 
156 |     if len(splits):
157 |         indices = np.digitize(x_clean, splits, right=False)
158 |     else:
159 |         indices = np.zeros(x_clean.shape)
160 | 
161 |     n_bins = len(splits) + 1
162 | 
163 |     # Compute event rate for special and missing bin
164 |     mean_special = metric_special
165 |     mean_missing = metric_missing
166 | 
167 |     if metric_special == "empirical":
168 |         sum_special = np.asarray(sum_special)
169 |         n_records_special = np.asarray(n_records_special)
170 | 
171 |         mean_special = np.zeros(n_special)
172 | 
173 |         mask = (n_records_special > 0)
174 | 
175 |         if n_special > 1:
176 |             mean_special[mask] = sum_special[mask] / n_records_special[mask]
177 |         elif mask:
178 |             mean_special = sum_special / n_records_special
179 | 
180 |     if metric_missing == "empirical":
181 |         if n_records_missing > 0:
182 |             mean_missing = sum_missing / n_records_missing
183 |         else:
184 |             mean_missing = 0
185 | 
186 |     x_transform = _apply_transform(
187 |         x, c, lb, ub, special_codes, metric_special, metric_missing,
188 |         clean_mask, special_mask, missing_mask, indices, x_clean, n_bins,
189 |         n_special, mean_special, mean_missing)
190 | 
191 |     return x_transform
192 | 


--------------------------------------------------------------------------------
/optbinning/binning/prebinning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Pre-binning class.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2019
  7 | 
  8 | import numpy as np
  9 | 
 10 | from sklearn.preprocessing import KBinsDiscretizer
 11 | from sklearn.tree import _tree
 12 | from sklearn.tree import DecisionTreeClassifier
 13 | from sklearn.tree import DecisionTreeRegressor
 14 | 
 15 | from .mdlp import MDLP
 16 | 
 17 | 
 18 | class PreBinning:
 19 |     """Prebinning algorithms.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     problem_type:
 24 |         The problem type depending on the target type.
 25 | 
 26 |     method : str
 27 |         Available methods are 'uniform', 'quantile' and 'cart'.
 28 | 
 29 |     n_bins : int
 30 |         The number of bins to produce.
 31 | 
 32 |     min_bin_size : int, float
 33 |         The minimum bin size.
 34 | 
 35 |     **kwargs : keyword arguments
 36 |         Keyword arguments for prebinning method. See notes.
 37 | 
 38 |     Notes
 39 |     -----
 40 |     Keyword arguments are those available in the following classes:
 41 | 
 42 |         * ``method="uniform"``: `sklearn.preprocessing.KBinsDiscretizer.
 43 | 
 44 |         * ``method="quantile"``: `sklearn.preprocessing.KBinsDiscretizer.
 45 | 
 46 |         * ``method="cart"``: sklearn.tree.DecistionTreeClassifier.
 47 | 
 48 |         * ``method="mdlp"``: optbinning.binning.mdlp.MDLP.
 49 | 
 50 |     """
 51 |     def __init__(self, problem_type, method, n_bins, min_bin_size,
 52 |                  class_weight=None, **kwargs):
 53 | 
 54 |         self.problem_type = problem_type
 55 |         self.method = method
 56 |         self.n_bins = n_bins
 57 |         self.min_bin_size = min_bin_size
 58 |         self.class_weight = class_weight
 59 |         self.kwargs = kwargs
 60 | 
 61 |         self._splits = None
 62 | 
 63 |     def fit(self, x, y, sample_weight=None):
 64 |         """Fit PreBinning algorithm.
 65 | 
 66 |         Parameters
 67 |         ----------
 68 |         x : array-like, shape = (n_samples)
 69 |             Data samples, where n_samples is the number of samples.
 70 | 
 71 |         y : array-like, shape = (n_samples)
 72 |             Target vector relative to x.
 73 | 
 74 |         sample_weight : array-like of shape (n_samples,) (default=None)
 75 |             Array of weights that are assigned to individual samples.
 76 | 
 77 |         Returns
 78 |         -------
 79 |         self : PreBinning
 80 |         """
 81 |         if self.method not in ("uniform", "quantile", "cart", "mdlp"):
 82 |             raise ValueError('Invalid value for prebinning method. Allowed '
 83 |                              'string values are "cart", "mdlp", "quantile" '
 84 |                              'and "uniform".')
 85 | 
 86 |         if self.problem_type not in ("classification", "regression"):
 87 |             raise ValueError('Invalid value for problem_type. Allowed '
 88 |                              'string values are "classification" and '
 89 |                              '"regression".')
 90 | 
 91 |         if self.problem_type == "regression" and self.method == "mdlp":
 92 |             raise ValueError("mdlp method can only handle binary "
 93 |                              "classification problems.")
 94 | 
 95 |         if self.method in ("uniform", "quantile"):
 96 |             unsup_kwargs = {"n_bins": self.n_bins, "strategy": self.method}
 97 |             unsup_kwargs.update(**self.kwargs)
 98 | 
 99 |             est = KBinsDiscretizer(**unsup_kwargs)
100 |             est.fit(x.reshape(-1, 1), y)
101 |             self._splits = est.bin_edges_[0][1:-1]
102 | 
103 |         elif self.method == "cart":
104 |             cart_kwargs = {
105 |                     "min_samples_leaf": self.min_bin_size,
106 |                     "max_leaf_nodes": self.n_bins}
107 | 
108 |             if self.problem_type == "classification":
109 |                 cart_kwargs["class_weight"] = self.class_weight
110 |                 cart_kwargs.update(**self.kwargs)
111 | 
112 |                 est = DecisionTreeClassifier(**cart_kwargs)
113 |             else:
114 |                 cart_kwargs.update(**self.kwargs)
115 |                 est = DecisionTreeRegressor(**cart_kwargs)
116 | 
117 |             est.fit(x.reshape(-1, 1), y, sample_weight=sample_weight)
118 |             splits = np.unique(est.tree_.threshold)
119 |             self._splits = splits[splits != _tree.TREE_UNDEFINED]
120 | 
121 |         elif self.method == "mdlp":
122 |             mdlp_kwargs = {"min_samples_leaf": self.min_bin_size}
123 |             mdlp_kwargs.update(**self.kwargs)
124 | 
125 |             est = MDLP(**mdlp_kwargs)
126 |             est.fit(x, y)
127 |             self._splits = est.splits
128 | 
129 |         return self
130 | 
131 |     @property
132 |     def splits(self):
133 |         """List of split points
134 | 
135 |         Returns
136 |         -------
137 |         splits : numpy.ndarray
138 |         """
139 |         return self._splits
140 | 


--------------------------------------------------------------------------------
/optbinning/binning/uncertainty/__init__.py:
--------------------------------------------------------------------------------
1 | from .binning_scenarios import SBOptimalBinning
2 | 
3 | 
4 | __all__ = ['SBOptimalBinning']
5 | 


--------------------------------------------------------------------------------
/optbinning/exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Custom error and warning exceptions.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2021
 7 | 
 8 | 
 9 | class NotDataAddedError(ValueError, AttributeError):
10 |     """Exception class to raise if binning sketch is solved before adding
11 |     data.
12 | 
13 |     This class inherits from both ValueError and AttributeError to help with
14 |     exception handling and backward compatibility.
15 |     """
16 | 
17 | 
18 | class NotSolvedError(ValueError, AttributeError):
19 |     """Exception class to raise if binning sketch methods are called before
20 |     solving.
21 | 
22 |     This class inherits from both ValueError and AttributeError to help with
23 |     exception handling and backward compatibility.
24 |     """
25 | 
26 | 
27 | class NotGeneratedError(ValueError, AttributeError):
28 |     """Exception class to raise is counterfactual information is requested
29 |     before generating explanations.
30 | 
31 |     This class inherits from both ValueError and AttributeError to help with
32 |     exception handling and backward compatibility.
33 |     """
34 | 
35 | 
36 | class CounterfactualsFoundWarning(UserWarning):
37 |     """Warning used to notify no feasible counterfactual explanations were
38 |     found.
39 |     """
40 | 


--------------------------------------------------------------------------------
/optbinning/formatting.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Printing utilities.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2020
 7 | 
 8 | import numbers
 9 | import textwrap
10 | 
11 | import pandas as pd
12 | 
13 | 
14 | def dataframe_to_string(df, tab=None):
15 |     if not isinstance(df, pd.DataFrame):
16 |         raise TypeError("df must be a pandas.DataFrame.")
17 | 
18 |     if tab is not None:
19 |         if not isinstance(tab, numbers.Integral) or tab < 0:
20 |             raise ValueError("tab must be a positive integer; got {}."
21 |                              .format(tab))
22 | 
23 |     df_string = textwrap.dedent(df.to_string(index=False))
24 | 
25 |     if tab is None:
26 |         return df_string
27 |     else:
28 |         return textwrap.indent(df_string, " " * tab)
29 | 


--------------------------------------------------------------------------------
/optbinning/information.py:
--------------------------------------------------------------------------------
  1 | """
  2 | General information routines.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2021
  7 | 
  8 | import numpy as np
  9 | 
 10 | from sklearn.base import BaseEstimator
 11 | 
 12 | from ._version import __version__
 13 | 
 14 | try:
 15 |     from localsolver import LSStatistics
 16 |     LOCALSOLVER_AVAILABLE = True
 17 | except ImportError:
 18 |     LOCALSOLVER_AVAILABLE = False
 19 | 
 20 | 
 21 | def print_header():
 22 |     header = (
 23 |         "optbinning (Version {})\n"
 24 |         "Copyright (c) 2019-2024 Guillermo Navas-Palencia, Apache License 2.0"
 25 |         "\n".format(__version__))
 26 | 
 27 |     print(header)
 28 | 
 29 | 
 30 | def print_optional_parameters(dict_default_options, dict_user_options):
 31 |     option_format = "    {:<24} {:>15}   * {}\n"
 32 |     str_options = "  Begin options\n"
 33 |     for key, value in dict_default_options.items():
 34 |         user_value = dict_user_options[key]
 35 | 
 36 |         if (isinstance(user_value, (list, np.ndarray, dict)) or
 37 |                 value != user_value):
 38 |             user_flag = "U"
 39 |         else:
 40 |             user_flag = "d"
 41 | 
 42 |         if user_value is None:
 43 |             user_value = "no"
 44 |         elif isinstance(user_value, (list, np.ndarray, dict)):
 45 |             user_value = "yes"
 46 |         elif isinstance(user_value, BaseEstimator):
 47 |             user_value = "yes"
 48 | 
 49 |         str_options += option_format.format(key, str(user_value), user_flag)
 50 |     str_options += "  End options\n"
 51 |     print(str_options)
 52 | 
 53 | 
 54 | def solver_statistics(solver_type, solver):
 55 |     time_optimizer = None
 56 |     d_solver = {}
 57 | 
 58 |     if solver_type == "cp":
 59 |         d_solver["n_booleans"] = solver.NumBooleans()
 60 |         d_solver["n_branches"] = solver.NumBranches()
 61 |         d_solver["n_conflicts"] = solver.NumConflicts()
 62 |         d_solver["objective"] = int(solver.ObjectiveValue())
 63 |         d_solver["best_objective_bound"] = int(solver.BestObjectiveBound())
 64 | 
 65 |         time_optimizer = solver.WallTime()
 66 | 
 67 |     elif solver_type == "mip":
 68 |         d_solver["n_constraints"] = solver.NumConstraints()
 69 |         d_solver["n_variables"] = solver.NumVariables()
 70 |         d_solver["objective"] = solver.Objective().Value()
 71 |         d_solver["best_bound"] = solver.Objective().BestBound()
 72 | 
 73 |     elif solver_type == "ls":
 74 |         if not LOCALSOLVER_AVAILABLE:
 75 |             raise ImportError('Cannot import localsolver. Install LocalSolver '
 76 |                               'or choose another solver, options are "cp" and '
 77 |                               '"mip".')
 78 | 
 79 |         d_solver["n_iterations"] = LSStatistics.get_nb_iterations(
 80 |             solver.statistics)
 81 | 
 82 |     elif solver_type == "lp":
 83 |         d_solver["n_variables"] = solver.n_variables
 84 |         d_solver["n_constraints"] = solver.n_constraints
 85 |         d_solver["n_iterations"] = solver.n_iterations
 86 |         d_solver["objective"] = solver.objective
 87 | 
 88 |     return d_solver, time_optimizer
 89 | 
 90 | 
 91 | def print_solver_statistics(solver_type, d_solver):
 92 |     if solver_type == "cp":
 93 |         solver_stats = (
 94 |             "  Solver statistics\n"
 95 |             "    Type                          {:>10}\n"
 96 |             "    Number of booleans            {:>10}\n"
 97 |             "    Number of branches            {:>10}\n"
 98 |             "    Number of conflicts           {:>10}\n"
 99 |             "    Objective value               {:>10}\n"
100 |             "    Best objective bound          {:>10}\n"
101 |             ).format(solver_type, *d_solver.values())
102 | 
103 |     elif solver_type == "mip":
104 |         solver_stats = (
105 |             "  Solver statistics\n"
106 |             "    Type                          {:>10}\n"
107 |             "    Number of variables           {:>10}\n"
108 |             "    Number of constraints         {:>10}\n"
109 |             "    Objective value               {:>10.4f}\n"
110 |             "    Best objective bound          {:>10.4f}\n"
111 |             ).format(solver_type, *d_solver.values())
112 | 
113 |     elif solver_type == "ls":
114 |         solver_stats = (
115 |             "  Solver statistics\n"
116 |             "    Type                          {:>10}\n"
117 |             "    Number of iterations          {:>10}\n"
118 |             ).format(solver_type, *d_solver.values())
119 | 
120 |     elif solver_type == "lp":
121 |         solver_stats = (
122 |             "  Solver statistics\n"
123 |             "    Type                          {:>10}\n"
124 |             "    Number of variables           {:>10}\n"
125 |             "    Number of constraints         {:>10}\n"
126 |             "    Number of iterations          {:>10}\n"
127 |             "    Objective value               {:>10.4f}\n"
128 |             ).format(solver_type, *d_solver.values())
129 | 
130 |     print(solver_stats)
131 | 


--------------------------------------------------------------------------------
/optbinning/logging.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logging class.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2019
 7 | 
 8 | import logging
 9 | import sys
10 | 
11 | 
12 | class Logger:
13 |     def __init__(self, logger_name=None, filename=None):
14 |         self.logger = logging.getLogger(logger_name)
15 |         self.logger.setLevel(logging.INFO)
16 |         self.logger.propagate = False
17 | 
18 |         formatter = logging.Formatter(
19 |             '%(asctime)s | %(levelname)s : %(message)s')
20 | 
21 |         handler = logging.StreamHandler(sys.stdout)
22 |         handler.setFormatter(formatter)
23 |         self.logger.addHandler(handler)
24 | 
25 |         if filename is not None:
26 |             fhandler = logging.FileHandler(filename)
27 |             fhandler.setFormatter(formatter)
28 |             self.logger.addHandler(fhandler)
29 | 
30 |     def close(self):
31 |         for handler in self.logger.handlers:
32 |             handler.close()
33 |             self.logger.removeHandler(handler)
34 | 


--------------------------------------------------------------------------------
/optbinning/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/optbinning/metrics/__init__.py


--------------------------------------------------------------------------------
/optbinning/metrics/classification.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Metrics to asses performance of classification models.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2021
  7 | 
  8 | import numpy as np
  9 | 
 10 | from sklearn.metrics import auc
 11 | from sklearn.metrics import confusion_matrix
 12 | from sklearn.metrics import roc_curve
 13 | 
 14 | 
 15 | def gini(y_true, y_pred_proba):
 16 |     """Compute the Gini Index or Accuracy Ration (AR).
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     y_true : array-like, shape (n_samples,)
 21 |         Ground truth (correct) target values.
 22 | 
 23 |     y_pred_proba : array-like, shape (n_samples,)
 24 |         Probability estimates of the positive class.
 25 | 
 26 |     Returns
 27 |     -------
 28 |     gini : float
 29 |     """
 30 |     fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
 31 |     return 2 * auc(fpr, tpr) - 1
 32 | 
 33 | 
 34 | def ks(y_true, y_pred_proba):
 35 |     """Compute the Kolmogorov-Smirnov (KS).
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     y_true : array-like, shape (n_samples,)
 40 |         Ground truth (correct) target values.
 41 | 
 42 |     y_pred_proba : array-like, shape (n_samples,)
 43 |         Probability estimates of the positive class.
 44 | 
 45 |     Returns
 46 |     -------
 47 |     ks : tuple(ks_score, ks_position)
 48 |     """
 49 |     n_samples = y_true.shape[0]
 50 |     n_event = np.sum(y_true)
 51 |     n_nonevent = n_samples - n_event
 52 | 
 53 |     idx = np.argsort(y_pred_proba)
 54 |     yy = y_true[idx]
 55 | 
 56 |     cum_event = np.cumsum(yy)
 57 |     cum_population = np.arange(0, n_samples)
 58 |     cum_nonevent = cum_population - cum_event
 59 | 
 60 |     p_event = cum_event / n_event
 61 |     p_nonevent = cum_nonevent / n_nonevent
 62 | 
 63 |     p_diff = p_nonevent - p_event
 64 |     ks_max_idx = np.argmax(p_diff)
 65 |     ks_score = p_diff[ks_max_idx]
 66 | 
 67 |     return ks_score, ks_max_idx
 68 | 
 69 | 
 70 | def imbalanced_classification_metrics(y_true, y_pred):
 71 |     """Compute imbalanced binary classification metrics.
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     y_true : array-like, shape (n_samples,)
 76 |         Ground truth (correct) target values.
 77 | 
 78 |     y_pred : array-like, shape (n_samples,)
 79 |         Estimated target values.
 80 | 
 81 |     Returns
 82 |     -------
 83 |     metrics : dict
 84 |         Dictionary of metrics.
 85 |     """
 86 |     tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
 87 | 
 88 |     # Sensitivity - True positive rate (TPR)
 89 |     tpr = tp / (tp + fn)
 90 | 
 91 |     # Specificity - True negative rate (TNR)
 92 |     tnr = tn / (fp + tn)
 93 | 
 94 |     # False positive rate (FPR)
 95 |     fpr = 1.0 - tnr
 96 | 
 97 |     # False negative rate (FNR)
 98 |     fnr = 1.0 - tpr
 99 | 
100 |     # Balanced accuracy
101 |     balanced_accuracy = 0.5 * (tpr + tnr)
102 | 
103 |     # Discriminant power
104 |     dp = np.sqrt(3) / np.pi * (np.log(tpr / (1-tnr)) + np.log(tnr / (1-tpr)))
105 | 
106 |     d_metrics = {
107 |         "True positive rate": tpr,
108 |         "True negative rate": tnr,
109 |         "False positive rate": fpr,
110 |         "False negative rate": fnr,
111 |         "Balanced accuracy": balanced_accuracy,
112 |         "Discriminant power": dp
113 |     }
114 | 
115 |     return d_metrics
116 | 


--------------------------------------------------------------------------------
/optbinning/metrics/regression.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Metrics to asses performance of regression models.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2021
  7 | 
  8 | import numpy as np
  9 | 
 10 | from sklearn.metrics import explained_variance_score
 11 | from sklearn.metrics import mean_absolute_error
 12 | from sklearn.metrics import mean_squared_error
 13 | from sklearn.metrics import median_absolute_error
 14 | from sklearn.metrics import r2_score
 15 | 
 16 | 
 17 | def mean_absolute_percentage_error(y_true, y_pred):
 18 |     """Compute the mean absolute percentage error (MAPE).
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     y_true : array-like, shape (n_samples,)
 23 |         Ground truth (correct) target values.
 24 | 
 25 |     y_pred : array-like, shape (n_samples,)
 26 |         Estimated target values.
 27 | 
 28 |     Returns
 29 |     -------
 30 |     mape : float
 31 |     """
 32 |     return np.abs((y_true - y_pred) / y_true).mean()
 33 | 
 34 | 
 35 | def median_absolute_percentage_error(y_true, y_pred):
 36 |     """Compute the median absolute percentage error (MdAPE).
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     y_true : array-like, shape (n_samples,)
 41 |         Ground truth (correct) target values.
 42 | 
 43 |     y_pred : array-like, shape (n_samples,)
 44 |         Estimated target values.
 45 | 
 46 |     Returns
 47 |     -------
 48 |     mdape : float
 49 |     """
 50 |     return np.median(np.abs((y_true - y_pred) / y_true))
 51 | 
 52 | 
 53 | def mean_percentage_error(y_true, y_pred):
 54 |     """Compute the mean percentage error (MPE).
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     y_true : array-like, shape (n_samples,)
 59 |         Ground truth (correct) target values.
 60 | 
 61 |     y_pred : array-like, shape (n_samples,)
 62 |         Estimated target values.
 63 | 
 64 |     Returns
 65 |     -------
 66 |     mpe : float
 67 |     """
 68 |     return ((y_true - y_pred) / y_true).mean()
 69 | 
 70 | 
 71 | def symmetric_mean_absolute_percentage_error(y_true, y_pred):
 72 |     """Compute the symmetric mean absolute percentage error (SMAPE).
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     y_true : array-like, shape (n_samples,)
 77 |         Ground truth (correct) target values.
 78 | 
 79 |     y_pred : array-like, shape (n_samples,)
 80 |         Estimated target values.
 81 | 
 82 |     Returns
 83 |     -------
 84 |     smape : float
 85 |     """
 86 |     e = np.abs(y_true - y_pred)
 87 |     return (e / (np.abs(y_true) + np.abs(y_pred))).mean()
 88 | 
 89 | 
 90 | def symmetric_median_absolute_percentage_error(y_true, y_pred):
 91 |     """Compute the symmetric median absolute percentage error (SMdAPE).
 92 | 
 93 |     Parameters
 94 |     ----------
 95 |     y_true : array-like, shape (n_samples,)
 96 |         Ground truth (correct) target values.
 97 | 
 98 |     y_pred : array-like, shape (n_samples,)
 99 |         Estimated target values.
100 | 
101 |     Returns
102 |     -------
103 |     smdape : float
104 |     """
105 |     e = np.abs(y_true - y_pred)
106 |     return np.median(e / (np.abs(y_true) + np.abs(y_pred)))
107 | 
108 | 
109 | def regression_metrics(y_true, y_pred):
110 |     """Compute regression metrics.
111 | 
112 |     Parameters
113 |     ----------
114 |     y_true : array-like, shape (n_samples,)
115 |         Ground truth (correct) target values.
116 | 
117 |     y_pred : array-like, shape (n_samples,)
118 |         Estimated target values.
119 | 
120 |     Returns
121 |     -------
122 |     metrics : dict
123 |         Dictionary of metrics.
124 |     """
125 | 
126 |     # Explained variance
127 |     variance = explained_variance_score(y_true, y_pred)
128 | 
129 |     # Mean absolute error
130 |     mae = mean_absolute_error(y_true, y_pred)
131 | 
132 |     # Mean squared error
133 |     mse = mean_squared_error(y_true, y_pred)
134 | 
135 |     # Median absolute error
136 |     median_ae = median_absolute_error(y_true, y_pred)
137 | 
138 |     # R^2 score
139 |     r2 = r2_score(y_true, y_pred)
140 | 
141 |     # Mean absolute percentage error
142 |     mape = mean_absolute_percentage_error(y_true, y_pred)
143 | 
144 |     # Mean percentage error
145 |     mpe = mean_percentage_error(y_true, y_pred)
146 | 
147 |     # Symmetric mean absolute percentage error
148 |     smape = symmetric_mean_absolute_percentage_error(y_true, y_pred)
149 | 
150 |     # Median absolute percentage error
151 |     mdape = median_absolute_percentage_error(y_true, y_pred)
152 | 
153 |     # Symmetric meadian absolute percentage error
154 |     smdape = symmetric_median_absolute_percentage_error(y_true, y_pred)
155 | 
156 |     d_metrics = {
157 |         "Mean absolute error": mae,
158 |         "Mean squared error": mse,
159 |         "Median absolute error": median_ae,
160 |         "Explained variance": variance,
161 |         "R^2": r2,
162 |         "MPE": mpe,
163 |         "MAPE": mape,
164 |         "SMAPE": smape,
165 |         "MdAPE": mdape,
166 |         "SMdAPE": smdape
167 |     }
168 | 
169 |     return d_metrics
170 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/__init__.py:
--------------------------------------------------------------------------------
 1 | from .counterfactual import Counterfactual
 2 | from .monitoring import ScorecardMonitoring
 3 | from .plots import plot_auc_roc, plot_cap, plot_ks
 4 | from .scorecard import Scorecard
 5 | 
 6 | 
 7 | __all__ = ["Scorecard",
 8 |            "ScorecardMonitoring",
 9 |            "plot_auc_roc",
10 |            "plot_cap",
11 |            "plot_ks",
12 |            "Counterfactual"]
13 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/__init__.py:
--------------------------------------------------------------------------------
1 | from .counterfactual import Counterfactual
2 | 
3 | 
4 | __all__ = ['Counterfactual']
5 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base counterfactual algorithm class.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2021
 7 | 
 8 | from abc import ABCMeta
 9 | from abc import abstractmethod
10 | 
11 | from sklearn.base import BaseEstimator
12 | 
13 | from ...binning.base import Base
14 | from ...exceptions import CounterfactualsFoundWarning
15 | from ...exceptions import NotGeneratedError
16 | 
17 | 
18 | class BaseCounterfactual(Base, BaseEstimator, metaclass=ABCMeta):
19 |     @abstractmethod
20 |     def fit(self):
21 |         """Fit counterfactual with training data."""
22 | 
23 |     @abstractmethod
24 |     def generate(self):
25 |         """Generate counterfactual explanations."""
26 | 
27 |     @abstractmethod
28 |     def display(self):
29 |         """Display counterfactual explanations."""
30 | 
31 |     @property
32 |     @abstractmethod
33 |     def status(self):
34 |         """The status of the underlying optimization solver."""
35 | 
36 |     def _check_is_generated(self):
37 |         if not self._is_generated:
38 |             raise NotGeneratedError("This {} instance has not generated "
39 |                                     "counterfactuals yet. Call "
40 |                                     "'generate' with appropriate arguments."
41 |                                     .format(self.__class__.__name__))
42 | 
43 |     def _check_counterfactual_is_found(self):
44 |         if not self._cfs:
45 |             raise CounterfactualsFoundWarning(
46 |                 "Neither optimal or feasible counterfactuals were found.")
47 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/counterfactual_information.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Counterfactual information.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2021
 7 | 
 8 | from ...information import print_header
 9 | from ...information import print_optional_parameters
10 | from ...information import print_solver_statistics
11 | from ...options import counterfactual_default_options
12 | 
13 | 
14 | def print_status(status):
15 |     print("  Status  : {:<32}\n".format(status))
16 | 
17 | 
18 | def print_main_info(status, time_total):
19 |     print_status(status)
20 | 
21 |     print("  Time    : {:<7.4f} sec\n".format(time_total))
22 | 
23 | 
24 | def print_objectives(objectives):
25 |     str_objectives = "  Objectives\n"
26 | 
27 |     for objname, objexp in objectives.items():
28 |         objval = objexp.solution_value()
29 |         if objname in ("diversity_features", "diversity_values"):
30 |             objval = abs(objval)
31 | 
32 |         str_objectives += "    {:<18}            {:>10.4f}\n".format(
33 |             objname, objval)
34 | 
35 |     print(str_objectives)
36 | 
37 | 
38 | def print_timing(time_total, time_fit, time_solver, time_postprocessing):
39 |     p_fit = time_fit / time_total
40 |     p_solver = time_solver / time_total
41 |     p_postprocessing = time_postprocessing / time_solver
42 | 
43 |     time_stats = (
44 |         "  Timing\n"
45 |         "    Total time            {:>18.2f} sec\n"
46 |         "    Fit                   {:>18.2f} sec   ({:>7.2%})\n"
47 |         "    Solver                {:>18.2f} sec   ({:>7.2%})\n"
48 |         "    Post-processing       {:>18.2f} sec   ({:>7.2%})\n"
49 |         ).format(time_total, time_fit, p_fit, time_solver, p_solver,
50 |                  time_postprocessing, p_postprocessing)
51 | 
52 |     print(time_stats)
53 | 
54 | 
55 | def print_counterfactual_information(print_level, status, solver, objectives,
56 |                                      time_total, time_fit, time_solver,
57 |                                      time_postprocessing, dict_user_options):
58 | 
59 |     print_header()
60 | 
61 |     if print_level == 2:
62 |         dict_default_options = counterfactual_default_options
63 |         print_optional_parameters(dict_default_options, dict_user_options)
64 | 
65 |     if print_level == 0:
66 |         print_main_info(status, time_total)
67 |     elif print_level >= 1:
68 |         print_status(status)
69 | 
70 |         if status in ("OPTIMAL", "FEASIBLE"):
71 |             if solver is not None:
72 |                 print_solver_statistics("mip", solver)
73 |                 print_objectives(objectives)
74 | 
75 |         print_timing(time_total, time_fit, time_solver, time_postprocessing)
76 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/model_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Counterfactual model data.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2021
 7 | 
 8 | 
 9 | def model_data(scorecard, x, special_missing):
10 |     s_vars = scorecard.binning_process_.get_support(names=True)
11 | 
12 |     sc = scorecard.table(style="detailed")
13 |     metric_name = "WoE" if scorecard._target_dtype == "binary" else "Mean"
14 | 
15 |     # Number of bins, metric and indices
16 |     nbins = []
17 |     metric = []
18 |     indices = []
19 |     for i, v in enumerate(s_vars):
20 |         metric_i = sc[sc.Variable == v][metric_name].values
21 | 
22 |         if not special_missing:
23 |             metric_i = metric_i[:-2]
24 | 
25 |         _metric = []
26 |         _indices = []
27 |         for j, m in enumerate(metric_i):
28 |             if m != x[i]:
29 |                 _indices.append(j)
30 |                 _metric.append(m)
31 | 
32 |         metric.append(_metric)
33 |         nbins.append(len(_metric))
34 |         indices.append(_indices)
35 | 
36 |     return nbins, metric, indices
37 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/problem_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Counterfactual problem data.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2021
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def problem_data(scorecard, X):
12 |     s_vars = X.columns
13 |     n_vars = X.shape[1]
14 | 
15 |     # Scorecard table
16 |     sc = scorecard.table(style="detailed")
17 | 
18 |     if scorecard._target_dtype == "binary":
19 |         sc["Points"] = sc["WoE"] * sc["Coefficient"]
20 |     else:
21 |         sc["Points"] = sc["Mean"] * sc["Coefficient"]
22 | 
23 |     # Linear model coefficients
24 | 
25 |     # Only index into the intercept if it is an array, it is a scalar otherwise
26 |     if isinstance(scorecard.estimator_.intercept_, np.ndarray):
27 |         intercept = float(scorecard.estimator_.intercept_[0])
28 |     else:
29 |         intercept = float(scorecard.estimator_.intercept_)
30 | 
31 |     coef = scorecard.estimator_.coef_.ravel()
32 | 
33 |     # Big-M parameters (min, max) points.
34 |     # Proximity weights. Inverse value range for each feature
35 |     min_p = 0
36 |     max_p = 0
37 |     wrange = np.empty(n_vars)
38 | 
39 |     for i, v in enumerate(s_vars):
40 |         v_points = sc[sc["Variable"] == v]["Points"]
41 |         _min = np.min(v_points)
42 |         _max = np.max(v_points)
43 |         min_p += _min
44 |         max_p += _max
45 | 
46 |         wrange[i] = 1.0 / (_max - _min)
47 | 
48 |     min_p += intercept
49 |     max_p += intercept
50 | 
51 |     # Mahalanobis distance
52 |     Xt = scorecard.binning_process_.transform(X).values
53 |     F = np.linalg.cholesky(np.linalg.inv(np.cov(Xt.T)))
54 |     mu = Xt.mean(axis=0)
55 | 
56 |     return intercept, coef, min_p, max_p, wrange, F, mu
57 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/counterfactual/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Piecewise linear approximation of logistic function.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2021
 7 | 
 8 | import numpy as np
 9 | 
10 | from ropwr import RobustPWRegression
11 | 
12 | 
13 | def logistic_pw(min_p, max_p, n_bins):
14 |     xl = np.linspace(min_p, max_p, 100)
15 |     yl = (1.0 / (1 + np.exp(-xl)))
16 | 
17 |     splits = np.linspace(min_p, max_p, n_bins+1)[1:-1]
18 | 
19 |     pw = RobustPWRegression(objective="l1", degree=1, monotonic_trend=None)
20 |     pw.fit(xl, yl, splits)
21 | 
22 |     splits = np.array([min_p] + list(splits) + [max_p])
23 |     b_pw = [(splits[i], splits[i+1]) for i in range(len(splits) - 1)]
24 |     c_pw = pw.coef_
25 | 
26 |     return b_pw, c_pw
27 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/monitoring_information.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Monitoring information.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2020
 7 | 
 8 | from ..binning.binning_information import print_header
 9 | from ..binning.binning_information import print_optional_parameters
10 | from ..options import scorecard_monitoring_default_options
11 | 
12 | 
13 | def print_main_info(n_records_a, n_records_e, n_variables, time_total):
14 |     print("  Number of records A : {}".format(n_records_a))
15 |     print("  Number of records E : {}".format(n_records_e))
16 |     print("  Number of variables : {}".format(n_variables))
17 |     print("  Time                : {:<7.4f} sec\n".format(time_total))
18 | 
19 | 
20 | def print_monitoring_statistics(n_records_a, n_records_e, n_variables,
21 |                                 target_dtype, time_total, time_system,
22 |                                 time_variables):
23 | 
24 |     stats = (
25 |         "  Statistics\n"
26 |         "    Number of records Actual      {:>10}\n"
27 |         "    Number of records Expected    {:>10}\n"
28 |         "    Number of scorecard variables {:>10}\n"
29 |         "    Target type                   {:>10}\n"
30 |         ).format(n_records_a, n_records_e, n_variables, target_dtype)
31 | 
32 |     print(stats)
33 | 
34 |     p_system = time_system / time_total
35 |     p_variables = time_variables / time_total
36 | 
37 |     time_stats = (
38 |         "  Timing\n"
39 |         "    Total time            {:>18.2f} sec\n"
40 |         "    System stability      {:>18.2f} sec   ({:>7.2%})\n"
41 |         "    Variables stability   {:>18.2f} sec   ({:>7.2%})\n"
42 |         ).format(time_total, time_system, p_system, time_variables,
43 |                  p_variables)
44 | 
45 |     print(time_stats)
46 | 
47 | 
48 | def print_monitoring_information(print_level, n_records_a, n_records_e,
49 |                                  n_variables, target_dtype, time_total,
50 |                                  time_system, time_variables,
51 |                                  dict_user_options):
52 | 
53 |     print_header()
54 | 
55 |     if print_level == 2:
56 |         dict_default_options = scorecard_monitoring_default_options
57 |         print_optional_parameters(dict_default_options, dict_user_options)
58 | 
59 |     if print_level == 0:
60 |         print_main_info(n_records_a, n_records_e, n_variables, time_total)
61 |     elif print_level >= 1:
62 |         print_monitoring_statistics(n_records_a, n_records_e, n_variables,
63 |                                     target_dtype, time_total, time_system,
64 |                                     time_variables)
65 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/rounding.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Rounding strategy.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2020
  7 | 
  8 | import numpy as np
  9 | 
 10 | from ortools.linear_solver import pywraplp
 11 | 
 12 | 
 13 | class RoundingMIP:
 14 |     def __init__(self):
 15 |         self.solver_ = None
 16 | 
 17 |         self._nb = None
 18 |         self._nn = None
 19 |         self._p = None
 20 | 
 21 |     def build_model(self, df_scorecard):
 22 |         # Parameters
 23 |         points = []
 24 |         mins = []
 25 |         maxs = []
 26 |         for variable in df_scorecard.Variable.unique():
 27 |             mask = df_scorecard.Variable == variable
 28 |             p = df_scorecard[mask].Points.values
 29 |             mins.append(p.min())
 30 |             maxs.append(p.max())
 31 |             points.append(p)
 32 | 
 33 |         nb = len(points)
 34 |         nn = [len(p) for p in points]
 35 | 
 36 |         min_point = np.rint(np.sum(mins))
 37 |         max_point = np.rint(np.sum(maxs))
 38 | 
 39 |         min_p = np.min(mins)
 40 |         max_p = np.max(maxs)
 41 | 
 42 |         # Initialize solver
 43 |         solver = pywraplp.Solver(
 44 |                 'RoundingMIP', pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING)
 45 | 
 46 |         # Decision variables
 47 |         p = {}
 48 |         tp = {}
 49 |         tm = {}
 50 |         min_b = {}
 51 |         max_b = {}
 52 |         for i in range(nb):
 53 |             min_b[i] = solver.IntVar(min_p, max_p, "min_b[{}]".format(i))
 54 |             max_b[i] = solver.IntVar(min_p, max_p, "max_b[{}]".format(i))
 55 |             for j in range(nn[i]):
 56 |                 p[i, j] = solver.IntVar(min_p, max_p, "p[{}, {}]".format(i, j))
 57 |                 tp[i, j] = solver.NumVar(0, np.inf, "tp[{}, {}]".format(i, j))
 58 |                 tm[i, j] = solver.NumVar(0, np.inf, "tm[{}, {}]".format(i, j))
 59 | 
 60 |         # Objective function
 61 |         solver.Minimize(solver.Sum([solver.Sum([tp[i, j] + tm[i, j]
 62 |                         for j in range(nn[i])]) for i in range(nb)]))
 63 | 
 64 |         # Constraints
 65 |         for i in range(nb):
 66 |             for j in range(nn[i]):
 67 |                 solver.Add(tp[i, j] - tm[i, j] == points[i][j] - p[i, j])
 68 | 
 69 |                 # Max score constraint for each variable
 70 |                 solver.Add(max_b[i] >= p[i, j])
 71 | 
 72 |                 # Min score constraints for each variable
 73 |                 solver.Add(min_b[i] <= p[i, j])
 74 | 
 75 |         # Sum of minimum/maximum point by variable must be min_point/max_point
 76 |         solver.Add(solver.Sum([min_b[i] for i in range(nb)]) == min_point)
 77 |         solver.Add(solver.Sum([max_b[i] for i in range(nb)]) == max_point)
 78 | 
 79 |         self.solver_ = solver
 80 |         self._nb = nb
 81 |         self._nn = nn
 82 |         self._p = p
 83 | 
 84 |     def solve(self):
 85 |         status = self.solver_.Solve()
 86 | 
 87 |         if status in (pywraplp.Solver.OPTIMAL, pywraplp.Solver.FEASIBLE):
 88 |             if status == pywraplp.Solver.OPTIMAL:
 89 |                 status_name = "OPTIMAL"
 90 |             else:
 91 |                 status_name = "FEASIBLE"
 92 | 
 93 |             # compute solution
 94 |             solution = []
 95 |             for i in range(self._nb):
 96 |                 for j in range(self._nn[i]):
 97 |                     solution.append(self._p[i, j].solution_value())
 98 |         else:
 99 |             if status == pywraplp.Solver.ABNORMAL:
100 |                 status_name = "ABNORMAL"
101 |             elif status == pywraplp.Solver.INFEASIBLE:
102 |                 status_name = "INFEASIBLE"
103 |             elif status == pywraplp.Solver.UNBOUNDED:
104 |                 status_name = "UNBOUNDED"
105 |             else:
106 |                 status_name = "UNKNOWN"
107 | 
108 |             solution = None
109 | 
110 |         return status_name, solution
111 | 


--------------------------------------------------------------------------------
/optbinning/scorecard/scorecard_information.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Scorecard information.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2020
 7 | 
 8 | from ..information import print_header
 9 | from ..information import print_optional_parameters
10 | from ..options import scorecard_default_options
11 | 
12 | 
13 | def print_main_info(n_records, n_variables, time_total):
14 |     print("  Number of records   : {}".format(n_records))
15 |     print("  Number of variables : {}".format(n_variables))
16 |     print("  Time                : {:<7.4f} sec\n".format(time_total))
17 | 
18 | 
19 | def print_scorecard_statistics(n_records, n_variables, target_dtype,
20 |                                n_numerical, n_categorical, n_selected,
21 |                                time_total, time_binning_process,
22 |                                time_estimator, time_build_scorecard,
23 |                                time_rounding):
24 | 
25 |     stats = (
26 |         "  Statistics\n"
27 |         "    Number of records             {:>10}\n"
28 |         "    Number of variables           {:>10}\n"
29 |         "    Target type                   {:>10}\n\n"
30 |         "    Number of numerical           {:>10}\n"
31 |         "    Number of categorical         {:>10}\n"
32 |         "    Number of selected            {:>10}\n"
33 |         ).format(n_records, n_variables, target_dtype, n_numerical,
34 |                  n_categorical, n_selected)
35 | 
36 |     print(stats)
37 | 
38 |     p_binning_process = time_binning_process / time_total
39 |     p_estimator = time_estimator / time_total
40 |     p_build_scorecard = time_build_scorecard / time_total
41 |     p_rounding = time_rounding / time_build_scorecard
42 | 
43 |     time_stats = (
44 |         "  Timing\n"
45 |         "    Total time            {:>18.2f} sec\n"
46 |         "    Binning process       {:>18.2f} sec   ({:>7.2%})\n"
47 |         "    Estimator             {:>18.2f} sec   ({:>7.2%})\n"
48 |         "    Build scorecard       {:>18.2f} sec   ({:>7.2%})\n"
49 |         "      rounding            {:>18.2f} sec   ({:>7.2%})\n"
50 |         ).format(time_total, time_binning_process, p_binning_process,
51 |                  time_estimator, p_estimator, time_build_scorecard,
52 |                  p_build_scorecard, time_rounding, p_rounding)
53 | 
54 |     print(time_stats)
55 | 
56 | 
57 | def print_scorecard_information(print_level, n_records, n_variables,
58 |                                 target_dtype, n_numerical, n_categorical,
59 |                                 n_selected, time_total, time_binning_process,
60 |                                 time_estimator, time_build_scorecard,
61 |                                 time_rounding, dict_user_options):
62 |     print_header()
63 | 
64 |     if print_level == 2:
65 |         dict_default_options = scorecard_default_options
66 |         print_optional_parameters(dict_default_options, dict_user_options)
67 | 
68 |     if print_level == 0:
69 |         print_main_info(n_records, n_variables, time_total)
70 |     elif print_level >= 1:
71 |         print_scorecard_statistics(n_records, n_variables, target_dtype,
72 |                                    n_numerical, n_categorical, n_selected,
73 |                                    time_total, time_binning_process,
74 |                                    time_estimator, time_build_scorecard,
75 |                                    time_rounding)
76 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | numpy>=1.16.1
3 | ortools>=9.4,<9.12
4 | pandas
5 | ropwr>=1.0.0
6 | scikit-learn>=1.0.2
7 | scipy>=1.6.0
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | 
 5 | from setuptools import find_packages, setup, Command
 6 | 
 7 | long_description = '''
 8 | The optimal binning is the optimal discretization of a variable into bins
 9 | given a discrete or continuous numeric target. OptBinning is a library
10 | written in Python implementing a rigorous and flexible mathematical
11 | programming formulation to solving the optimal binning problem for a binary,
12 | continuous and multiclass target type, incorporating constraints not
13 | previously addressed.
14 | 
15 | Read the documentation at: http://gnpalencia.org/optbinning/
16 | 
17 | OptBinning is distributed under the Apache Software License (Apache 2.0).
18 | '''
19 | 
20 | 
21 | class CleanCommand(Command):
22 |     user_options = []
23 | 
24 |     def initialize_options(self):
25 |         pass
26 | 
27 |     def finalize_options(self):
28 |         pass
29 | 
30 |     def run(self):
31 |         os.system('rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info')
32 | 
33 | 
34 | # install requirements
35 | install_requires = [
36 |     'matplotlib',
37 |     'numpy>=1.16.1',
38 |     'ortools>=9.4,<9.12',
39 |     'pandas',
40 |     'ropwr>=1.0.0',
41 |     'scikit-learn>=1.0.2',
42 |     'scipy>=1.6.0',
43 | ]
44 | 
45 | # extra requirements
46 | extras_require = {
47 |     'distributed': ['pympler', 'tdigest'],
48 |     'test': [
49 |         'coverage', 
50 |         'flake8',
51 |         'pytest',
52 |         'pyarrow',
53 |         'pympler',
54 |         'tdigest',
55 |     ],
56 |     # For ecos support: https://github.com/embotech/ecos 
57 |     'ecos': ['ecos']
58 | }
59 | 
60 | 
61 | # Read version file
62 | version_info = {}
63 | with open("optbinning/_version.py") as f:
64 |     exec(f.read(), version_info)
65 | 
66 | 
67 | setup(
68 |     name="optbinning",
69 |     version=version_info['__version__'],
70 |     description="OptBinning: The Python Optimal Binning library",
71 |     long_description=long_description,
72 |     author="Guillermo Navas-Palencia",
73 |     author_email="g.navas.palencia@gmail.com",
74 |     packages=find_packages(exclude=['tests', 'tests.*']),
75 |     platforms="any",
76 |     include_package_data=True,
77 |     license="Apache Licence 2.0",
78 |     url="https://github.com/guillermo-navas-palencia/optbinning",
79 |     cmdclass={'clean': CleanCommand},
80 |     python_requires='>=3.7',
81 |     install_requires=install_requires,
82 |     extras_require=extras_require,
83 |     classifiers=[
84 |         'Topic :: Scientific/Engineering :: Mathematics',
85 |         'Topic :: Software Development :: Libraries',
86 |         'Topic :: Software Development :: Libraries :: Python Modules',
87 |         'Intended Audience :: Developers',
88 |         'Intended Audience :: Education',
89 |         'Intended Audience :: Science/Research',
90 |         'License :: OSI Approved :: Apache Software License',
91 |         'Programming Language :: Python :: 3',
92 |         'Programming Language :: Python :: 3.9',
93 |         'Programming Language :: Python :: 3.10',
94 |         'Programming Language :: Python :: 3.11',
95 |         'Programming Language :: Python :: 3.12',
96 |         ]
97 |     )
98 | 


--------------------------------------------------------------------------------
/test_requirements.txt:
--------------------------------------------------------------------------------
1 | coverage
2 | flake8
3 | pytest
4 | pyarrow
5 | pympler
6 | tdigest
7 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/breast_cancer.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/data/breast_cancer.parquet


--------------------------------------------------------------------------------
/tests/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import load_boston
2 | 
3 | 
4 | __all__ = ['load_boston']
5 | 


--------------------------------------------------------------------------------
/tests/datasets/datasets.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | class Data:
 6 |     def __init__(self, data, target, feature_names):
 7 |         self.data = data
 8 |         self.target = target
 9 |         self.feature_names = feature_names
10 | 
11 | 
12 | def load_boston():
13 |     data_url = "http://lib.stat.cmu.edu/datasets/boston"
14 |     raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None)
15 |     raw_data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
16 |     target = raw_df.values[1::2, 2]
17 |     feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS',
18 |                      'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
19 | 
20 |     return Data(raw_data, target, feature_names)
21 | 


--------------------------------------------------------------------------------
/tests/results/plot_auc_roc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/plot_auc_roc.png


--------------------------------------------------------------------------------
/tests/results/plot_cap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/plot_cap.png


--------------------------------------------------------------------------------
/tests/results/plot_ks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/plot_ks.png


--------------------------------------------------------------------------------
/tests/results/psi_plot_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/psi_plot_binary.png


--------------------------------------------------------------------------------
/tests/results/psi_plot_continuous.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/psi_plot_continuous.png


--------------------------------------------------------------------------------
/tests/results/test_binning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning.png


--------------------------------------------------------------------------------
/tests/results/test_binning_2d_event_rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_2d_event_rate.png


--------------------------------------------------------------------------------
/tests/results/test_binning_2d_woe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_2d_woe.png


--------------------------------------------------------------------------------
/tests/results/test_binning_no_missing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_no_missing.png


--------------------------------------------------------------------------------
/tests/results/test_binning_no_special.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_no_special.png


--------------------------------------------------------------------------------
/tests/results/test_binning_process_information.txt:
--------------------------------------------------------------------------------
 1 | optbinning (Version 0.14.0)
 2 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
 3 | 
 4 |   Number of records   : 569
 5 |   Number of variables : 30
 6 |   Time                : 4.2282     sec
 7 | 
 8 | optbinning (Version 0.14.0)
 9 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
10 | 
11 |   Statistics
12 |     Number of records                    569
13 |     Number of variables                   30
14 |     Target type                       binary
15 | 
16 |     Number of numerical                   30
17 |     Number of categorical                  0
18 |     Number of selected                    30
19 | 
20 |   Time                                4.2282 sec
21 | 
22 | optbinning (Version 0.14.0)
23 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
24 | 
25 |   Begin options
26 |     max_n_prebins                         20   * d
27 |     min_prebin_size                     0.05   * d
28 |     min_n_bins                            no   * d
29 |     max_n_bins                            no   * d
30 |     min_bin_size                          no   * d
31 |     max_bin_size                          no   * d
32 |     max_pvalue                            no   * d
33 |     max_pvalue_policy            consecutive   * d
34 |     selection_criteria                    no   * d
35 |     fixed_variables                       no   * d
36 |     categorical_variables                 no   * d
37 |     special_codes                         no   * d
38 |     split_digits                          no   * d
39 |     binning_fit_params                    no   * d
40 |     binning_transform_params              no   * d
41 |     verbose                            False   * d
42 |   End options
43 | 
44 |   Statistics
45 |     Number of records                    569
46 |     Number of variables                   30
47 |     Target type                       binary
48 | 
49 |     Number of numerical                   30
50 |     Number of categorical                  0
51 |     Number of selected                    30
52 | 
53 |   Time                                4.2282 sec
54 | 
55 | 


--------------------------------------------------------------------------------
/tests/results/test_binning_process_verbose.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_process_verbose.txt


--------------------------------------------------------------------------------
/tests/results/test_continuous_binning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning.png


--------------------------------------------------------------------------------
/tests/results/test_continuous_binning_2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning_2d.png


--------------------------------------------------------------------------------
/tests/results/test_continuous_binning_no_missing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning_no_missing.png


--------------------------------------------------------------------------------
/tests/results/test_continuous_binning_no_special.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning_no_special.png


--------------------------------------------------------------------------------
/tests/results/test_multiclass_binning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_multiclass_binning.png


--------------------------------------------------------------------------------
/tests/results/test_multiclass_binning_no_missing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_multiclass_binning_no_missing.png


--------------------------------------------------------------------------------
/tests/results/test_multiclass_binning_no_special.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_multiclass_binning_no_special.png


--------------------------------------------------------------------------------
/tests/results/test_scorecard_information.txt:
--------------------------------------------------------------------------------
 1 | optbinning (Version 0.14.0)
 2 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
 3 | 
 4 |   Number of records   : 569
 5 |   Number of variables : 30
 6 |   Time                : 4.5420  sec
 7 | 
 8 | optbinning (Version 0.14.0)
 9 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
10 | 
11 |   Statistics
12 |     Number of records                    569
13 |     Number of variables                   30
14 |     Target type                       binary
15 | 
16 |     Number of numerical                   30
17 |     Number of categorical                  0
18 |     Number of selected                    30
19 | 
20 |   Timing
21 |     Total time                          4.54 sec
22 |     Binning process                     4.18 sec   ( 92.04%)
23 |     Estimator                           0.04 sec   (  0.94%)
24 |     Build scorecard                     0.32 sec   (  7.01%)
25 |       rounding                          0.00 sec   (  0.00%)
26 | 
27 | optbinning (Version 0.14.0)
28 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
29 | 
30 |   Begin options
31 |     binning_process                      yes   * U
32 |     estimator                            yes   * U
33 |     scaling_method                        no   * d
34 |     scaling_method_params                 no   * d
35 |     intercept_based                    False   * d
36 |     reverse_scorecard                  False   * d
37 |     rounding                           False   * d
38 |     verbose                            False   * d
39 |   End options
40 | 
41 |   Statistics
42 |     Number of records                    569
43 |     Number of variables                   30
44 |     Target type                       binary
45 | 
46 |     Number of numerical                   30
47 |     Number of categorical                  0
48 |     Number of selected                    30
49 | 
50 |   Timing
51 |     Total time                          4.54 sec
52 |     Binning process                     4.18 sec   ( 92.04%)
53 |     Estimator                           0.04 sec   (  0.94%)
54 |     Build scorecard                     0.32 sec   (  7.01%)
55 |       rounding                          0.00 sec   (  0.00%)
56 | 
57 | 


--------------------------------------------------------------------------------
/tests/results/test_scorecard_monitoring_default.txt:
--------------------------------------------------------------------------------
 1 | -----------------------------------
 2 | Monitoring: System Stability Report
 3 | -----------------------------------
 4 | 
 5 |   Population Stability Index (PSI)
 6 | 
 7 | 
 8 |     PSI total:      0.0018 (No significant change)
 9 | 
10 |          PSI bin  Count  Count (%)
11 |     [0.00, 0.10)      3        1.0
12 |     [0.10, 0.25)      0        0.0
13 |     [0.25, Inf+)      0        0.0
14 | 
15 |   Significance tests (H0: actual == expected)
16 | 
17 |      p-value bin  Count  Count (%)
18 |     [0.00, 0.05)      1   0.333333
19 |     [0.05, 0.10)      0   0.000000
20 |     [0.10, 0.50)      1   0.333333
21 |     [0.50, 1.00)      1   0.333333
22 | 
23 |   Target analysis
24 | 
25 |                Metric  Actual Actual (%)  Expected Expected (%)
26 |     Number of records     171          -       398            -
27 |         Event records     108   0.631579       249     0.625628
28 |     Non-event records      63   0.368421       149     0.374372
29 | 
30 |   Performance metrics
31 | 
32 |                  Metric   Actual  Expected  Diff A - E
33 |      True positive rate 0.990741  1.000000   -0.009259
34 |      True negative rate 0.968254  0.979866   -0.011612
35 |     False positive rate 0.031746  0.020134    0.011612
36 |     False negative rate 0.009259  0.000000    0.009259
37 |       Balanced accuracy 0.979497  0.989933   -0.010436
38 |      Discriminant power 4.460557       inf        -inf
39 |                    Gini 0.986185  0.999838   -0.013654
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/results/test_scorecard_monitoring_default_continuous.txt:
--------------------------------------------------------------------------------
 1 | -----------------------------------
 2 | Monitoring: System Stability Report
 3 | -----------------------------------
 4 | 
 5 |   Population Stability Index (PSI)
 6 | 
 7 | 
 8 |     PSI total:      0.1630 (Requires investigation)
 9 | 
10 |          PSI bin  Count  Count (%)
11 |     [0.00, 0.10)     14   0.933333
12 |     [0.10, 0.25)      1   0.066667
13 |     [0.25, Inf+)      0   0.000000
14 | 
15 |   Significance tests (H0: actual == expected)
16 | 
17 |      p-value bin  Count  Count (%)
18 |     [0.00, 0.05)      1   0.066667
19 |     [0.05, 0.10)      0   0.000000
20 |     [0.10, 0.50)      5   0.333333
21 |     [0.50, 1.00)      9   0.600000
22 | 
23 |   Target analysis
24 | 
25 |     Metric    Actual  Expected
26 |       Mean 21.407895 23.015819
27 |        Std  8.632097  9.375315
28 |        p25 16.325000 17.400000
29 |     Median 20.000000 21.750000
30 |        p75 24.125000 26.600000
31 | 
32 |   Performance metrics
33 | 
34 |                    Metric    Actual  Expected  Diff A - E
35 |       Mean absolute error  2.482286  2.546775   -0.064488
36 |        Mean squared error 12.583966 12.187764    0.396202
37 |     Median absolute error  2.059913  1.947342    0.112571
38 |        Explained variance  0.831908  0.861340   -0.029432
39 |                       R^2  0.831117  0.861340   -0.030222
40 |                       MPE -0.032197 -0.024922   -0.007275
41 |                      MAPE  0.125897  0.125992   -0.000095
42 |                     SMAPE  0.061339  0.060410    0.000929
43 |                     MdAPE  0.097021  0.091783    0.005238
44 |                    SMdAPE  0.049889  0.046868    0.003021
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/results/test_scorecard_monitoring_information.txt:
--------------------------------------------------------------------------------
 1 | optbinning (Version 0.14.0)
 2 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
 3 | 
 4 |   Number of records A : 152
 5 |   Number of records E : 354
 6 |   Number of variables : 13
 7 |   Time                : 0.1124  sec
 8 | 
 9 | optbinning (Version 0.14.0)
10 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
11 | 
12 |   Statistics
13 |     Number of records Actual             152
14 |     Number of records Expected           354
15 |     Number of scorecard variables         13
16 |     Target type                   continuous
17 | 
18 |   Timing
19 |     Total time                          0.11 sec
20 |     System stability                    0.07 sec   ( 60.67%)
21 |     Variables stability                 0.04 sec   ( 38.99%)
22 | 
23 | optbinning (Version 0.14.0)
24 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0
25 | 
26 |   Begin options
27 |     scorecard                            yes   * U
28 |     psi_method                          cart   * d
29 |     psi_n_bins                            20   * d
30 |     psi_min_bin_size                    0.05   * d
31 |     show_digits                            2   * d
32 |     verbose                            False   * d
33 |   End options
34 | 
35 |   Statistics
36 |     Number of records Actual             152
37 |     Number of records Expected           354
38 |     Number of scorecard variables         13
39 |     Target type                   continuous
40 | 
41 |   Timing
42 |     Total time                          0.11 sec
43 |     System stability                    0.07 sec   ( 60.67%)
44 |     Variables stability                 0.04 sec   ( 38.99%)
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/results/test_scorecard_monitoring_verbose.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_scorecard_monitoring_verbose.txt


--------------------------------------------------------------------------------
/tests/results/test_scorecard_verbose.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_scorecard_verbose.txt


--------------------------------------------------------------------------------
/tests/test_binning_piecewise.py:
--------------------------------------------------------------------------------
  1 | """
  2 | OptimalPWBinning testing.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2022
  7 | 
  8 | import pandas as pd
  9 | 
 10 | from pytest import approx, raises
 11 | 
 12 | from optbinning import OptimalPWBinning
 13 | from sklearn.datasets import load_breast_cancer
 14 | from sklearn.exceptions import NotFittedError
 15 | 
 16 | 
 17 | data = load_breast_cancer()
 18 | df = pd.DataFrame(data.data, columns=data.feature_names)
 19 | 
 20 | variable = "mean radius"
 21 | x = df[variable].values
 22 | y = data.target
 23 | 
 24 | 
 25 | def test_params():
 26 |     with raises(TypeError):
 27 |         optb = OptimalPWBinning(name=1)
 28 |         optb.fit(x, y)
 29 | 
 30 |     with raises(TypeError):
 31 |         optb = OptimalPWBinning(estimator=2)
 32 |         optb.fit(x, y)
 33 | 
 34 |     with raises(ValueError):
 35 |         optb = OptimalPWBinning(objective="new")
 36 |         optb.fit(x, y)
 37 | 
 38 |     with raises(ValueError):
 39 |         optb = OptimalPWBinning(degree=0.2)
 40 |         optb.fit(x, y)
 41 | 
 42 |     with raises(TypeError):
 43 |         optb = OptimalPWBinning(continuous=1)
 44 |         optb.fit(x, y)
 45 | 
 46 |     with raises(ValueError):
 47 |         optb = OptimalPWBinning(prebinning_method="new")
 48 |         optb.fit(x, y)
 49 | 
 50 |     with raises(ValueError):
 51 |         optb = OptimalPWBinning(min_prebin_size=0.9)
 52 |         optb.fit(x, y)
 53 | 
 54 |     with raises(ValueError):
 55 |         optb = OptimalPWBinning(min_n_bins=1.2)
 56 |         optb.fit(x, y)
 57 | 
 58 |     with raises(ValueError):
 59 |         optb = OptimalPWBinning(max_n_bins=1.2)
 60 |         optb.fit(x, y)
 61 | 
 62 |     with raises(ValueError):
 63 |         optb = OptimalPWBinning(min_n_bins=10, max_n_bins=5)
 64 |         optb.fit(x, y)
 65 | 
 66 |     with raises(ValueError):
 67 |         optb = OptimalPWBinning(min_bin_size=0.6)
 68 |         optb.fit(x, y)
 69 | 
 70 |     with raises(ValueError):
 71 |         optb = OptimalPWBinning(max_bin_size=1.1)
 72 |         optb.fit(x, y)
 73 | 
 74 |     with raises(ValueError):
 75 |         optb = OptimalPWBinning(min_bin_size=0.3, max_bin_size=0.2)
 76 |         optb.fit(x, y)
 77 | 
 78 |     with raises(ValueError):
 79 |         optb = OptimalPWBinning(monotonic_trend="new")
 80 |         optb.fit(x, y)
 81 | 
 82 |     with raises(ValueError):
 83 |         optb = OptimalPWBinning(monotonic_trend="convex", degree=2)
 84 |         optb.fit(x, y)
 85 | 
 86 |     with raises(ValueError):
 87 |         optb = OptimalPWBinning(n_subsamples=1001.2)
 88 |         optb.fit(x, y)
 89 | 
 90 |     with raises(ValueError):
 91 |         optb = OptimalPWBinning(max_pvalue=1.1)
 92 |         optb.fit(x, y)
 93 | 
 94 |     with raises(ValueError):
 95 |         optb = OptimalPWBinning(max_pvalue_policy="new_policy")
 96 |         optb.fit(x, y)
 97 | 
 98 |     with raises(ValueError):
 99 |         optb = OptimalPWBinning(outlier_detector="new_method")
100 |         optb.fit(x, y)
101 | 
102 |     with raises(TypeError):
103 |         optb = OptimalPWBinning(outlier_detector="range",
104 |                                 outlier_params="pass")
105 |         optb.fit(x, y)
106 | 
107 |     with raises(TypeError):
108 |         optb = OptimalPWBinning(user_splits={"a": [1, 2]})
109 |         optb.fit(x, y)
110 | 
111 |     with raises(ValueError):
112 |         optb = OptimalPWBinning(user_splits=None,
113 |                                 user_splits_fixed=[True, True])
114 |         optb.fit(x, y)
115 | 
116 |     with raises(TypeError):
117 |         optb = OptimalPWBinning(user_splits=[1, 2],
118 |                                 user_splits_fixed=(True, True))
119 |         optb.fit(x, y)
120 | 
121 |     with raises(ValueError):
122 |         optb = OptimalPWBinning(user_splits=[1, 2],
123 |                                 user_splits_fixed=[True, 1])
124 |         optb.fit(x, y)
125 | 
126 |     with raises(ValueError):
127 |         optb = OptimalPWBinning(user_splits=[1, 2],
128 |                                 user_splits_fixed=[True])
129 |         optb.fit(x, y)
130 | 
131 |     with raises(TypeError):
132 |         optb = OptimalPWBinning(special_codes={1, 2, 3})
133 |         optb.fit(x, y)
134 | 
135 |     with raises(ValueError):
136 |         optb = OptimalPWBinning(split_digits=9)
137 |         optb.fit(x, y)
138 | 
139 |     with raises(ValueError):
140 |         optb = OptimalPWBinning(solver=None)
141 |         optb.fit(x, y)
142 | 
143 |     with raises(ValueError):
144 |         optb = OptimalPWBinning(h_epsilon=0.9)
145 |         optb.fit(x, y)
146 | 
147 |     with raises(ValueError):
148 |         optb = OptimalPWBinning(quantile=0)
149 |         optb.fit(x, y)
150 | 
151 |     with raises(ValueError):
152 |         optb = OptimalPWBinning(regularization='l3')
153 |         optb.fit(x, y)
154 | 
155 |     with raises(ValueError):
156 |         optb = OptimalPWBinning(reg_l1=-0.5)
157 |         optb.fit(x, y)
158 | 
159 |     with raises(ValueError):
160 |         optb = OptimalPWBinning(reg_l2=-0.5)
161 |         optb.fit(x, y)
162 | 
163 |     with raises(TypeError):
164 |         optb = OptimalPWBinning(random_state='None')
165 |         optb.fit(x, y)
166 | 
167 |     with raises(TypeError):
168 |         optb = OptimalPWBinning(verbose=1)
169 |         optb.fit(x, y)
170 | 
171 | 
172 | def test_default():
173 |     optb = OptimalPWBinning(name=variable)
174 |     optb.fit(x, y)
175 | 
176 |     optb.binning_table.build()
177 |     assert optb.binning_table.iv == approx(5.87474602, rel=1e-6)
178 | 
179 |     with raises(ValueError):
180 |         optb.binning_table.plot(metric="new_metric")
181 | 
182 |     optb.binning_table.plot(
183 |         metric="woe", savefig="tests/results/test_binning_piecewise.png")
184 | 
185 | 
186 | def test_default_discontinuous():
187 |     optb = OptimalPWBinning(name=variable, continuous=False)
188 |     optb.fit(x, y)
189 | 
190 |     optb.binning_table.build()
191 |     assert optb.binning_table.iv == approx(5.84465825, rel=1e-6)
192 | 
193 | 
194 | def test_bounds_transform():
195 |     optb = OptimalPWBinning(name=variable)
196 |     optb.fit(x, y, lb=0.001, ub=0.999)
197 | 
198 |     x_transform_woe = optb.transform(x, metric="woe")
199 |     assert x_transform_woe[:4] == approx(
200 |         [3.99180564, 4.28245092, 4.17407503, -3.2565373], rel=1e-6)
201 | 
202 |     x_transform_event_rate = optb.transform(x, metric="event_rate")
203 |     assert x_transform_event_rate[:4] == approx(
204 |         [0.03015878, 0.02272502, 0.02526056, 0.97763604], rel=1e-6)
205 | 
206 | 
207 | def test_bounds_fit_transform():
208 |     optb = OptimalPWBinning(name=variable)
209 | 
210 |     x_transform_woe = optb.fit_transform(
211 |         x, y, lb=0.001, ub=0.999, metric="woe")
212 | 
213 |     assert x_transform_woe[:4] == approx(
214 |         [3.9918056, 4.2824509, 4.17407503, -3.25653732], rel=1e-6)
215 |     x_transform_event_rate = optb.fit_transform(
216 |         x, y, lb=0.001, ub=0.999, metric="event_rate")
217 |     assert x_transform_event_rate[:4] == approx(
218 |         [0.03015878, 0.02272502, 0.02526056, 0.97763604], rel=1e-6)
219 | 
220 | 
221 | def test_solvers():
222 |     for solver in ("auto", "ecos", "osqp"):
223 |         optb = OptimalPWBinning(name=variable, solver=solver)
224 |         optb.fit(x, y)
225 | 
226 |         optb.binning_table.build()
227 |         assert optb.binning_table.iv == approx(5.87474602, rel=1e-6)
228 | 
229 | 
230 | def test_user_splits():
231 |     variable = "mean texture"
232 |     x = df[variable].values
233 | 
234 |     user_splits = [14, 15, 16, 17, 20, 21, 22, 27]
235 |     user_splits_fixed = [False, True, True, False, False, False, False, False]
236 | 
237 |     optb = OptimalPWBinning(name=variable, user_splits=user_splits,
238 |                             user_splits_fixed=user_splits_fixed)
239 | 
240 |     optb.fit(x, y)
241 | 
242 | 
243 | def test_information():
244 |     optb = OptimalPWBinning()
245 | 
246 |     with raises(NotFittedError):
247 |         optb.information()
248 | 
249 |     optb.fit(x, y)
250 | 
251 |     with raises(ValueError):
252 |         optb.information(print_level=-1)
253 | 
254 |     optb.information(print_level=0)
255 |     optb.information(print_level=1)
256 |     optb.information(print_level=2)
257 | 
258 |     optb = OptimalPWBinning()
259 |     optb.fit(x, y)
260 |     optb.information(print_level=2)
261 | 
262 | 
263 | def test_verbose():
264 |     optb = OptimalPWBinning(verbose=True)
265 |     optb.fit(x, y)
266 | 
267 |     assert optb.status == "OPTIMAL"
268 | 


--------------------------------------------------------------------------------
/tests/test_binning_process_sketch.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BinningProcessSketch testing.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2021
  7 | 
  8 | import pandas as pd
  9 | 
 10 | from pytest import approx, raises
 11 | 
 12 | from optbinning import BinningProcessSketch
 13 | from optbinning import OptimalBinningSketch
 14 | from optbinning.exceptions import NotSolvedError
 15 | from optbinning.exceptions import NotDataAddedError
 16 | from sklearn.datasets import load_breast_cancer
 17 | 
 18 | data = load_breast_cancer()
 19 | variable_names = data.feature_names
 20 | df = pd.DataFrame(data.data, columns=variable_names)
 21 | y = data.target
 22 | 
 23 | 
 24 | def test_params():
 25 |     with raises(TypeError):
 26 |         BinningProcessSketch(variable_names=1)
 27 | 
 28 |     with raises(ValueError):
 29 |         BinningProcessSketch(variable_names=[], max_n_prebins=-2)
 30 | 
 31 |     with raises(ValueError):
 32 |         BinningProcessSketch(variable_names=[], min_n_bins=-2)
 33 | 
 34 |     with raises(ValueError):
 35 |         BinningProcessSketch(variable_names=[], max_n_bins=-2.2)
 36 | 
 37 |     with raises(ValueError):
 38 |         BinningProcessSketch(variable_names=[], min_n_bins=3, max_n_bins=2)
 39 | 
 40 |     with raises(ValueError):
 41 |         BinningProcessSketch(variable_names=[], min_bin_size=0.6)
 42 | 
 43 |     with raises(ValueError):
 44 |         BinningProcessSketch(variable_names=[], max_bin_size=-0.6)
 45 | 
 46 |     with raises(ValueError):
 47 |         BinningProcessSketch(variable_names=[], min_bin_size=0.5,
 48 |                              max_bin_size=0.3)
 49 | 
 50 |     with raises(ValueError):
 51 |         BinningProcessSketch(variable_names=[], max_pvalue=1.1)
 52 | 
 53 |     with raises(ValueError):
 54 |         BinningProcessSketch(variable_names=[], max_pvalue_policy="new_policy")
 55 | 
 56 |     with raises(TypeError):
 57 |         BinningProcessSketch(variable_names=[], selection_criteria=[])
 58 | 
 59 |     with raises(TypeError):
 60 |         BinningProcessSketch(variable_names=[], categorical_variables={})
 61 | 
 62 |     with raises(TypeError):
 63 |         BinningProcessSketch(variable_names=[], categorical_variables=[1, 2])
 64 | 
 65 |     with raises(TypeError):
 66 |         BinningProcessSketch(variable_names=[], special_codes={1, 2, 3})
 67 | 
 68 |     with raises(ValueError):
 69 |         BinningProcessSketch(variable_names=[], split_digits=9)
 70 | 
 71 |     with raises(TypeError):
 72 |         BinningProcessSketch(variable_names=[], binning_fit_params=[1, 2])
 73 | 
 74 |     with raises(TypeError):
 75 |         BinningProcessSketch(variable_names=[],
 76 |                              binning_transform_params=[1, 2])
 77 | 
 78 |     with raises(TypeError):
 79 |         BinningProcessSketch(variable_names=[], verbose=1)
 80 | 
 81 | 
 82 | def test_default():
 83 |     bpsketch = BinningProcessSketch(variable_names)
 84 |     bpsketch.add(df, y)
 85 |     bpsketch.solve()
 86 | 
 87 |     optb = bpsketch.get_binned_variable("mean radius")
 88 | 
 89 |     assert optb.status == "OPTIMAL"
 90 | 
 91 |     optb.binning_table.build()
 92 |     assert optb.binning_table.iv == approx(5.04392547, rel=1e-2)
 93 | 
 94 | 
 95 | def test_default_merge():
 96 |     bpsketch_1 = BinningProcessSketch(variable_names)
 97 |     bpsketch_2 = BinningProcessSketch(variable_names)
 98 | 
 99 |     df_1, y_1 = df.iloc[:200, :], y[:200]
100 |     df_2, y_2 = df.iloc[200:, :], y[200:]
101 | 
102 |     bpsketch_1.add(df_1, y_1)
103 |     bpsketch_2.add(df_2, y_2)
104 |     bpsketch_1.merge(bpsketch_2)
105 | 
106 |     bpsketch_1.solve()
107 | 
108 |     optb = bpsketch_1.get_binned_variable("mean radius")
109 | 
110 |     assert optb.status == "OPTIMAL"
111 | 
112 |     optb.binning_table.build()
113 |     assert optb.binning_table.iv == approx(5.04392547, rel=1e-2)
114 | 
115 | 
116 | def test_default_tdigest_merge():
117 |     binning_fit_params = {v: {"sketch": "t-digest"} for v in variable_names}
118 | 
119 |     bpsketch_1 = BinningProcessSketch(variable_names,
120 |                                       binning_fit_params=binning_fit_params)
121 |     bpsketch_2 = BinningProcessSketch(variable_names,
122 |                                       binning_fit_params=binning_fit_params)
123 | 
124 |     df_1, y_1 = df.iloc[:200, :], y[:200]
125 |     df_2, y_2 = df.iloc[200:, :], y[200:]
126 | 
127 |     bpsketch_1.add(df_1, y_1)
128 |     bpsketch_2.add(df_2, y_2)
129 |     bpsketch_1.merge(bpsketch_2)
130 | 
131 |     bpsketch_1.solve()
132 | 
133 |     optb = bpsketch_1.get_binned_variable("mean radius")
134 | 
135 |     assert optb.status == "OPTIMAL"
136 | 
137 |     optb.binning_table.build()
138 |     assert optb.binning_table.iv == approx(5.04392547, rel=1e-2)
139 | 
140 | 
141 | def test_default_transform():
142 |     bpsketch = BinningProcessSketch(variable_names)
143 |     bpsketch.add(df, y)
144 | 
145 |     with raises(NotSolvedError):
146 |         bpsketch.transform(df, metric="woe")
147 | 
148 |     bpsketch.solve()
149 | 
150 |     with raises(TypeError):
151 |         X_transform = bpsketch.transform(df.values, metric="woe")
152 | 
153 |     with raises(ValueError):
154 |         X_transform = bpsketch.transform(df, metric="new_woe")
155 | 
156 |     X_transform = bpsketch.transform(df)
157 | 
158 |     optb = OptimalBinningSketch()
159 |     x = df["mean radius"]
160 |     optb.add(x, y)
161 |     optb.solve()
162 | 
163 |     assert optb.transform(x, metric="woe") == approx(
164 |         X_transform["mean radius"], rel=1e-6)
165 | 
166 | 
167 | def test_information():
168 |     bpsketch = BinningProcessSketch(variable_names)
169 | 
170 |     with raises(NotDataAddedError):
171 |         bpsketch.solve()
172 | 
173 |     bpsketch.add(df, y)
174 | 
175 |     with raises(NotSolvedError):
176 |         bpsketch.information()
177 | 
178 |     bpsketch.solve()
179 | 
180 |     with raises(ValueError):
181 |         bpsketch.information(print_level=-1)
182 | 
183 |     bpsketch.information(print_level=0)
184 |     bpsketch.information(print_level=1)
185 |     bpsketch.information(print_level=2)
186 | 


--------------------------------------------------------------------------------
/tests/test_continuous_binning_piecewise.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ContinuousOptimalPWBinning testing.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2022
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | from pytest import approx
12 | 
13 | from optbinning import ContinuousOptimalPWBinning
14 | from tests.datasets import load_boston
15 | 
16 | data = load_boston()
17 | df = pd.DataFrame(data.data, columns=data.feature_names)
18 | 
19 | variable = "LSTAT"
20 | x = df[variable].values
21 | y = data.target
22 | 
23 | 
24 | def test_default():
25 |     optb = ContinuousOptimalPWBinning(name=variable)
26 |     optb.fit(x, y)
27 | 
28 |     optb.binning_table.build()
29 |     optb.binning_table.plot(
30 |         savefig="tests/results/test_continuous_binning_piecewise.png")
31 | 
32 | 
33 | def test_transform():
34 |     optb = ContinuousOptimalPWBinning(name=variable)
35 |     optb.fit(x, y)
36 | 
37 |     x_transform = optb.transform(x)
38 |     assert x_transform[:3] == approx(
39 |         [31.46014643, 23.87619986, 37.31237732], rel=1e-6)
40 | 
41 | 
42 | def test_fit_transform():
43 |     optb = ContinuousOptimalPWBinning(name=variable)
44 | 
45 |     x_transform = optb.fit_transform(x, y)
46 |     assert x_transform[:3] == approx(
47 |         [31.46014643, 23.87619986, 37.31237732], rel=1e-6)
48 | 
49 | 
50 | def test_special_codes():
51 |     variable = "INDUS"
52 |     x = df[variable].values
53 | 
54 |     x[:50] = -9
55 |     x[50:100] = -8
56 |     special_codes = {'special_-9': -9, 'special_-8': -8}
57 | 
58 |     optb = ContinuousOptimalPWBinning(
59 |         name=variable, monotonic_trend="convex", special_codes=special_codes)
60 |     optb.fit(x, y)
61 | 
62 |     x_transform = optb.transform([-9, -8], metric_special=1000)
63 |     assert x_transform == approx([1000, 1000], rel=1e-6)
64 | 
65 |     x_transform = optb.transform([-9, -8], metric_special='empirical')
66 |     assert x_transform == approx([20.502000, 24.116000], rel=1e-6)
67 | 
68 |     optb = ContinuousOptimalPWBinning(
69 |         name=variable, monotonic_trend="convex", special_codes=[-9, -8])
70 |     optb.fit(x, y)
71 | 
72 |     x_transform = optb.transform([-9, -8], metric_special=1000)
73 |     assert x_transform == approx([1000, 1000], rel=1e-6)
74 | 
75 |     x_transform = optb.transform([-9, -8], metric_special='empirical')
76 |     assert x_transform == approx([22.309, 22.309], rel=1e-6)
77 | 
78 |     x[45:50] = np.nan
79 |     optb = ContinuousOptimalPWBinning(
80 |         name=variable, monotonic_trend="convex", special_codes=special_codes)
81 |     optb.fit(x, y)
82 | 
83 |     x_transform = optb.transform([np.nan], metric_missing='empirical')
84 |     assert x_transform == approx([17.94], rel=1e-6)
85 | 
86 | 
87 | def test_verbose():
88 |     optb = ContinuousOptimalPWBinning(verbose=True)
89 |     optb.fit(x, y)
90 | 
91 |     assert optb.status == "OPTIMAL"
92 | 


--------------------------------------------------------------------------------
/tests/test_mdlp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MDLP testing.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2020
 7 | 
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | from pytest import approx, raises
12 | 
13 | from optbinning import MDLP
14 | from sklearn.datasets import load_breast_cancer
15 | from sklearn.exceptions import NotFittedError
16 | 
17 | 
18 | data = load_breast_cancer()
19 | df = pd.DataFrame(data.data, columns=data.feature_names)
20 | 
21 | variable = "mean radius"
22 | x = df[variable].values
23 | y = data.target
24 | 
25 | 
26 | def test_params():
27 |     with raises(ValueError):
28 |         mdlp = MDLP(min_samples_split=-1)
29 |         mdlp.fit(x, y)
30 | 
31 |     with raises(ValueError):
32 |         mdlp = MDLP(min_samples_leaf=-1)
33 |         mdlp.fit(x, y)
34 | 
35 |     with raises(ValueError):
36 |         mdlp = MDLP(max_candidates=-1)
37 |         mdlp.fit(x, y)
38 | 
39 | 
40 | # def test_numerical_default():
41 | #     mdlp = MDLP()
42 | #     mdlp.fit(x, y)
43 | 
44 | #     assert mdlp.splits == approx([10.945, 13.08729032, 15.00163870,
45 | #                                   15.10030322, 16.925, 17.88], rel=1e-6)
46 | 
47 | 
48 | # def test_numerical_practical():
49 | #     min_samples_leaf = int(np.ceil(len(x) * 0.05))
50 | #     mdlp = MDLP(max_candidates=128, min_samples_leaf=min_samples_leaf)
51 | #     mdlp.fit(x, y)
52 | 
53 | #     assert mdlp.splits == approx([10.945, 12.995, 13.71, 15.045, 16.325,
54 | #                                   17.88], rel=1e-6)
55 | 
56 | 
57 | def test_splits():
58 |     mdlp = MDLP()
59 | 
60 |     with raises(NotFittedError):
61 |         mdlp.splits
62 | 


--------------------------------------------------------------------------------
/tests/test_multiclass_binning.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MulticlassOptimalBinning testing.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2020
  7 | 
  8 | import pandas as pd
  9 | 
 10 | from pytest import approx, raises
 11 | 
 12 | from optbinning import MulticlassOptimalBinning
 13 | from sklearn.datasets import load_wine
 14 | from sklearn.exceptions import NotFittedError
 15 | 
 16 | 
 17 | data = load_wine()
 18 | df = pd.DataFrame(data.data, columns=data.feature_names)
 19 | 
 20 | variable = "ash"
 21 | x = df[variable].values
 22 | y = data.target
 23 | 
 24 | 
 25 | def test_params():
 26 |     with raises(TypeError):
 27 |         optb = MulticlassOptimalBinning(name=1)
 28 |         optb.fit(x, y)
 29 | 
 30 |     with raises(ValueError):
 31 |         optb = MulticlassOptimalBinning(prebinning_method="new_method")
 32 |         optb.fit(x, y)
 33 | 
 34 |     with raises(ValueError):
 35 |         optb = MulticlassOptimalBinning(solver="new_solver")
 36 |         optb.fit(x, y)
 37 | 
 38 |     with raises(ValueError):
 39 |         optb = MulticlassOptimalBinning(max_n_prebins=-2)
 40 |         optb.fit(x, y)
 41 | 
 42 |     with raises(ValueError):
 43 |         optb = MulticlassOptimalBinning(min_prebin_size=0.6)
 44 |         optb.fit(x, y)
 45 | 
 46 |     with raises(ValueError):
 47 |         optb = MulticlassOptimalBinning(min_n_bins=-2)
 48 |         optb.fit(x, y)
 49 | 
 50 |     with raises(ValueError):
 51 |         optb = MulticlassOptimalBinning(max_n_bins=-2.2)
 52 |         optb.fit(x, y)
 53 | 
 54 |     with raises(ValueError):
 55 |         optb = MulticlassOptimalBinning(min_n_bins=3, max_n_bins=2)
 56 |         optb.fit(x, y)
 57 | 
 58 |     with raises(ValueError):
 59 |         optb = MulticlassOptimalBinning(min_bin_size=0.6)
 60 |         optb.fit(x, y)
 61 | 
 62 |     with raises(ValueError):
 63 |         optb = MulticlassOptimalBinning(max_bin_size=-0.6)
 64 |         optb.fit(x, y)
 65 | 
 66 |     with raises(ValueError):
 67 |         optb = MulticlassOptimalBinning(min_bin_size=0.5, max_bin_size=0.3)
 68 |         optb.fit(x, y)
 69 | 
 70 |     with raises(ValueError):
 71 |         optb = MulticlassOptimalBinning(monotonic_trend=["new_trend", "auto"])
 72 |         optb.fit(x, y)
 73 | 
 74 |     with raises(ValueError):
 75 |         optb = MulticlassOptimalBinning(monotonic_trend="new_trend")
 76 |         optb.fit(x, y)
 77 | 
 78 |     with raises(ValueError):
 79 |         optb = MulticlassOptimalBinning(max_pvalue=1.1)
 80 |         optb.fit(x, y)
 81 | 
 82 |     with raises(ValueError):
 83 |         optb = MulticlassOptimalBinning(max_pvalue_policy="new_policy")
 84 |         optb.fit(x, y)
 85 | 
 86 |     with raises(TypeError):
 87 |         optb = MulticlassOptimalBinning(user_splits={"a": [1, 2]})
 88 |         optb.fit(x, y)
 89 | 
 90 |     with raises(TypeError):
 91 |         optb = MulticlassOptimalBinning(special_codes={1, 2, 3})
 92 |         optb.fit(x, y)
 93 | 
 94 |     with raises(ValueError):
 95 |         optb = MulticlassOptimalBinning(split_digits=9)
 96 |         optb.fit(x, y)
 97 | 
 98 |     with raises(ValueError):
 99 |         optb = MulticlassOptimalBinning(mip_solver="new_solver")
100 |         optb.fit(x, y)
101 | 
102 |     with raises(ValueError):
103 |         optb = MulticlassOptimalBinning(time_limit=-2)
104 |         optb.fit(x, y)
105 | 
106 |     with raises(TypeError):
107 |         optb = MulticlassOptimalBinning(verbose=1)
108 |         optb.fit(x, y)
109 | 
110 | 
111 | def test_numerical_default():
112 |     optb = MulticlassOptimalBinning()
113 |     optb.fit(x, y)
114 | 
115 |     assert optb.status == "OPTIMAL"
116 |     assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999,
117 |                                   2.6450001], rel=1e-6)
118 | 
119 |     optb.binning_table.build()
120 |     optb.binning_table.analysis()
121 |     assert optb.binning_table.js == approx(0.10989515, rel=1e-6)
122 |     assert optb.binning_table.quality_score == approx(0.05279822, rel=1e-6)
123 |     optb.binning_table.plot(
124 |         savefig="tests/results/test_multiclass_binning.png")
125 |     optb.binning_table.plot(
126 |         add_special=False,
127 |         savefig="tests/results/test_multiclass_binning_no_special.png")
128 |     optb.binning_table.plot(
129 |         add_missing=False,
130 |         savefig="tests/results/test_multiclass_binning_no_missing.png")
131 | 
132 | 
133 | def test_numerical_default_solvers():
134 |     optb_mip_bop = MulticlassOptimalBinning(solver="mip", mip_solver="bop")
135 |     optb_mip_bop.fit(x, y)
136 | 
137 |     optb_cp = MulticlassOptimalBinning(solver="cp")
138 |     optb_cp.fit(x, y)
139 | 
140 |     for optb in [optb_mip_bop, optb_cp]:
141 |         assert optb.status == "OPTIMAL"
142 |         assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999,
143 |                                       2.6450001], rel=1e-6)
144 | 
145 | 
146 | def test_numerical_user_splits_fixed():
147 |     user_splits = [2.1, 2.2, 2.3, 2.6, 2.9]
148 | 
149 |     with raises(ValueError):
150 |         user_splits_fixed = [False, False, False, True, False]
151 |         optb = MulticlassOptimalBinning(user_splits_fixed=user_splits_fixed)
152 |         optb.fit(x, y)
153 | 
154 |     with raises(TypeError):
155 |         user_splits_fixed = (False, False, False, True, False)
156 |         optb = MulticlassOptimalBinning(user_splits=user_splits,
157 |                                         user_splits_fixed=user_splits_fixed)
158 |         optb.fit(x, y)
159 | 
160 |     with raises(ValueError):
161 |         user_splits_fixed = [0, 0, 0, 1, 0]
162 |         optb = MulticlassOptimalBinning(user_splits=user_splits,
163 |                                         user_splits_fixed=user_splits_fixed)
164 |         optb.fit(x, y)
165 | 
166 |     with raises(ValueError):
167 |         user_splits_fixed = [False, False, False, False]
168 |         optb = MulticlassOptimalBinning(user_splits=user_splits,
169 |                                         user_splits_fixed=user_splits_fixed)
170 |         optb.fit(x, y)
171 | 
172 |     user_splits_fixed = [False, False, False, True, True]
173 | 
174 |     with raises(ValueError):
175 |         # pure pre-bins
176 |         optb = MulticlassOptimalBinning(user_splits=user_splits,
177 |                                         user_splits_fixed=user_splits_fixed)
178 |         optb.fit(x, y)
179 | 
180 |     user_splits = [2.1, 2.2, 2.3, 2.6, 2.7]
181 |     optb = MulticlassOptimalBinning(user_splits=user_splits,
182 |                                     user_splits_fixed=user_splits_fixed)
183 |     optb.fit(x, y)
184 | 
185 |     assert optb.status == "OPTIMAL"
186 |     assert 2.7 in optb.splits
187 | 
188 | 
189 | def test_numerical_user_splits_non_unique():
190 |     user_splits = [2.1, 2.2, 2.2, 2.6, 2.9]
191 |     optb = MulticlassOptimalBinning(user_splits=user_splits)
192 | 
193 |     with raises(ValueError):
194 |         optb.fit(x, y)
195 | 
196 | 
197 | def test_numerical_default_transform():
198 |     optb = MulticlassOptimalBinning()
199 |     with raises(NotFittedError):
200 |         x_transform = optb.transform(x)
201 | 
202 |     optb.fit(x, y)
203 | 
204 |     x_transform = optb.transform([0.3, 2.1, 2.5, 3], metric="mean_woe")
205 |     assert x_transform == approx([0.48973998, 0.48973998, -0.00074357,
206 |                                   0.02189459], rel=1e-5)
207 | 
208 | 
209 | def test_numerical_default_fit_transform():
210 |     optb = MulticlassOptimalBinning()
211 | 
212 |     x_transform = optb.fit_transform(x, y, metric="mean_woe")
213 |     assert x_transform[:5] == approx([-0.00074357, 0.48973998, 0.02189459,
214 |                                       -0.00074357, 0.02189459], rel=1e-5)
215 | 
216 | 
217 | def test_classes():
218 |     optb = MulticlassOptimalBinning()
219 |     optb.fit(x, y)
220 | 
221 |     assert optb.classes == approx([0, 1, 2])
222 | 
223 | 
224 | def test_verbose():
225 |     optb = MulticlassOptimalBinning(verbose=True)
226 |     optb.fit(x, y)
227 | 
228 |     assert optb.status == "OPTIMAL"
229 | 


--------------------------------------------------------------------------------
/tests/test_outlier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Outlier classes testing.
  3 | """
  4 | 
  5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
  6 | # Copyright (C) 2022
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | from pytest import approx, raises
 12 | 
 13 | from optbinning.binning.outlier import ModifiedZScoreDetector
 14 | from optbinning.binning.outlier import RangeDetector
 15 | from optbinning.binning.outlier import YQuantileDetector
 16 | from tests.datasets import load_boston
 17 | 
 18 | data = load_boston()
 19 | df = pd.DataFrame(data.data, columns=data.feature_names)
 20 | 
 21 | variable = "LSTAT"
 22 | x = df[variable].values
 23 | y = data.target
 24 | 
 25 | 
 26 | def test_range_params():
 27 |     with raises(ValueError):
 28 |         detector = RangeDetector(method="new")
 29 |         detector.fit(x)
 30 | 
 31 |     with raises(ValueError):
 32 |         detector = RangeDetector(interval_length=1.5)
 33 |         detector.fit(x)
 34 | 
 35 | 
 36 | def test_zscore_params():
 37 |     with raises(ValueError):
 38 |         detector = ModifiedZScoreDetector(threshold=-1.5)
 39 |         detector.fit(x)
 40 | 
 41 | 
 42 | def test_yquantile_params():
 43 |     with raises(ValueError):
 44 |         detector = YQuantileDetector(outlier_detector="new")
 45 |         detector.fit(x, y)
 46 | 
 47 |     with raises(TypeError):
 48 |         detector = YQuantileDetector(outlier_params=[])
 49 |         detector.fit(x, y)
 50 | 
 51 |     with raises(ValueError):
 52 |         detector = YQuantileDetector(n_bins=-1)
 53 |         detector.fit(x, y)
 54 | 
 55 |     with raises(ValueError):
 56 |         detector = YQuantileDetector(
 57 |             outlier_detector="range",
 58 |             outlier_params={"threshold": 3.7})
 59 | 
 60 |         detector.fit(x, y)
 61 | 
 62 | 
 63 | def test_range_default():
 64 |     detector = RangeDetector(method="ETI")
 65 |     detector.fit(x)
 66 |     assert np.count_nonzero(detector.get_support()) == 7
 67 | 
 68 |     detector = RangeDetector(method="HDI")
 69 |     detector.fit(x)
 70 |     assert np.count_nonzero(detector.get_support()) == 31
 71 | 
 72 | 
 73 | def test_zscore_default():
 74 |     detector = ModifiedZScoreDetector()
 75 |     detector.fit(x)
 76 | 
 77 |     mask = detector.get_support()
 78 |     assert np.count_nonzero(mask) == 2
 79 | 
 80 |     assert x[mask] == approx([37.97, 36.98])
 81 | 
 82 | 
 83 | def test_yquantile_default():
 84 |     detector = YQuantileDetector()
 85 |     detector.fit(x, y)
 86 |     mask = detector.get_support()
 87 | 
 88 |     assert x[mask] == approx(
 89 |         [7.56, 9.59, 7.26, 11.25, 14.79, 7.44, 9.53, 8.88])
 90 | 
 91 |     assert y[mask] == approx([39.8, 33.8, 43.1, 31, 30.7, 50, 50, 50])
 92 | 
 93 | 
 94 | def test_yquantile_outlier_params():
 95 |     detector = YQuantileDetector(n_bins=10, outlier_detector="range",
 96 |                                  outlier_params={'method': 'HDI'})
 97 | 
 98 |     detector.fit(x, y)
 99 |     assert np.count_nonzero(detector.get_support()) == 39
100 | 


--------------------------------------------------------------------------------
/tests/test_scorecard_plots.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Scorecard plots testing.
 3 | """
 4 | 
 5 | # Guillermo Navas-Palencia <g.navas.palencia@gmail.com>
 6 | # Copyright (C) 2020
 7 | 
 8 | import numpy as np
 9 | 
10 | from pytest import raises
11 | 
12 | from optbinning.scorecard import plot_auc_roc
13 | from optbinning.scorecard import plot_cap
14 | from optbinning.scorecard import plot_ks
15 | 
16 | 
17 | y = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0])
18 | y_pred = np.array([0.2, 0.1, 0.6, 0.3, 0.7, 0.2, 0.8, 0.1, 0.9, 0.7, 0.3])
19 | 
20 | 
21 | def test_params():
22 |     for plot in (plot_auc_roc, plot_cap, plot_ks):
23 |         with raises(ValueError):
24 |             y_pred_wrong = y_pred[:-1]
25 |             plot(y, y_pred_wrong)
26 | 
27 |         with raises(TypeError):
28 |             plot(y, y_pred, title=1)
29 | 
30 |         with raises(TypeError):
31 |             plot(y, y_pred, xlabel=1)
32 | 
33 |         with raises(TypeError):
34 |             plot(y, y_pred, ylabel=1)
35 | 
36 |         with raises(TypeError):
37 |             plot(y, y_pred, savefig=1)
38 | 
39 |         with raises(TypeError):
40 |             plot(y, y_pred, fname=1)
41 | 
42 |         with raises(ValueError):
43 |             plot(y, y_pred, savefig=True, fname=None)
44 | 
45 | 
46 | def test_savefig():
47 |     for plot in (plot_auc_roc, plot_cap, plot_ks):
48 |         plot(y, y_pred, savefig=True,
49 |              fname="tests/results/{}.png".format(plot.__name__))
50 | 


--------------------------------------------------------------------------------