├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.rst ├── doc ├── Makefile ├── make.bat └── source │ ├── _images │ ├── binning_2d_readme.png │ ├── binning_2d_readme_example.png │ ├── binning_2d_readme_woe.png │ ├── binning_binary.png │ ├── binning_data_stream.gif │ ├── binning_readme_example_split_woe.png │ ├── binning_readme_example_woe.png │ ├── logo.svg │ ├── logo_optbinning.ico │ └── logo_optbinning.svg │ ├── _static │ └── css │ │ └── custom.css │ ├── binning_2d_binary.rst │ ├── binning_2d_continuous.rst │ ├── binning_2d_tables.rst │ ├── binning_binary.rst │ ├── binning_continuous.rst │ ├── binning_multiclass.rst │ ├── binning_process.rst │ ├── binning_process_sketch.rst │ ├── binning_scenarios.rst │ ├── binning_sketch.rst │ ├── binning_tables.rst │ ├── binning_utilities.rst │ ├── conf.py │ ├── counterfactual.rst │ ├── index.rst │ ├── installation.rst │ ├── mdlp.rst │ ├── outlier.rst │ ├── piecewise_binary.rst │ ├── piecewise_continuous.rst │ ├── release_notes.rst │ ├── scorecard.rst │ ├── tutorials.rst │ └── tutorials │ ├── tutorial_binary.ipynb │ ├── tutorial_binary_large_scale.ipynb │ ├── tutorial_binary_localsolver.ipynb │ ├── tutorial_binary_under_uncertainty.ipynb │ ├── tutorial_binning_2d.ipynb │ ├── tutorial_binning_process_FICO_update_binning.ipynb │ ├── tutorial_binning_process_FICO_xAI.ipynb │ ├── tutorial_binning_process_sklearn_pipeline.ipynb │ ├── tutorial_binning_process_telco_churn.ipynb │ ├── tutorial_continuous.ipynb │ ├── tutorial_continuous_2d.ipynb │ ├── tutorial_counterfactual_binary_target.ipynb │ ├── tutorial_counterfactual_continuous_target.ipynb │ ├── tutorial_multiclass.ipynb │ ├── tutorial_piecewise_binary.ipynb │ ├── tutorial_piecewise_continuous.ipynb │ ├── tutorial_scorecard_binary_target.ipynb │ ├── tutorial_scorecard_continuous_target.ipynb │ ├── tutorial_scorecard_monitoring.ipynb │ ├── tutorial_sketch_binary.ipynb │ └── tutorial_sketch_binary_pyspark.rst ├── optbinning ├── __init__.py ├── _version.py ├── binning │ ├── __init__.py │ ├── auto_monotonic.py │ ├── base.py │ ├── binning.py │ ├── binning_information.py │ ├── binning_process.py │ ├── binning_process_information.py │ ├── binning_statistics.py │ ├── continuous_binning.py │ ├── continuous_cp.py │ ├── cp.py │ ├── distributed │ │ ├── __init__.py │ │ ├── base.py │ │ ├── binning_process_sketch.py │ │ ├── binning_process_sketch_information.py │ │ ├── binning_sketch.py │ │ ├── bsketch.py │ │ ├── bsketch_information.py │ │ ├── gk.py │ │ └── plots.py │ ├── ls.py │ ├── mdlp.py │ ├── metrics.py │ ├── mip.py │ ├── model_data.py │ ├── multiclass_binning.py │ ├── multiclass_cp.py │ ├── multiclass_mip.py │ ├── multidimensional │ │ ├── __init__.py │ │ ├── binning_2d.py │ │ ├── binning_statistics_2d.py │ │ ├── continuous_binning_2d.py │ │ ├── cp_2d.py │ │ ├── mip_2d.py │ │ ├── model_data_2d.py │ │ ├── model_data_cart_2d.py │ │ ├── preprocessing_2d.py │ │ └── transformations_2d.py │ ├── outlier.py │ ├── piecewise │ │ ├── __init__.py │ │ ├── base.py │ │ ├── binning.py │ │ ├── binning_information.py │ │ ├── binning_statistics.py │ │ ├── continuous_binning.py │ │ ├── metrics.py │ │ └── transformations.py │ ├── prebinning.py │ ├── preprocessing.py │ ├── transformations.py │ └── uncertainty │ │ ├── __init__.py │ │ └── binning_scenarios.py ├── exceptions.py ├── formatting.py ├── information.py ├── logging.py ├── metrics │ ├── __init__.py │ ├── classification.py │ └── regression.py ├── options.py └── scorecard │ ├── __init__.py │ ├── counterfactual │ ├── __init__.py │ ├── base.py │ ├── counterfactual.py │ ├── counterfactual_information.py │ ├── mip.py │ ├── model_data.py │ ├── multi_mip.py │ ├── problem_data.py │ └── utils.py │ ├── monitoring.py │ ├── monitoring_information.py │ ├── plots.py │ ├── rounding.py │ ├── scorecard.py │ └── scorecard_information.py ├── requirements.txt ├── setup.py ├── test_requirements.txt └── tests ├── __init__.py ├── data ├── breast_cancer.csv └── breast_cancer.parquet ├── datasets ├── __init__.py └── datasets.py ├── results ├── breast_cancer_woe.csv ├── breast_cancer_woe_2.csv ├── plot_auc_roc.png ├── plot_cap.png ├── plot_ks.png ├── psi_plot_binary.png ├── psi_plot_continuous.png ├── test_binning.png ├── test_binning_2d_event_rate.png ├── test_binning_2d_woe.png ├── test_binning_no_missing.png ├── test_binning_no_special.png ├── test_binning_process_information.txt ├── test_binning_process_verbose.txt ├── test_continuous_binning.png ├── test_continuous_binning_2d.png ├── test_continuous_binning_no_missing.png ├── test_continuous_binning_no_special.png ├── test_multiclass_binning.png ├── test_multiclass_binning_no_missing.png ├── test_multiclass_binning_no_special.png ├── test_scorecard_information.txt ├── test_scorecard_monitoring_default.txt ├── test_scorecard_monitoring_default_continuous.txt ├── test_scorecard_monitoring_information.txt ├── test_scorecard_monitoring_verbose.txt └── test_scorecard_verbose.txt ├── test_binning.py ├── test_binning_2d.py ├── test_binning_piecewise.py ├── test_binning_process.py ├── test_binning_process_sketch.py ├── test_binning_scenarios.py ├── test_binning_sketch.py ├── test_continuous_binning.py ├── test_continuous_binning_2d.py ├── test_continuous_binning_piecewise.py ├── test_counterfactual.py ├── test_mdlp.py ├── test_multiclass_binning.py ├── test_outlier.py ├── test_scorecard.py ├── test_scorecard_monitoring.py └── test_scorecard_plots.py /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: CI 5 | 6 | on: 7 | push: 8 | branches: [master, develop] 9 | pull_request: 10 | branches: [master, develop] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | matrix: 18 | 19 | python-version: ['3.9', '3.10', "3.11", "3.12"] 20 | os: [ubuntu-latest, windows-latest, macos-latest] 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v2 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install -r test_requirements.txt 32 | pip install -r requirements.txt 33 | - name: Install package 34 | run: | 35 | pip install -e .[distributed,test,ecos] 36 | - name: Lint with flake8 37 | run: | 38 | # stop the build if there are Python syntax errors or undefined names 39 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 40 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 41 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 42 | - name: Test with pytest 43 | run: | 44 | pytest 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | # OS: Linux 3 | dist: xenial 4 | python: 5 | - "3.6" 6 | - "3.7" 7 | 8 | matrix: 9 | include: 10 | # OS: Windows 11 | - os: windows 12 | language: shell 13 | before_install: 14 | - choco install python --version 3.6.8 15 | - python --version 16 | - python -m pip install --upgrade pip 17 | - pip3 install --upgrade pytest 18 | # - pip3 install codecov 19 | env: PATH=/c/Python36:/c/Python36/Scripts:$PATH 20 | 21 | - os: windows 22 | language: shell 23 | before_install: 24 | - choco install python --version 3.7.4 25 | - python --version 26 | - python -m pip install --upgrade pip 27 | - pip3 install --upgrade pytest 28 | # - pip3 install codecov 29 | env: PATH=/c/Python37:/c/Python37/Scripts:$PATH 30 | 31 | # command to install dependencies 32 | install: 33 | - pip install codecov 34 | # - pip install coveralls 35 | - pip install -r requirements.txt 36 | - pip install -e . 37 | # command to run tests 38 | script: 39 | - coverage run --source optbinning -m pytest tests/ 40 | 41 | after_success: 42 | - codecov 43 | # - coveralls -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/source/_images/binning_2d_readme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_2d_readme.png -------------------------------------------------------------------------------- /doc/source/_images/binning_2d_readme_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_2d_readme_example.png -------------------------------------------------------------------------------- /doc/source/_images/binning_2d_readme_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_2d_readme_woe.png -------------------------------------------------------------------------------- /doc/source/_images/binning_binary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_binary.png -------------------------------------------------------------------------------- /doc/source/_images/binning_data_stream.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_data_stream.gif -------------------------------------------------------------------------------- /doc/source/_images/binning_readme_example_split_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_readme_example_split_woe.png -------------------------------------------------------------------------------- /doc/source/_images/binning_readme_example_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/binning_readme_example_woe.png -------------------------------------------------------------------------------- /doc/source/_images/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 |
OptBinning
OptBinning
-------------------------------------------------------------------------------- /doc/source/_images/logo_optbinning.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/doc/source/_images/logo_optbinning.ico -------------------------------------------------------------------------------- /doc/source/_images/logo_optbinning.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /doc/source/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | /* Navigator and sidebar colors */ 2 | .wy-side-nav-search, .wy-nav-top { 3 | background: #326d62; 4 | } 5 | 6 | .wy-menu-vertical a:active { 7 | background: #30bba3; 8 | } 9 | 10 | .wy-side-nav-search>div.version { 11 | color: #d8d8d8; 12 | } 13 | 14 | .wy-side-nav-search>a img.logo, .wy-side-nav-search .wy-dropdown>a img.logo { 15 | display: block; 16 | margin: 0 auto; 17 | height: 20%; 18 | width: 20%; 19 | border-radius: 0; 20 | max-width: 100%; 21 | background: transparent; 22 | } 23 | 24 | .wy-menu-vertical header, .wy-menu-vertical p.caption { 25 | color: #30bba3; 26 | } 27 | 28 | /* Class/function declaration colors */ 29 | .rst-content dl:not(.docutils) dt { 30 | background: #daf2ee; 31 | color: #326d62; 32 | border-top: solid 3px #30bba3; 33 | } 34 | 35 | 36 | /* Links colors */ 37 | a { 38 | color: #30bba3; 39 | } 40 | 41 | .wy-nav-content a:hover { 42 | color: #326d62; 43 | } 44 | 45 | .wy-nav-content a:visited:hover { 46 | color: #9B59B6; 47 | } 48 | -------------------------------------------------------------------------------- /doc/source/binning_2d_binary.rst: -------------------------------------------------------------------------------- 1 | Optimal binning 2D with binary target 2 | ===================================== 3 | 4 | .. autoclass:: optbinning.OptimalBinning2D 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: -------------------------------------------------------------------------------- /doc/source/binning_2d_continuous.rst: -------------------------------------------------------------------------------- 1 | Optimal binning 2D with continuous target 2 | ========================================= 3 | 4 | .. autoclass:: optbinning.ContinuousOptimalBinning2D 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: -------------------------------------------------------------------------------- /doc/source/binning_2d_tables.rst: -------------------------------------------------------------------------------- 1 | Binning 2D tables 2 | ================= 3 | 4 | Binning table 2D: binary target 5 | ------------------------------- 6 | 7 | .. autoclass:: optbinning.binning.multidimensional.binning_statistics_2d.BinningTable2D 8 | :members: 9 | :inherited-members: 10 | :show-inheritance: 11 | 12 | Binning table 2D: continuous target 13 | ----------------------------------- 14 | 15 | .. autoclass:: optbinning.binning.multidimensional.binning_statistics_2d.ContinuousBinningTable2D 16 | :members: 17 | :inherited-members: 18 | :show-inheritance: -------------------------------------------------------------------------------- /doc/source/binning_binary.rst: -------------------------------------------------------------------------------- 1 | Optimal binning with binary target 2 | ================================== 3 | 4 | .. autoclass:: optbinning.OptimalBinning 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/binning_continuous.rst: -------------------------------------------------------------------------------- 1 | Optimal binning with continuous target 2 | ====================================== 3 | 4 | .. autoclass:: optbinning.ContinuousOptimalBinning 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/binning_multiclass.rst: -------------------------------------------------------------------------------- 1 | Optimal binning with multiclass target 2 | ====================================== 3 | 4 | 5 | .. autoclass:: optbinning.MulticlassOptimalBinning 6 | :members: 7 | :inherited-members: 8 | :show-inheritance: -------------------------------------------------------------------------------- /doc/source/binning_process.rst: -------------------------------------------------------------------------------- 1 | Binning process 2 | =============== 3 | 4 | .. autoclass:: optbinning.BinningProcess 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/binning_process_sketch.rst: -------------------------------------------------------------------------------- 1 | Binning process sketch with binary target 2 | ========================================= 3 | 4 | .. autoclass:: optbinning.BinningProcessSketch 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/binning_scenarios.rst: -------------------------------------------------------------------------------- 1 | Stochastic optimal binning 2 | ========================== 3 | 4 | Introduction 5 | ------------ 6 | The data used when performing optimal binning is generally assumed to be known accurately and being fully representative of past, present, and future data. This confidence might produce misleading results, especially with data representing future events such as product demand, churn rate, or probability of default. 7 | 8 | Stochastic programming is a framework for explicitly incorporating uncertainty. Stochastic programming uses random variables to account for data variability and optimizes the expected value of the objective function. Optbinning implements the stochastic programming approach using the two-stage scenario-based formulation (also known as extensive form or deterministic equivalent), obtaining a deterministic mixed-integer linear programming formulation. The scenario-based formulation guarantees the nonanticipativity constraint and a solution that must be feasible for each scenario, leading to a more **robust** solution. 9 | 10 | 11 | Scenario-based optimal binning 12 | ------------------------------ 13 | 14 | .. autoclass:: optbinning.binning.uncertainty.SBOptimalBinning 15 | :members: 16 | :inherited-members: 17 | :show-inheritance: 18 | 19 | -------------------------------------------------------------------------------- /doc/source/binning_sketch.rst: -------------------------------------------------------------------------------- 1 | Optimal binning sketch with binary target 2 | ========================================= 3 | 4 | Introduction 5 | ------------ 6 | 7 | The optimal binning is the constrained discretization of a numerical feature into bins given a binary target, maximizing a statistic such as Jeffrey's divergence or Gini. Binning is a data preprocessing technique commonly used in binary classification, but the current list of existing binning algorithms supporting constraints lacks a method to handle streaming data. The new class OptimalBinningSketch implements a new scalable, memory-efficient and robust algorithm for performing optimal binning in the streaming settings. Algorithmic details are discussed in http://gnpalencia.org/blog/2020/binning_data_streams/. 8 | 9 | 10 | Algorithms 11 | ---------- 12 | 13 | OptimalBinningSketch 14 | """""""""""""""""""" 15 | 16 | .. autoclass:: optbinning.binning.distributed.OptimalBinningSketch 17 | :members: 18 | :inherited-members: 19 | :show-inheritance: 20 | 21 | 22 | GK: Greenwald-Khanna's algorithm 23 | """""""""""""""""""""""""""""""" 24 | 25 | .. autoclass:: optbinning.binning.distributed.GK 26 | :members: 27 | :inherited-members: 28 | :show-inheritance: 29 | 30 | 31 | Binning sketch: numerical variable - binary target 32 | """""""""""""""""""""""""""""""""""""""""""""""""" 33 | 34 | .. autoclass:: optbinning.binning.distributed.BSketch 35 | :members: 36 | :inherited-members: 37 | :show-inheritance: 38 | 39 | 40 | Binning sketch: categorical variable - binary target 41 | """""""""""""""""""""""""""""""""""""""""""""""""""" 42 | 43 | .. autoclass:: optbinning.binning.distributed.BCatSketch 44 | :members: 45 | :inherited-members: 46 | :show-inheritance: -------------------------------------------------------------------------------- /doc/source/binning_tables.rst: -------------------------------------------------------------------------------- 1 | Binning tables 2 | ============== 3 | 4 | Binning table: binary target 5 | ---------------------------- 6 | 7 | .. autoclass:: optbinning.binning.binning_statistics.BinningTable 8 | :members: 9 | :inherited-members: 10 | :show-inheritance: 11 | 12 | Binning table: continuous target 13 | -------------------------------- 14 | 15 | .. autoclass:: optbinning.binning.binning_statistics.ContinuousBinningTable 16 | :members: 17 | :inherited-members: 18 | :show-inheritance: 19 | 20 | Binning table: multiclass target 21 | -------------------------------- 22 | 23 | .. autoclass:: optbinning.binning.binning_statistics.MulticlassBinningTable 24 | :members: 25 | :inherited-members: 26 | :show-inheritance: -------------------------------------------------------------------------------- /doc/source/binning_utilities.rst: -------------------------------------------------------------------------------- 1 | Utilities 2 | ========= 3 | 4 | 5 | Pre-binning 6 | ----------- 7 | 8 | .. autoclass:: optbinning.binning.prebinning.PreBinning 9 | :members: 10 | :inherited-members: 11 | :show-inheritance: 12 | 13 | 14 | Transformations 15 | --------------- 16 | 17 | The Weight of Evidence :math:`\text{WoE}_i` and event rate :math:`D_i` for each bin are related by means of the functional equations 18 | 19 | .. math:: 20 | 21 | \begin{align} 22 | \text{WoE}_i &= \log\left(\frac{1 - D_i}{D_i}\right) + \log\left(\frac{N_T^{E}}{N_T^{NE}}\right) = 23 | \log\left(\frac{N_T^{E}}{N_T^{NE}}\right) - \text{logit}(D_i)\\ 24 | D_i &= \left(1 + \frac{N_T^{NE}}{N_T^{E}} e^{\text{WoE}_i}\right)^{-1} = \left(1 + e^{\text{WoE}_i - \log\left(\frac{N_T^{E}}{N_T^{NE}}\right)}\right)^{-1}, 25 | \end{align} 26 | 27 | where :math:`D_i` can be characterized as a logistic function of :math:`\text{WoE}_i`, and :math:`\text{WoE}_i` can be expressed in terms of the logit function of :math:`D_i`. 28 | The constant term :math:`\log(N_T^{E} / N_T^{NE})` is the log ratio of the total 29 | number of event :math:`N_T^{E}` and the total number of non-events :math:`N_T^{NE}`. This shows that WoE is inversely related to the event rate. 30 | 31 | .. autofunction:: optbinning.binning.transformations.transform_event_rate_to_woe 32 | 33 | .. autofunction:: optbinning.binning.transformations.transform_woe_to_event_rate 34 | 35 | 36 | Metrics 37 | ------- 38 | 39 | Gini coefficient 40 | """""""""""""""" 41 | 42 | The Gini coefficient or Accuracy Ratio is a quantitative measure of discriminatory and predictive power given a distribution of events and non-events. The Gini coefficient 43 | ranges from 0 to 1, and is defined by 44 | 45 | .. math:: 46 | 47 | Gini = 1 - \frac{2 \sum_{i=2}^n \left(N_i^{E} \sum_{j=1}^{i-1} N_j^{NE}\right) + \sum_{k=1}^n N_k^{E} N_k^{NE}}{N_T^{E} N_T^{NE}}, 48 | 49 | where :math:`N_i^{E}` and :math:`N_i^{NE}` are the number of events and non-events per 50 | bin, respectively, and :math:`N_T^{E}` and :math:`N_T^{NE}` are the total number of 51 | events and non-events, respectively. 52 | 53 | .. autofunction:: optbinning.binning.metrics.gini 54 | 55 | Divergence measures 56 | """"""""""""""""""" 57 | 58 | Given two discrete probability distributions :math:`P` and :math:`Q`. The Shannon entropy 59 | is defined as 60 | 61 | .. math:: 62 | 63 | S(P) = - \sum_{i=1}^n p_i \log(p_i). 64 | 65 | The Kullback-Leibler divergence, denoted as :math:`D_{KL}(P||Q)`, is given by 66 | 67 | .. math:: 68 | 69 | D_{KL}(P || Q) = \sum_{i=1}^n p_i \log \left(\frac{p_i}{q_i}\right). 70 | 71 | The Jeffrey's divergence or Information Value (IV), is a symmetric measure expressible in terms of the Kullback-Leibler divergence defined by 72 | 73 | .. math:: 74 | 75 | \begin{align*} 76 | J(P|| Q) &= D_{KL}(P || Q) + D_{KL}(Q || P) = \sum_{i=1}^n p_i \log \left(\frac{p_i}{q_i}\right) + \sum_{i=1}^n q_i \log \left(\frac{q_i}{p_i}\right)\\ 77 | &= \sum_{i=1}^n (p_i - q_i) \log \left(\frac{p_i}{q_i}\right). 78 | \end{align*} 79 | 80 | The Jensen-Shannon divergence is a bounded symmetric measure also expressible in 81 | terms of the Kullback-Leibler divergence 82 | 83 | .. math:: 84 | 85 | \begin{equation} 86 | JSD(P || Q) = \frac{1}{2}\left(D(P || M) + D(Q || M)\right), \quad M = \frac{1}{2}(P + Q), 87 | \end{equation} 88 | 89 | and bounded by :math:`JSD(P||Q) \in [0, \log(2)]`. We note that these measures cannot be directly used whenever :math:`p_i = 0` and/or :math:`q_i = 0`. 90 | 91 | .. autofunction:: optbinning.binning.metrics.entropy 92 | 93 | .. autofunction:: optbinning.binning.metrics.kullback_leibler 94 | 95 | .. autofunction:: optbinning.binning.metrics.jeffrey 96 | 97 | .. autofunction:: optbinning.binning.metrics.jensen_shannon 98 | 99 | .. autofunction:: optbinning.binning.metrics.jensen_shannon_multivariate 100 | 101 | .. autofunction:: optbinning.binning.metrics.hellinger 102 | 103 | .. autofunction:: optbinning.binning.metrics.triangular 104 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'optbinning' 21 | copyright = '2019 - 2024, Guillermo Navas-Palencia' 22 | author = 'Guillermo Navas-Palencia' 23 | 24 | # The short X.Y version 25 | version = '0.20.0' 26 | # The full version, including alpha/beta/rc tags 27 | release = '0.20.0' 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | 'sphinx.ext.autodoc', 37 | 'sphinx.ext.doctest', 38 | 'sphinx.ext.mathjax', 39 | 'sphinx.ext.napoleon', 40 | 'sphinxcontrib.bibtex', 41 | 'nbsphinx', 42 | 'sphinx.ext.autosectionlabel' 43 | ] 44 | 45 | # Add any paths that contain templates here, relative to this directory. 46 | templates_path = ['_templates'] 47 | 48 | # The suffix(es) of source filenames. 49 | # You can specify multiple suffix as a list of string: 50 | # 51 | # source_suffix = ['.rst', '.md'] 52 | source_suffix = '.rst' 53 | 54 | # The master toctree document. 55 | master_doc = 'index' 56 | 57 | # The language for content autogenerated by Sphinx. Refer to documentation 58 | # for a list of supported languages. 59 | # 60 | # This is also used if you do content translation via gettext catalogs. 61 | # Usually you set "language" from the command line for these cases. 62 | language = None 63 | 64 | # List of patterns, relative to source directory, that match files and 65 | # directories to ignore when looking for source files. 66 | # This pattern also affects html_static_path and html_extra_path. 67 | exclude_patterns = [] 68 | 69 | 70 | # -- Options for HTML output ------------------------------------------------- 71 | 72 | # The theme to use for HTML and HTML Help pages. See the documentation for 73 | # a list of builtin themes. 74 | # 75 | html_theme_options = { 76 | 'logo_only': True 77 | } 78 | 79 | html_show_sourcelink = False 80 | 81 | html_theme = 'sphinx_rtd_theme' 82 | html_logo = '_images/logo_optbinning.svg' 83 | html_favicon = '_images/logo_optbinning.ico' 84 | 85 | 86 | # Add any paths that contain custom static files (such as style sheets) here, 87 | # relative to this directory. They are copied after the builtin static files, 88 | # so a file named "default.css" will overwrite the builtin "default.css". 89 | html_static_path = ['_static'] 90 | 91 | 92 | # These paths are either relative to html_static_path 93 | # or fully qualified paths (eg. https://...) 94 | html_css_files = [ 95 | 'css/custom.css', 96 | ] 97 | -------------------------------------------------------------------------------- /doc/source/counterfactual.rst: -------------------------------------------------------------------------------- 1 | Counterfactual explanations 2 | =========================== 3 | 4 | .. autoclass:: optbinning.scorecard.Counterfactual 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. optbinning documentation master file, created by 2 | sphinx-quickstart on Thu Dec 19 10:54:06 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | OptBinning: The Python Optimal Binning library 8 | ============================================== 9 | 10 | The optimal binning is the optimal discretization of a variable into bins given a discrete or continuous numeric target. **OptBinning** is a library 11 | written in Python implementing a **rigorous** and **flexible** mathematical programming formulation to solve the optimal binning problem for a binary, continuous and multiclass target type, incorporating constraints not previously addressed. 12 | 13 | .. toctree:: 14 | :maxdepth: 1 15 | :caption: Getting started 16 | 17 | installation 18 | tutorials 19 | release_notes 20 | 21 | .. toctree:: 22 | :maxdepth: 1 23 | :caption: Optimal binning algorithms 24 | 25 | binning_binary 26 | binning_continuous 27 | binning_multiclass 28 | binning_process 29 | binning_tables 30 | binning_utilities 31 | 32 | .. toctree:: 33 | :maxdepth: 1 34 | :caption: Scorecard development 35 | 36 | scorecard 37 | counterfactual 38 | 39 | .. toctree:: 40 | :maxdepth: 1 41 | :caption: Optimal piecewise binning 42 | 43 | piecewise_binary 44 | piecewise_continuous 45 | 46 | .. toctree:: 47 | :maxdepth: 1 48 | :caption: Batch and stream optimal binning 49 | 50 | binning_sketch 51 | binning_process_sketch 52 | 53 | .. toctree:: 54 | :maxdepth: 1 55 | :caption: Binning under uncertainty 56 | 57 | binning_scenarios 58 | 59 | .. toctree:: 60 | :maxdepth: 1 61 | :caption: Optimal binning 2D 62 | 63 | binning_2d_binary 64 | binning_2d_continuous 65 | binning_2d_tables 66 | 67 | .. toctree:: 68 | :maxdepth: 1 69 | :caption: Other binning algorithms 70 | 71 | mdlp 72 | 73 | .. toctree:: 74 | :maxdepth: 1 75 | :caption: Utilities 76 | 77 | outlier -------------------------------------------------------------------------------- /doc/source/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Install release 5 | --------------- 6 | 7 | To install the current release of OptBinning: 8 | 9 | .. code-block:: text 10 | 11 | pip install optbinning 12 | 13 | Optionally, download a different release 14 | from https://github.com/guillermo-navas-palencia/optbinning/releases and install 15 | using 16 | 17 | .. code-block:: text 18 | 19 | python setup.py install 20 | 21 | Install from source 22 | ------------------- 23 | 24 | To install from source, download or clone the git repository https://github.com/guillermo-navas-palencia/optbinning 25 | 26 | .. code-block:: text 27 | 28 | cd optbinning 29 | python setup.py install -------------------------------------------------------------------------------- /doc/source/mdlp.rst: -------------------------------------------------------------------------------- 1 | MDLP discretization algorithm 2 | ============================= 3 | 4 | .. autoclass:: optbinning.MDLP 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /doc/source/outlier.rst: -------------------------------------------------------------------------------- 1 | Outlier detection 2 | ================= 3 | 4 | .. autoclass:: optbinning.binning.outlier.OutlierDetector 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | 9 | 10 | .. autoclass:: optbinning.binning.outlier.RangeDetector 11 | :members: 12 | :inherited-members: 13 | :show-inheritance: 14 | 15 | 16 | .. autoclass:: optbinning.binning.outlier.ModifiedZScoreDetector 17 | :members: 18 | :inherited-members: 19 | :show-inheritance: 20 | 21 | 22 | .. autoclass:: optbinning.binning.outlier.YQuantileDetector 23 | :members: 24 | :inherited-members: 25 | :show-inheritance: 26 | -------------------------------------------------------------------------------- /doc/source/piecewise_binary.rst: -------------------------------------------------------------------------------- 1 | Optimal piecewise binning with binary target 2 | ============================================ 3 | 4 | .. autoclass:: optbinning.OptimalPWBinning 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: -------------------------------------------------------------------------------- /doc/source/piecewise_continuous.rst: -------------------------------------------------------------------------------- 1 | Optimal piecewise binning with continuous target 2 | ================================================ 3 | 4 | .. autoclass:: optbinning.ContinuousOptimalPWBinning 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: -------------------------------------------------------------------------------- /doc/source/scorecard.rst: -------------------------------------------------------------------------------- 1 | Scorecard 2 | ========= 3 | 4 | .. autoclass:: optbinning.scorecard.Scorecard 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | 9 | 10 | Monitoring 11 | ---------- 12 | 13 | .. autoclass:: optbinning.scorecard.ScorecardMonitoring 14 | :members: 15 | :inherited-members: 16 | :show-inheritance: 17 | 18 | 19 | Plot functions 20 | -------------- 21 | 22 | .. autofunction:: optbinning.scorecard.plot_auc_roc 23 | 24 | .. autofunction:: optbinning.scorecard.plot_cap 25 | 26 | .. autofunction:: optbinning.scorecard.plot_ks -------------------------------------------------------------------------------- /doc/source/tutorials.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | This is a guide for optbinning new users with tutorials ranging from basic 5 | to advanced level for each supported target type. 6 | 7 | Optimal binning tutorials 8 | ------------------------- 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | 13 | tutorials/tutorial_binary 14 | tutorials/tutorial_binary_localsolver 15 | tutorials/tutorial_binary_large_scale 16 | tutorials/tutorial_continuous 17 | tutorials/tutorial_multiclass 18 | 19 | 20 | Binning process tutorials 21 | ------------------------- 22 | 23 | .. toctree:: 24 | :maxdepth: 1 25 | 26 | tutorials/tutorial_binning_process_sklearn_pipeline 27 | tutorials/tutorial_binning_process_FICO_xAI 28 | tutorials/tutorial_binning_process_FICO_update_binning 29 | tutorials/tutorial_binning_process_telco_churn 30 | 31 | 32 | Scorecard tutorials 33 | ------------------- 34 | 35 | .. toctree:: 36 | :maxdepth: 1 37 | 38 | tutorials/tutorial_scorecard_binary_target 39 | tutorials/tutorial_scorecard_continuous_target 40 | tutorials/tutorial_scorecard_monitoring 41 | tutorials/tutorial_counterfactual_binary_target 42 | tutorials/tutorial_counterfactual_continuous_target 43 | 44 | 45 | Optimal piecewise binning tutorials 46 | ----------------------------------- 47 | 48 | .. toctree:: 49 | :maxdepth: 1 50 | 51 | tutorials/tutorial_piecewise_binary 52 | tutorials/tutorial_piecewise_continuous 53 | 54 | 55 | Optimal binning for batch and streaming data processing 56 | ------------------------------------------------------- 57 | 58 | .. toctree:: 59 | :maxdepth: 1 60 | 61 | tutorials/tutorial_sketch_binary 62 | tutorials/tutorial_sketch_binary_pyspark 63 | 64 | 65 | Optimal binning under uncertainty 66 | --------------------------------- 67 | 68 | .. toctree:: 69 | :maxdepth: 1 70 | 71 | tutorials/tutorial_binary_under_uncertainty 72 | 73 | 74 | Optimal binning 2D 75 | ------------------ 76 | 77 | .. toctree:: 78 | :maxdepth: 1 79 | 80 | tutorials/tutorial_binning_2d 81 | tutorials/tutorial_continuous_2d 82 | -------------------------------------------------------------------------------- /doc/source/tutorials/tutorial_sketch_binary_pyspark.rst: -------------------------------------------------------------------------------- 1 | Tutorial: optimal binning sketch with binary target using PySpark 2 | ================================================================= 3 | 4 | In this example, we use PySpark mapPartitions function to compute the optimal 5 | binning of a single variable from a large dataset in a distributed fashion. The dataset is split into 4 partitions. 6 | 7 | .. code:: 8 | 9 | from pyspark.sql import SparkSession 10 | 11 | spark.conf.set("spark.sql.execution.arrow.enabled", "true") 12 | 13 | df = spark.read.csv("data/kaggle/HomeCreditDefaultRisk/application_train.csv", 14 | sep=",", header=True, inferSchema=True) 15 | 16 | n_partitions = 4 17 | df = df.repartition(n_partitions) 18 | 19 | 20 | We prepare the MapReduce structure 21 | 22 | .. code :: 23 | 24 | import pandas as pd 25 | from optbinning import OptimalBinningSketch 26 | 27 | variable = "EXT_SOURCE_3" 28 | target = "TARGET" 29 | columns = [variable, target] 30 | 31 | 32 | def add(partition): 33 | df_pandas = pd.DataFrame.from_records(partition, columns=columns) 34 | x = df_pandas[variable] 35 | y = df_pandas[target] 36 | optbsketch = OptimalBinningSketch(eps=0.001) 37 | optbsketch.add(x, y) 38 | 39 | return [optbsketch] 40 | 41 | def merge(optbsketch, other_optbsketch): 42 | optbsketch.merge(other_optbsketch) 43 | 44 | return optbsketch 45 | 46 | Finally, with the required columns, we use mapPartitions and method 47 | treeReduce to aggregate the ``OptimalBinningSketch`` instance of each partition. 48 | 49 | .. code :: 50 | 51 | optbsketch = df.select(columns).rdd.mapPartitions(lambda partition: add(partition) 52 | ).treeReduce(merge) -------------------------------------------------------------------------------- /optbinning/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | from .binning import BinningProcess 3 | from .binning import ContinuousOptimalBinning 4 | from .binning import MDLP 5 | from .binning import MulticlassOptimalBinning 6 | from .binning import OptimalBinning 7 | from .binning.distributed import BinningProcessSketch 8 | from .binning.distributed import OptimalBinningSketch 9 | from .binning.multidimensional import ContinuousOptimalBinning2D 10 | from .binning.multidimensional import OptimalBinning2D 11 | from .binning.piecewise import ContinuousOptimalPWBinning 12 | from .binning.piecewise import OptimalPWBinning 13 | from .binning.uncertainty import SBOptimalBinning 14 | from .scorecard import Scorecard 15 | 16 | 17 | __all__ = ['__version__', 18 | 'BinningProcess', 19 | 'BinningProcessSketch', 20 | 'ContinuousOptimalBinning', 21 | 'ContinuousOptimalBinning2D', 22 | 'ContinuousOptimalPWBinning', 23 | 'MDLP', 24 | 'MulticlassOptimalBinning', 25 | 'OptimalBinning', 26 | 'OptimalBinningSketch', 27 | 'OptimalBinning2D', 28 | 'OptimalPWBinning', 29 | 'SBOptimalBinning', 30 | 'Scorecard'] 31 | -------------------------------------------------------------------------------- /optbinning/_version.py: -------------------------------------------------------------------------------- 1 | """Version information.""" 2 | 3 | __version__ = "0.20.1" 4 | -------------------------------------------------------------------------------- /optbinning/binning/__init__.py: -------------------------------------------------------------------------------- 1 | from .binning import OptimalBinning 2 | from .binning_process import BinningProcess 3 | from .continuous_binning import ContinuousOptimalBinning 4 | from .mdlp import MDLP 5 | from .multiclass_binning import MulticlassOptimalBinning 6 | 7 | 8 | __all__ = ['BinningProcess', 9 | 'ContinuousOptimalBinning', 10 | 'MDLP', 11 | 'MulticlassOptimalBinning', 12 | 'OptimalBinning'] 13 | -------------------------------------------------------------------------------- /optbinning/binning/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base optimal binning algorithm class. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | from abc import ABCMeta 9 | from abc import abstractmethod 10 | 11 | from sklearn.base import BaseEstimator 12 | from sklearn.exceptions import NotFittedError 13 | 14 | 15 | class Base: 16 | def _check_is_fitted(self): 17 | if not self._is_fitted: 18 | raise NotFittedError("This {} instance is not fitted yet. Call " 19 | "'fit' with appropriate arguments." 20 | .format(self.__class__.__name__)) 21 | 22 | 23 | class BaseOptimalBinning(Base, BaseEstimator, metaclass=ABCMeta): 24 | @abstractmethod 25 | def fit(self): 26 | """Fit the optimal binning according to the given training data.""" 27 | 28 | @abstractmethod 29 | def fit_transform(self): 30 | """Fit the optimal binning according to the given training data, then 31 | transform it.""" 32 | 33 | @abstractmethod 34 | def transform(self): 35 | """Transform given data using bins from the fitted optimal binning.""" 36 | 37 | @abstractmethod 38 | def information(self): 39 | """Print overview information about the options settings, problem 40 | statistics, and the solution of the computation.""" 41 | 42 | @property 43 | @abstractmethod 44 | def binning_table(self): 45 | """Return an instantiated binning table.""" 46 | 47 | @property 48 | @abstractmethod 49 | def splits(self): 50 | """List of optimal split points.""" 51 | 52 | @property 53 | @abstractmethod 54 | def status(self): 55 | """The status of the underlying optimization solver.""" 56 | -------------------------------------------------------------------------------- /optbinning/binning/binning_information.py: -------------------------------------------------------------------------------- 1 | """ 2 | Optimal binning information. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2019 7 | 8 | from ..information import print_header 9 | from ..information import print_optional_parameters 10 | from ..information import print_solver_statistics 11 | from ..options import continuous_optimal_binning_default_options 12 | from ..options import multiclass_optimal_binning_default_options 13 | from ..options import optimal_binning_default_options 14 | from ..options import sboptimal_binning_default_options 15 | from ..options import continuous_optimal_binning_2d_default_options 16 | from ..options import optimal_binning_2d_default_options 17 | 18 | 19 | def print_prebinning_statistics(n_prebins, n_refinement): 20 | prebinning_stats = ( 21 | " Pre-binning statistics\n" 22 | " Number of pre-bins {:>10}\n" 23 | " Number of refinements {:>10}\n" 24 | ).format(n_prebins, n_refinement) 25 | 26 | print(prebinning_stats) 27 | 28 | 29 | def print_timing(solver_type, solver, time_total, time_preprocessing, 30 | time_prebinning, time_solver, time_optimizer, 31 | time_postprocessing): 32 | 33 | p_preprocessing = time_preprocessing / time_total 34 | p_prebinning = time_prebinning / time_total 35 | p_solver = time_solver / time_total 36 | p_postprocessing = time_postprocessing / time_total 37 | 38 | if solver_type == "cp" and solver is not None: 39 | time_model_generation = time_solver - time_optimizer 40 | p_model_generation = time_model_generation / time_solver 41 | p_optimizer = time_optimizer / time_solver 42 | 43 | time_stats = ( 44 | " Timing\n" 45 | " Total time {:>18.2f} sec\n" 46 | " Pre-processing {:>18.2f} sec ({:>7.2%})\n" 47 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n" 48 | " Solver {:>18.2f} sec ({:>7.2%})\n" 49 | " model generation {:>18.2f} sec ({:>7.2%})\n" 50 | " optimizer {:>18.2f} sec ({:>7.2%})\n" 51 | " Post-processing {:>18.2f} sec ({:>7.2%})\n" 52 | ).format(time_total, time_preprocessing, p_preprocessing, 53 | time_prebinning, p_prebinning, time_solver, p_solver, 54 | time_model_generation, p_model_generation, time_optimizer, 55 | p_optimizer, time_postprocessing, p_postprocessing) 56 | else: 57 | time_stats = ( 58 | " Timing\n" 59 | " Total time {:>18.2f} sec\n" 60 | " Pre-processing {:>18.2f} sec ({:>7.2%})\n" 61 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n" 62 | " Solver {:>18.2f} sec ({:>7.2%})\n" 63 | " Post-processing {:>18.2f} sec ({:>7.2%})\n" 64 | ).format(time_total, time_preprocessing, p_preprocessing, 65 | time_prebinning, p_prebinning, time_solver, p_solver, 66 | time_postprocessing, p_postprocessing) 67 | 68 | print(time_stats) 69 | 70 | 71 | def print_name_status(name, status): 72 | if not name: 73 | name = "UNKNOWN" 74 | 75 | print(" Name : {:<32}\n" 76 | " Status : {:<32}\n".format(name, status)) 77 | 78 | 79 | def print_main_info(name, status, time_total): 80 | print_name_status(name, status) 81 | 82 | print(" Time : {:<7.4f} sec\n".format(time_total)) 83 | 84 | 85 | def print_binning_information(binning_type, print_level, name, status, 86 | solver_type, solver, time_total, 87 | time_preprocessing, time_prebinning, time_solver, 88 | time_optimizer, time_postprocessing, n_prebins, 89 | n_refinements, dict_user_options): 90 | 91 | print_header() 92 | 93 | if print_level == 2: 94 | if binning_type == "optimalbinning": 95 | d_default_options = optimal_binning_default_options 96 | elif binning_type == "multiclassoptimalbinning": 97 | d_default_options = multiclass_optimal_binning_default_options 98 | elif binning_type == "continuousoptimalbinning": 99 | d_default_options = continuous_optimal_binning_default_options 100 | elif binning_type == "sboptimalbinning": 101 | d_default_options = sboptimal_binning_default_options 102 | elif binning_type == "optimalbinning2d": 103 | d_default_options = optimal_binning_2d_default_options 104 | elif binning_type == "continuousoptimalbinning2d": 105 | d_default_options = continuous_optimal_binning_2d_default_options 106 | 107 | print_optional_parameters(d_default_options, dict_user_options) 108 | 109 | if print_level == 0: 110 | print_main_info(name, status, time_total) 111 | elif print_level >= 1: 112 | print_name_status(name, status) 113 | 114 | print_prebinning_statistics(n_prebins, n_refinements) 115 | 116 | if status in ("OPTIMAL", "FEASIBLE"): 117 | if solver is not None: 118 | print_solver_statistics(solver_type, solver) 119 | 120 | print_timing(solver_type, solver, time_total, time_preprocessing, 121 | time_prebinning, time_solver, time_optimizer, 122 | time_postprocessing) 123 | -------------------------------------------------------------------------------- /optbinning/binning/binning_process_information.py: -------------------------------------------------------------------------------- 1 | """ 2 | Binning process information. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | from ..information import print_header 9 | from ..information import print_optional_parameters 10 | from ..options import binning_process_default_options 11 | 12 | 13 | def print_main_info(n_records, n_variables, time_total): 14 | print(" Number of records : {}".format(n_records)) 15 | print(" Number of variables : {}".format(n_variables)) 16 | print(" Time : {:<10.4f} sec\n".format(time_total)) 17 | 18 | 19 | def print_binning_process_statistics(n_records, n_variables, target_dtype, 20 | n_numerical, n_categorical, n_selected, 21 | time_total): 22 | stats = ( 23 | " Statistics\n" 24 | " Number of records {:>10}\n" 25 | " Number of variables {:>10}\n" 26 | " Target type {:>10}\n\n" 27 | " Number of numerical {:>10}\n" 28 | " Number of categorical {:>10}\n" 29 | " Number of selected {:>10}\n\n" 30 | " Time {:>10.4f} sec\n" 31 | ).format(n_records, n_variables, target_dtype, n_numerical, 32 | n_categorical, n_selected, time_total) 33 | 34 | print(stats) 35 | 36 | 37 | def print_binning_process_information(print_level, n_records, n_variables, 38 | target_dtype, n_numerical, n_categorical, 39 | n_selected, time_total, 40 | dict_user_options): 41 | print_header() 42 | 43 | if print_level == 2: 44 | dict_default_options = binning_process_default_options 45 | print_optional_parameters(dict_default_options, dict_user_options) 46 | 47 | if print_level == 0: 48 | print_main_info(n_records, n_variables, time_total) 49 | elif print_level >= 1: 50 | print_binning_process_statistics(n_records, n_variables, target_dtype, 51 | n_numerical, n_categorical, 52 | n_selected, time_total) 53 | -------------------------------------------------------------------------------- /optbinning/binning/distributed/__init__.py: -------------------------------------------------------------------------------- 1 | from .gk import GK 2 | from .bsketch import BSketch 3 | from .bsketch import BCatSketch 4 | from .binning_process_sketch import BinningProcessSketch 5 | from .binning_sketch import OptimalBinningSketch 6 | 7 | 8 | __all__ = ['BSketch', 9 | 'BCatSketch', 10 | 'GK', 11 | 'OptimalBinningSketch', 12 | 'BinningProcessSketch'] 13 | -------------------------------------------------------------------------------- /optbinning/binning/distributed/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base optimal binning sketch algorithm class. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | from ...exceptions import NotSolvedError 9 | 10 | 11 | class BaseSketch: 12 | def _check_is_solved(self): 13 | if not self._is_solved: 14 | raise NotSolvedError("This {} instance is not solved yet. Call " 15 | "'solve' with appropriate arguments." 16 | .format(self.__class__.__name__)) 17 | -------------------------------------------------------------------------------- /optbinning/binning/distributed/binning_process_sketch_information.py: -------------------------------------------------------------------------------- 1 | """ 2 | Binning process sketch information. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | from ...information import print_header 9 | from ...information import print_optional_parameters 10 | from ...options import binning_process_sketch_default_options 11 | 12 | 13 | def print_main_info(n_records, n_variables, time_add, time_solve): 14 | print(" Number of records : {}".format(n_records)) 15 | print(" Number of variables : {}".format(n_variables)) 16 | print(" Time add : {:<10.4f} sec".format(time_add)) 17 | print(" Time solve : {:<10.4f} sec\n".format(time_solve)) 18 | 19 | 20 | def print_binning_process_sketch_statistics( 21 | n_records, n_variables, target_dtype, n_numerical, n_categorical, 22 | n_selected, n_add, time_add, n_solve, time_solve): 23 | 24 | r_add = time_add / n_add 25 | r_solve = time_solve / n_solve 26 | 27 | stats = ( 28 | " Statistics\n" 29 | " Number of records {:>10}\n" 30 | " Number of variables {:>10}\n" 31 | " Target type {:>10}\n\n" 32 | " Number of numerical {:>10}\n" 33 | " Number of categorical {:>10}\n" 34 | " Number of selected {:>10}\n" 35 | ).format(n_records, n_variables, target_dtype, n_numerical, 36 | n_categorical, n_selected) 37 | 38 | records_stats = ( 39 | " Streaming statistics\n" 40 | " Add operations {:>18}\n" 41 | " Solve operations {:>18}\n" 42 | ).format(n_add, n_solve) 43 | 44 | time_stats = ( 45 | " Streaming timing\n" 46 | " Time add {:>18.2f} sec ({:6.4f} sec / add)\n" 47 | " Time solve {:>18.2f} sec ({:6.4f} sec / solve)\n" 48 | ).format(time_add, r_add, time_solve, r_solve) 49 | 50 | print(stats) 51 | print(records_stats) 52 | print(time_stats) 53 | 54 | 55 | def print_binning_process_sketch_information( 56 | print_level, n_records, n_variables, target_dtype, n_numerical, 57 | n_categorical, n_selected, n_add, time_add, n_solve, time_solve, 58 | dict_user_options): 59 | 60 | print_header() 61 | 62 | if print_level == 2: 63 | dict_default_options = binning_process_sketch_default_options 64 | print_optional_parameters(dict_default_options, dict_user_options) 65 | 66 | if print_level == 0: 67 | print_main_info(n_records, n_variables, time_add, time_solve) 68 | elif print_level >= 1: 69 | print_binning_process_sketch_statistics( 70 | n_records, n_variables, target_dtype, n_numerical, n_categorical, 71 | n_selected, n_add, time_add, n_solve, time_solve) 72 | -------------------------------------------------------------------------------- /optbinning/binning/distributed/bsketch_information.py: -------------------------------------------------------------------------------- 1 | """ 2 | Binning sketch information. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | from ...binning.binning_information import print_main_info 9 | from ...binning.binning_information import print_name_status 10 | from ...binning.binning_information import print_prebinning_statistics 11 | from ...information import print_header 12 | from ...information import print_optional_parameters 13 | from ...information import print_solver_statistics 14 | from ...options import optimal_binning_sketch_options 15 | 16 | 17 | def print_timing(solver_type, solver, time_total, time_prebinning, time_solver, 18 | time_optimizer, time_postprocessing): 19 | 20 | p_prebinning = time_prebinning / time_total 21 | p_solver = time_solver / time_total 22 | p_postprocessing = time_postprocessing / time_total 23 | 24 | if solver_type == "cp" and solver is not None: 25 | time_model_generation = time_solver - time_optimizer 26 | p_model_generation = time_model_generation / time_solver 27 | p_optimizer = time_optimizer / time_solver 28 | 29 | time_stats = ( 30 | " Timing\n" 31 | " Total time {:>18.2f} sec\n" 32 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n" 33 | " Solver {:>18.2f} sec ({:>7.2%})\n" 34 | " model generation {:>18.2f} sec ({:>7.2%})\n" 35 | " optimizer {:>18.2f} sec ({:>7.2%})\n" 36 | " Post-processing {:>18.2f} sec ({:>7.2%})\n" 37 | ).format(time_total, time_prebinning, p_prebinning, time_solver, 38 | p_solver, time_model_generation, p_model_generation, 39 | time_optimizer, p_optimizer, time_postprocessing, 40 | p_postprocessing) 41 | else: 42 | time_stats = ( 43 | " Timing\n" 44 | " Total time {:>18.2f} sec\n" 45 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n" 46 | " Solver {:>18.2f} sec ({:>7.2%})\n" 47 | " Post-processing {:>18.2f} sec ({:>7.2%})\n" 48 | ).format(time_total, time_prebinning, p_prebinning, time_solver, 49 | p_solver, time_postprocessing, p_postprocessing) 50 | 51 | print(time_stats) 52 | 53 | 54 | def print_streaming_timing(memory_usage, n_records, n_add, time_add, n_solve, 55 | time_solve): 56 | r_add = time_add / n_add 57 | r_solve = time_solve / n_solve 58 | 59 | records_stats = ( 60 | " Streaming statistics\n" 61 | " Sketch memory usage {:>18.5f} MB\n" 62 | " Processed records {:>18}\n" 63 | " Add operations {:>18}\n" 64 | " Solve operations {:>18}\n" 65 | ).format(memory_usage, n_records, n_add, n_solve) 66 | 67 | time_stats = ( 68 | " Streaming timing\n" 69 | " Time add {:>18.2f} sec ({:6.4f} sec / add)\n" 70 | " Time solve {:>18.2f} sec ({:6.4f} sec / solve)\n" 71 | ).format(time_add, r_add, time_solve, r_solve) 72 | 73 | print(records_stats) 74 | print(time_stats) 75 | 76 | 77 | def print_binning_information(binning_type, print_level, name, status, 78 | solver_type, solver, time_total, time_prebinning, 79 | time_solver, time_optimizer, time_postprocessing, 80 | n_prebins, n_refinements, n_records, n_add, 81 | time_add, n_solve, time_solve, memory_usage, 82 | dict_user_options): 83 | 84 | print_header() 85 | 86 | if print_level == 2: 87 | if binning_type == "optimalbinningsketch": 88 | dict_default_options = optimal_binning_sketch_options 89 | 90 | print_optional_parameters(dict_default_options, dict_user_options) 91 | 92 | if print_level == 0: 93 | print_main_info(name, status, time_total) 94 | elif print_level >= 1: 95 | print_name_status(name, status) 96 | 97 | print_prebinning_statistics(n_prebins, n_refinements) 98 | 99 | if status in ("OPTIMAL", "FEASIBLE"): 100 | if solver is not None: 101 | print_solver_statistics(solver_type, solver) 102 | 103 | print_timing(solver_type, solver, time_total, time_prebinning, 104 | time_solver, time_optimizer, time_postprocessing) 105 | 106 | print_streaming_timing(memory_usage, n_records, n_add, time_add, 107 | n_solve, time_solve) 108 | -------------------------------------------------------------------------------- /optbinning/binning/distributed/gk.py: -------------------------------------------------------------------------------- 1 | """ 2 | Greenwald-Khanna's streaming quantiles. 3 | 4 | References: 5 | [1] M. Greenwald and S. Khanna, "Space-Efficient Online Computation of 6 | Quantile Summaries", (2001). 7 | 8 | Comment: + improvements (~ 30% faster for large arrays) 9 | 10 | [2] https://github.com/DataDog/sketches-py/tree/master/gkarray 11 | """ 12 | 13 | import numpy as np 14 | 15 | 16 | class Entry: 17 | def __init__(self, value, g, delta): 18 | """ 19 | Tuple t = (v, g, delta) 20 | 21 | Parameters 22 | ---------- 23 | value : float 24 | value that corresponds to one of the elements of the sequence. 25 | 26 | g : float 27 | g = r_min(value_[i]) - r_min(value_[i-1]) 28 | 29 | delta : float 30 | r_max - r_min 31 | """ 32 | self.value = value 33 | self.g = g 34 | self.delta = delta 35 | 36 | 37 | class GK: 38 | """Greenwald-Khanna's streaming quantiles. 39 | 40 | Parameters 41 | ---------- 42 | eps : float (default=0.01) 43 | Relative error epsilon. 44 | """ 45 | def __init__(self, eps=0.01): 46 | self.eps = eps 47 | 48 | self.entries = [] 49 | self.incoming = [] 50 | self._min = np.inf 51 | self._max = -np.inf 52 | self._count = 0 53 | self._sum = 0 54 | 55 | self._compress_threshold = int(1.0 / self.eps) + 1 56 | 57 | def __len__(self): 58 | if len(self.incoming): 59 | self.merge_compress() 60 | return len(self.entries) 61 | 62 | def add(self, value): 63 | """Add value to sketch.""" 64 | self.incoming.append(value) 65 | self._count += 1 66 | self._sum += value 67 | 68 | if value < self._min: 69 | self._min = value 70 | if value > self._max: 71 | self._max = value 72 | 73 | if self._count % self._compress_threshold == 0: 74 | self.merge_compress() 75 | 76 | def copy(self, gk): 77 | """Copy GK sketch.""" 78 | self.entries = [Entry(e.value, e.g, e.delta) for e in gk.entries] 79 | self.incoming = gk.incoming[:] 80 | self._count = gk._count 81 | self._min = gk._min 82 | self._max = gk._max 83 | self._sum = gk._sum 84 | 85 | def merge(self, gk): 86 | """Merge sketch with another sketch gk.""" 87 | if not self.mergeable(gk): 88 | raise Exception("gk does not share signature.") 89 | 90 | if gk._count == 0: 91 | return 92 | 93 | if self._count == 0: 94 | self.copy(gk) 95 | return 96 | 97 | entries = [] 98 | spread = int(gk.eps * (gk.n - 1)) 99 | gk.merge_compress() 100 | 101 | # upper bound elements(gk.v0, gk.v1) - spread 102 | g = gk.entries[0].g + gk.entries[0].delta - 1 - spread 103 | 104 | if g > 0: 105 | entries.append(Entry(gk._min, g, 0)) 106 | 107 | n_gk = len(gk) 108 | for i in range(n_gk - 1): 109 | tp1 = gk.entries[i + 1] 110 | t = gk.entries[i] 111 | g = tp1.g + (tp1.delta - t.delta) 112 | if g > 0: 113 | entries.append(Entry(t.value, g, 0)) 114 | 115 | last_t = gk.entries[n_gk - 1] 116 | g = spread + 1 - last_t.delta 117 | if g > 0: 118 | entries.append(Entry(last_t.value, g, 0)) 119 | 120 | self._count += gk._count 121 | self._min = min(self._min, gk._min) 122 | self._max = max(self._max, gk._max) 123 | self._sum += gk._sum 124 | 125 | self.merge_compress(entries) 126 | 127 | def merge_compress(self, entries=[]): 128 | """Compress sketch.""" 129 | remove_threshold = float(2.0 * self.eps * (self._count - 1)) 130 | 131 | incoming = [Entry(value, 1, 0) for value in self.incoming] 132 | 133 | if len(entries): 134 | incoming.extend(Entry(e.value, e.g, e.delta) for e in entries) 135 | 136 | incoming = sorted(incoming, key=lambda e: e.value) 137 | 138 | merged = [] 139 | i = 0 140 | j = 0 141 | n_incoming = len(incoming) 142 | n_entries = len(self.entries) 143 | 144 | while i < n_incoming or j < n_entries: 145 | if i == n_incoming: 146 | t = self.entries[j] 147 | j += 1 148 | if j < n_entries: 149 | tn = self.entries[j] 150 | if t.g + tn.g + tn.delta <= remove_threshold: 151 | tn.g += t.g 152 | continue 153 | merged.append(t) 154 | elif j == n_entries: 155 | t = incoming[i] 156 | i += 1 157 | if i < n_incoming: 158 | tn = incoming[i] 159 | if t.g + tn.g + tn.delta <= remove_threshold: 160 | tn.g += t.g 161 | continue 162 | merged.append(t) 163 | elif incoming[i].value < self.entries[j].value: 164 | ti = incoming[i] 165 | tj = self.entries[j] 166 | if ti.g + tj.g + tj.delta <= remove_threshold: 167 | tj.g += ti.g 168 | else: 169 | ti.delta = tj.g + tj.delta - ti.g 170 | merged.append(ti) 171 | i += 1 172 | else: 173 | t = self.entries[j] 174 | j += 1 175 | if j < n_entries: 176 | tn = self.entries[j] 177 | if t.g + tn.g + tn.delta <= remove_threshold: 178 | tn.g += t.g 179 | continue 180 | merged.append(t) 181 | 182 | self.entries = merged 183 | self.incoming = [] 184 | 185 | def mergeable(self, gk): 186 | """Check whether a sketch gk is mergeable.""" 187 | return self.eps == gk.eps 188 | 189 | def quantile(self, q): 190 | """Calculate quantile q.""" 191 | if not (0 <= q <= 1): 192 | raise ValueError("q must be a value in [0, 1].") 193 | 194 | if self._count == 0: 195 | raise ValueError("GK sketch does not contain values.") 196 | 197 | if len(self.incoming): 198 | self.merge_compress() 199 | 200 | rank = int(q * (self._count - 1) + 1) 201 | spread = int(self.eps * (self._count - 1)) 202 | g_sum = 0.0 203 | i = 0 204 | 205 | n_entries = len(self.entries) 206 | while i < n_entries: 207 | g_sum += self.entries[i].g 208 | if g_sum + self.entries[i].delta > rank + spread: 209 | break 210 | i += 1 211 | if i == 0: 212 | return self._min 213 | 214 | return self.entries[i - 1].value 215 | 216 | @property 217 | def n(self): 218 | """Number of records in sketch.""" 219 | return self._count 220 | -------------------------------------------------------------------------------- /optbinning/binning/distributed/plots.py: -------------------------------------------------------------------------------- 1 | """ 2 | Binning sketch plots. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | 11 | 12 | def plot_progress_divergence(df, divergence): 13 | n = len(df) 14 | n_add = df.n_add 15 | n_records = df.n_records 16 | div = df.divergence 17 | 18 | mv_div_mean = div.rolling(n, min_periods=1).mean() 19 | mv_div_std = div.rolling(n, min_periods=1).std() 20 | mv_div_std /= np.sqrt(np.arange(1, n+1)) 21 | 22 | div_low = np.maximum(0, div - mv_div_std * 1.959963984540054) 23 | div_high = div + mv_div_std * 1.959963984540054 24 | 25 | div_label = "divergence ({:.5f})".format(div.values[-1]) 26 | mv_div_label = "moving mean ({:.5f})".format(mv_div_mean.values[-1]) 27 | mv_std_label = "standard error ({:.5f})".format(mv_div_std.values[-1]) 28 | 29 | plt.plot(n_records, div, label=div_label) 30 | plt.plot(n_records, mv_div_mean, linestyle="-.", color="green", 31 | label=mv_div_label) 32 | plt.fill_between(n_records, div_low, div_high, alpha=0.2, color="green", 33 | label=mv_std_label) 34 | 35 | plt.title("Progress after {:} add and {} processed records". 36 | format(int(n_add.values[-1]), int(n_records.values[-1])), 37 | fontsize=14) 38 | plt.xlabel("Processed records", fontsize=12) 39 | plt.ylabel("Divergence: {}".format(divergence), fontsize=12) 40 | plt.legend(fontsize=12) 41 | 42 | plt.show() 43 | -------------------------------------------------------------------------------- /optbinning/binning/mdlp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Minimum Description Length Principle (MDLP) 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import numbers 9 | 10 | import numpy as np 11 | 12 | from scipy import special 13 | from sklearn.base import BaseEstimator 14 | from sklearn.exceptions import NotFittedError 15 | from sklearn.utils import check_array 16 | 17 | 18 | def _check_parameters(min_samples_split, min_samples_leaf, max_candidates): 19 | if (not isinstance(min_samples_split, numbers.Integral) or 20 | min_samples_split < 2): 21 | raise ValueError("min_samples_split must be a positive integer >= 2; " 22 | "got {}.".format(min_samples_split)) 23 | 24 | if (not isinstance(min_samples_leaf, numbers.Integral) or 25 | min_samples_leaf < 1): 26 | raise ValueError("min_samples_leaf must be a positive integer >= 1; " 27 | "got {}.".format(min_samples_leaf)) 28 | 29 | if not isinstance(max_candidates, numbers.Integral) or max_candidates < 1: 30 | raise ValueError("max_candidates must be a positive integer >= 1; " 31 | "got {}.".format(max_candidates)) 32 | 33 | 34 | class MDLP(BaseEstimator): 35 | """ 36 | Minimum Description Length Principle (MDLP) discretization algorithm. 37 | 38 | Parameters 39 | ---------- 40 | min_samples_split : int (default=2) 41 | The minimum number of samples required to split an internal node. 42 | 43 | min_samples_leaf : int (default=2) 44 | The minimum number of samples required to be at a leaf node. 45 | 46 | max_candidates : int (default=32) 47 | The maximum number of split points to evaluate at each partition. 48 | 49 | Notes 50 | ----- 51 | Implementation of the discretization algorithm in [FI93]. A dynamic 52 | split strategy based on binning the number of candidate splits [CMR2001] 53 | is implemented to increase efficiency. For large size datasets, it is 54 | recommended to use a smaller ``max_candidates`` (e.g. 16) to get a 55 | significant speed up. 56 | 57 | References 58 | ---------- 59 | 60 | .. [FI93] U. M. Fayyad and K. B. Irani. "Multi-Interval Discretization of 61 | Continuous-Valued Attributes for Classification Learning". 62 | International Joint Conferences on Artificial Intelligence, 63 | 13:1022–1027, 1993. 64 | 65 | .. [CMR2001] D. M. Chickering, C. Meek and R. Rounthwaite. "Efficient 66 | Determination of Dynamic Split Points in a Decision Tree". In 67 | Proceedings of the 2001 IEEE International Conference on Data 68 | Mining, 91-98, 2001. 69 | """ 70 | def __init__(self, min_samples_split=2, min_samples_leaf=2, 71 | max_candidates=32): 72 | 73 | self.min_samples_split = min_samples_split 74 | self.min_samples_leaf = min_samples_leaf 75 | self.max_candidates = max_candidates 76 | 77 | # auxiliary 78 | self._splits = [] 79 | 80 | self._is_fitted = None 81 | 82 | def fit(self, x, y): 83 | """Fit MDLP discretization algorithm. 84 | 85 | Parameters 86 | ---------- 87 | x : array-like, shape = (n_samples) 88 | Data samples, where n_samples is the number of samples. 89 | 90 | y : array-like, shape = (n_samples) 91 | Target vector relative to x. 92 | 93 | Returns 94 | ------- 95 | self : MDLP 96 | """ 97 | return self._fit(x, y) 98 | 99 | def _fit(self, x, y): 100 | _check_parameters(**self.get_params()) 101 | 102 | x = check_array(x, ensure_2d=False, force_all_finite=True) 103 | y = check_array(y, ensure_2d=False, force_all_finite=True) 104 | 105 | idx = np.argsort(x) 106 | x = x[idx] 107 | y = y[idx] 108 | 109 | self._recurse(x, y, 0) 110 | 111 | self._is_fitted = True 112 | 113 | return self 114 | 115 | def _recurse(self, x, y, id): 116 | u_x = np.unique(x) 117 | n_x = len(u_x) 118 | n_y = len(np.bincount(y)) 119 | 120 | split = self._find_split(u_x, x, y) 121 | 122 | if split is not None: 123 | self._splits.append(split) 124 | t = np.searchsorted(x, split, side="right") 125 | 126 | if not self._terminate(n_x, n_y, y, y[:t], y[t:]): 127 | self._recurse(x[:t], y[:t], id + 1) 128 | self._recurse(x[t:], y[t:], id + 2) 129 | 130 | def _find_split(self, u_x, x, y): 131 | n_x = len(x) 132 | u_x = np.unique(0.5 * (x[1:] + x[:-1])[(y[1:] - y[:-1]) != 0]) 133 | 134 | if len(u_x) > self.max_candidates: 135 | percentiles = np.linspace(1, 100, self.max_candidates) 136 | splits = np.percentile(u_x, percentiles) 137 | else: 138 | splits = u_x 139 | 140 | max_entropy_gain = 0 141 | best_split = None 142 | 143 | tt = np.searchsorted(x, splits, side="right") 144 | for i, t in enumerate(tt): 145 | samples_l = t >= self.min_samples_leaf 146 | samples_r = n_x - t >= self.min_samples_leaf 147 | 148 | if samples_l and samples_r: 149 | entropy_gain = self._entropy_gain(y, y[:t], y[t:]) 150 | if entropy_gain > max_entropy_gain: 151 | max_entropy_gain = entropy_gain 152 | best_split = splits[i] 153 | 154 | return best_split 155 | 156 | def _entropy(self, x): 157 | n = len(x) 158 | ns1 = np.sum(x) 159 | ns0 = n - ns1 160 | p = np.array([ns0, ns1]) / n 161 | return -special.xlogy(p, p).sum() 162 | 163 | def _entropy_gain(self, y, y1, y2): 164 | n = len(y) 165 | n1 = len(y1) 166 | n2 = n - n1 167 | ent_y = self._entropy(y) 168 | ent_y1 = self._entropy(y1) 169 | ent_y2 = self._entropy(y2) 170 | return ent_y - (n1 * ent_y1 + n2 * ent_y2) / n 171 | 172 | def _terminate(self, n_x, n_y, y, y1, y2): 173 | splittable = (n_x >= self.min_samples_split) and (n_y >= 2) 174 | 175 | n = len(y) 176 | n1 = len(y1) 177 | n2 = n - n1 178 | ent_y = self._entropy(y) 179 | ent_y1 = self._entropy(y1) 180 | ent_y2 = self._entropy(y2) 181 | gain = ent_y - (n1 * ent_y1 + n2 * ent_y2) / n 182 | 183 | k = len(np.bincount(y)) 184 | k1 = len(np.bincount(y1)) 185 | k2 = len(np.bincount(y2)) 186 | 187 | t0 = np.log(3**k - 2) 188 | t1 = k * ent_y 189 | t2 = k1 * ent_y1 190 | t3 = k2 * ent_y2 191 | delta = t0 - (t1 - t2 - t3) 192 | 193 | return gain <= (np.log(n - 1) + delta) / n or not splittable 194 | 195 | @property 196 | def splits(self): 197 | """List of split points 198 | 199 | Returns 200 | ------- 201 | splits : numpy.ndarray 202 | """ 203 | if not self._is_fitted: 204 | raise NotFittedError("This {} instance is not fitted yet. Call " 205 | "'fit' with appropriate arguments." 206 | .format(self.__class__.__name__)) 207 | 208 | return np.sort(self._splits) 209 | -------------------------------------------------------------------------------- /optbinning/binning/multiclass_cp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generalized assigment problem: solve constrained multiclass optimal binning 3 | problem. Constraint programming implementation. 4 | """ 5 | 6 | # Guillermo Navas-Palencia 7 | # Copyright (C) 2019 8 | 9 | from ortools.sat.python import cp_model 10 | 11 | from .cp import BinningCP 12 | from .model_data import multiclass_model_data 13 | 14 | 15 | class MulticlassBinningCP(BinningCP): 16 | def __init__(self, monotonic_trend, min_n_bins, max_n_bins, min_bin_size, 17 | max_bin_size, min_event_rate_diff, max_pvalue, 18 | max_pvalue_policy, user_splits_fixed, time_limit): 19 | 20 | self.monotonic_trend = monotonic_trend 21 | 22 | self.min_n_bins = min_n_bins 23 | self.max_n_bins = max_n_bins 24 | self.min_bin_size = min_bin_size 25 | self.max_bin_size = max_bin_size 26 | 27 | self.min_event_rate_diff = min_event_rate_diff 28 | self.max_pvalue = max_pvalue 29 | self.max_pvalue_policy = max_pvalue_policy 30 | self.user_splits_fixed = user_splits_fixed 31 | self.time_limit = time_limit 32 | 33 | self.solver_ = None 34 | 35 | # Auxiliary 36 | self._is_scenario_binning = False 37 | self._model = None 38 | self._n = None 39 | self._x = None 40 | 41 | def build_model(self, n_nonevent, n_event, trend_changes): 42 | # Parameters 43 | M = int(1e6) 44 | (D, V, pvalue_violation_indices, 45 | min_diff_violation_indices) = multiclass_model_data( 46 | n_nonevent, n_event, self.max_pvalue, self.max_pvalue_policy, 47 | self.min_event_rate_diff, M) 48 | 49 | n = len(n_nonevent) 50 | n_records = n_nonevent + n_event 51 | n_classes = len(self.monotonic_trend) 52 | 53 | # Initialize model 54 | model = cp_model.CpModel() 55 | 56 | # Decision variables 57 | x, y, t, d, u, bin_size_diff = self.decision_variables( 58 | model, n, n_classes) 59 | 60 | # Objective function 61 | model.Maximize(sum([sum([(V[c][i][i] * x[i, i]) + 62 | sum([(V[c][i][j] - V[c][i][j+1]) * x[i, j] 63 | for j in range(i)]) for i in range(n)]) 64 | for c in range(n_classes)])) 65 | 66 | # Constraint: unique assignment 67 | self.add_constraint_unique_assignment(model, n, x) 68 | 69 | # Constraint: continuity 70 | self.add_constraint_continuity(model, n, x) 71 | 72 | # Constraint: min / max bins 73 | self.add_constraint_min_max_bins(model, n, x, d) 74 | 75 | # Constraint: min / max bin size 76 | self.add_constraint_min_max_bin_size(model, n, x, u, n_records, 77 | bin_size_diff) 78 | 79 | # Constraints: monotonicity 80 | for c in range(n_classes): 81 | if self.monotonic_trend[c] == "ascending": 82 | self.add_constraint_monotonic_ascending(model, n, D[c], x, M) 83 | 84 | if self.monotonic_trend[c] == "descending": 85 | self.add_constraint_monotonic_descending(model, n, D[c], x, M) 86 | 87 | elif self.monotonic_trend[c] in ("peak", "valley"): 88 | for i in range(n): 89 | model.Add(t[c] >= i - n * (1 - y[c, i])) 90 | model.Add(t[c] <= i + n * y[c, i]) 91 | 92 | if self.monotonic_trend[c] == "peak": 93 | self.add_constraint_monotonic_peak( 94 | model, n, D[c], x, c, y, M) 95 | else: 96 | self.add_constraint_monotonic_valley( 97 | model, n, D[c], x, c, y, M) 98 | 99 | elif self.monotonic_trend == "peak_heuristic": 100 | self.add_constraint_monotonic_peak_heuristic( 101 | model, n, D[c], x, trend_changes[c], M) 102 | 103 | elif self.monotonic_trend == "valley_heuristic": 104 | self.add_constraint_monotonic_valley_heuristic( 105 | model, n, D[c], x, trend_changes[c], M) 106 | 107 | # Constraint: max-pvalue 108 | for c in range(n_classes): 109 | self.add_constraint_violation(model, x, 110 | pvalue_violation_indices[c]) 111 | 112 | # Constraint: min diff 113 | for c in range(n_classes): 114 | self.add_constraint_violation(model, x, 115 | min_diff_violation_indices[c]) 116 | 117 | # Constraint: fixed splits 118 | self.add_constraint_fixed_splits(model, n, x) 119 | 120 | self._model = model 121 | self._x = x 122 | self._n = n 123 | 124 | def decision_variables(self, model, n, n_classes): 125 | x = {} 126 | for i in range(n): 127 | for j in range(i + 1): 128 | x[i, j] = model.NewBoolVar("x[{}, {}]".format(i, j)) 129 | 130 | y = None 131 | t = None 132 | d = None 133 | u = None 134 | bin_size_diff = None 135 | 136 | if "peak" in self.monotonic_trend or "valley" in self.monotonic_trend: 137 | # Auxiliary binary variables 138 | y = {} 139 | t = {} 140 | for c in range(n_classes): 141 | if self.monotonic_trend[c] in ("peak", "valley"): 142 | for i in range(n): 143 | y[c, i] = model.NewBoolVar("y[{}]".format(i)) 144 | 145 | # Change points 146 | t[c] = model.NewIntVar(0, n, "t[{}]".format(c)) 147 | 148 | if self.min_n_bins is not None and self.max_n_bins is not None: 149 | n_bin_diff = self.max_n_bins - self.min_n_bins 150 | 151 | # Range constraints auxiliary variables 152 | d = model.NewIntVar(0, n_bin_diff, "n_bin_diff") 153 | 154 | if self.min_bin_size is not None and self.max_bin_size is not None: 155 | bin_size_diff = self.max_bin_size - self.min_bin_size 156 | 157 | # Range constraints auxiliary variables 158 | u = {} 159 | for i in range(n): 160 | u[i] = model.NewIntVar(0, bin_size_diff, "u[{}]".format(i)) 161 | 162 | return x, y, t, d, u, bin_size_diff 163 | 164 | def add_constraint_monotonic_peak(self, model, n, D, x, c, y, M): 165 | for i in range(1, n): 166 | for z in range(i): 167 | model.Add( 168 | M * (y[c, i] + y[c, z]) + M + (D[z][z] - M) * x[z, z] + 169 | sum([(D[z][j] - D[z][j+1]) * x[z, j] 170 | for j in range(z)]) - 171 | sum([(D[i][j] - D[i][j + 1]) * x[i, j] 172 | for j in range(i)]) - 173 | D[i][i] * x[i, i] >= 0) 174 | 175 | model.Add( 176 | M * (2 - y[c, i] - y[c, z]) + M + (D[i][i] - M) * x[i, i] + 177 | sum([(D[i][j] - D[i][j + 1]) * x[i, j] 178 | for j in range(i)]) - 179 | sum([(D[z][j] - D[z][j+1]) * x[z, j] 180 | for j in range(z)]) - 181 | D[z][z] * x[z, z] >= 0) 182 | 183 | def add_constraint_monotonic_valley(self, model, n, D, x, c, y, M): 184 | for i in range(1, n): 185 | for z in range(i): 186 | model.Add( 187 | M * (y[c, i] + y[c, z]) + M + (D[i][i] - M) * x[i, i] + 188 | sum([(D[i][j] - D[i][j + 1]) * x[i, j] 189 | for j in range(i)]) - 190 | sum([(D[z][j] - D[z][j+1]) * x[z, j] 191 | for j in range(z)]) - 192 | D[z][z] * x[z, z] >= 0) 193 | 194 | model.Add( 195 | M * (2 - y[c, i] - y[c, z]) + M + (D[z][z] - M) * x[z, z] + 196 | sum([(D[z][j] - D[z][j+1]) * x[z, j] 197 | for j in range(z)]) - 198 | sum([(D[i][j] - D[i][j + 1]) * x[i, j] 199 | for j in range(i)]) - 200 | D[i][i] * x[i, i] >= 0) 201 | -------------------------------------------------------------------------------- /optbinning/binning/multidimensional/__init__.py: -------------------------------------------------------------------------------- 1 | from .binning_2d import OptimalBinning2D 2 | from .continuous_binning_2d import ContinuousOptimalBinning2D 3 | 4 | __all__ = ['ContinuousOptimalBinning2D', 5 | 'OptimalBinning2D'] 6 | -------------------------------------------------------------------------------- /optbinning/binning/multidimensional/cp_2d.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generalized assigment problem: solve constrained optimal 2D binning problem. 3 | Constraint programming implementation. 4 | """ 5 | 6 | # Guillermo Navas-Palencia 7 | # Copyright (C) 2021 8 | 9 | import numpy as np 10 | 11 | from ortools.sat.python import cp_model 12 | 13 | 14 | class Binning2DCP: 15 | def __init__(self, monotonic_trend_x, monotonic_trend_y, min_n_bins, 16 | max_n_bins, min_diff_x, min_diff_y, gamma, n_jobs, 17 | time_limit): 18 | 19 | self.monotonic_trend_x = monotonic_trend_x 20 | self.monotonic_trend_y = monotonic_trend_y 21 | self.min_n_bins = min_n_bins 22 | self.max_n_bins = max_n_bins 23 | self.min_diff_x = min_diff_x 24 | self.min_diff_y = min_diff_y 25 | self.gamma = gamma 26 | 27 | self.n_jobs = n_jobs 28 | self.time_limit = time_limit 29 | 30 | self.solver_ = None 31 | self.event_rate_ = None 32 | self.iv_ = None 33 | 34 | self._model = None 35 | self._x = None 36 | self._n_rectangles = None 37 | 38 | def build_model(self, n_grid, n_rectangles, cols, c, d_connected_x, 39 | d_connected_y, er, n_records): 40 | # Parameters 41 | scale = int(1e6) 42 | 43 | # Initialize model 44 | model = cp_model.CpModel() 45 | 46 | # Decision variables 47 | x, d = self.decision_variables(model, n_rectangles) 48 | 49 | # Objective function 50 | if self.gamma: 51 | total_records = int(n_records.sum()) 52 | regularization = int(np.ceil(scale * self.gamma / total_records)) 53 | pmax = model.NewIntVar(0, total_records, "pmax") 54 | pmin = model.NewIntVar(0, total_records, "pmin") 55 | 56 | model.Maximize(sum([c[i] * x[i] for i in range(n_rectangles)]) - 57 | regularization * (pmax - pmin)) 58 | else: 59 | model.Maximize(sum([c[i] * x[i] for i in range(n_rectangles)])) 60 | 61 | # Constraint: unique assignment 62 | self.add_constraint_unique_assignment(model, x, n_grid, cols) 63 | 64 | # Constraint: min / max bins 65 | self.add_constraint_min_max_bins(model, n_rectangles, x, d) 66 | 67 | # Constraint: monotonicity 68 | self.add_constraint_monotonic( 69 | model, n_rectangles, x, er, d_connected_x, d_connected_y, 70 | self.min_diff_x, self.min_diff_y) 71 | 72 | # Constraint: reduction of dominating bins 73 | if self.gamma: 74 | for i in range(n_rectangles): 75 | bin_size = n_records[i] * x[i] 76 | 77 | model.Add(pmin <= total_records * (1 - x[i]) + bin_size) 78 | model.Add(pmax >= bin_size) 79 | model.Add(pmin <= pmax) 80 | 81 | # Save data for post-processing 82 | self._model = model 83 | self._x = x 84 | self._n_rectangles = n_rectangles 85 | 86 | def solve(self): 87 | # Solve 88 | self.solver_ = cp_model.CpSolver() 89 | if self.n_jobs > 1: 90 | self.solver_.parameters.num_search_workers = self.n_jobs 91 | else: 92 | self.solver_.parameters.linearization_level = 2 93 | 94 | self.solver_.parameters.max_time_in_seconds = self.time_limit 95 | 96 | status = self.solver_.Solve(self._model) 97 | status_name = self.solver_.StatusName(status) 98 | 99 | if status in (cp_model.OPTIMAL, cp_model.FEASIBLE): 100 | solution = np.array([self.solver_.BooleanValue(self._x[i]) 101 | for i in range(self._n_rectangles)]) 102 | else: 103 | solution = np.zeros(self._n_rectangles).astype(np.bool) 104 | 105 | return status_name, solution 106 | 107 | def decision_variables(self, model, n_rectangles): 108 | x = {} 109 | for i in range(n_rectangles): 110 | x[i] = model.NewBoolVar("x[{}]".format(i)) 111 | 112 | d = None 113 | 114 | if self.min_n_bins is not None and self.max_n_bins is not None: 115 | n_bin_diff = self.max_n_bins - self.min_n_bins 116 | 117 | # Range constraints auxiliary variables 118 | d = model.NewIntVar(0, n_bin_diff, "n_bin_diff") 119 | 120 | return x, d 121 | 122 | def add_constraint_unique_assignment(self, model, x, n_grid, cols): 123 | for j in range(n_grid): 124 | model.Add(sum([x[i] for i in cols[j]]) == 1) 125 | 126 | def add_constraint_min_max_bins(self, model, n_rectangles, x, d): 127 | if self.min_n_bins is not None or self.max_n_bins is not None: 128 | n_bins = sum([x[i] for i in range(n_rectangles)]) 129 | 130 | if self.min_n_bins is not None and self.max_n_bins is not None: 131 | model.Add(d + n_bins - self.max_n_bins == 0) 132 | elif self.min_n_bins is not None: 133 | model.Add(n_bins >= self.min_n_bins) 134 | elif self.max_n_bins is not None: 135 | model.Add(n_bins <= self.max_n_bins) 136 | 137 | def add_constraint_monotonic(self, model, n_rectangles, x, 138 | er, d_connected_x, d_connected_y, min_diff_x, 139 | min_diff_y): 140 | 141 | if (self.monotonic_trend_x is not None and 142 | self.monotonic_trend_y is not None): 143 | for i in range(n_rectangles): 144 | ind_x = [] 145 | ind_y = [] 146 | for j in d_connected_x[i]: 147 | if self.monotonic_trend_x == "ascending": 148 | if er[i] + min_diff_x >= er[j]: 149 | ind_x.append(j) 150 | elif self.monotonic_trend_x == "descending": 151 | if er[i] <= er[j] + min_diff_x: 152 | ind_x.append(j) 153 | 154 | if ind_x: 155 | model.Add(sum([x[j] for j in ind_x]) <= 156 | len(ind_x) * (1 - x[i])) 157 | 158 | for j in d_connected_y[i]: 159 | if self.monotonic_trend_y == "ascending": 160 | if er[i] + min_diff_y >= er[j]: 161 | ind_y.append(j) 162 | elif self.monotonic_trend_y == "descending": 163 | if er[i] <= er[j] + min_diff_y: 164 | ind_y.append(j) 165 | 166 | if ind_y: 167 | model.Add(sum([x[j] for j in ind_y]) <= 168 | len(ind_y) * (1 - x[i])) 169 | 170 | elif self.monotonic_trend_x is not None: 171 | for i in range(n_rectangles): 172 | ind_x = [] 173 | for j in d_connected_x[i]: 174 | if self.monotonic_trend_x == "ascending": 175 | if er[i] + min_diff_x >= er[j]: 176 | ind_x.append(j) 177 | elif self.monotonic_trend_x == "descending": 178 | if er[i] <= er[j] + min_diff_x: 179 | ind_x.append(j) 180 | 181 | if ind_x: 182 | model.Add(sum([x[j] for j in ind_x]) <= 183 | len(ind_x) * (1 - x[i])) 184 | 185 | elif self.monotonic_trend_y is not None: 186 | for i in range(n_rectangles): 187 | ind_y = [] 188 | for j in d_connected_y[i]: 189 | if self.monotonic_trend_y == "ascending": 190 | if er[i] + min_diff_y >= er[j]: 191 | ind_y.append(j) 192 | elif self.monotonic_trend_y == "descending": 193 | if er[i] <= er[j] + min_diff_y: 194 | ind_y.append(j) 195 | 196 | if ind_y: 197 | model.Add(sum([x[j] for j in ind_y]) <= 198 | len(ind_y) * (1 - x[i])) 199 | -------------------------------------------------------------------------------- /optbinning/binning/multidimensional/mip_2d.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generalized assigment problem: solve constrained optimal 2D binning problem. 3 | Mixed-Integer programming implementation. 4 | """ 5 | 6 | # Guillermo Navas-Palencia 7 | # Copyright (C) 2021 8 | 9 | import numpy as np 10 | 11 | from ortools.linear_solver import pywraplp 12 | 13 | 14 | class Binning2DMIP: 15 | def __init__(self, monotonic_trend_x, monotonic_trend_y, min_n_bins, 16 | max_n_bins, min_diff_x, min_diff_y, gamma, n_jobs, 17 | time_limit): 18 | 19 | self.monotonic_trend_x = monotonic_trend_x 20 | self.monotonic_trend_y = monotonic_trend_y 21 | self.min_n_bins = min_n_bins 22 | self.max_n_bins = max_n_bins 23 | self.min_diff_x = min_diff_x 24 | self.min_diff_y = min_diff_y 25 | self.gamma = gamma 26 | 27 | self.n_jobs = n_jobs 28 | self.time_limit = time_limit 29 | 30 | self.solver_ = None 31 | self.event_rate_ = None 32 | self.iv_ = None 33 | 34 | self._model = None 35 | self._x = None 36 | self._n_rectangles = None 37 | 38 | def build_model(self, n_grid, n_rectangles, cols, c, d_connected_x, 39 | d_connected_y, er, n_records): 40 | # Initialize solver 41 | solver = pywraplp.Solver( 42 | 'BinningMIP', pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING) 43 | 44 | # Decision variables 45 | x, d = self.decision_variables(solver, n_rectangles) 46 | 47 | # Objective function 48 | if self.gamma: 49 | total_records = int(n_records.sum()) 50 | regularization = self.gamma / total_records 51 | pmax = solver.NumVar(0, total_records, "pmax") 52 | pmin = solver.NumVar(0, total_records, "pmin") 53 | 54 | solver.Maximize( 55 | solver.Sum([c[i] * x[i] for i in range(n_rectangles)]) - 56 | regularization * (pmax - pmin)) 57 | else: 58 | solver.Maximize( 59 | solver.Sum([c[i] * x[i] for i in range(n_rectangles)])) 60 | 61 | # Constraint: unique assignment 62 | self.add_constraint_unique_assignment(solver, x, n_grid, cols) 63 | 64 | # Constraint: min / max bins 65 | self.add_constraint_min_max_bins(solver, n_rectangles, x, d) 66 | 67 | # Constraint: monotonicity 68 | self.add_constraint_monotonic( 69 | solver, n_rectangles, x, er, d_connected_x, d_connected_y, 70 | self.min_diff_x, self.min_diff_y) 71 | 72 | # Constraint: reduction of dominating bins 73 | if self.gamma: 74 | for i in range(n_rectangles): 75 | bin_size = n_records[i] * x[i] 76 | 77 | solver.Add(pmin <= total_records * (1 - x[i]) + bin_size) 78 | solver.Add(pmax >= bin_size) 79 | solver.Add(pmin <= pmax) 80 | 81 | # Save data for post-processing 82 | self.solver_ = solver 83 | self._x = x 84 | self._n_rectangles = n_rectangles 85 | 86 | def solve(self): 87 | # Solve 88 | self.solver_.SetTimeLimit(self.time_limit * 1000) 89 | self.solver_.SetNumThreads(self.n_jobs) 90 | status = self.solver_.Solve() 91 | 92 | if status in (pywraplp.Solver.OPTIMAL, pywraplp.Solver.FEASIBLE): 93 | if status == pywraplp.Solver.OPTIMAL: 94 | status_name = "OPTIMAL" 95 | else: 96 | status_name = "FEASIBLE" 97 | 98 | solution = np.array([self._x[i].solution_value() 99 | for i in range(self._n_rectangles)]) 100 | 101 | solution = solution.astype(bool) 102 | else: 103 | if status == pywraplp.Solver.ABNORMAL: 104 | status_name = "ABNORMAL" 105 | elif status == pywraplp.Solver.INFEASIBLE: 106 | status_name = "INFEASIBLE" 107 | elif status == pywraplp.Solver.UNBOUNDED: 108 | status_name = "UNBOUNDED" 109 | else: 110 | status_name = "UNKNOWN" 111 | 112 | solution = np.zeros(self._n_rectangles, dtype=bool) 113 | 114 | return status_name, solution 115 | 116 | def decision_variables(self, solver, n_rectangles): 117 | x = {} 118 | 119 | for i in range(n_rectangles): 120 | x[i] = solver.BoolVar("x[{}]".format(i)) 121 | 122 | d = None 123 | 124 | if self.min_n_bins is not None and self.max_n_bins is not None: 125 | n_bin_diff = self.max_n_bins - self.min_n_bins 126 | 127 | # Range constraints auxiliary variables 128 | d = solver.NumVar(0, n_bin_diff, "n_bin_diff") 129 | 130 | return x, d 131 | 132 | def add_constraint_unique_assignment(self, solver, x, n_grid, cols): 133 | for j in range(n_grid): 134 | solver.Add(solver.Sum([x[i] for i in cols[j]]) == 1) 135 | 136 | def add_constraint_min_max_bins(self, solver, n_rectangles, x, d): 137 | if self.min_n_bins is not None or self.max_n_bins is not None: 138 | n_bins = solver.Sum([x[i] for i in range(n_rectangles)]) 139 | 140 | if self.min_n_bins is not None and self.max_n_bins is not None: 141 | solver.Add(d + n_bins - self.max_n_bins == 0) 142 | elif self.min_n_bins is not None: 143 | solver.Add(n_bins >= self.min_n_bins) 144 | elif self.max_n_bins is not None: 145 | solver.Add(n_bins <= self.max_n_bins) 146 | 147 | def add_constraint_monotonic(self, solver, n_rectangles, x, er, 148 | d_connected_x, d_connected_y, min_diff_x, 149 | min_diff_y): 150 | 151 | if (self.monotonic_trend_x is not None and 152 | self.monotonic_trend_y is not None): 153 | for i in range(n_rectangles): 154 | ind_x = [] 155 | ind_y = [] 156 | for j in d_connected_x[i]: 157 | if self.monotonic_trend_x == "ascending": 158 | if er[i] + min_diff_x >= er[j]: 159 | ind_x.append(j) 160 | elif self.monotonic_trend_x == "descending": 161 | if er[i] <= er[j] + min_diff_x: 162 | ind_x.append(j) 163 | 164 | if ind_x: 165 | solver.Add(solver.Sum([x[j] for j in ind_x]) <= 166 | len(ind_x) * (1 - x[i])) 167 | 168 | for j in d_connected_y[i]: 169 | if self.monotonic_trend_y == "ascending": 170 | if er[i] + min_diff_y >= er[j]: 171 | ind_y.append(j) 172 | elif self.monotonic_trend_y == "descending": 173 | if er[i] <= er[j] + min_diff_y: 174 | ind_y.append(j) 175 | 176 | if ind_y: 177 | solver.Add(solver.Sum([x[j] for j in ind_y]) <= 178 | len(ind_y) * (1 - x[i])) 179 | 180 | elif self.monotonic_trend_x is not None: 181 | for i in range(n_rectangles): 182 | ind_x = [] 183 | for j in d_connected_x[i]: 184 | if self.monotonic_trend_x == "ascending": 185 | if er[i] + min_diff_x >= er[j]: 186 | ind_x.append(j) 187 | elif self.monotonic_trend_x == "descending": 188 | if er[i] <= er[j] + min_diff_x: 189 | ind_x.append(j) 190 | 191 | if ind_x: 192 | solver.Add(solver.Sum([x[j] for j in ind_x]) <= 193 | len(ind_x) * (1 - x[i])) 194 | 195 | elif self.monotonic_trend_y is not None: 196 | for i in range(n_rectangles): 197 | ind_y = [] 198 | for j in d_connected_y[i]: 199 | if self.monotonic_trend_y == "ascending": 200 | if er[i] + min_diff_y >= er[j]: 201 | ind_y.append(j) 202 | elif self.monotonic_trend_y == "descending": 203 | if er[i] <= er[j] + min_diff_y: 204 | ind_y.append(j) 205 | 206 | if ind_y: 207 | solver.Add(solver.Sum([x[j] for j in ind_y]) <= 208 | len(ind_y) * (1 - x[i])) 209 | -------------------------------------------------------------------------------- /optbinning/binning/multidimensional/preprocessing_2d.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocessing 2D functions. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from sklearn.utils import check_array 12 | from sklearn.utils import check_consistent_length 13 | 14 | from ..preprocessing import categorical_transform 15 | 16 | 17 | def split_data_2d(dtype_x, dtype_y, x, y, z, special_codes_x=None, 18 | special_codes_y=None, check_input=True): 19 | """Split 2d data into clean, missing and special values data. 20 | 21 | Parameters 22 | ---------- 23 | dtype_x : str, optional (default="numerical") 24 | The data type of variable x. Supported data type is "numerical" for 25 | continuous and ordinal variables. 26 | 27 | dtype_y : str, optional (default="numerical") 28 | The data type of variable y. Supported data type is "numerical" for 29 | continuous and ordinal variables. 30 | 31 | x : array-like, shape = (n_samples,) 32 | Training vector x, where n_samples is the number of samples. 33 | 34 | y : array-like, shape = (n_samples,) 35 | Training vector y, where n_samples is the number of samples. 36 | 37 | z : array-like, shape = (n_samples,) 38 | Target vector relative to x and y. 39 | 40 | special_codes_x : array-like or None, optional (default=None) 41 | List of special codes for the variable x. Use special codes to specify 42 | the data values that must be treated separately. 43 | 44 | special_codes_y : array-like or None, optional (default=None) 45 | List of special codes for the variable y. Use special codes to specify 46 | the data values that must be treated separately. 47 | 48 | check_input : bool, (default=True) 49 | If False, the input arrays x and y will not be checked. 50 | 51 | Returns 52 | ------- 53 | """ 54 | if check_input: 55 | x = check_array(x, ensure_2d=False, dtype=None, 56 | force_all_finite='allow-nan') 57 | 58 | y = check_array(y, ensure_2d=False, dtype=None, 59 | force_all_finite='allow-nan') 60 | 61 | z = check_array(z, ensure_2d=False, dtype=None, 62 | force_all_finite=True) 63 | 64 | check_consistent_length(x, y, z) 65 | 66 | x = np.asarray(x) 67 | y = np.asarray(y) 68 | z = np.asarray(z) 69 | 70 | if np.issubdtype(x.dtype, np.number) and np.issubdtype(z.dtype, np.number): 71 | missing_mask_x = np.isnan(x) | np.isnan(z) 72 | else: 73 | missing_mask_x = pd.isnull(x) | pd.isnull(z) 74 | 75 | if np.issubdtype(y.dtype, np.number) and np.issubdtype(z.dtype, np.number): 76 | missing_mask_y = np.isnan(y) | np.isnan(z) 77 | else: 78 | missing_mask_y = pd.isnull(y) | pd.isnull(z) 79 | 80 | if special_codes_x is not None: 81 | special_mask_x = pd.Series(x).isin(special_codes_x).values 82 | else: 83 | special_mask_x = np.zeros(len(x), dtype=bool) 84 | 85 | if special_codes_y is not None: 86 | special_mask_y = pd.Series(y).isin(special_codes_y).values 87 | else: 88 | special_mask_y = np.zeros(len(y), dtype=bool) 89 | 90 | missing_mask = missing_mask_x | missing_mask_y 91 | special_mask = special_mask_x | special_mask_y 92 | 93 | clean_mask = ~missing_mask & ~special_mask 94 | 95 | x_clean = x[clean_mask] 96 | y_clean = y[clean_mask] 97 | z_clean = z[clean_mask] 98 | 99 | x_missing = x[missing_mask] 100 | y_missing = y[missing_mask] 101 | z_missing = z[missing_mask] 102 | 103 | x_special = x[special_mask] 104 | y_special = y[special_mask] 105 | z_special = z[special_mask] 106 | 107 | if dtype_x == "categorical": 108 | x_categories, x_clean = categorical_transform(x_clean, z_clean) 109 | else: 110 | x_categories = [] 111 | 112 | if dtype_y == "categorical": 113 | y_categories, y_clean = categorical_transform(y_clean, z_clean) 114 | else: 115 | y_categories = [] 116 | 117 | return (x_clean, y_clean, z_clean, x_missing, y_missing, z_missing, 118 | x_special, y_special, z_special, x_categories, y_categories) 119 | -------------------------------------------------------------------------------- /optbinning/binning/outlier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Univariate outlier detection methods. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import numbers 9 | 10 | import numpy as np 11 | 12 | from sklearn.base import BaseEstimator 13 | from sklearn.exceptions import NotFittedError 14 | 15 | 16 | class OutlierDetector: 17 | """Base class for all outlier detectors.""" 18 | def __init__(self): 19 | self._support = None 20 | 21 | # flag 22 | self._is_fitted = False 23 | 24 | def fit(self, x, y=None): 25 | """Fit outlier detector. 26 | 27 | Parameters 28 | ---------- 29 | x : array-like, shape = (n_samples) 30 | 31 | y : array-like, shape = (n_samples) or None (default=None) 32 | 33 | Returns 34 | ------- 35 | self : OutlierDetector 36 | """ 37 | self._fit(x, y) 38 | 39 | return self 40 | 41 | def get_support(self, indices=False): 42 | """Get a mask, or integer index, of the samples excluded, i.e, samples 43 | detected as outliers. 44 | 45 | Parameters 46 | ---------- 47 | indices : boolean (default False) 48 | If True, the return value will be an array of integers, rather 49 | than a boolean mask. 50 | 51 | Returns 52 | ------- 53 | support : array, shape = (n_samples) 54 | An index that selects the excluded samples from a vector. 55 | If `indices` is False, this is a boolean array, in which an element 56 | is True iff its corresponding sample is excluded. If `indices` is 57 | True, this is an integer array whose values are indices into the 58 | input vector. 59 | """ 60 | if not self._is_fitted: 61 | raise NotFittedError("This {} instance is not fitted yet. Call " 62 | "'fit' with appropriate arguments." 63 | .format(self.__class__.__name__)) 64 | 65 | mask = self._support 66 | return mask if not indices else np.where(mask)[0] 67 | 68 | 69 | class RangeDetector(BaseEstimator, OutlierDetector): 70 | r"""Interquartile range or interval based outlier detection method. 71 | 72 | The default settings compute the usual interquartile range method. 73 | 74 | Parameters 75 | ---------- 76 | interval_length : float (default=0.5) 77 | Compute ``interval_length``\% credible interval. This is a value in 78 | [0, 1]. 79 | 80 | k : float (default=1.5) 81 | Tukey's factor. 82 | 83 | method : str (default="ETI") 84 | Method to compute credible intervals. Supported methods are Highest 85 | Density interval (``method="HDI"``) and Equal-tailed interval 86 | (``method="ETI"``). 87 | """ 88 | def __init__(self, interval_length=0.5, k=1.5, method="ETI"): 89 | self.interval_length = interval_length 90 | self.k = k 91 | self.method = method 92 | 93 | def _fit(self, x, y=None): 94 | if self.method not in ("ETI", "HDI"): 95 | raise ValueError('Invalid value for method. Allowed string ' 96 | 'values are "ETI" and "HDI".') 97 | 98 | if (not isinstance(self.interval_length, numbers.Number) or 99 | not 0 <= self.interval_length <= 1): 100 | raise ValueError("Interval length must a value in [0, 1]; got {}." 101 | .format(self.interval_length)) 102 | 103 | if self.method == "ETI": 104 | lower = 100 * (1 - self.interval_length) / 2 105 | upper = 100 * (1 + self.interval_length) / 2 106 | 107 | lb, ub = np.percentile(x, [lower, upper]) 108 | else: 109 | n = len(x) 110 | xsorted = np.sort(x) 111 | n_included = int(np.ceil(self.interval_length * n)) 112 | n_ci = n - n_included 113 | ci = xsorted[n_included:] - xsorted[:n_ci] 114 | j = np.argmin(ci) 115 | hdi_min = xsorted[j] 116 | hdi_max = xsorted[j + n_included] 117 | 118 | lb = hdi_min 119 | ub = hdi_max 120 | 121 | iqr = ub - lb 122 | lower_bound = lb - self.k * iqr 123 | upper_bound = ub + self.k * iqr 124 | 125 | self._support = (x > upper_bound) | (x < lower_bound) 126 | 127 | self._is_fitted = True 128 | 129 | 130 | class ModifiedZScoreDetector(BaseEstimator, OutlierDetector): 131 | """Modified Z-score method. 132 | 133 | Parameters 134 | ---------- 135 | threshold : float (default=3.5) 136 | Modified Z-scores with an absolute value of greater than the threshold 137 | are labeled as outliers. 138 | 139 | References 140 | ---------- 141 | 142 | .. [IH93] B. Iglewicz and D. Hoaglin. "Volume 16: How to Detect and Handle 143 | Outliers", The ASQC Basic References in Quality Control: 144 | Statistical Techniques, Edward F. Mykytka, Ph.D., Editor, 1993. 145 | """ 146 | def __init__(self, threshold=3.5): 147 | self.threshold = threshold 148 | 149 | def _fit(self, x, y=None): 150 | if (not isinstance(self.threshold, numbers.Number) or 151 | self.threshold < 0): 152 | raise ValueError("threshold must be a value >= 0; got {}". 153 | format(self.threshold)) 154 | 155 | x = np.asarray(x) 156 | median = np.median(x) 157 | mad = np.median(np.abs(x - median)) 158 | m_z_score = 0.6745 * (x - median) / mad 159 | 160 | self._support = np.abs(m_z_score) > self.threshold 161 | 162 | self._is_fitted = True 163 | 164 | 165 | class YQuantileDetector(BaseEstimator, OutlierDetector): 166 | """Outlier detector on the y-axis over quantiles. 167 | 168 | Parameters 169 | ---------- 170 | outlier_detector : str or None, optional (default=None) 171 | The outlier detection method. Supported methods are "range" to use 172 | the interquartile range based method or "zcore" to use the modified 173 | Z-score method. 174 | 175 | outlier_params : dict or None, optional (default=None) 176 | Dictionary of parameters to pass to the outlier detection method. 177 | 178 | n_bins : int (default=5) 179 | The maximum number of bins to consider. 180 | """ 181 | def __init__(self, outlier_detector="zscore", outlier_params=None, 182 | n_bins=5): 183 | self.outlier_detector = outlier_detector 184 | self.outlier_params = outlier_params 185 | self.n_bins = n_bins 186 | 187 | def _fit(self, x, y): 188 | if self.outlier_detector not in ("range", "zscore"): 189 | raise ValueError('Invalid value for outlier_detector. Allowed ' 190 | 'string values are "range" and "zscore".') 191 | 192 | if self.outlier_params is not None: 193 | if not isinstance(self.outlier_params, dict): 194 | raise TypeError("outlier_params must be a dict or None; " 195 | "got {}.".format(self.outlier_params)) 196 | 197 | if not isinstance(self.n_bins, numbers.Integral) or self.n_bins <= 0: 198 | raise ValueError("bins must be a positive integer; got {}." 199 | .format(self.n_bins)) 200 | 201 | x = np.asarray(x) 202 | y = np.asarray(y) 203 | 204 | q = np.linspace(0, 1, self.n_bins + 1) 205 | splits = np.unique(np.quantile(x, q))[1:-1] 206 | n_bins = len(splits) + 1 207 | indices = np.digitize(x, splits, right=False) 208 | 209 | self._support = np.zeros(x.size, dtype=bool) 210 | idx_support = np.arange(x.size) 211 | 212 | if self.outlier_detector == "zscore": 213 | detector = ModifiedZScoreDetector() 214 | elif self.outlier_detector == "range": 215 | detector = RangeDetector() 216 | 217 | if self.outlier_params is not None: 218 | detector.set_params(**self.outlier_params) 219 | 220 | for i in range(n_bins): 221 | mask_x = indices == i 222 | detector.fit(y[mask_x]) 223 | mask_out = detector.get_support() 224 | idx_out = idx_support[mask_x][mask_out] 225 | self._support[idx_out] = True 226 | 227 | self._is_fitted = True 228 | -------------------------------------------------------------------------------- /optbinning/binning/piecewise/__init__.py: -------------------------------------------------------------------------------- 1 | from .binning import OptimalPWBinning 2 | from .continuous_binning import ContinuousOptimalPWBinning 3 | 4 | 5 | __all__ = ['OptimalPWBinning', 6 | 'ContinuousOptimalPWBinning'] 7 | -------------------------------------------------------------------------------- /optbinning/binning/piecewise/binning_information.py: -------------------------------------------------------------------------------- 1 | """ 2 | Optimal piecewise binning information. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | from ...binning.binning_information import print_header 9 | from ...binning.binning_information import print_optional_parameters 10 | from ...binning.binning_information import print_name_status 11 | from ...binning.binning_information import print_main_info 12 | from ...options import optimal_pw_binning_options 13 | 14 | 15 | def print_prebinning_statistics(n_prebins): 16 | prebinning_stats = ( 17 | " Pre-binning statistics\n" 18 | " Number of bins {:>10}\n" 19 | ).format(n_prebins) 20 | 21 | print(prebinning_stats) 22 | 23 | 24 | def print_solver_statistics(solver_type, solver): 25 | if isinstance(solver.stats, list): 26 | n_constraints = sum(info["n_constraints"] for info in solver.stats) 27 | n_variables = sum(info["n_variables"] for info in solver.stats) 28 | else: 29 | n_constraints = solver.stats["n_constraints"] 30 | n_variables = solver.stats["n_variables"] 31 | 32 | solver_stats = ( 33 | " Solver statistics\n" 34 | " Type {:>10}\n" 35 | " Number of variables {:>10}\n" 36 | " Number of constraints {:>10}\n" 37 | ).format(solver_type, n_variables, n_constraints) 38 | 39 | print(solver_stats) 40 | 41 | 42 | def print_timing(solver_type, solver, time_total, time_preprocessing, 43 | time_estimator, time_prebinning, time_solver, 44 | time_postprocessing): 45 | 46 | p_preprocessing = time_preprocessing / time_total 47 | p_estimator = time_estimator / time_total 48 | p_prebinning = time_prebinning / time_total 49 | p_solver = time_solver / time_total 50 | p_postprocessing = time_postprocessing / time_total 51 | 52 | time_stats = ( 53 | " Timing\n" 54 | " Total time {:>18.2f} sec\n" 55 | " Pre-processing {:>18.2f} sec ({:>7.2%})\n" 56 | " Estimator {:>18.2f} sec ({:>7.2%})\n" 57 | " Pre-binning {:>18.2f} sec ({:>7.2%})\n" 58 | " Solver {:>18.2f} sec ({:>7.2%})\n" 59 | " Post-processing {:>18.2f} sec ({:>7.2%})\n" 60 | ).format(time_total, time_preprocessing, p_preprocessing, 61 | time_estimator, p_estimator, time_prebinning, p_prebinning, 62 | time_solver, p_solver, time_postprocessing, p_postprocessing) 63 | 64 | print(time_stats) 65 | 66 | 67 | def retrieve_status(status): 68 | if isinstance(status, list): 69 | n_status = len(status) 70 | n_optimal = 0 71 | n_feasible = 0 72 | n_unbouded = 0 73 | for s in status: 74 | if "optimal" in s: 75 | n_optimal += 1 76 | elif "feasible" in s: 77 | n_feasible += 1 78 | elif "unbounded" in s: 79 | n_unbouded += 1 80 | if n_optimal == n_status: 81 | return "OPTIMAL" 82 | elif n_feasible == n_status: 83 | return "FEASIBLE" 84 | elif n_unbouded == n_status: 85 | return "UNBOUNDED" 86 | else: 87 | new_status = "" 88 | if n_optimal > 0: 89 | new_status += "OPTIMAL ({}/{})".format(n_optimal, n_status) 90 | if n_feasible > 0: 91 | new_status += "FEASIBLE ({}/{})".format(n_feasible, n_status) 92 | if n_unbouded > 0: 93 | new_status += "UNBOUNDED ({}/{})".format(n_unbouded, n_status) 94 | return new_status 95 | else: 96 | if "optimal" in status: 97 | return "OPTIMAL" 98 | elif "feasible" in status: 99 | return "FEASIBLE" 100 | elif "unbounded" in status: 101 | return "UNBOUNDED" 102 | 103 | 104 | def print_binning_information(print_level, name, status, solver_type, solver, 105 | time_total, time_preprocessing, time_estimator, 106 | time_prebinning, time_solver, 107 | time_postprocessing, n_prebins, 108 | dict_user_options): 109 | 110 | print_header() 111 | 112 | if print_level == 2: 113 | dict_default_options = optimal_pw_binning_options 114 | 115 | print_optional_parameters(dict_default_options, dict_user_options) 116 | 117 | if print_level == 0: 118 | print_main_info(name, status, time_total) 119 | elif print_level >= 1: 120 | print_name_status(name, status) 121 | 122 | print_prebinning_statistics(n_prebins) 123 | 124 | if status in ("OPTIMAL", "FEASIBLE"): 125 | if solver is not None: 126 | print_solver_statistics(solver_type, solver) 127 | 128 | print_timing(solver_type, solver, time_total, time_preprocessing, 129 | time_estimator, time_prebinning, time_solver, 130 | time_postprocessing) 131 | -------------------------------------------------------------------------------- /optbinning/binning/piecewise/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Optimal piecewise binning metrics. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import numpy as np 9 | 10 | from sklearn.metrics import average_precision_score 11 | from sklearn.metrics import brier_score_loss 12 | 13 | from ...binning.metrics import jeffrey 14 | from ...binning.metrics import jensen_shannon 15 | from ...binning.metrics import hellinger 16 | from ...binning.metrics import triangular 17 | from ...metrics.classification import gini 18 | from ...metrics.classification import ks 19 | from ...metrics.regression import regression_metrics 20 | from .transformations import transform_binary_target 21 | from .transformations import transform_continuous_target 22 | 23 | 24 | def _fun_divergence(fun, n, pi, qi, pi_special, qi_special, pi_missing, 25 | qi_missing, flag_special, flag_missing, n_special): 26 | 27 | div_value = fun(pi, qi, return_sum=True) / n 28 | 29 | if flag_special: 30 | div_value += fun(pi_special, qi_special, return_sum=True) / n_special 31 | 32 | if flag_missing: 33 | div_value += fun([pi_missing], [qi_missing]) 34 | 35 | return float(div_value) 36 | 37 | 38 | def divergences_asymptotic(event_rate, n_nonevent_special, n_event_special, 39 | n_nonevent_missing, n_event_missing, t_n_nonevent, 40 | t_n_event): 41 | 42 | n = t_n_nonevent + t_n_event 43 | p = t_n_event / n 44 | 45 | pi = (1.0 - event_rate) / (1.0 - p) 46 | qi = event_rate / p 47 | 48 | if isinstance(n_event_special, (np.ndarray, list)): 49 | n_special = n_event_special.size 50 | mask = (n_event_special > 0) & (n_nonevent_special > 0) 51 | flag_special = np.any(mask) 52 | 53 | pi_special = n_nonevent_special[mask] / t_n_nonevent 54 | qi_special = n_event_special[mask] / t_n_event 55 | else: 56 | n_special = 1 57 | flag_special = (n_event_special > 0 and n_nonevent_special > 0) 58 | pi_special = n_nonevent_special / t_n_nonevent 59 | qi_special = n_event_special / t_n_event 60 | 61 | flag_missing = (n_event_missing > 0 and n_nonevent_missing > 0) 62 | pi_missing = n_nonevent_missing / t_n_nonevent 63 | qi_missing = n_event_missing / t_n_event 64 | 65 | d_divergences = {} 66 | 67 | d_divergences["IV (Jeffrey)"] = _fun_divergence( 68 | jeffrey, n, pi, qi, pi_special, qi_special, pi_missing, qi_missing, 69 | flag_special, flag_missing, n_special) 70 | 71 | d_divergences["JS (Jensen-Shannon)"] = _fun_divergence( 72 | jensen_shannon, n, pi, qi, pi_special, qi_special, pi_missing, 73 | qi_missing, flag_special, flag_missing, n_special) 74 | 75 | d_divergences["Hellinger"] = _fun_divergence( 76 | hellinger, n, pi, qi, pi_special, qi_special, pi_missing, qi_missing, 77 | flag_special, flag_missing, n_special) 78 | 79 | d_divergences["Triangular"] = _fun_divergence( 80 | triangular, n, pi, qi, pi_special, qi_special, pi_missing, qi_missing, 81 | flag_special, flag_missing, n_special) 82 | 83 | return d_divergences 84 | 85 | 86 | def binary_metrics(x, y, splits, c, t_n_nonevent, t_n_event, 87 | n_nonevent_special, n_event_special, n_nonevent_missing, 88 | n_event_missing, special_codes): 89 | 90 | d_metrics = {} 91 | 92 | n_nonevent_special = np.asarray(n_nonevent_special) 93 | n_event_special = np.asarray(n_event_special) 94 | 95 | # Metrics using predicted probability of Y=1. 96 | min_pred = 1e-8 97 | max_pred = 1 - min_pred 98 | 99 | event_rate = transform_binary_target( 100 | splits, x, c, min_pred, max_pred, t_n_nonevent, t_n_event, 101 | n_nonevent_special, n_event_special, n_nonevent_missing, 102 | n_event_missing, special_codes, "event_rate", "empirical", "empirical") 103 | 104 | d_metrics["Gini index"] = gini(y, event_rate) 105 | 106 | # Divergence metrics 107 | d_divergences = divergences_asymptotic( 108 | event_rate, n_nonevent_special, n_event_special, n_nonevent_missing, 109 | n_event_missing, t_n_nonevent, t_n_event) 110 | 111 | for dk, dv in d_divergences.items(): 112 | d_metrics[dk] = dv 113 | 114 | d_metrics["KS"] = ks(y, event_rate)[0] 115 | d_metrics["Avg precision"] = average_precision_score(y, event_rate) 116 | d_metrics["Brier score"] = brier_score_loss(y, event_rate) 117 | 118 | return d_metrics 119 | 120 | 121 | def continuous_metrics(x, y, splits, c, lb, ub, n_records_special, sum_special, 122 | n_records_missing, sum_missing, special_codes): 123 | 124 | y_pred = transform_continuous_target( 125 | splits, x, c, lb, ub, n_records_special, sum_special, 126 | n_records_missing, sum_missing, special_codes, "empirical", 127 | "empirical") 128 | 129 | d_metrics = regression_metrics(y, y_pred) 130 | 131 | return d_metrics 132 | -------------------------------------------------------------------------------- /optbinning/binning/piecewise/transformations.py: -------------------------------------------------------------------------------- 1 | """ 2 | Piecewise binning transformations. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from sklearn.utils import check_array 12 | 13 | from ...binning.transformations import transform_event_rate_to_woe 14 | from ...binning.transformations import _check_metric_special_missing 15 | from ...binning.transformations import _mask_special_missing 16 | 17 | 18 | def _apply_transform(x, c, lb, ub, special_codes, metric_special, 19 | metric_missing, clean_mask, special_mask, missing_mask, 20 | indices, x_clean, n_bins, n_special, event_rate_special, 21 | event_rate_missing): 22 | 23 | x_transform = np.zeros(x.shape) 24 | x_clean_transform = np.zeros(x_clean.shape) 25 | 26 | for i in range(n_bins): 27 | mask = (indices == i) 28 | x_clean_transform[mask] = np.polyval(c[i, :][::-1], x_clean[mask]) 29 | 30 | # Clip values using LB/UB 31 | bounded = (lb is not None or ub is not None) 32 | if bounded: 33 | x_clean_transform = np.clip(x_clean_transform, lb, ub) 34 | 35 | x_transform[clean_mask] = x_clean_transform 36 | 37 | if special_codes: 38 | if isinstance(special_codes, dict): 39 | xt = pd.Series(x) 40 | for i, (k, s) in enumerate(special_codes.items()): 41 | sl = s if isinstance(s, (list, np.ndarray)) else [s] 42 | mask = xt.isin(sl).values 43 | if metric_special == "empirical": 44 | x_transform[mask] = event_rate_special[i] 45 | else: 46 | x_transform[mask] = metric_special 47 | else: 48 | if metric_special == "empirical": 49 | x_transform[special_mask] = event_rate_special 50 | else: 51 | x_transform[special_mask] = metric_special 52 | 53 | if metric_missing == "empirical": 54 | x_transform[missing_mask] = event_rate_missing 55 | else: 56 | x_transform[missing_mask] = metric_missing 57 | 58 | return x_transform 59 | 60 | 61 | def transform_binary_target(splits, x, c, lb, ub, n_nonevent, n_event, 62 | n_event_special, n_nonevent_special, 63 | n_event_missing, n_nonevent_missing, 64 | special_codes, metric, metric_special, 65 | metric_missing, check_input=False): 66 | 67 | if metric not in ("event_rate", "woe"): 68 | raise ValueError('Invalid value for metric. Allowed string ' 69 | 'values are "event_rate" and "woe".') 70 | 71 | _check_metric_special_missing(metric_special, metric_missing) 72 | 73 | if check_input: 74 | x = check_array(x, ensure_2d=False, dtype=None, 75 | force_all_finite='allow-nan') 76 | 77 | x = np.asarray(x) 78 | 79 | special_mask, missing_mask, clean_mask, n_special = _mask_special_missing( 80 | x, special_codes) 81 | 82 | x_clean = x[clean_mask] 83 | 84 | if len(splits): 85 | indices = np.digitize(x_clean, splits, right=False) 86 | else: 87 | indices = np.zeros(x_clean.shape) 88 | 89 | n_bins = len(splits) + 1 90 | 91 | # Compute event rate for special and missing bin 92 | event_rate_special = metric_special 93 | event_rate_missing = metric_missing 94 | 95 | if metric_special == "empirical": 96 | n_event_special = np.asarray(n_event_special) 97 | n_nonevent_special = np.asarray(n_nonevent_special) 98 | 99 | event_rate_special = np.zeros(n_special) 100 | n_records_special = n_event_special + n_nonevent_special 101 | 102 | mask = (n_event_special > 0) & (n_nonevent_special > 0) 103 | 104 | if n_special > 1: 105 | event_rate_special[mask] = ( 106 | n_event_special[mask] / n_records_special[mask]) 107 | elif mask: 108 | event_rate_special = n_event_special / n_records_special 109 | 110 | if metric == "woe": 111 | event_rate_special = transform_event_rate_to_woe( 112 | event_rate_special, n_nonevent, n_event) 113 | 114 | if metric_missing == "empirical": 115 | n_records_missing = n_event_missing + n_nonevent_missing 116 | 117 | if n_records_missing > 0: 118 | event_rate_missing = n_event_missing / n_records_missing 119 | else: 120 | event_rate_missing = 0 121 | 122 | if metric == "woe": 123 | event_rate_missing = transform_event_rate_to_woe( 124 | event_rate_missing, n_nonevent, n_event) 125 | 126 | x_transform = _apply_transform( 127 | x, c, lb, ub, special_codes, metric_special, metric_missing, 128 | clean_mask, special_mask, missing_mask, indices, x_clean, n_bins, 129 | n_special, event_rate_special, event_rate_missing) 130 | 131 | if metric == "woe": 132 | x_transform[clean_mask] = transform_event_rate_to_woe( 133 | x_transform[clean_mask], n_nonevent, n_event) 134 | 135 | return x_transform 136 | 137 | 138 | def transform_continuous_target(splits, x, c, lb, ub, n_records_special, 139 | sum_special, n_records_missing, sum_missing, 140 | special_codes, metric_special, metric_missing, 141 | check_input=False): 142 | 143 | _check_metric_special_missing(metric_special, metric_missing) 144 | 145 | if check_input: 146 | x = check_array(x, ensure_2d=False, dtype=None, 147 | force_all_finite='allow-nan') 148 | 149 | x = np.asarray(x) 150 | 151 | special_mask, missing_mask, clean_mask, n_special = _mask_special_missing( 152 | x, special_codes) 153 | 154 | x_clean = x[clean_mask] 155 | 156 | if len(splits): 157 | indices = np.digitize(x_clean, splits, right=False) 158 | else: 159 | indices = np.zeros(x_clean.shape) 160 | 161 | n_bins = len(splits) + 1 162 | 163 | # Compute event rate for special and missing bin 164 | mean_special = metric_special 165 | mean_missing = metric_missing 166 | 167 | if metric_special == "empirical": 168 | sum_special = np.asarray(sum_special) 169 | n_records_special = np.asarray(n_records_special) 170 | 171 | mean_special = np.zeros(n_special) 172 | 173 | mask = (n_records_special > 0) 174 | 175 | if n_special > 1: 176 | mean_special[mask] = sum_special[mask] / n_records_special[mask] 177 | elif mask: 178 | mean_special = sum_special / n_records_special 179 | 180 | if metric_missing == "empirical": 181 | if n_records_missing > 0: 182 | mean_missing = sum_missing / n_records_missing 183 | else: 184 | mean_missing = 0 185 | 186 | x_transform = _apply_transform( 187 | x, c, lb, ub, special_codes, metric_special, metric_missing, 188 | clean_mask, special_mask, missing_mask, indices, x_clean, n_bins, 189 | n_special, mean_special, mean_missing) 190 | 191 | return x_transform 192 | -------------------------------------------------------------------------------- /optbinning/binning/prebinning.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pre-binning class. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2019 7 | 8 | import numpy as np 9 | 10 | from sklearn.preprocessing import KBinsDiscretizer 11 | from sklearn.tree import _tree 12 | from sklearn.tree import DecisionTreeClassifier 13 | from sklearn.tree import DecisionTreeRegressor 14 | 15 | from .mdlp import MDLP 16 | 17 | 18 | class PreBinning: 19 | """Prebinning algorithms. 20 | 21 | Parameters 22 | ---------- 23 | problem_type: 24 | The problem type depending on the target type. 25 | 26 | method : str 27 | Available methods are 'uniform', 'quantile' and 'cart'. 28 | 29 | n_bins : int 30 | The number of bins to produce. 31 | 32 | min_bin_size : int, float 33 | The minimum bin size. 34 | 35 | **kwargs : keyword arguments 36 | Keyword arguments for prebinning method. See notes. 37 | 38 | Notes 39 | ----- 40 | Keyword arguments are those available in the following classes: 41 | 42 | * ``method="uniform"``: `sklearn.preprocessing.KBinsDiscretizer. 43 | 44 | * ``method="quantile"``: `sklearn.preprocessing.KBinsDiscretizer. 45 | 46 | * ``method="cart"``: sklearn.tree.DecistionTreeClassifier. 47 | 48 | * ``method="mdlp"``: optbinning.binning.mdlp.MDLP. 49 | 50 | """ 51 | def __init__(self, problem_type, method, n_bins, min_bin_size, 52 | class_weight=None, **kwargs): 53 | 54 | self.problem_type = problem_type 55 | self.method = method 56 | self.n_bins = n_bins 57 | self.min_bin_size = min_bin_size 58 | self.class_weight = class_weight 59 | self.kwargs = kwargs 60 | 61 | self._splits = None 62 | 63 | def fit(self, x, y, sample_weight=None): 64 | """Fit PreBinning algorithm. 65 | 66 | Parameters 67 | ---------- 68 | x : array-like, shape = (n_samples) 69 | Data samples, where n_samples is the number of samples. 70 | 71 | y : array-like, shape = (n_samples) 72 | Target vector relative to x. 73 | 74 | sample_weight : array-like of shape (n_samples,) (default=None) 75 | Array of weights that are assigned to individual samples. 76 | 77 | Returns 78 | ------- 79 | self : PreBinning 80 | """ 81 | if self.method not in ("uniform", "quantile", "cart", "mdlp"): 82 | raise ValueError('Invalid value for prebinning method. Allowed ' 83 | 'string values are "cart", "mdlp", "quantile" ' 84 | 'and "uniform".') 85 | 86 | if self.problem_type not in ("classification", "regression"): 87 | raise ValueError('Invalid value for problem_type. Allowed ' 88 | 'string values are "classification" and ' 89 | '"regression".') 90 | 91 | if self.problem_type == "regression" and self.method == "mdlp": 92 | raise ValueError("mdlp method can only handle binary " 93 | "classification problems.") 94 | 95 | if self.method in ("uniform", "quantile"): 96 | unsup_kwargs = {"n_bins": self.n_bins, "strategy": self.method} 97 | unsup_kwargs.update(**self.kwargs) 98 | 99 | est = KBinsDiscretizer(**unsup_kwargs) 100 | est.fit(x.reshape(-1, 1), y) 101 | self._splits = est.bin_edges_[0][1:-1] 102 | 103 | elif self.method == "cart": 104 | cart_kwargs = { 105 | "min_samples_leaf": self.min_bin_size, 106 | "max_leaf_nodes": self.n_bins} 107 | 108 | if self.problem_type == "classification": 109 | cart_kwargs["class_weight"] = self.class_weight 110 | cart_kwargs.update(**self.kwargs) 111 | 112 | est = DecisionTreeClassifier(**cart_kwargs) 113 | else: 114 | cart_kwargs.update(**self.kwargs) 115 | est = DecisionTreeRegressor(**cart_kwargs) 116 | 117 | est.fit(x.reshape(-1, 1), y, sample_weight=sample_weight) 118 | splits = np.unique(est.tree_.threshold) 119 | self._splits = splits[splits != _tree.TREE_UNDEFINED] 120 | 121 | elif self.method == "mdlp": 122 | mdlp_kwargs = {"min_samples_leaf": self.min_bin_size} 123 | mdlp_kwargs.update(**self.kwargs) 124 | 125 | est = MDLP(**mdlp_kwargs) 126 | est.fit(x, y) 127 | self._splits = est.splits 128 | 129 | return self 130 | 131 | @property 132 | def splits(self): 133 | """List of split points 134 | 135 | Returns 136 | ------- 137 | splits : numpy.ndarray 138 | """ 139 | return self._splits 140 | -------------------------------------------------------------------------------- /optbinning/binning/uncertainty/__init__.py: -------------------------------------------------------------------------------- 1 | from .binning_scenarios import SBOptimalBinning 2 | 3 | 4 | __all__ = ['SBOptimalBinning'] 5 | -------------------------------------------------------------------------------- /optbinning/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Custom error and warning exceptions. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | 9 | class NotDataAddedError(ValueError, AttributeError): 10 | """Exception class to raise if binning sketch is solved before adding 11 | data. 12 | 13 | This class inherits from both ValueError and AttributeError to help with 14 | exception handling and backward compatibility. 15 | """ 16 | 17 | 18 | class NotSolvedError(ValueError, AttributeError): 19 | """Exception class to raise if binning sketch methods are called before 20 | solving. 21 | 22 | This class inherits from both ValueError and AttributeError to help with 23 | exception handling and backward compatibility. 24 | """ 25 | 26 | 27 | class NotGeneratedError(ValueError, AttributeError): 28 | """Exception class to raise is counterfactual information is requested 29 | before generating explanations. 30 | 31 | This class inherits from both ValueError and AttributeError to help with 32 | exception handling and backward compatibility. 33 | """ 34 | 35 | 36 | class CounterfactualsFoundWarning(UserWarning): 37 | """Warning used to notify no feasible counterfactual explanations were 38 | found. 39 | """ 40 | -------------------------------------------------------------------------------- /optbinning/formatting.py: -------------------------------------------------------------------------------- 1 | """ 2 | Printing utilities. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import numbers 9 | import textwrap 10 | 11 | import pandas as pd 12 | 13 | 14 | def dataframe_to_string(df, tab=None): 15 | if not isinstance(df, pd.DataFrame): 16 | raise TypeError("df must be a pandas.DataFrame.") 17 | 18 | if tab is not None: 19 | if not isinstance(tab, numbers.Integral) or tab < 0: 20 | raise ValueError("tab must be a positive integer; got {}." 21 | .format(tab)) 22 | 23 | df_string = textwrap.dedent(df.to_string(index=False)) 24 | 25 | if tab is None: 26 | return df_string 27 | else: 28 | return textwrap.indent(df_string, " " * tab) 29 | -------------------------------------------------------------------------------- /optbinning/information.py: -------------------------------------------------------------------------------- 1 | """ 2 | General information routines. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | import numpy as np 9 | 10 | from sklearn.base import BaseEstimator 11 | 12 | from ._version import __version__ 13 | 14 | try: 15 | from localsolver import LSStatistics 16 | LOCALSOLVER_AVAILABLE = True 17 | except ImportError: 18 | LOCALSOLVER_AVAILABLE = False 19 | 20 | 21 | def print_header(): 22 | header = ( 23 | "optbinning (Version {})\n" 24 | "Copyright (c) 2019-2024 Guillermo Navas-Palencia, Apache License 2.0" 25 | "\n".format(__version__)) 26 | 27 | print(header) 28 | 29 | 30 | def print_optional_parameters(dict_default_options, dict_user_options): 31 | option_format = " {:<24} {:>15} * {}\n" 32 | str_options = " Begin options\n" 33 | for key, value in dict_default_options.items(): 34 | user_value = dict_user_options[key] 35 | 36 | if (isinstance(user_value, (list, np.ndarray, dict)) or 37 | value != user_value): 38 | user_flag = "U" 39 | else: 40 | user_flag = "d" 41 | 42 | if user_value is None: 43 | user_value = "no" 44 | elif isinstance(user_value, (list, np.ndarray, dict)): 45 | user_value = "yes" 46 | elif isinstance(user_value, BaseEstimator): 47 | user_value = "yes" 48 | 49 | str_options += option_format.format(key, str(user_value), user_flag) 50 | str_options += " End options\n" 51 | print(str_options) 52 | 53 | 54 | def solver_statistics(solver_type, solver): 55 | time_optimizer = None 56 | d_solver = {} 57 | 58 | if solver_type == "cp": 59 | d_solver["n_booleans"] = solver.NumBooleans() 60 | d_solver["n_branches"] = solver.NumBranches() 61 | d_solver["n_conflicts"] = solver.NumConflicts() 62 | d_solver["objective"] = int(solver.ObjectiveValue()) 63 | d_solver["best_objective_bound"] = int(solver.BestObjectiveBound()) 64 | 65 | time_optimizer = solver.WallTime() 66 | 67 | elif solver_type == "mip": 68 | d_solver["n_constraints"] = solver.NumConstraints() 69 | d_solver["n_variables"] = solver.NumVariables() 70 | d_solver["objective"] = solver.Objective().Value() 71 | d_solver["best_bound"] = solver.Objective().BestBound() 72 | 73 | elif solver_type == "ls": 74 | if not LOCALSOLVER_AVAILABLE: 75 | raise ImportError('Cannot import localsolver. Install LocalSolver ' 76 | 'or choose another solver, options are "cp" and ' 77 | '"mip".') 78 | 79 | d_solver["n_iterations"] = LSStatistics.get_nb_iterations( 80 | solver.statistics) 81 | 82 | elif solver_type == "lp": 83 | d_solver["n_variables"] = solver.n_variables 84 | d_solver["n_constraints"] = solver.n_constraints 85 | d_solver["n_iterations"] = solver.n_iterations 86 | d_solver["objective"] = solver.objective 87 | 88 | return d_solver, time_optimizer 89 | 90 | 91 | def print_solver_statistics(solver_type, d_solver): 92 | if solver_type == "cp": 93 | solver_stats = ( 94 | " Solver statistics\n" 95 | " Type {:>10}\n" 96 | " Number of booleans {:>10}\n" 97 | " Number of branches {:>10}\n" 98 | " Number of conflicts {:>10}\n" 99 | " Objective value {:>10}\n" 100 | " Best objective bound {:>10}\n" 101 | ).format(solver_type, *d_solver.values()) 102 | 103 | elif solver_type == "mip": 104 | solver_stats = ( 105 | " Solver statistics\n" 106 | " Type {:>10}\n" 107 | " Number of variables {:>10}\n" 108 | " Number of constraints {:>10}\n" 109 | " Objective value {:>10.4f}\n" 110 | " Best objective bound {:>10.4f}\n" 111 | ).format(solver_type, *d_solver.values()) 112 | 113 | elif solver_type == "ls": 114 | solver_stats = ( 115 | " Solver statistics\n" 116 | " Type {:>10}\n" 117 | " Number of iterations {:>10}\n" 118 | ).format(solver_type, *d_solver.values()) 119 | 120 | elif solver_type == "lp": 121 | solver_stats = ( 122 | " Solver statistics\n" 123 | " Type {:>10}\n" 124 | " Number of variables {:>10}\n" 125 | " Number of constraints {:>10}\n" 126 | " Number of iterations {:>10}\n" 127 | " Objective value {:>10.4f}\n" 128 | ).format(solver_type, *d_solver.values()) 129 | 130 | print(solver_stats) 131 | -------------------------------------------------------------------------------- /optbinning/logging.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging class. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2019 7 | 8 | import logging 9 | import sys 10 | 11 | 12 | class Logger: 13 | def __init__(self, logger_name=None, filename=None): 14 | self.logger = logging.getLogger(logger_name) 15 | self.logger.setLevel(logging.INFO) 16 | self.logger.propagate = False 17 | 18 | formatter = logging.Formatter( 19 | '%(asctime)s | %(levelname)s : %(message)s') 20 | 21 | handler = logging.StreamHandler(sys.stdout) 22 | handler.setFormatter(formatter) 23 | self.logger.addHandler(handler) 24 | 25 | if filename is not None: 26 | fhandler = logging.FileHandler(filename) 27 | fhandler.setFormatter(formatter) 28 | self.logger.addHandler(fhandler) 29 | 30 | def close(self): 31 | for handler in self.logger.handlers: 32 | handler.close() 33 | self.logger.removeHandler(handler) 34 | -------------------------------------------------------------------------------- /optbinning/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/optbinning/metrics/__init__.py -------------------------------------------------------------------------------- /optbinning/metrics/classification.py: -------------------------------------------------------------------------------- 1 | """ 2 | Metrics to asses performance of classification models. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | import numpy as np 9 | 10 | from sklearn.metrics import auc 11 | from sklearn.metrics import confusion_matrix 12 | from sklearn.metrics import roc_curve 13 | 14 | 15 | def gini(y_true, y_pred_proba): 16 | """Compute the Gini Index or Accuracy Ration (AR). 17 | 18 | Parameters 19 | ---------- 20 | y_true : array-like, shape (n_samples,) 21 | Ground truth (correct) target values. 22 | 23 | y_pred_proba : array-like, shape (n_samples,) 24 | Probability estimates of the positive class. 25 | 26 | Returns 27 | ------- 28 | gini : float 29 | """ 30 | fpr, tpr, _ = roc_curve(y_true, y_pred_proba) 31 | return 2 * auc(fpr, tpr) - 1 32 | 33 | 34 | def ks(y_true, y_pred_proba): 35 | """Compute the Kolmogorov-Smirnov (KS). 36 | 37 | Parameters 38 | ---------- 39 | y_true : array-like, shape (n_samples,) 40 | Ground truth (correct) target values. 41 | 42 | y_pred_proba : array-like, shape (n_samples,) 43 | Probability estimates of the positive class. 44 | 45 | Returns 46 | ------- 47 | ks : tuple(ks_score, ks_position) 48 | """ 49 | n_samples = y_true.shape[0] 50 | n_event = np.sum(y_true) 51 | n_nonevent = n_samples - n_event 52 | 53 | idx = np.argsort(y_pred_proba) 54 | yy = y_true[idx] 55 | 56 | cum_event = np.cumsum(yy) 57 | cum_population = np.arange(0, n_samples) 58 | cum_nonevent = cum_population - cum_event 59 | 60 | p_event = cum_event / n_event 61 | p_nonevent = cum_nonevent / n_nonevent 62 | 63 | p_diff = p_nonevent - p_event 64 | ks_max_idx = np.argmax(p_diff) 65 | ks_score = p_diff[ks_max_idx] 66 | 67 | return ks_score, ks_max_idx 68 | 69 | 70 | def imbalanced_classification_metrics(y_true, y_pred): 71 | """Compute imbalanced binary classification metrics. 72 | 73 | Parameters 74 | ---------- 75 | y_true : array-like, shape (n_samples,) 76 | Ground truth (correct) target values. 77 | 78 | y_pred : array-like, shape (n_samples,) 79 | Estimated target values. 80 | 81 | Returns 82 | ------- 83 | metrics : dict 84 | Dictionary of metrics. 85 | """ 86 | tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() 87 | 88 | # Sensitivity - True positive rate (TPR) 89 | tpr = tp / (tp + fn) 90 | 91 | # Specificity - True negative rate (TNR) 92 | tnr = tn / (fp + tn) 93 | 94 | # False positive rate (FPR) 95 | fpr = 1.0 - tnr 96 | 97 | # False negative rate (FNR) 98 | fnr = 1.0 - tpr 99 | 100 | # Balanced accuracy 101 | balanced_accuracy = 0.5 * (tpr + tnr) 102 | 103 | # Discriminant power 104 | dp = np.sqrt(3) / np.pi * (np.log(tpr / (1-tnr)) + np.log(tnr / (1-tpr))) 105 | 106 | d_metrics = { 107 | "True positive rate": tpr, 108 | "True negative rate": tnr, 109 | "False positive rate": fpr, 110 | "False negative rate": fnr, 111 | "Balanced accuracy": balanced_accuracy, 112 | "Discriminant power": dp 113 | } 114 | 115 | return d_metrics 116 | -------------------------------------------------------------------------------- /optbinning/metrics/regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | Metrics to asses performance of regression models. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | import numpy as np 9 | 10 | from sklearn.metrics import explained_variance_score 11 | from sklearn.metrics import mean_absolute_error 12 | from sklearn.metrics import mean_squared_error 13 | from sklearn.metrics import median_absolute_error 14 | from sklearn.metrics import r2_score 15 | 16 | 17 | def mean_absolute_percentage_error(y_true, y_pred): 18 | """Compute the mean absolute percentage error (MAPE). 19 | 20 | Parameters 21 | ---------- 22 | y_true : array-like, shape (n_samples,) 23 | Ground truth (correct) target values. 24 | 25 | y_pred : array-like, shape (n_samples,) 26 | Estimated target values. 27 | 28 | Returns 29 | ------- 30 | mape : float 31 | """ 32 | return np.abs((y_true - y_pred) / y_true).mean() 33 | 34 | 35 | def median_absolute_percentage_error(y_true, y_pred): 36 | """Compute the median absolute percentage error (MdAPE). 37 | 38 | Parameters 39 | ---------- 40 | y_true : array-like, shape (n_samples,) 41 | Ground truth (correct) target values. 42 | 43 | y_pred : array-like, shape (n_samples,) 44 | Estimated target values. 45 | 46 | Returns 47 | ------- 48 | mdape : float 49 | """ 50 | return np.median(np.abs((y_true - y_pred) / y_true)) 51 | 52 | 53 | def mean_percentage_error(y_true, y_pred): 54 | """Compute the mean percentage error (MPE). 55 | 56 | Parameters 57 | ---------- 58 | y_true : array-like, shape (n_samples,) 59 | Ground truth (correct) target values. 60 | 61 | y_pred : array-like, shape (n_samples,) 62 | Estimated target values. 63 | 64 | Returns 65 | ------- 66 | mpe : float 67 | """ 68 | return ((y_true - y_pred) / y_true).mean() 69 | 70 | 71 | def symmetric_mean_absolute_percentage_error(y_true, y_pred): 72 | """Compute the symmetric mean absolute percentage error (SMAPE). 73 | 74 | Parameters 75 | ---------- 76 | y_true : array-like, shape (n_samples,) 77 | Ground truth (correct) target values. 78 | 79 | y_pred : array-like, shape (n_samples,) 80 | Estimated target values. 81 | 82 | Returns 83 | ------- 84 | smape : float 85 | """ 86 | e = np.abs(y_true - y_pred) 87 | return (e / (np.abs(y_true) + np.abs(y_pred))).mean() 88 | 89 | 90 | def symmetric_median_absolute_percentage_error(y_true, y_pred): 91 | """Compute the symmetric median absolute percentage error (SMdAPE). 92 | 93 | Parameters 94 | ---------- 95 | y_true : array-like, shape (n_samples,) 96 | Ground truth (correct) target values. 97 | 98 | y_pred : array-like, shape (n_samples,) 99 | Estimated target values. 100 | 101 | Returns 102 | ------- 103 | smdape : float 104 | """ 105 | e = np.abs(y_true - y_pred) 106 | return np.median(e / (np.abs(y_true) + np.abs(y_pred))) 107 | 108 | 109 | def regression_metrics(y_true, y_pred): 110 | """Compute regression metrics. 111 | 112 | Parameters 113 | ---------- 114 | y_true : array-like, shape (n_samples,) 115 | Ground truth (correct) target values. 116 | 117 | y_pred : array-like, shape (n_samples,) 118 | Estimated target values. 119 | 120 | Returns 121 | ------- 122 | metrics : dict 123 | Dictionary of metrics. 124 | """ 125 | 126 | # Explained variance 127 | variance = explained_variance_score(y_true, y_pred) 128 | 129 | # Mean absolute error 130 | mae = mean_absolute_error(y_true, y_pred) 131 | 132 | # Mean squared error 133 | mse = mean_squared_error(y_true, y_pred) 134 | 135 | # Median absolute error 136 | median_ae = median_absolute_error(y_true, y_pred) 137 | 138 | # R^2 score 139 | r2 = r2_score(y_true, y_pred) 140 | 141 | # Mean absolute percentage error 142 | mape = mean_absolute_percentage_error(y_true, y_pred) 143 | 144 | # Mean percentage error 145 | mpe = mean_percentage_error(y_true, y_pred) 146 | 147 | # Symmetric mean absolute percentage error 148 | smape = symmetric_mean_absolute_percentage_error(y_true, y_pred) 149 | 150 | # Median absolute percentage error 151 | mdape = median_absolute_percentage_error(y_true, y_pred) 152 | 153 | # Symmetric meadian absolute percentage error 154 | smdape = symmetric_median_absolute_percentage_error(y_true, y_pred) 155 | 156 | d_metrics = { 157 | "Mean absolute error": mae, 158 | "Mean squared error": mse, 159 | "Median absolute error": median_ae, 160 | "Explained variance": variance, 161 | "R^2": r2, 162 | "MPE": mpe, 163 | "MAPE": mape, 164 | "SMAPE": smape, 165 | "MdAPE": mdape, 166 | "SMdAPE": smdape 167 | } 168 | 169 | return d_metrics 170 | -------------------------------------------------------------------------------- /optbinning/scorecard/__init__.py: -------------------------------------------------------------------------------- 1 | from .counterfactual import Counterfactual 2 | from .monitoring import ScorecardMonitoring 3 | from .plots import plot_auc_roc, plot_cap, plot_ks 4 | from .scorecard import Scorecard 5 | 6 | 7 | __all__ = ["Scorecard", 8 | "ScorecardMonitoring", 9 | "plot_auc_roc", 10 | "plot_cap", 11 | "plot_ks", 12 | "Counterfactual"] 13 | -------------------------------------------------------------------------------- /optbinning/scorecard/counterfactual/__init__.py: -------------------------------------------------------------------------------- 1 | from .counterfactual import Counterfactual 2 | 3 | 4 | __all__ = ['Counterfactual'] 5 | -------------------------------------------------------------------------------- /optbinning/scorecard/counterfactual/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base counterfactual algorithm class. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | from abc import ABCMeta 9 | from abc import abstractmethod 10 | 11 | from sklearn.base import BaseEstimator 12 | 13 | from ...binning.base import Base 14 | from ...exceptions import CounterfactualsFoundWarning 15 | from ...exceptions import NotGeneratedError 16 | 17 | 18 | class BaseCounterfactual(Base, BaseEstimator, metaclass=ABCMeta): 19 | @abstractmethod 20 | def fit(self): 21 | """Fit counterfactual with training data.""" 22 | 23 | @abstractmethod 24 | def generate(self): 25 | """Generate counterfactual explanations.""" 26 | 27 | @abstractmethod 28 | def display(self): 29 | """Display counterfactual explanations.""" 30 | 31 | @property 32 | @abstractmethod 33 | def status(self): 34 | """The status of the underlying optimization solver.""" 35 | 36 | def _check_is_generated(self): 37 | if not self._is_generated: 38 | raise NotGeneratedError("This {} instance has not generated " 39 | "counterfactuals yet. Call " 40 | "'generate' with appropriate arguments." 41 | .format(self.__class__.__name__)) 42 | 43 | def _check_counterfactual_is_found(self): 44 | if not self._cfs: 45 | raise CounterfactualsFoundWarning( 46 | "Neither optimal or feasible counterfactuals were found.") 47 | -------------------------------------------------------------------------------- /optbinning/scorecard/counterfactual/counterfactual_information.py: -------------------------------------------------------------------------------- 1 | """ 2 | Counterfactual information. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | from ...information import print_header 9 | from ...information import print_optional_parameters 10 | from ...information import print_solver_statistics 11 | from ...options import counterfactual_default_options 12 | 13 | 14 | def print_status(status): 15 | print(" Status : {:<32}\n".format(status)) 16 | 17 | 18 | def print_main_info(status, time_total): 19 | print_status(status) 20 | 21 | print(" Time : {:<7.4f} sec\n".format(time_total)) 22 | 23 | 24 | def print_objectives(objectives): 25 | str_objectives = " Objectives\n" 26 | 27 | for objname, objexp in objectives.items(): 28 | objval = objexp.solution_value() 29 | if objname in ("diversity_features", "diversity_values"): 30 | objval = abs(objval) 31 | 32 | str_objectives += " {:<18} {:>10.4f}\n".format( 33 | objname, objval) 34 | 35 | print(str_objectives) 36 | 37 | 38 | def print_timing(time_total, time_fit, time_solver, time_postprocessing): 39 | p_fit = time_fit / time_total 40 | p_solver = time_solver / time_total 41 | p_postprocessing = time_postprocessing / time_solver 42 | 43 | time_stats = ( 44 | " Timing\n" 45 | " Total time {:>18.2f} sec\n" 46 | " Fit {:>18.2f} sec ({:>7.2%})\n" 47 | " Solver {:>18.2f} sec ({:>7.2%})\n" 48 | " Post-processing {:>18.2f} sec ({:>7.2%})\n" 49 | ).format(time_total, time_fit, p_fit, time_solver, p_solver, 50 | time_postprocessing, p_postprocessing) 51 | 52 | print(time_stats) 53 | 54 | 55 | def print_counterfactual_information(print_level, status, solver, objectives, 56 | time_total, time_fit, time_solver, 57 | time_postprocessing, dict_user_options): 58 | 59 | print_header() 60 | 61 | if print_level == 2: 62 | dict_default_options = counterfactual_default_options 63 | print_optional_parameters(dict_default_options, dict_user_options) 64 | 65 | if print_level == 0: 66 | print_main_info(status, time_total) 67 | elif print_level >= 1: 68 | print_status(status) 69 | 70 | if status in ("OPTIMAL", "FEASIBLE"): 71 | if solver is not None: 72 | print_solver_statistics("mip", solver) 73 | print_objectives(objectives) 74 | 75 | print_timing(time_total, time_fit, time_solver, time_postprocessing) 76 | -------------------------------------------------------------------------------- /optbinning/scorecard/counterfactual/model_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Counterfactual model data. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | 9 | def model_data(scorecard, x, special_missing): 10 | s_vars = scorecard.binning_process_.get_support(names=True) 11 | 12 | sc = scorecard.table(style="detailed") 13 | metric_name = "WoE" if scorecard._target_dtype == "binary" else "Mean" 14 | 15 | # Number of bins, metric and indices 16 | nbins = [] 17 | metric = [] 18 | indices = [] 19 | for i, v in enumerate(s_vars): 20 | metric_i = sc[sc.Variable == v][metric_name].values 21 | 22 | if not special_missing: 23 | metric_i = metric_i[:-2] 24 | 25 | _metric = [] 26 | _indices = [] 27 | for j, m in enumerate(metric_i): 28 | if m != x[i]: 29 | _indices.append(j) 30 | _metric.append(m) 31 | 32 | metric.append(_metric) 33 | nbins.append(len(_metric)) 34 | indices.append(_indices) 35 | 36 | return nbins, metric, indices 37 | -------------------------------------------------------------------------------- /optbinning/scorecard/counterfactual/problem_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Counterfactual problem data. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | import numpy as np 9 | 10 | 11 | def problem_data(scorecard, X): 12 | s_vars = X.columns 13 | n_vars = X.shape[1] 14 | 15 | # Scorecard table 16 | sc = scorecard.table(style="detailed") 17 | 18 | if scorecard._target_dtype == "binary": 19 | sc["Points"] = sc["WoE"] * sc["Coefficient"] 20 | else: 21 | sc["Points"] = sc["Mean"] * sc["Coefficient"] 22 | 23 | # Linear model coefficients 24 | 25 | # Only index into the intercept if it is an array, it is a scalar otherwise 26 | if isinstance(scorecard.estimator_.intercept_, np.ndarray): 27 | intercept = float(scorecard.estimator_.intercept_[0]) 28 | else: 29 | intercept = float(scorecard.estimator_.intercept_) 30 | 31 | coef = scorecard.estimator_.coef_.ravel() 32 | 33 | # Big-M parameters (min, max) points. 34 | # Proximity weights. Inverse value range for each feature 35 | min_p = 0 36 | max_p = 0 37 | wrange = np.empty(n_vars) 38 | 39 | for i, v in enumerate(s_vars): 40 | v_points = sc[sc["Variable"] == v]["Points"] 41 | _min = np.min(v_points) 42 | _max = np.max(v_points) 43 | min_p += _min 44 | max_p += _max 45 | 46 | wrange[i] = 1.0 / (_max - _min) 47 | 48 | min_p += intercept 49 | max_p += intercept 50 | 51 | # Mahalanobis distance 52 | Xt = scorecard.binning_process_.transform(X).values 53 | F = np.linalg.cholesky(np.linalg.inv(np.cov(Xt.T))) 54 | mu = Xt.mean(axis=0) 55 | 56 | return intercept, coef, min_p, max_p, wrange, F, mu 57 | -------------------------------------------------------------------------------- /optbinning/scorecard/counterfactual/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Piecewise linear approximation of logistic function. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | import numpy as np 9 | 10 | from ropwr import RobustPWRegression 11 | 12 | 13 | def logistic_pw(min_p, max_p, n_bins): 14 | xl = np.linspace(min_p, max_p, 100) 15 | yl = (1.0 / (1 + np.exp(-xl))) 16 | 17 | splits = np.linspace(min_p, max_p, n_bins+1)[1:-1] 18 | 19 | pw = RobustPWRegression(objective="l1", degree=1, monotonic_trend=None) 20 | pw.fit(xl, yl, splits) 21 | 22 | splits = np.array([min_p] + list(splits) + [max_p]) 23 | b_pw = [(splits[i], splits[i+1]) for i in range(len(splits) - 1)] 24 | c_pw = pw.coef_ 25 | 26 | return b_pw, c_pw 27 | -------------------------------------------------------------------------------- /optbinning/scorecard/monitoring_information.py: -------------------------------------------------------------------------------- 1 | """ 2 | Monitoring information. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | from ..binning.binning_information import print_header 9 | from ..binning.binning_information import print_optional_parameters 10 | from ..options import scorecard_monitoring_default_options 11 | 12 | 13 | def print_main_info(n_records_a, n_records_e, n_variables, time_total): 14 | print(" Number of records A : {}".format(n_records_a)) 15 | print(" Number of records E : {}".format(n_records_e)) 16 | print(" Number of variables : {}".format(n_variables)) 17 | print(" Time : {:<7.4f} sec\n".format(time_total)) 18 | 19 | 20 | def print_monitoring_statistics(n_records_a, n_records_e, n_variables, 21 | target_dtype, time_total, time_system, 22 | time_variables): 23 | 24 | stats = ( 25 | " Statistics\n" 26 | " Number of records Actual {:>10}\n" 27 | " Number of records Expected {:>10}\n" 28 | " Number of scorecard variables {:>10}\n" 29 | " Target type {:>10}\n" 30 | ).format(n_records_a, n_records_e, n_variables, target_dtype) 31 | 32 | print(stats) 33 | 34 | p_system = time_system / time_total 35 | p_variables = time_variables / time_total 36 | 37 | time_stats = ( 38 | " Timing\n" 39 | " Total time {:>18.2f} sec\n" 40 | " System stability {:>18.2f} sec ({:>7.2%})\n" 41 | " Variables stability {:>18.2f} sec ({:>7.2%})\n" 42 | ).format(time_total, time_system, p_system, time_variables, 43 | p_variables) 44 | 45 | print(time_stats) 46 | 47 | 48 | def print_monitoring_information(print_level, n_records_a, n_records_e, 49 | n_variables, target_dtype, time_total, 50 | time_system, time_variables, 51 | dict_user_options): 52 | 53 | print_header() 54 | 55 | if print_level == 2: 56 | dict_default_options = scorecard_monitoring_default_options 57 | print_optional_parameters(dict_default_options, dict_user_options) 58 | 59 | if print_level == 0: 60 | print_main_info(n_records_a, n_records_e, n_variables, time_total) 61 | elif print_level >= 1: 62 | print_monitoring_statistics(n_records_a, n_records_e, n_variables, 63 | target_dtype, time_total, time_system, 64 | time_variables) 65 | -------------------------------------------------------------------------------- /optbinning/scorecard/rounding.py: -------------------------------------------------------------------------------- 1 | """ 2 | Rounding strategy. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import numpy as np 9 | 10 | from ortools.linear_solver import pywraplp 11 | 12 | 13 | class RoundingMIP: 14 | def __init__(self): 15 | self.solver_ = None 16 | 17 | self._nb = None 18 | self._nn = None 19 | self._p = None 20 | 21 | def build_model(self, df_scorecard): 22 | # Parameters 23 | points = [] 24 | mins = [] 25 | maxs = [] 26 | for variable in df_scorecard.Variable.unique(): 27 | mask = df_scorecard.Variable == variable 28 | p = df_scorecard[mask].Points.values 29 | mins.append(p.min()) 30 | maxs.append(p.max()) 31 | points.append(p) 32 | 33 | nb = len(points) 34 | nn = [len(p) for p in points] 35 | 36 | min_point = np.rint(np.sum(mins)) 37 | max_point = np.rint(np.sum(maxs)) 38 | 39 | min_p = np.min(mins) 40 | max_p = np.max(maxs) 41 | 42 | # Initialize solver 43 | solver = pywraplp.Solver( 44 | 'RoundingMIP', pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING) 45 | 46 | # Decision variables 47 | p = {} 48 | tp = {} 49 | tm = {} 50 | min_b = {} 51 | max_b = {} 52 | for i in range(nb): 53 | min_b[i] = solver.IntVar(min_p, max_p, "min_b[{}]".format(i)) 54 | max_b[i] = solver.IntVar(min_p, max_p, "max_b[{}]".format(i)) 55 | for j in range(nn[i]): 56 | p[i, j] = solver.IntVar(min_p, max_p, "p[{}, {}]".format(i, j)) 57 | tp[i, j] = solver.NumVar(0, np.inf, "tp[{}, {}]".format(i, j)) 58 | tm[i, j] = solver.NumVar(0, np.inf, "tm[{}, {}]".format(i, j)) 59 | 60 | # Objective function 61 | solver.Minimize(solver.Sum([solver.Sum([tp[i, j] + tm[i, j] 62 | for j in range(nn[i])]) for i in range(nb)])) 63 | 64 | # Constraints 65 | for i in range(nb): 66 | for j in range(nn[i]): 67 | solver.Add(tp[i, j] - tm[i, j] == points[i][j] - p[i, j]) 68 | 69 | # Max score constraint for each variable 70 | solver.Add(max_b[i] >= p[i, j]) 71 | 72 | # Min score constraints for each variable 73 | solver.Add(min_b[i] <= p[i, j]) 74 | 75 | # Sum of minimum/maximum point by variable must be min_point/max_point 76 | solver.Add(solver.Sum([min_b[i] for i in range(nb)]) == min_point) 77 | solver.Add(solver.Sum([max_b[i] for i in range(nb)]) == max_point) 78 | 79 | self.solver_ = solver 80 | self._nb = nb 81 | self._nn = nn 82 | self._p = p 83 | 84 | def solve(self): 85 | status = self.solver_.Solve() 86 | 87 | if status in (pywraplp.Solver.OPTIMAL, pywraplp.Solver.FEASIBLE): 88 | if status == pywraplp.Solver.OPTIMAL: 89 | status_name = "OPTIMAL" 90 | else: 91 | status_name = "FEASIBLE" 92 | 93 | # compute solution 94 | solution = [] 95 | for i in range(self._nb): 96 | for j in range(self._nn[i]): 97 | solution.append(self._p[i, j].solution_value()) 98 | else: 99 | if status == pywraplp.Solver.ABNORMAL: 100 | status_name = "ABNORMAL" 101 | elif status == pywraplp.Solver.INFEASIBLE: 102 | status_name = "INFEASIBLE" 103 | elif status == pywraplp.Solver.UNBOUNDED: 104 | status_name = "UNBOUNDED" 105 | else: 106 | status_name = "UNKNOWN" 107 | 108 | solution = None 109 | 110 | return status_name, solution 111 | -------------------------------------------------------------------------------- /optbinning/scorecard/scorecard_information.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scorecard information. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | from ..information import print_header 9 | from ..information import print_optional_parameters 10 | from ..options import scorecard_default_options 11 | 12 | 13 | def print_main_info(n_records, n_variables, time_total): 14 | print(" Number of records : {}".format(n_records)) 15 | print(" Number of variables : {}".format(n_variables)) 16 | print(" Time : {:<7.4f} sec\n".format(time_total)) 17 | 18 | 19 | def print_scorecard_statistics(n_records, n_variables, target_dtype, 20 | n_numerical, n_categorical, n_selected, 21 | time_total, time_binning_process, 22 | time_estimator, time_build_scorecard, 23 | time_rounding): 24 | 25 | stats = ( 26 | " Statistics\n" 27 | " Number of records {:>10}\n" 28 | " Number of variables {:>10}\n" 29 | " Target type {:>10}\n\n" 30 | " Number of numerical {:>10}\n" 31 | " Number of categorical {:>10}\n" 32 | " Number of selected {:>10}\n" 33 | ).format(n_records, n_variables, target_dtype, n_numerical, 34 | n_categorical, n_selected) 35 | 36 | print(stats) 37 | 38 | p_binning_process = time_binning_process / time_total 39 | p_estimator = time_estimator / time_total 40 | p_build_scorecard = time_build_scorecard / time_total 41 | p_rounding = time_rounding / time_build_scorecard 42 | 43 | time_stats = ( 44 | " Timing\n" 45 | " Total time {:>18.2f} sec\n" 46 | " Binning process {:>18.2f} sec ({:>7.2%})\n" 47 | " Estimator {:>18.2f} sec ({:>7.2%})\n" 48 | " Build scorecard {:>18.2f} sec ({:>7.2%})\n" 49 | " rounding {:>18.2f} sec ({:>7.2%})\n" 50 | ).format(time_total, time_binning_process, p_binning_process, 51 | time_estimator, p_estimator, time_build_scorecard, 52 | p_build_scorecard, time_rounding, p_rounding) 53 | 54 | print(time_stats) 55 | 56 | 57 | def print_scorecard_information(print_level, n_records, n_variables, 58 | target_dtype, n_numerical, n_categorical, 59 | n_selected, time_total, time_binning_process, 60 | time_estimator, time_build_scorecard, 61 | time_rounding, dict_user_options): 62 | print_header() 63 | 64 | if print_level == 2: 65 | dict_default_options = scorecard_default_options 66 | print_optional_parameters(dict_default_options, dict_user_options) 67 | 68 | if print_level == 0: 69 | print_main_info(n_records, n_variables, time_total) 70 | elif print_level >= 1: 71 | print_scorecard_statistics(n_records, n_variables, target_dtype, 72 | n_numerical, n_categorical, n_selected, 73 | time_total, time_binning_process, 74 | time_estimator, time_build_scorecard, 75 | time_rounding) 76 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy>=1.16.1 3 | ortools>=9.4,<9.12 4 | pandas 5 | ropwr>=1.0.0 6 | scikit-learn>=1.0.2 7 | scipy>=1.6.0 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | 5 | from setuptools import find_packages, setup, Command 6 | 7 | long_description = ''' 8 | The optimal binning is the optimal discretization of a variable into bins 9 | given a discrete or continuous numeric target. OptBinning is a library 10 | written in Python implementing a rigorous and flexible mathematical 11 | programming formulation to solving the optimal binning problem for a binary, 12 | continuous and multiclass target type, incorporating constraints not 13 | previously addressed. 14 | 15 | Read the documentation at: http://gnpalencia.org/optbinning/ 16 | 17 | OptBinning is distributed under the Apache Software License (Apache 2.0). 18 | ''' 19 | 20 | 21 | class CleanCommand(Command): 22 | user_options = [] 23 | 24 | def initialize_options(self): 25 | pass 26 | 27 | def finalize_options(self): 28 | pass 29 | 30 | def run(self): 31 | os.system('rm -vrf ./build ./dist ./*.pyc ./*.tgz ./*.egg-info') 32 | 33 | 34 | # install requirements 35 | install_requires = [ 36 | 'matplotlib', 37 | 'numpy>=1.16.1', 38 | 'ortools>=9.4,<9.12', 39 | 'pandas', 40 | 'ropwr>=1.0.0', 41 | 'scikit-learn>=1.0.2', 42 | 'scipy>=1.6.0', 43 | ] 44 | 45 | # extra requirements 46 | extras_require = { 47 | 'distributed': ['pympler', 'tdigest'], 48 | 'test': [ 49 | 'coverage', 50 | 'flake8', 51 | 'pytest', 52 | 'pyarrow', 53 | 'pympler', 54 | 'tdigest', 55 | ], 56 | # For ecos support: https://github.com/embotech/ecos 57 | 'ecos': ['ecos'] 58 | } 59 | 60 | 61 | # Read version file 62 | version_info = {} 63 | with open("optbinning/_version.py") as f: 64 | exec(f.read(), version_info) 65 | 66 | 67 | setup( 68 | name="optbinning", 69 | version=version_info['__version__'], 70 | description="OptBinning: The Python Optimal Binning library", 71 | long_description=long_description, 72 | author="Guillermo Navas-Palencia", 73 | author_email="g.navas.palencia@gmail.com", 74 | packages=find_packages(exclude=['tests', 'tests.*']), 75 | platforms="any", 76 | include_package_data=True, 77 | license="Apache Licence 2.0", 78 | url="https://github.com/guillermo-navas-palencia/optbinning", 79 | cmdclass={'clean': CleanCommand}, 80 | python_requires='>=3.7', 81 | install_requires=install_requires, 82 | extras_require=extras_require, 83 | classifiers=[ 84 | 'Topic :: Scientific/Engineering :: Mathematics', 85 | 'Topic :: Software Development :: Libraries', 86 | 'Topic :: Software Development :: Libraries :: Python Modules', 87 | 'Intended Audience :: Developers', 88 | 'Intended Audience :: Education', 89 | 'Intended Audience :: Science/Research', 90 | 'License :: OSI Approved :: Apache Software License', 91 | 'Programming Language :: Python :: 3', 92 | 'Programming Language :: Python :: 3.9', 93 | 'Programming Language :: Python :: 3.10', 94 | 'Programming Language :: Python :: 3.11', 95 | 'Programming Language :: Python :: 3.12', 96 | ] 97 | ) 98 | -------------------------------------------------------------------------------- /test_requirements.txt: -------------------------------------------------------------------------------- 1 | coverage 2 | flake8 3 | pytest 4 | pyarrow 5 | pympler 6 | tdigest 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/breast_cancer.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/data/breast_cancer.parquet -------------------------------------------------------------------------------- /tests/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import load_boston 2 | 3 | 4 | __all__ = ['load_boston'] 5 | -------------------------------------------------------------------------------- /tests/datasets/datasets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | class Data: 6 | def __init__(self, data, target, feature_names): 7 | self.data = data 8 | self.target = target 9 | self.feature_names = feature_names 10 | 11 | 12 | def load_boston(): 13 | data_url = "http://lib.stat.cmu.edu/datasets/boston" 14 | raw_df = pd.read_csv(data_url, sep=r"\s+", skiprows=22, header=None) 15 | raw_data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]]) 16 | target = raw_df.values[1::2, 2] 17 | feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 18 | 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] 19 | 20 | return Data(raw_data, target, feature_names) 21 | -------------------------------------------------------------------------------- /tests/results/plot_auc_roc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/plot_auc_roc.png -------------------------------------------------------------------------------- /tests/results/plot_cap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/plot_cap.png -------------------------------------------------------------------------------- /tests/results/plot_ks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/plot_ks.png -------------------------------------------------------------------------------- /tests/results/psi_plot_binary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/psi_plot_binary.png -------------------------------------------------------------------------------- /tests/results/psi_plot_continuous.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/psi_plot_continuous.png -------------------------------------------------------------------------------- /tests/results/test_binning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning.png -------------------------------------------------------------------------------- /tests/results/test_binning_2d_event_rate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_2d_event_rate.png -------------------------------------------------------------------------------- /tests/results/test_binning_2d_woe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_2d_woe.png -------------------------------------------------------------------------------- /tests/results/test_binning_no_missing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_no_missing.png -------------------------------------------------------------------------------- /tests/results/test_binning_no_special.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_no_special.png -------------------------------------------------------------------------------- /tests/results/test_binning_process_information.txt: -------------------------------------------------------------------------------- 1 | optbinning (Version 0.14.0) 2 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 3 | 4 | Number of records : 569 5 | Number of variables : 30 6 | Time : 4.2282 sec 7 | 8 | optbinning (Version 0.14.0) 9 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 10 | 11 | Statistics 12 | Number of records 569 13 | Number of variables 30 14 | Target type binary 15 | 16 | Number of numerical 30 17 | Number of categorical 0 18 | Number of selected 30 19 | 20 | Time 4.2282 sec 21 | 22 | optbinning (Version 0.14.0) 23 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 24 | 25 | Begin options 26 | max_n_prebins 20 * d 27 | min_prebin_size 0.05 * d 28 | min_n_bins no * d 29 | max_n_bins no * d 30 | min_bin_size no * d 31 | max_bin_size no * d 32 | max_pvalue no * d 33 | max_pvalue_policy consecutive * d 34 | selection_criteria no * d 35 | fixed_variables no * d 36 | categorical_variables no * d 37 | special_codes no * d 38 | split_digits no * d 39 | binning_fit_params no * d 40 | binning_transform_params no * d 41 | verbose False * d 42 | End options 43 | 44 | Statistics 45 | Number of records 569 46 | Number of variables 30 47 | Target type binary 48 | 49 | Number of numerical 30 50 | Number of categorical 0 51 | Number of selected 30 52 | 53 | Time 4.2282 sec 54 | 55 | -------------------------------------------------------------------------------- /tests/results/test_binning_process_verbose.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_binning_process_verbose.txt -------------------------------------------------------------------------------- /tests/results/test_continuous_binning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning.png -------------------------------------------------------------------------------- /tests/results/test_continuous_binning_2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning_2d.png -------------------------------------------------------------------------------- /tests/results/test_continuous_binning_no_missing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning_no_missing.png -------------------------------------------------------------------------------- /tests/results/test_continuous_binning_no_special.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_continuous_binning_no_special.png -------------------------------------------------------------------------------- /tests/results/test_multiclass_binning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_multiclass_binning.png -------------------------------------------------------------------------------- /tests/results/test_multiclass_binning_no_missing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_multiclass_binning_no_missing.png -------------------------------------------------------------------------------- /tests/results/test_multiclass_binning_no_special.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_multiclass_binning_no_special.png -------------------------------------------------------------------------------- /tests/results/test_scorecard_information.txt: -------------------------------------------------------------------------------- 1 | optbinning (Version 0.14.0) 2 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 3 | 4 | Number of records : 569 5 | Number of variables : 30 6 | Time : 4.5420 sec 7 | 8 | optbinning (Version 0.14.0) 9 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 10 | 11 | Statistics 12 | Number of records 569 13 | Number of variables 30 14 | Target type binary 15 | 16 | Number of numerical 30 17 | Number of categorical 0 18 | Number of selected 30 19 | 20 | Timing 21 | Total time 4.54 sec 22 | Binning process 4.18 sec ( 92.04%) 23 | Estimator 0.04 sec ( 0.94%) 24 | Build scorecard 0.32 sec ( 7.01%) 25 | rounding 0.00 sec ( 0.00%) 26 | 27 | optbinning (Version 0.14.0) 28 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 29 | 30 | Begin options 31 | binning_process yes * U 32 | estimator yes * U 33 | scaling_method no * d 34 | scaling_method_params no * d 35 | intercept_based False * d 36 | reverse_scorecard False * d 37 | rounding False * d 38 | verbose False * d 39 | End options 40 | 41 | Statistics 42 | Number of records 569 43 | Number of variables 30 44 | Target type binary 45 | 46 | Number of numerical 30 47 | Number of categorical 0 48 | Number of selected 30 49 | 50 | Timing 51 | Total time 4.54 sec 52 | Binning process 4.18 sec ( 92.04%) 53 | Estimator 0.04 sec ( 0.94%) 54 | Build scorecard 0.32 sec ( 7.01%) 55 | rounding 0.00 sec ( 0.00%) 56 | 57 | -------------------------------------------------------------------------------- /tests/results/test_scorecard_monitoring_default.txt: -------------------------------------------------------------------------------- 1 | ----------------------------------- 2 | Monitoring: System Stability Report 3 | ----------------------------------- 4 | 5 | Population Stability Index (PSI) 6 | 7 | 8 | PSI total: 0.0018 (No significant change) 9 | 10 | PSI bin Count Count (%) 11 | [0.00, 0.10) 3 1.0 12 | [0.10, 0.25) 0 0.0 13 | [0.25, Inf+) 0 0.0 14 | 15 | Significance tests (H0: actual == expected) 16 | 17 | p-value bin Count Count (%) 18 | [0.00, 0.05) 1 0.333333 19 | [0.05, 0.10) 0 0.000000 20 | [0.10, 0.50) 1 0.333333 21 | [0.50, 1.00) 1 0.333333 22 | 23 | Target analysis 24 | 25 | Metric Actual Actual (%) Expected Expected (%) 26 | Number of records 171 - 398 - 27 | Event records 108 0.631579 249 0.625628 28 | Non-event records 63 0.368421 149 0.374372 29 | 30 | Performance metrics 31 | 32 | Metric Actual Expected Diff A - E 33 | True positive rate 0.990741 1.000000 -0.009259 34 | True negative rate 0.968254 0.979866 -0.011612 35 | False positive rate 0.031746 0.020134 0.011612 36 | False negative rate 0.009259 0.000000 0.009259 37 | Balanced accuracy 0.979497 0.989933 -0.010436 38 | Discriminant power 4.460557 inf -inf 39 | Gini 0.986185 0.999838 -0.013654 40 | 41 | -------------------------------------------------------------------------------- /tests/results/test_scorecard_monitoring_default_continuous.txt: -------------------------------------------------------------------------------- 1 | ----------------------------------- 2 | Monitoring: System Stability Report 3 | ----------------------------------- 4 | 5 | Population Stability Index (PSI) 6 | 7 | 8 | PSI total: 0.1630 (Requires investigation) 9 | 10 | PSI bin Count Count (%) 11 | [0.00, 0.10) 14 0.933333 12 | [0.10, 0.25) 1 0.066667 13 | [0.25, Inf+) 0 0.000000 14 | 15 | Significance tests (H0: actual == expected) 16 | 17 | p-value bin Count Count (%) 18 | [0.00, 0.05) 1 0.066667 19 | [0.05, 0.10) 0 0.000000 20 | [0.10, 0.50) 5 0.333333 21 | [0.50, 1.00) 9 0.600000 22 | 23 | Target analysis 24 | 25 | Metric Actual Expected 26 | Mean 21.407895 23.015819 27 | Std 8.632097 9.375315 28 | p25 16.325000 17.400000 29 | Median 20.000000 21.750000 30 | p75 24.125000 26.600000 31 | 32 | Performance metrics 33 | 34 | Metric Actual Expected Diff A - E 35 | Mean absolute error 2.482286 2.546775 -0.064488 36 | Mean squared error 12.583966 12.187764 0.396202 37 | Median absolute error 2.059913 1.947342 0.112571 38 | Explained variance 0.831908 0.861340 -0.029432 39 | R^2 0.831117 0.861340 -0.030222 40 | MPE -0.032197 -0.024922 -0.007275 41 | MAPE 0.125897 0.125992 -0.000095 42 | SMAPE 0.061339 0.060410 0.000929 43 | MdAPE 0.097021 0.091783 0.005238 44 | SMdAPE 0.049889 0.046868 0.003021 45 | 46 | -------------------------------------------------------------------------------- /tests/results/test_scorecard_monitoring_information.txt: -------------------------------------------------------------------------------- 1 | optbinning (Version 0.14.0) 2 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 3 | 4 | Number of records A : 152 5 | Number of records E : 354 6 | Number of variables : 13 7 | Time : 0.1124 sec 8 | 9 | optbinning (Version 0.14.0) 10 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 11 | 12 | Statistics 13 | Number of records Actual 152 14 | Number of records Expected 354 15 | Number of scorecard variables 13 16 | Target type continuous 17 | 18 | Timing 19 | Total time 0.11 sec 20 | System stability 0.07 sec ( 60.67%) 21 | Variables stability 0.04 sec ( 38.99%) 22 | 23 | optbinning (Version 0.14.0) 24 | Copyright (c) 2019-2022 Guillermo Navas-Palencia, Apache License 2.0 25 | 26 | Begin options 27 | scorecard yes * U 28 | psi_method cart * d 29 | psi_n_bins 20 * d 30 | psi_min_bin_size 0.05 * d 31 | show_digits 2 * d 32 | verbose False * d 33 | End options 34 | 35 | Statistics 36 | Number of records Actual 152 37 | Number of records Expected 354 38 | Number of scorecard variables 13 39 | Target type continuous 40 | 41 | Timing 42 | Total time 0.11 sec 43 | System stability 0.07 sec ( 60.67%) 44 | Variables stability 0.04 sec ( 38.99%) 45 | 46 | -------------------------------------------------------------------------------- /tests/results/test_scorecard_monitoring_verbose.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_scorecard_monitoring_verbose.txt -------------------------------------------------------------------------------- /tests/results/test_scorecard_verbose.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guillermo-navas-palencia/optbinning/0720b8dd2b60a723cfb16e1f830da1d62598171a/tests/results/test_scorecard_verbose.txt -------------------------------------------------------------------------------- /tests/test_binning_piecewise.py: -------------------------------------------------------------------------------- 1 | """ 2 | OptimalPWBinning testing. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2022 7 | 8 | import pandas as pd 9 | 10 | from pytest import approx, raises 11 | 12 | from optbinning import OptimalPWBinning 13 | from sklearn.datasets import load_breast_cancer 14 | from sklearn.exceptions import NotFittedError 15 | 16 | 17 | data = load_breast_cancer() 18 | df = pd.DataFrame(data.data, columns=data.feature_names) 19 | 20 | variable = "mean radius" 21 | x = df[variable].values 22 | y = data.target 23 | 24 | 25 | def test_params(): 26 | with raises(TypeError): 27 | optb = OptimalPWBinning(name=1) 28 | optb.fit(x, y) 29 | 30 | with raises(TypeError): 31 | optb = OptimalPWBinning(estimator=2) 32 | optb.fit(x, y) 33 | 34 | with raises(ValueError): 35 | optb = OptimalPWBinning(objective="new") 36 | optb.fit(x, y) 37 | 38 | with raises(ValueError): 39 | optb = OptimalPWBinning(degree=0.2) 40 | optb.fit(x, y) 41 | 42 | with raises(TypeError): 43 | optb = OptimalPWBinning(continuous=1) 44 | optb.fit(x, y) 45 | 46 | with raises(ValueError): 47 | optb = OptimalPWBinning(prebinning_method="new") 48 | optb.fit(x, y) 49 | 50 | with raises(ValueError): 51 | optb = OptimalPWBinning(min_prebin_size=0.9) 52 | optb.fit(x, y) 53 | 54 | with raises(ValueError): 55 | optb = OptimalPWBinning(min_n_bins=1.2) 56 | optb.fit(x, y) 57 | 58 | with raises(ValueError): 59 | optb = OptimalPWBinning(max_n_bins=1.2) 60 | optb.fit(x, y) 61 | 62 | with raises(ValueError): 63 | optb = OptimalPWBinning(min_n_bins=10, max_n_bins=5) 64 | optb.fit(x, y) 65 | 66 | with raises(ValueError): 67 | optb = OptimalPWBinning(min_bin_size=0.6) 68 | optb.fit(x, y) 69 | 70 | with raises(ValueError): 71 | optb = OptimalPWBinning(max_bin_size=1.1) 72 | optb.fit(x, y) 73 | 74 | with raises(ValueError): 75 | optb = OptimalPWBinning(min_bin_size=0.3, max_bin_size=0.2) 76 | optb.fit(x, y) 77 | 78 | with raises(ValueError): 79 | optb = OptimalPWBinning(monotonic_trend="new") 80 | optb.fit(x, y) 81 | 82 | with raises(ValueError): 83 | optb = OptimalPWBinning(monotonic_trend="convex", degree=2) 84 | optb.fit(x, y) 85 | 86 | with raises(ValueError): 87 | optb = OptimalPWBinning(n_subsamples=1001.2) 88 | optb.fit(x, y) 89 | 90 | with raises(ValueError): 91 | optb = OptimalPWBinning(max_pvalue=1.1) 92 | optb.fit(x, y) 93 | 94 | with raises(ValueError): 95 | optb = OptimalPWBinning(max_pvalue_policy="new_policy") 96 | optb.fit(x, y) 97 | 98 | with raises(ValueError): 99 | optb = OptimalPWBinning(outlier_detector="new_method") 100 | optb.fit(x, y) 101 | 102 | with raises(TypeError): 103 | optb = OptimalPWBinning(outlier_detector="range", 104 | outlier_params="pass") 105 | optb.fit(x, y) 106 | 107 | with raises(TypeError): 108 | optb = OptimalPWBinning(user_splits={"a": [1, 2]}) 109 | optb.fit(x, y) 110 | 111 | with raises(ValueError): 112 | optb = OptimalPWBinning(user_splits=None, 113 | user_splits_fixed=[True, True]) 114 | optb.fit(x, y) 115 | 116 | with raises(TypeError): 117 | optb = OptimalPWBinning(user_splits=[1, 2], 118 | user_splits_fixed=(True, True)) 119 | optb.fit(x, y) 120 | 121 | with raises(ValueError): 122 | optb = OptimalPWBinning(user_splits=[1, 2], 123 | user_splits_fixed=[True, 1]) 124 | optb.fit(x, y) 125 | 126 | with raises(ValueError): 127 | optb = OptimalPWBinning(user_splits=[1, 2], 128 | user_splits_fixed=[True]) 129 | optb.fit(x, y) 130 | 131 | with raises(TypeError): 132 | optb = OptimalPWBinning(special_codes={1, 2, 3}) 133 | optb.fit(x, y) 134 | 135 | with raises(ValueError): 136 | optb = OptimalPWBinning(split_digits=9) 137 | optb.fit(x, y) 138 | 139 | with raises(ValueError): 140 | optb = OptimalPWBinning(solver=None) 141 | optb.fit(x, y) 142 | 143 | with raises(ValueError): 144 | optb = OptimalPWBinning(h_epsilon=0.9) 145 | optb.fit(x, y) 146 | 147 | with raises(ValueError): 148 | optb = OptimalPWBinning(quantile=0) 149 | optb.fit(x, y) 150 | 151 | with raises(ValueError): 152 | optb = OptimalPWBinning(regularization='l3') 153 | optb.fit(x, y) 154 | 155 | with raises(ValueError): 156 | optb = OptimalPWBinning(reg_l1=-0.5) 157 | optb.fit(x, y) 158 | 159 | with raises(ValueError): 160 | optb = OptimalPWBinning(reg_l2=-0.5) 161 | optb.fit(x, y) 162 | 163 | with raises(TypeError): 164 | optb = OptimalPWBinning(random_state='None') 165 | optb.fit(x, y) 166 | 167 | with raises(TypeError): 168 | optb = OptimalPWBinning(verbose=1) 169 | optb.fit(x, y) 170 | 171 | 172 | def test_default(): 173 | optb = OptimalPWBinning(name=variable) 174 | optb.fit(x, y) 175 | 176 | optb.binning_table.build() 177 | assert optb.binning_table.iv == approx(5.87474602, rel=1e-6) 178 | 179 | with raises(ValueError): 180 | optb.binning_table.plot(metric="new_metric") 181 | 182 | optb.binning_table.plot( 183 | metric="woe", savefig="tests/results/test_binning_piecewise.png") 184 | 185 | 186 | def test_default_discontinuous(): 187 | optb = OptimalPWBinning(name=variable, continuous=False) 188 | optb.fit(x, y) 189 | 190 | optb.binning_table.build() 191 | assert optb.binning_table.iv == approx(5.84465825, rel=1e-6) 192 | 193 | 194 | def test_bounds_transform(): 195 | optb = OptimalPWBinning(name=variable) 196 | optb.fit(x, y, lb=0.001, ub=0.999) 197 | 198 | x_transform_woe = optb.transform(x, metric="woe") 199 | assert x_transform_woe[:4] == approx( 200 | [3.99180564, 4.28245092, 4.17407503, -3.2565373], rel=1e-6) 201 | 202 | x_transform_event_rate = optb.transform(x, metric="event_rate") 203 | assert x_transform_event_rate[:4] == approx( 204 | [0.03015878, 0.02272502, 0.02526056, 0.97763604], rel=1e-6) 205 | 206 | 207 | def test_bounds_fit_transform(): 208 | optb = OptimalPWBinning(name=variable) 209 | 210 | x_transform_woe = optb.fit_transform( 211 | x, y, lb=0.001, ub=0.999, metric="woe") 212 | 213 | assert x_transform_woe[:4] == approx( 214 | [3.9918056, 4.2824509, 4.17407503, -3.25653732], rel=1e-6) 215 | x_transform_event_rate = optb.fit_transform( 216 | x, y, lb=0.001, ub=0.999, metric="event_rate") 217 | assert x_transform_event_rate[:4] == approx( 218 | [0.03015878, 0.02272502, 0.02526056, 0.97763604], rel=1e-6) 219 | 220 | 221 | def test_solvers(): 222 | for solver in ("auto", "ecos", "osqp"): 223 | optb = OptimalPWBinning(name=variable, solver=solver) 224 | optb.fit(x, y) 225 | 226 | optb.binning_table.build() 227 | assert optb.binning_table.iv == approx(5.87474602, rel=1e-6) 228 | 229 | 230 | def test_user_splits(): 231 | variable = "mean texture" 232 | x = df[variable].values 233 | 234 | user_splits = [14, 15, 16, 17, 20, 21, 22, 27] 235 | user_splits_fixed = [False, True, True, False, False, False, False, False] 236 | 237 | optb = OptimalPWBinning(name=variable, user_splits=user_splits, 238 | user_splits_fixed=user_splits_fixed) 239 | 240 | optb.fit(x, y) 241 | 242 | 243 | def test_information(): 244 | optb = OptimalPWBinning() 245 | 246 | with raises(NotFittedError): 247 | optb.information() 248 | 249 | optb.fit(x, y) 250 | 251 | with raises(ValueError): 252 | optb.information(print_level=-1) 253 | 254 | optb.information(print_level=0) 255 | optb.information(print_level=1) 256 | optb.information(print_level=2) 257 | 258 | optb = OptimalPWBinning() 259 | optb.fit(x, y) 260 | optb.information(print_level=2) 261 | 262 | 263 | def test_verbose(): 264 | optb = OptimalPWBinning(verbose=True) 265 | optb.fit(x, y) 266 | 267 | assert optb.status == "OPTIMAL" 268 | -------------------------------------------------------------------------------- /tests/test_binning_process_sketch.py: -------------------------------------------------------------------------------- 1 | """ 2 | BinningProcessSketch testing. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2021 7 | 8 | import pandas as pd 9 | 10 | from pytest import approx, raises 11 | 12 | from optbinning import BinningProcessSketch 13 | from optbinning import OptimalBinningSketch 14 | from optbinning.exceptions import NotSolvedError 15 | from optbinning.exceptions import NotDataAddedError 16 | from sklearn.datasets import load_breast_cancer 17 | 18 | data = load_breast_cancer() 19 | variable_names = data.feature_names 20 | df = pd.DataFrame(data.data, columns=variable_names) 21 | y = data.target 22 | 23 | 24 | def test_params(): 25 | with raises(TypeError): 26 | BinningProcessSketch(variable_names=1) 27 | 28 | with raises(ValueError): 29 | BinningProcessSketch(variable_names=[], max_n_prebins=-2) 30 | 31 | with raises(ValueError): 32 | BinningProcessSketch(variable_names=[], min_n_bins=-2) 33 | 34 | with raises(ValueError): 35 | BinningProcessSketch(variable_names=[], max_n_bins=-2.2) 36 | 37 | with raises(ValueError): 38 | BinningProcessSketch(variable_names=[], min_n_bins=3, max_n_bins=2) 39 | 40 | with raises(ValueError): 41 | BinningProcessSketch(variable_names=[], min_bin_size=0.6) 42 | 43 | with raises(ValueError): 44 | BinningProcessSketch(variable_names=[], max_bin_size=-0.6) 45 | 46 | with raises(ValueError): 47 | BinningProcessSketch(variable_names=[], min_bin_size=0.5, 48 | max_bin_size=0.3) 49 | 50 | with raises(ValueError): 51 | BinningProcessSketch(variable_names=[], max_pvalue=1.1) 52 | 53 | with raises(ValueError): 54 | BinningProcessSketch(variable_names=[], max_pvalue_policy="new_policy") 55 | 56 | with raises(TypeError): 57 | BinningProcessSketch(variable_names=[], selection_criteria=[]) 58 | 59 | with raises(TypeError): 60 | BinningProcessSketch(variable_names=[], categorical_variables={}) 61 | 62 | with raises(TypeError): 63 | BinningProcessSketch(variable_names=[], categorical_variables=[1, 2]) 64 | 65 | with raises(TypeError): 66 | BinningProcessSketch(variable_names=[], special_codes={1, 2, 3}) 67 | 68 | with raises(ValueError): 69 | BinningProcessSketch(variable_names=[], split_digits=9) 70 | 71 | with raises(TypeError): 72 | BinningProcessSketch(variable_names=[], binning_fit_params=[1, 2]) 73 | 74 | with raises(TypeError): 75 | BinningProcessSketch(variable_names=[], 76 | binning_transform_params=[1, 2]) 77 | 78 | with raises(TypeError): 79 | BinningProcessSketch(variable_names=[], verbose=1) 80 | 81 | 82 | def test_default(): 83 | bpsketch = BinningProcessSketch(variable_names) 84 | bpsketch.add(df, y) 85 | bpsketch.solve() 86 | 87 | optb = bpsketch.get_binned_variable("mean radius") 88 | 89 | assert optb.status == "OPTIMAL" 90 | 91 | optb.binning_table.build() 92 | assert optb.binning_table.iv == approx(5.04392547, rel=1e-2) 93 | 94 | 95 | def test_default_merge(): 96 | bpsketch_1 = BinningProcessSketch(variable_names) 97 | bpsketch_2 = BinningProcessSketch(variable_names) 98 | 99 | df_1, y_1 = df.iloc[:200, :], y[:200] 100 | df_2, y_2 = df.iloc[200:, :], y[200:] 101 | 102 | bpsketch_1.add(df_1, y_1) 103 | bpsketch_2.add(df_2, y_2) 104 | bpsketch_1.merge(bpsketch_2) 105 | 106 | bpsketch_1.solve() 107 | 108 | optb = bpsketch_1.get_binned_variable("mean radius") 109 | 110 | assert optb.status == "OPTIMAL" 111 | 112 | optb.binning_table.build() 113 | assert optb.binning_table.iv == approx(5.04392547, rel=1e-2) 114 | 115 | 116 | def test_default_tdigest_merge(): 117 | binning_fit_params = {v: {"sketch": "t-digest"} for v in variable_names} 118 | 119 | bpsketch_1 = BinningProcessSketch(variable_names, 120 | binning_fit_params=binning_fit_params) 121 | bpsketch_2 = BinningProcessSketch(variable_names, 122 | binning_fit_params=binning_fit_params) 123 | 124 | df_1, y_1 = df.iloc[:200, :], y[:200] 125 | df_2, y_2 = df.iloc[200:, :], y[200:] 126 | 127 | bpsketch_1.add(df_1, y_1) 128 | bpsketch_2.add(df_2, y_2) 129 | bpsketch_1.merge(bpsketch_2) 130 | 131 | bpsketch_1.solve() 132 | 133 | optb = bpsketch_1.get_binned_variable("mean radius") 134 | 135 | assert optb.status == "OPTIMAL" 136 | 137 | optb.binning_table.build() 138 | assert optb.binning_table.iv == approx(5.04392547, rel=1e-2) 139 | 140 | 141 | def test_default_transform(): 142 | bpsketch = BinningProcessSketch(variable_names) 143 | bpsketch.add(df, y) 144 | 145 | with raises(NotSolvedError): 146 | bpsketch.transform(df, metric="woe") 147 | 148 | bpsketch.solve() 149 | 150 | with raises(TypeError): 151 | X_transform = bpsketch.transform(df.values, metric="woe") 152 | 153 | with raises(ValueError): 154 | X_transform = bpsketch.transform(df, metric="new_woe") 155 | 156 | X_transform = bpsketch.transform(df) 157 | 158 | optb = OptimalBinningSketch() 159 | x = df["mean radius"] 160 | optb.add(x, y) 161 | optb.solve() 162 | 163 | assert optb.transform(x, metric="woe") == approx( 164 | X_transform["mean radius"], rel=1e-6) 165 | 166 | 167 | def test_information(): 168 | bpsketch = BinningProcessSketch(variable_names) 169 | 170 | with raises(NotDataAddedError): 171 | bpsketch.solve() 172 | 173 | bpsketch.add(df, y) 174 | 175 | with raises(NotSolvedError): 176 | bpsketch.information() 177 | 178 | bpsketch.solve() 179 | 180 | with raises(ValueError): 181 | bpsketch.information(print_level=-1) 182 | 183 | bpsketch.information(print_level=0) 184 | bpsketch.information(print_level=1) 185 | bpsketch.information(print_level=2) 186 | -------------------------------------------------------------------------------- /tests/test_continuous_binning_piecewise.py: -------------------------------------------------------------------------------- 1 | """ 2 | ContinuousOptimalPWBinning testing. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2022 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from pytest import approx 12 | 13 | from optbinning import ContinuousOptimalPWBinning 14 | from tests.datasets import load_boston 15 | 16 | data = load_boston() 17 | df = pd.DataFrame(data.data, columns=data.feature_names) 18 | 19 | variable = "LSTAT" 20 | x = df[variable].values 21 | y = data.target 22 | 23 | 24 | def test_default(): 25 | optb = ContinuousOptimalPWBinning(name=variable) 26 | optb.fit(x, y) 27 | 28 | optb.binning_table.build() 29 | optb.binning_table.plot( 30 | savefig="tests/results/test_continuous_binning_piecewise.png") 31 | 32 | 33 | def test_transform(): 34 | optb = ContinuousOptimalPWBinning(name=variable) 35 | optb.fit(x, y) 36 | 37 | x_transform = optb.transform(x) 38 | assert x_transform[:3] == approx( 39 | [31.46014643, 23.87619986, 37.31237732], rel=1e-6) 40 | 41 | 42 | def test_fit_transform(): 43 | optb = ContinuousOptimalPWBinning(name=variable) 44 | 45 | x_transform = optb.fit_transform(x, y) 46 | assert x_transform[:3] == approx( 47 | [31.46014643, 23.87619986, 37.31237732], rel=1e-6) 48 | 49 | 50 | def test_special_codes(): 51 | variable = "INDUS" 52 | x = df[variable].values 53 | 54 | x[:50] = -9 55 | x[50:100] = -8 56 | special_codes = {'special_-9': -9, 'special_-8': -8} 57 | 58 | optb = ContinuousOptimalPWBinning( 59 | name=variable, monotonic_trend="convex", special_codes=special_codes) 60 | optb.fit(x, y) 61 | 62 | x_transform = optb.transform([-9, -8], metric_special=1000) 63 | assert x_transform == approx([1000, 1000], rel=1e-6) 64 | 65 | x_transform = optb.transform([-9, -8], metric_special='empirical') 66 | assert x_transform == approx([20.502000, 24.116000], rel=1e-6) 67 | 68 | optb = ContinuousOptimalPWBinning( 69 | name=variable, monotonic_trend="convex", special_codes=[-9, -8]) 70 | optb.fit(x, y) 71 | 72 | x_transform = optb.transform([-9, -8], metric_special=1000) 73 | assert x_transform == approx([1000, 1000], rel=1e-6) 74 | 75 | x_transform = optb.transform([-9, -8], metric_special='empirical') 76 | assert x_transform == approx([22.309, 22.309], rel=1e-6) 77 | 78 | x[45:50] = np.nan 79 | optb = ContinuousOptimalPWBinning( 80 | name=variable, monotonic_trend="convex", special_codes=special_codes) 81 | optb.fit(x, y) 82 | 83 | x_transform = optb.transform([np.nan], metric_missing='empirical') 84 | assert x_transform == approx([17.94], rel=1e-6) 85 | 86 | 87 | def test_verbose(): 88 | optb = ContinuousOptimalPWBinning(verbose=True) 89 | optb.fit(x, y) 90 | 91 | assert optb.status == "OPTIMAL" 92 | -------------------------------------------------------------------------------- /tests/test_mdlp.py: -------------------------------------------------------------------------------- 1 | """ 2 | MDLP testing. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from pytest import approx, raises 12 | 13 | from optbinning import MDLP 14 | from sklearn.datasets import load_breast_cancer 15 | from sklearn.exceptions import NotFittedError 16 | 17 | 18 | data = load_breast_cancer() 19 | df = pd.DataFrame(data.data, columns=data.feature_names) 20 | 21 | variable = "mean radius" 22 | x = df[variable].values 23 | y = data.target 24 | 25 | 26 | def test_params(): 27 | with raises(ValueError): 28 | mdlp = MDLP(min_samples_split=-1) 29 | mdlp.fit(x, y) 30 | 31 | with raises(ValueError): 32 | mdlp = MDLP(min_samples_leaf=-1) 33 | mdlp.fit(x, y) 34 | 35 | with raises(ValueError): 36 | mdlp = MDLP(max_candidates=-1) 37 | mdlp.fit(x, y) 38 | 39 | 40 | # def test_numerical_default(): 41 | # mdlp = MDLP() 42 | # mdlp.fit(x, y) 43 | 44 | # assert mdlp.splits == approx([10.945, 13.08729032, 15.00163870, 45 | # 15.10030322, 16.925, 17.88], rel=1e-6) 46 | 47 | 48 | # def test_numerical_practical(): 49 | # min_samples_leaf = int(np.ceil(len(x) * 0.05)) 50 | # mdlp = MDLP(max_candidates=128, min_samples_leaf=min_samples_leaf) 51 | # mdlp.fit(x, y) 52 | 53 | # assert mdlp.splits == approx([10.945, 12.995, 13.71, 15.045, 16.325, 54 | # 17.88], rel=1e-6) 55 | 56 | 57 | def test_splits(): 58 | mdlp = MDLP() 59 | 60 | with raises(NotFittedError): 61 | mdlp.splits 62 | -------------------------------------------------------------------------------- /tests/test_multiclass_binning.py: -------------------------------------------------------------------------------- 1 | """ 2 | MulticlassOptimalBinning testing. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import pandas as pd 9 | 10 | from pytest import approx, raises 11 | 12 | from optbinning import MulticlassOptimalBinning 13 | from sklearn.datasets import load_wine 14 | from sklearn.exceptions import NotFittedError 15 | 16 | 17 | data = load_wine() 18 | df = pd.DataFrame(data.data, columns=data.feature_names) 19 | 20 | variable = "ash" 21 | x = df[variable].values 22 | y = data.target 23 | 24 | 25 | def test_params(): 26 | with raises(TypeError): 27 | optb = MulticlassOptimalBinning(name=1) 28 | optb.fit(x, y) 29 | 30 | with raises(ValueError): 31 | optb = MulticlassOptimalBinning(prebinning_method="new_method") 32 | optb.fit(x, y) 33 | 34 | with raises(ValueError): 35 | optb = MulticlassOptimalBinning(solver="new_solver") 36 | optb.fit(x, y) 37 | 38 | with raises(ValueError): 39 | optb = MulticlassOptimalBinning(max_n_prebins=-2) 40 | optb.fit(x, y) 41 | 42 | with raises(ValueError): 43 | optb = MulticlassOptimalBinning(min_prebin_size=0.6) 44 | optb.fit(x, y) 45 | 46 | with raises(ValueError): 47 | optb = MulticlassOptimalBinning(min_n_bins=-2) 48 | optb.fit(x, y) 49 | 50 | with raises(ValueError): 51 | optb = MulticlassOptimalBinning(max_n_bins=-2.2) 52 | optb.fit(x, y) 53 | 54 | with raises(ValueError): 55 | optb = MulticlassOptimalBinning(min_n_bins=3, max_n_bins=2) 56 | optb.fit(x, y) 57 | 58 | with raises(ValueError): 59 | optb = MulticlassOptimalBinning(min_bin_size=0.6) 60 | optb.fit(x, y) 61 | 62 | with raises(ValueError): 63 | optb = MulticlassOptimalBinning(max_bin_size=-0.6) 64 | optb.fit(x, y) 65 | 66 | with raises(ValueError): 67 | optb = MulticlassOptimalBinning(min_bin_size=0.5, max_bin_size=0.3) 68 | optb.fit(x, y) 69 | 70 | with raises(ValueError): 71 | optb = MulticlassOptimalBinning(monotonic_trend=["new_trend", "auto"]) 72 | optb.fit(x, y) 73 | 74 | with raises(ValueError): 75 | optb = MulticlassOptimalBinning(monotonic_trend="new_trend") 76 | optb.fit(x, y) 77 | 78 | with raises(ValueError): 79 | optb = MulticlassOptimalBinning(max_pvalue=1.1) 80 | optb.fit(x, y) 81 | 82 | with raises(ValueError): 83 | optb = MulticlassOptimalBinning(max_pvalue_policy="new_policy") 84 | optb.fit(x, y) 85 | 86 | with raises(TypeError): 87 | optb = MulticlassOptimalBinning(user_splits={"a": [1, 2]}) 88 | optb.fit(x, y) 89 | 90 | with raises(TypeError): 91 | optb = MulticlassOptimalBinning(special_codes={1, 2, 3}) 92 | optb.fit(x, y) 93 | 94 | with raises(ValueError): 95 | optb = MulticlassOptimalBinning(split_digits=9) 96 | optb.fit(x, y) 97 | 98 | with raises(ValueError): 99 | optb = MulticlassOptimalBinning(mip_solver="new_solver") 100 | optb.fit(x, y) 101 | 102 | with raises(ValueError): 103 | optb = MulticlassOptimalBinning(time_limit=-2) 104 | optb.fit(x, y) 105 | 106 | with raises(TypeError): 107 | optb = MulticlassOptimalBinning(verbose=1) 108 | optb.fit(x, y) 109 | 110 | 111 | def test_numerical_default(): 112 | optb = MulticlassOptimalBinning() 113 | optb.fit(x, y) 114 | 115 | assert optb.status == "OPTIMAL" 116 | assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999, 117 | 2.6450001], rel=1e-6) 118 | 119 | optb.binning_table.build() 120 | optb.binning_table.analysis() 121 | assert optb.binning_table.js == approx(0.10989515, rel=1e-6) 122 | assert optb.binning_table.quality_score == approx(0.05279822, rel=1e-6) 123 | optb.binning_table.plot( 124 | savefig="tests/results/test_multiclass_binning.png") 125 | optb.binning_table.plot( 126 | add_special=False, 127 | savefig="tests/results/test_multiclass_binning_no_special.png") 128 | optb.binning_table.plot( 129 | add_missing=False, 130 | savefig="tests/results/test_multiclass_binning_no_missing.png") 131 | 132 | 133 | def test_numerical_default_solvers(): 134 | optb_mip_bop = MulticlassOptimalBinning(solver="mip", mip_solver="bop") 135 | optb_mip_bop.fit(x, y) 136 | 137 | optb_cp = MulticlassOptimalBinning(solver="cp") 138 | optb_cp.fit(x, y) 139 | 140 | for optb in [optb_mip_bop, optb_cp]: 141 | assert optb.status == "OPTIMAL" 142 | assert optb.splits == approx([2.1450001, 2.245, 2.31499994, 2.6049999, 143 | 2.6450001], rel=1e-6) 144 | 145 | 146 | def test_numerical_user_splits_fixed(): 147 | user_splits = [2.1, 2.2, 2.3, 2.6, 2.9] 148 | 149 | with raises(ValueError): 150 | user_splits_fixed = [False, False, False, True, False] 151 | optb = MulticlassOptimalBinning(user_splits_fixed=user_splits_fixed) 152 | optb.fit(x, y) 153 | 154 | with raises(TypeError): 155 | user_splits_fixed = (False, False, False, True, False) 156 | optb = MulticlassOptimalBinning(user_splits=user_splits, 157 | user_splits_fixed=user_splits_fixed) 158 | optb.fit(x, y) 159 | 160 | with raises(ValueError): 161 | user_splits_fixed = [0, 0, 0, 1, 0] 162 | optb = MulticlassOptimalBinning(user_splits=user_splits, 163 | user_splits_fixed=user_splits_fixed) 164 | optb.fit(x, y) 165 | 166 | with raises(ValueError): 167 | user_splits_fixed = [False, False, False, False] 168 | optb = MulticlassOptimalBinning(user_splits=user_splits, 169 | user_splits_fixed=user_splits_fixed) 170 | optb.fit(x, y) 171 | 172 | user_splits_fixed = [False, False, False, True, True] 173 | 174 | with raises(ValueError): 175 | # pure pre-bins 176 | optb = MulticlassOptimalBinning(user_splits=user_splits, 177 | user_splits_fixed=user_splits_fixed) 178 | optb.fit(x, y) 179 | 180 | user_splits = [2.1, 2.2, 2.3, 2.6, 2.7] 181 | optb = MulticlassOptimalBinning(user_splits=user_splits, 182 | user_splits_fixed=user_splits_fixed) 183 | optb.fit(x, y) 184 | 185 | assert optb.status == "OPTIMAL" 186 | assert 2.7 in optb.splits 187 | 188 | 189 | def test_numerical_user_splits_non_unique(): 190 | user_splits = [2.1, 2.2, 2.2, 2.6, 2.9] 191 | optb = MulticlassOptimalBinning(user_splits=user_splits) 192 | 193 | with raises(ValueError): 194 | optb.fit(x, y) 195 | 196 | 197 | def test_numerical_default_transform(): 198 | optb = MulticlassOptimalBinning() 199 | with raises(NotFittedError): 200 | x_transform = optb.transform(x) 201 | 202 | optb.fit(x, y) 203 | 204 | x_transform = optb.transform([0.3, 2.1, 2.5, 3], metric="mean_woe") 205 | assert x_transform == approx([0.48973998, 0.48973998, -0.00074357, 206 | 0.02189459], rel=1e-5) 207 | 208 | 209 | def test_numerical_default_fit_transform(): 210 | optb = MulticlassOptimalBinning() 211 | 212 | x_transform = optb.fit_transform(x, y, metric="mean_woe") 213 | assert x_transform[:5] == approx([-0.00074357, 0.48973998, 0.02189459, 214 | -0.00074357, 0.02189459], rel=1e-5) 215 | 216 | 217 | def test_classes(): 218 | optb = MulticlassOptimalBinning() 219 | optb.fit(x, y) 220 | 221 | assert optb.classes == approx([0, 1, 2]) 222 | 223 | 224 | def test_verbose(): 225 | optb = MulticlassOptimalBinning(verbose=True) 226 | optb.fit(x, y) 227 | 228 | assert optb.status == "OPTIMAL" 229 | -------------------------------------------------------------------------------- /tests/test_outlier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Outlier classes testing. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2022 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from pytest import approx, raises 12 | 13 | from optbinning.binning.outlier import ModifiedZScoreDetector 14 | from optbinning.binning.outlier import RangeDetector 15 | from optbinning.binning.outlier import YQuantileDetector 16 | from tests.datasets import load_boston 17 | 18 | data = load_boston() 19 | df = pd.DataFrame(data.data, columns=data.feature_names) 20 | 21 | variable = "LSTAT" 22 | x = df[variable].values 23 | y = data.target 24 | 25 | 26 | def test_range_params(): 27 | with raises(ValueError): 28 | detector = RangeDetector(method="new") 29 | detector.fit(x) 30 | 31 | with raises(ValueError): 32 | detector = RangeDetector(interval_length=1.5) 33 | detector.fit(x) 34 | 35 | 36 | def test_zscore_params(): 37 | with raises(ValueError): 38 | detector = ModifiedZScoreDetector(threshold=-1.5) 39 | detector.fit(x) 40 | 41 | 42 | def test_yquantile_params(): 43 | with raises(ValueError): 44 | detector = YQuantileDetector(outlier_detector="new") 45 | detector.fit(x, y) 46 | 47 | with raises(TypeError): 48 | detector = YQuantileDetector(outlier_params=[]) 49 | detector.fit(x, y) 50 | 51 | with raises(ValueError): 52 | detector = YQuantileDetector(n_bins=-1) 53 | detector.fit(x, y) 54 | 55 | with raises(ValueError): 56 | detector = YQuantileDetector( 57 | outlier_detector="range", 58 | outlier_params={"threshold": 3.7}) 59 | 60 | detector.fit(x, y) 61 | 62 | 63 | def test_range_default(): 64 | detector = RangeDetector(method="ETI") 65 | detector.fit(x) 66 | assert np.count_nonzero(detector.get_support()) == 7 67 | 68 | detector = RangeDetector(method="HDI") 69 | detector.fit(x) 70 | assert np.count_nonzero(detector.get_support()) == 31 71 | 72 | 73 | def test_zscore_default(): 74 | detector = ModifiedZScoreDetector() 75 | detector.fit(x) 76 | 77 | mask = detector.get_support() 78 | assert np.count_nonzero(mask) == 2 79 | 80 | assert x[mask] == approx([37.97, 36.98]) 81 | 82 | 83 | def test_yquantile_default(): 84 | detector = YQuantileDetector() 85 | detector.fit(x, y) 86 | mask = detector.get_support() 87 | 88 | assert x[mask] == approx( 89 | [7.56, 9.59, 7.26, 11.25, 14.79, 7.44, 9.53, 8.88]) 90 | 91 | assert y[mask] == approx([39.8, 33.8, 43.1, 31, 30.7, 50, 50, 50]) 92 | 93 | 94 | def test_yquantile_outlier_params(): 95 | detector = YQuantileDetector(n_bins=10, outlier_detector="range", 96 | outlier_params={'method': 'HDI'}) 97 | 98 | detector.fit(x, y) 99 | assert np.count_nonzero(detector.get_support()) == 39 100 | -------------------------------------------------------------------------------- /tests/test_scorecard_plots.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scorecard plots testing. 3 | """ 4 | 5 | # Guillermo Navas-Palencia 6 | # Copyright (C) 2020 7 | 8 | import numpy as np 9 | 10 | from pytest import raises 11 | 12 | from optbinning.scorecard import plot_auc_roc 13 | from optbinning.scorecard import plot_cap 14 | from optbinning.scorecard import plot_ks 15 | 16 | 17 | y = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0]) 18 | y_pred = np.array([0.2, 0.1, 0.6, 0.3, 0.7, 0.2, 0.8, 0.1, 0.9, 0.7, 0.3]) 19 | 20 | 21 | def test_params(): 22 | for plot in (plot_auc_roc, plot_cap, plot_ks): 23 | with raises(ValueError): 24 | y_pred_wrong = y_pred[:-1] 25 | plot(y, y_pred_wrong) 26 | 27 | with raises(TypeError): 28 | plot(y, y_pred, title=1) 29 | 30 | with raises(TypeError): 31 | plot(y, y_pred, xlabel=1) 32 | 33 | with raises(TypeError): 34 | plot(y, y_pred, ylabel=1) 35 | 36 | with raises(TypeError): 37 | plot(y, y_pred, savefig=1) 38 | 39 | with raises(TypeError): 40 | plot(y, y_pred, fname=1) 41 | 42 | with raises(ValueError): 43 | plot(y, y_pred, savefig=True, fname=None) 44 | 45 | 46 | def test_savefig(): 47 | for plot in (plot_auc_roc, plot_cap, plot_ks): 48 | plot(y, y_pred, savefig=True, 49 | fname="tests/results/{}.png".format(plot.__name__)) 50 | --------------------------------------------------------------------------------