├── autowoe
    ├── lib
    │   ├── __init__.py
    │   ├── woe
    │   │   ├── __init__.py
    │   │   └── woe.py
    │   ├── optimizer
    │   │   ├── __init__.py
    │   │   └── optimizer.py
    │   ├── pipelines
    │   │   ├── __init__.py
    │   │   ├── pipeline_homotopy.py
    │   │   └── pipeline_feature_special_values.py
    │   ├── report
    │   │   ├── __init__.py
    │   │   ├── utilities_images
    │   │   │   ├── __init__.py
    │   │   │   └── utilities_images.py
    │   │   └── report_generator.py
    │   ├── selectors
    │   │   ├── __init__.py
    │   │   ├── l1.py
    │   │   ├── selector_last.py
    │   │   ├── utils.py
    │   │   ├── composed_selector.py
    │   │   └── selector_first.py
    │   ├── utilities
    │   │   ├── __init__.py
    │   │   ├── s3.py
    │   │   ├── cv_split_f.py
    │   │   ├── utils.py
    │   │   ├── eli5_permutation.py
    │   │   ├── refit.py
    │   │   └── sql.py
    │   ├── cat_encoding
    │   │   ├── __init__.py
    │   │   └── cat_encoding.py
    │   ├── types_handler
    │   │   ├── __init__.py
    │   │   ├── features_checkers_handlers.py
    │   │   └── types_handler.py
    │   └── logging.py
    └── __init__.py
├── _config.yml
├── poetry.toml
├── .pre-commit-config.yaml
├── setup.cfg
├── ruff.toml
├── tests
    ├── conftest.py
    └── integration
    │   ├── test_regression_task.py
    │   ├── test_eda_allfeatures.py
    │   ├── test_dates_and_stat_model.py
    │   ├── test_autotyping.py
    │   ├── test_marked_values.py
    │   └── test_basic_usage_and_params.py
├── .github
    └── workflows
    │   ├── tests_macos.yml
    │   ├── tests_ubuntu.yml
    │   ├── tests_windows.yml
    │   └── CI.yml
├── tox.ini
├── pyproject.toml
├── README.md
├── parameters_info.md
├── .gitignore
├── examples
    ├── Tutorial_2__Dates_and_stat_model.ipynb
    └── Tutorial_1__Basic_usage_and_params.ipynb
└── LICENSE


/autowoe/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowoe/lib/woe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowoe/lib/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowoe/lib/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowoe/lib/report/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowoe/lib/selectors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowoe/lib/utilities/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowoe/lib/cat_encoding/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowoe/lib/types_handler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-architect
2 | 


--------------------------------------------------------------------------------
/autowoe/lib/report/utilities_images/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | create = true
3 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |     rev: v0.11.2
 4 |     hooks:
 5 |       - id: ruff
 6 |         args: ["--fix"]
 7 |       - id: ruff-format
 8 |         args: ["--diff"]
 9 | 
10 |   - repo: https://github.com/pre-commit/pre-commit-hooks
11 |     rev: v3.4.0
12 |     hooks:
13 |       - id: trailing-whitespace
14 |       - id: end-of-file-fixer
15 |       - id: debug-statements
16 | 


--------------------------------------------------------------------------------
/autowoe/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | 
 5 | from .lib.autowoe import AutoWoE
 6 | from .lib.report.report import ReportDeco
 7 | 
 8 | __all__ = ["AutoWoE", "ReportDeco"]
 9 | 
10 | if os.getenv("DOCUMENTATION_ENV") is None:
11 |     try:
12 |         import importlib.metadata as importlib_metadata
13 |     except ModuleNotFoundError:
14 |         import importlib_metadata
15 | 
16 |     __version__ = importlib_metadata.version(__name__)
17 | 
18 | np.random.seed(42)
19 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [darglint]
 2 | docstring_style = google
 3 | strictness = short
 4 | ignore = DAR401, DAR402
 5 | 
 6 | 
 7 | [flake8]
 8 | max-line-length = 120
 9 | ignore =  D100, D103, D104, D105, D107, E402, E203, W503, W605, E722, E501
10 | docstring-convention = google
11 | # per-file-ignores =
12 | exclude =
13 |     .git
14 |     __pycache__
15 |     setup.py
16 |     build
17 |     dist
18 |     releases
19 |     .venv
20 |     .tox
21 |     .mypy_cache
22 |     .pytest_cache
23 |     .vscode
24 |     .github
25 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
 1 | # Exclude a variety of commonly ignored directories.
 2 | exclude = [
 3 |     "tests",
 4 |     ".git",
 5 |     "__pycache__",
 6 |     "setup.py",
 7 |     "build",
 8 |     "dist",
 9 |     "releases",
10 |     ".venv",
11 |     ".tox",
12 |     ".mypy_cache",
13 |     ".pytest_cache",
14 |     ".vscode",
15 |     ".github",
16 | ]
17 | 
18 | # Same as Black.
19 | line-length = 120
20 | indent-width = 4
21 | 
22 | target-version = "py38"
23 | 
24 | [lint]
25 | select = [
26 |     "S", "B", "A", "D", "F", "E", "N", "I", "PD", "PERF",
27 |     "UP032",
28 |     "PERF401",
29 |     "TID252",
30 |     "C4"]
31 | ignore = [
32 |     "D203",
33 |     "D213",
34 |     "PD011",
35 |     "PD901",
36 |     "D107",
37 |     "D104",
38 |     "D103",
39 |     "E501",
40 |     "S101",
41 |     "N803",
42 |     "N806",
43 |     "D401",
44 |     "B904",
45 |     "PLR0912"
46 |     ]
47 | 
48 | fixable = ["ALL"]
49 | 
50 | 
51 | [format]
52 | quote-style = "double"
53 | 
54 | indent-style = "space"
55 | 
56 | skip-magic-trailing-comma = false
57 | 
58 | line-ending = "auto"
59 | 
60 | docstring-code-format = true
61 | 
62 | docstring-code-line-length = "dynamic"
63 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import pytest
 7 | 
 8 | 
 9 | RANDOM_STATE = 42
10 | np.random.seed(RANDOM_STATE)
11 | 
12 | 
13 | DATA_DIR = "examples/data/"
14 | 
15 | 
16 | @pytest.fixture()
17 | def train_data():
18 |     train = pd.read_csv(
19 |         DATA_DIR + "train_demo.csv",
20 |         low_memory=False,
21 |         index_col="line_id",
22 |         parse_dates=["datetime_" + str(i) for i in range(2)],
23 |     )
24 |     return train
25 | 
26 | 
27 | @pytest.fixture()
28 | def test_data():
29 |     test = pd.read_csv(
30 |         DATA_DIR + "test_demo.csv", index_col="line_id", parse_dates=["datetime_" + str(i) for i in range(2)]
31 |     )
32 |     return test
33 | 
34 | 
35 | @pytest.fixture()
36 | def test_target():
37 |     test_target = pd.read_csv(DATA_DIR + "test-target_demo.csv")["target"]
38 |     return test_target
39 | 
40 | 
41 | @pytest.fixture()
42 | def cat_data():
43 |     data = pd.read_csv(DATA_DIR + "data_cat.csv")
44 |     return data
45 | 
46 | 
47 | @pytest.fixture()
48 | def regression_data():
49 |     data = pd.read_csv(DATA_DIR + "regression_dataset.csv")
50 |     return data
51 | 


--------------------------------------------------------------------------------
/.github/workflows/tests_macos.yml:
--------------------------------------------------------------------------------
 1 | name: tests_macos
 2 | 
 3 | on:
 4 |   # # At 20:59 every day (23:59 MSK)
 5 |   # schedule:
 6 |   #   - cron: 59 20 * * *
 7 | 
 8 |   # Manually triggerable in github
 9 |   workflow_dispatch:
10 | 
11 |   workflow_run:
12 |     workflows: ["tests_ubuntu"]
13 |     branches: [master]
14 |     types:
15 |       - completed
16 | 
17 | jobs:
18 |   macos-tests:
19 |     if: ${{ github.event.workflow_run.conclusion == 'success' }}
20 |     runs-on: macos-latest
21 |     strategy:
22 |       fail-fast: true
23 |       matrix:
24 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
25 | 
26 |     steps:
27 |       - uses: actions/checkout@v2
28 | 
29 |       - name: Set up Python ${{ matrix.python-version }}
30 |         uses: actions/setup-python@v2
31 |         with:
32 |           python-version: ${{ matrix.python-version }}
33 | 
34 |       - name: install deps for MacOS
35 |         run: brew update && brew install libomp cairo pango gdk-pixbuf libffi
36 | 
37 |       - name: install with pip
38 |         run: |
39 |           pip install tox==4.23.2
40 |           pip install tox-gh-actions==3.2.0
41 | 
42 |       - name: test with tox
43 |         run: |
44 |           tox -- -vv
45 | 


--------------------------------------------------------------------------------
/.github/workflows/tests_ubuntu.yml:
--------------------------------------------------------------------------------
 1 | name: tests_ubuntu
 2 | 
 3 | on:
 4 |   # # At 20:59 every day (23:59 MSK)
 5 |   # schedule:
 6 |   #   - cron: 59 20 * * *
 7 | 
 8 |   # Manually triggerable in github
 9 |   workflow_dispatch:
10 | 
11 |   workflow_run:
12 |     workflows: ["CI"]
13 |     types:
14 |       - completed
15 | 
16 | jobs:
17 |   ubuntu-tests:
18 |     if: ${{ github.event.workflow_run.conclusion == 'success' }}
19 |     runs-on: ubuntu-latest
20 |     strategy:
21 |       fail-fast: true
22 |       matrix:
23 |         python-version: ["3.8", "3.9", "3.11", "3.12"] # "3.10" is tested in CI
24 | 
25 |     steps:
26 |       - uses: actions/checkout@v4
27 | 
28 |       - name: Set up Python ${{ matrix.python-version }}
29 |         uses: actions/setup-python@v4
30 |         with:
31 |           python-version: ${{ matrix.python-version }}
32 | 
33 |       - name: install deps for Ubuntu
34 |         run: sudo apt-get install build-essential libcairo2 libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 libffi-dev shared-mime-info
35 | 
36 |       - name: install tox
37 |         run: |
38 |           python3 -m pip install --upgrade pip
39 |           pip3 install tox==4.23.2
40 |           pip3 install tox-gh-actions==3.2.0
41 | 
42 |       - name: test with tox
43 |         run: |
44 |           tox -- -vv
45 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | min_version = 3.28.0
 3 | isolated_build = True
 4 | envlist =
 5 |     py{38, 39, 310, 311, 312},
 6 |     lint,
 7 |     docs,
 8 |     typing,
 9 |     build
10 |     codespell
11 | 
12 | [tox:.package]
13 | # note tox will use the same python version as under what tox is installed to package
14 | # so unless this is python 3 you can require a given python version for the packaging
15 | # environment via the basepython key
16 | basepython = python3
17 | 
18 | [gh-actions]
19 | python =
20 |     3.8: py38
21 |     3.9: py39
22 |     3.10: py310
23 |     3.11: py311
24 |     3.12: py312
25 | 
26 | [gh-actions:env]
27 | PLATFORM =
28 |     ubuntu-latest: linux
29 |     macos-latest: macos
30 |     windows-latest: windows
31 | 
32 | [testenv]
33 | allowlist_externals = make
34 | package = wheel
35 | deps =
36 |     .[all]
37 |     pytest >= 6.2.5
38 | commands = pytest {posargs} -v --basetemp="{envtmpdir}" --log-level=DEBUG
39 | 
40 | [testenv:lint]
41 | deps =
42 |     pre-commit == 2.15.0
43 | commands =
44 |     pre-commit install
45 |     pre-commit run --all-files
46 | 
47 | [testenv:build]
48 | deps =
49 |     poetry >= 1.1.7
50 | commands =
51 |     poetry build
52 | 
53 | [testenv:codespell]
54 | deps =
55 |     codespell == 2.3.0
56 | commands =
57 |   codespell --skip="*.js,*.csv,*.ipynb,*shaptxt*"
58 | 


--------------------------------------------------------------------------------
/tests/integration/test_regression_task.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from sklearn.model_selection import train_test_split
 3 | from sklearn.metrics import r2_score
 4 | 
 5 | from autowoe import AutoWoE
 6 | 
 7 | DATA_DIR = "examples/data/"
 8 | 
 9 | 
10 | def test_regression_task(regression_data):
11 | 
12 |     df = regression_data
13 | 
14 |     TARGET_NAME = "Target"
15 | 
16 |     train_df, test_df = train_test_split(df, test_size=0.4, random_state=42, shuffle=True)
17 | 
18 |     autowoe = AutoWoE(
19 |         task="REG",
20 |         monotonic=True,
21 |         interpreted_model=True,
22 |         regularized_refit=True,
23 |         metric_th=0.0,
24 |         n_jobs=1,
25 |         verbose=0,
26 |     )
27 | 
28 |     start_fit_time = time.time()
29 |     autowoe.fit(train=train_df, target_name=TARGET_NAME)
30 | 
31 |     assert time.time() - start_fit_time < 25
32 | 
33 |     start_predicts_time = time.time()
34 | 
35 |     train_pred = autowoe.predict(train_df)
36 |     test_pred = autowoe.predict(test_df)
37 | 
38 |     train_pred = autowoe.predict(train_df)
39 | 
40 |     assert time.time() - start_predicts_time < 0.3, f"Pred time is {time.time() - start_predicts_time}, >= 0.3"
41 | 
42 |     r2_train = r2_score(train_df[TARGET_NAME], train_pred)
43 |     r2_test = r2_score(test_df[TARGET_NAME], test_pred)
44 | 
45 |     assert r2_train > 0.8
46 |     assert r2_test > 0.76
47 | 
48 |     autowoe.get_sql_inference_query("FEATURE_TABLE")
49 | 


--------------------------------------------------------------------------------
/.github/workflows/tests_windows.yml:
--------------------------------------------------------------------------------
 1 | name: tests_windows
 2 | 
 3 | on:
 4 |   # # At 20:59 every day (23:59 MSK)
 5 |   # schedule:
 6 |   #   - cron: 59 20 * * *
 7 | 
 8 |   # Manually triggerable in github
 9 |   workflow_dispatch:
10 | 
11 |   workflow_run:
12 |     workflows: ["tests_ubuntu"]
13 |     branches: [master]
14 |     types:
15 |       - completed
16 | 
17 | jobs:
18 |   windows-tests:
19 |     if: ${{ github.event.workflow_run.conclusion == 'success' }}
20 |     runs-on: windows-latest
21 |     strategy:
22 |       fail-fast: true
23 |       matrix:
24 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
25 | 
26 |     steps:
27 |       - uses: actions/checkout@v2
28 | 
29 |       - name: Set up Python ${{ matrix.python-version }}
30 |         uses: actions/setup-python@v2
31 |         with:
32 |           python-version: ${{ matrix.python-version }}
33 | 
34 |       - name: setup-msys2
35 |         uses: msys2/setup-msys2@v2
36 |         with:
37 |           msystem: MINGW64
38 |           update: true
39 |           install: >-
40 |             mingw-w64-x86_64-cairo
41 | 
42 |       - name: install deps for Windows
43 |         run: pip3 install pycairo
44 | 
45 |       - name: install tox
46 |         run: |
47 |           python3 -m pip install --upgrade pip
48 |           pip3 install tox==4.23.2
49 |           pip3 install tox-gh-actions==3.2.0
50 | 
51 |       - name: test with tox
52 |         run: |
53 |           tox -- -vv
54 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "AutoWoE"
 3 | version = "1.3.4"
 4 | description = "Library for automatic interpretable model building (Whitebox AutoML)"
 5 | authors = ["Vakhrushev Anton <btbpanda@gmail.com>"]
 6 | readme = "README.md"
 7 | homepage = "https://github.com/AILab-MLTools/AutoMLWhitebox"
 8 | repository = "https://github.com/AILab-MLTools/AutoMLWhitebox"
 9 | classifiers = [
10 |     "Programming Language :: Python :: 3.8",
11 |     "Programming Language :: Python :: 3.9",
12 |     "Programming Language :: Python :: 3.10",
13 |     "Programming Language :: Python :: 3.11",
14 |     "Programming Language :: Python :: 3.12",
15 |     "Operating System :: OS Independent",
16 |     "Intended Audience :: Science/Research",
17 |     "Development Status :: 3 - Alpha",
18 |     "Environment :: Console",
19 |     "Natural Language :: English",
20 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
21 |     "Typing :: Typed"
22 | ]
23 | 
24 | [tool.poetry.dependencies]
25 | python = ">=3.8"
26 | numpy = "*"
27 | scipy = "*"
28 | pandas = "*"
29 | scikit-learn = "*"
30 | lightgbm = "*"
31 | sphinx = "*"
32 | sphinx-rtd-theme = "*"
33 | joblib = "*"
34 | pytz = "*"
35 | pytest = "*"
36 | jinja2 = "*"
37 | matplotlib = "*"
38 | seaborn = "*"
39 | tqdm = "^4.62.3"
40 | StrEnum = "^0.4.7"
41 | 
42 | 
43 | 
44 | [tool.poetry.dev-dependencies]
45 | notebook = "^6.4.6"
46 | black = "20.8b1"
47 | pre-commit = "2.15.0"
48 | mypy = "^0.910"
49 | tox = "*"
50 | darglint = "^1.8.1"
51 | flake8-docstrings = "^1.6.0"
52 | isort = "5.7.0"
53 | jupyter-contrib-nbextensions = "^0.5.1"
54 | jupyter_nbextensions_configurator = "^0.4.1"
55 | 
56 | [build-system]
57 | requires = ["poetry>=0.12"]
58 | build-backend = "poetry.masonry.api"
59 | 


--------------------------------------------------------------------------------
/autowoe/lib/utilities/s3.py:
--------------------------------------------------------------------------------
 1 | """S3 wrapper."""
 2 | # ruff: noqa
 3 | 
 4 | import s3fs
 5 | 
 6 | 
 7 | class S3Client(s3fs.S3FileSystem):
 8 |     """Класс-обёртка для доступа к хранилищу S3.
 9 | 
10 |     Используется для доступа к объектам хранилища с использованием интерфейса файловой системы.
11 | 
12 |     """
13 | 
14 |     def __init__(self, aws_access_key_id, aws_secret_access_key, namespace=None, endpoint_url=None, **kwargs):
15 |         """Конструктор объекта файловой системы на S3 SberCloud.
16 | 
17 |         Args:
18 |             aws_access_key_id: Публичный ключ доступа к бакету S3
19 |             aws_secret_access_key: Приватный ключ доступа к бакету S3
20 |             namespace: Идентификатор пространства пользователя в хранилище SberCloud. Используется для формировании
21 |                 URL web-сервиса S3 SberCloud. Если не задан, то необходимо задать URL в аргументе endpoint_url
22 |             endpoint_url: URL web-сервиса S3 SberCloud. Если не задан, то URL будет автоматически
23 |                 сконструирован на основании значения namespace.
24 |             kwargs: Дополнительные параметры, передаваемые конструктору s3fs.S3FileSystem
25 | 
26 |         """
27 |         if not namespace and not endpoint_url:
28 |             raise ValueError("Either namespace or endpoint_url is required")
29 | 
30 |         self.namespace = namespace
31 |         self.aws_access_key_id = aws_access_key_id
32 |         self.aws_secret_access_key = aws_secret_access_key
33 |         self.endpoint_url = endpoint_url
34 | 
35 |         super(S3Client, self).__init__(
36 |             key=self.aws_access_key_id,
37 |             secret=self.aws_secret_access_key,
38 |             client_kwargs={"endpoint_url": self.endpoint_url},
39 |             **kwargs,
40 |         )
41 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   # Manually triggerable in github
 5 |   workflow_dispatch:
 6 | 
 7 |   push:
 8 | 
 9 | 
10 |   pull_request:
11 | 
12 | 
13 | jobs:
14 |   pre-commit:
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: checkout
18 |         uses: actions/checkout@v4
19 | 
20 |       - uses: actions/setup-python@v4
21 |         with:
22 |           python-version: "3.10"
23 | 
24 |       - name: pre-commit
25 |         uses: pre-commit/action@v3.0.1
26 | 
27 |   codespell:
28 |     runs-on: ubuntu-latest
29 | 
30 |     steps:
31 |     - name: checkout
32 |       uses: actions/checkout@v4
33 | 
34 |     - name: codespell
35 |       run: |
36 |         pip install codespell
37 |         codespell --skip="*.js,*.csv,*.ipynb,*shaptxt*"
38 | 
39 |   linux-py310-tests:
40 |     needs: pre-commit
41 |     runs-on: ubuntu-latest
42 |     if: |
43 |       ( github.event_name == 'push' ) && ( needs.pre-commit.result == 'success' )
44 |       ||
45 |       ( github.event_name == 'pull_request' ) && ( needs.pre-commit.result == 'success' )
46 |       ||
47 |       ( github.event_name == 'workflow_dispatch' ) && ( needs.pre-commit.result == 'success' )
48 | 
49 |     steps:
50 |       - uses: actions/checkout@v4
51 | 
52 |       - name: Set up Python
53 |         uses: actions/setup-python@v4
54 | 
55 |       - uses: Gr1N/setup-poetry@v8
56 |         with:
57 |           poetry-version: 1.8.0
58 | 
59 |       # - name: update pip if python 3.12
60 |       #   run: pip install setuptools && python -m ensurepip --upgrade
61 | 
62 |       - name: install deps for Ubuntu
63 |         run: sudo apt-get install build-essential libcairo2 libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 libffi-dev shared-mime-info
64 | 
65 |       - name: install tox
66 |         run: |
67 |           python3 -m pip install --upgrade pip
68 |           pip3 install tox==4.23.2
69 |           pip3 install tox-gh-actions==3.2.0
70 | 
71 |       - name: test with tox
72 |         run: |
73 |           tox
74 | 


--------------------------------------------------------------------------------
/autowoe/lib/pipelines/pipeline_homotopy.py:
--------------------------------------------------------------------------------
 1 | # noqa: D100
 2 | 
 3 | import lightgbm as lgb
 4 | import numpy as np
 5 | import pandas as pd
 6 | from sklearn.model_selection import StratifiedKFold
 7 | 
 8 | from autowoe.lib.utilities.utils import TaskType, flatten
 9 | 
10 | 
11 | class HTransform:
12 |     """Homotopy transform.
13 | 
14 |     Args:
15 |         x: Feature.
16 |         y: Target.
17 |         cv_splits: Number of splits.
18 | 
19 |     """
20 | 
21 |     def __init__(self, task: TaskType, x: pd.Series, y: pd.Series, cv_splits: int = 5):
22 |         self.x, self.y = x, y
23 |         self._task = task
24 |         # TODO: for what ?
25 |         self.cv = self._get_cv(cv_splits)
26 | 
27 |     @staticmethod
28 |     def _get_cv(cv_splits: int) -> StratifiedKFold:
29 |         return StratifiedKFold(n_splits=cv_splits, random_state=323, shuffle=True)
30 | 
31 |     def __call__(self, tree_params: dict) -> np.ndarray:
32 |         """Return the boundaries of the split by the transmitted sample and parameters.
33 | 
34 |         Args:
35 |             tree_params: dict or lightgbm tree params
36 | 
37 |         Returns:
38 |             Splitting.
39 | 
40 |         """
41 |         default_tree_params = {
42 |             "boosting_type": "rf",
43 |             "objective": "binary" if self._task == TaskType.BIN else "regression",
44 |             "bagging_freq": 1,
45 |             "bagging_fraction": 0.999,
46 |             "feature_fraction": 0.999,
47 |             "bagging_seed": 323,
48 |             "verbosity": -1,
49 |         }
50 | 
51 |         unite_params = {**default_tree_params, **tree_params, "num_threads": 1}
52 |         lgb_train = lgb.Dataset(self.x.values.astype(np.float32)[:, np.newaxis], label=self.y)
53 |         gbm = lgb.train(params=unite_params, train_set=lgb_train, num_boost_round=1)
54 | 
55 |         d_tree_prop = flatten(gbm.dump_model()["tree_info"][0])
56 |         limits = {d_tree_prop[key] for key in d_tree_prop if "threshold" in key}
57 | 
58 |         limits = list(limits)
59 |         limits.sort()
60 | 
61 |         return np.unique(limits)
62 | 


--------------------------------------------------------------------------------
/autowoe/lib/logging.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | import logging
 4 | import sys
 5 | import warnings
 6 | 
 7 | 
 8 | logging.captureWarnings(True)
 9 | 
10 | debug_log_format = f"%(asctime)s - [%(levelname)s] - %(name)s - (%(filename)s).%(funcName)s(%(lineno)d) - %(message)s"
11 | default_log_format = f"%(message)s"
12 | 
13 | 
14 | def verbosity_to_loglevel(verbosity):
15 |     if verbosity <= 0:
16 |         log_level = logging.ERROR
17 |         warnings.filterwarnings("ignore")
18 |     elif verbosity == 1:
19 |         log_level = logging.WARNING
20 |     elif verbosity == 2:
21 |         log_level = logging.INFO
22 |     else:
23 |         log_level = logging.DEBUG
24 | 
25 |     return log_level
26 | 
27 | 
28 | def get_file_handler():
29 |     file_handler = logging.FileHandler("x.log")
30 |     file_handler.setLevel(logging.WARNING)
31 |     file_handler.setFormatter(logging.Formatter(default_log_format))
32 |     return file_handler
33 | 
34 | 
35 | def get_stream_handler(stream, level=None, handler_filter=None):
36 |     stream_handler = logging.StreamHandler(stream)
37 |     stream_handler.setFormatter(logging.Formatter(default_log_format))
38 | 
39 |     if level:
40 |         stream_handler.setLevel(level)
41 | 
42 |     if handler_filter:
43 |         stream_handler.addFilter(handler_filter)
44 | 
45 |     return stream_handler
46 | 
47 | 
48 | def get_logger(name=None, level=None):
49 |     class InfoFilter(logging.Filter):
50 |         def filter(self, rec):
51 |             return rec.levelno in (logging.DEBUG, logging.INFO)
52 | 
53 |     logger = logging.getLogger(name)
54 | 
55 |     if level:
56 |         logger.setLevel(level)
57 | 
58 |     if logger.hasHandlers():
59 |         logger.handlers.clear()
60 | 
61 |     logger.addHandler(get_stream_handler(stream=None, level=logging.WARNING))
62 |     logger.addHandler(get_stream_handler(stream=sys.stdout, level=logging.DEBUG, handler_filter=InfoFilter()))
63 | 
64 |     logger.propagate = False
65 | 
66 |     return logger
67 | 
68 | 
69 | class DuplicateFilter(object):
70 |     def __init__(self):
71 |         self.msgs = set()
72 | 
73 |     def filter(self, record):
74 |         rv = record.msg not in self.msgs
75 |         self.msgs.add(record.msg)
76 |         return rv
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## AutoWoE library
 2 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/AutoWoE)](https://pypi.org/project/AutoWoE)
 3 | [![PyPI - Version](https://img.shields.io/pypi/v/AutoWoE)](https://pypi.org/project/AutoWoE)
 4 | ![pypi - Downloads](https://img.shields.io/pypi/dm/AutoWoE?color=green&label=PyPI%20downloads&logo=pypi&logoColor=green)
 5 | [![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/sb-ai-lab/AutoMLWhitebox/CI.yml)](https://github.com/sb-ai-lab/AutoMLWhitebox/actions/workflows/CI.yml?query=branch%3Amaster)
 6 | 
 7 | This is the repository for **AutoWoE** library, developed by LightAutoML group. This library can be used for automatic creation of interpretable ML model based on feature binning, WoE features transformation, feature selection and Logistic Regression.
 8 | 
 9 | **Authors:** Vakhrushev Anton, Grigorii Penkin, Alexander Kirilin
10 | 
11 | **Library setup** can be done by one of three scenarios below:
12 | 
13 | 1. Installation from PyPI:
14 | ```bash
15 | pip install autowoe
16 | ```
17 | 2. Installation from source code
18 | 
19 | First of all you need to install [git](https://git-scm.com/downloads) and [poetry](https://python-poetry.org/docs/#installation).
20 | 
21 | ```bash
22 | 
23 | # Load WhiteBox source code
24 | git clone https://github.com/AILab-MLTools/AutoMLWhitebox.git
25 | 
26 | cd AutoMLWhiteBox/
27 | 
28 | # !!!Choose only one item!!!
29 | 
30 | # 1. Recommended: Create virtual environment inside your project directory
31 | poetry config virtualenvs.in-project true
32 | 
33 | # 2. Global installation: Don't create virtual environment
34 | poetry config virtualenvs.create false --local
35 | 
36 | # For more information read poetry docs
37 | 
38 | # Install WhiteBox
39 | poetry install
40 | 
41 | ```
42 | 
43 | 
44 | **Usage tutorials** are in Jupyter notebooks in the repository root. For **parameters description** take a look at `parameters_info.md`.
45 | 
46 | **Bugs / Questions / Suggestions:**
47 | - Seek prompt advice in [Telegram group](https://t.me/joinchat/sp8P7sdAqaU0YmRi).
48 | - Open bug reports and feature requests on GitHub [issues](https://github.com/sb-ai-lab/AutoMLWhitebox/issues).
49 | - Also follow our [Telegram channel](https://t.me/lightautoml)
50 | 


--------------------------------------------------------------------------------
/tests/integration/test_eda_allfeatures.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.metrics import roc_auc_score
 5 | 
 6 | from pandas import Series
 7 | 
 8 | from autowoe import AutoWoE
 9 | 
10 | 
11 | def test_eda_all_features(train_data):
12 |     df = train_data
13 | 
14 |     TARGET_NAME = "target"
15 | 
16 |     num_features = [col for col in df.columns if col.startswith("number")][:10]
17 |     cat_features = [col for col in df.columns if col.startswith("string")][:5]
18 | 
19 |     df = df[num_features + cat_features + [TARGET_NAME]]
20 | 
21 |     train_df, test_df = train_test_split(df, stratify=df[TARGET_NAME], test_size=0.4, random_state=42, shuffle=True)
22 | 
23 |     autowoe = AutoWoE(
24 |         task="BIN",
25 |         n_jobs=1,
26 |         verbose=0,
27 |         # turn off initial importance selection - this step force all features to pass into the binning stage
28 |         imp_th=-1,
29 |     )
30 | 
31 |     autowoe.fit(train=train_df, target_name=TARGET_NAME)
32 | 
33 |     test_pred = autowoe.predict_proba(test_df)
34 | 
35 |     score = roc_auc_score(test_df[TARGET_NAME], test_pred)
36 | 
37 |     assert np.isclose(score, 0.6186, atol=1e-4), f"Real score is {score}"
38 | 
39 |     enc = autowoe.test_encoding(train_df, list(autowoe.woe_dict.keys()), bins=True)
40 |     fails_counter = 0
41 |     for col in enc.columns:
42 |         start_time = time.time()
43 | 
44 |         grp = enc.groupby(col).size()
45 |         woe = autowoe.woe_dict[col]
46 | 
47 |         woe_val = Series(woe.cod_dict).reset_index()
48 |         woe_val.columns = [col, "WoE"]
49 |         woe_val["count"] = woe_val[col].map(grp).fillna(0).values.astype(int)
50 |         if woe.f_type == "cat":
51 |             woe_val["bin"] = woe_val[col]
52 |         else:
53 |             split = list(woe.split.astype(np.float32))
54 |             mapper = {n: f"({x}; {y}]" for (n, (x, y)) in enumerate(zip(["-inf"] + split, split + ["inf"]))}
55 |             woe_val["bin"] = woe_val[col].map(mapper)
56 |             woe_val["bin"] = np.where(woe_val["bin"].isnull().values, woe_val[col], woe_val["bin"])
57 | 
58 |         if time.time() - start_time > 0.3:
59 |             fails_counter += 1
60 |     assert fails_counter <= 1, f"There were {fails_counter} fails, it's more than 1"
61 | 


--------------------------------------------------------------------------------
/autowoe/lib/selectors/l1.py:
--------------------------------------------------------------------------------
 1 | """Selector based on Lasso."""
 2 | 
 3 | from typing import Dict, List, Tuple, TypeVar
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | from autowoe.lib.selectors.utils import Result, l1_select
 9 | from autowoe.lib.utilities.utils import TaskType
10 | 
11 | from .utils import F_LIST_TYPE, FEATURE
12 | 
13 | WoE = TypeVar("WoE")
14 | 
15 | 
16 | class L1:
17 |     """L1 selector.
18 | 
19 |     Args:
20 |         interpreted_model: Build interpreted model.
21 |         train: Train features.
22 |         target: Train target.
23 |         n_jobs: Number of threads.
24 |         cv_split: Cross-Val splits.
25 | 
26 |     """
27 | 
28 |     def __init__(
29 |         self,
30 |         task: TaskType,
31 |         interpreted_model: bool,
32 |         train: pd.DataFrame,
33 |         target: pd.Series,
34 |         n_jobs: int,
35 |         cv_split: Dict[int, Tuple[List[int], List[int]]],
36 |     ):
37 |         self.task = task
38 |         self.train = train
39 |         self.target = target
40 | 
41 |         self.__interpreted_model = interpreted_model
42 |         self.__n_jobs = n_jobs
43 |         self.__features = train.columns
44 |         self.__cv_split = cv_split
45 | 
46 |     def __call__(
47 |         self, features_fit: List[FEATURE], l1_grid_size: int, l1_exp_scale: float, metric_tol: float = 1e-4
48 |     ) -> Tuple[F_LIST_TYPE, Result]:
49 |         """Run selector.
50 | 
51 |         Args:
52 |             features_fit: List of features.
53 |             l1_grid_size: Number of points on grid.
54 |             l1_exp_scale: Maximum value of `C`.
55 |             metric_tol: Metric tolerance.
56 | 
57 |         Returns:
58 |             Selected features, summary info.
59 | 
60 | 
61 |         """
62 |         np.random.seed(323)
63 |         features_fit_ = features_fit.copy()
64 |         dataset = self.train[features_fit_], self.target
65 | 
66 |         best_features, result = l1_select(
67 |             self.task,
68 |             interpreted_model=self.__interpreted_model,
69 |             n_jobs=self.__n_jobs,
70 |             dataset=dataset,
71 |             l1_grid_size=l1_grid_size,
72 |             l1_exp_scale=l1_exp_scale,
73 |             cv_split=self.__cv_split,
74 |             metric_tol=metric_tol,
75 |         )
76 | 
77 |         return best_features, result
78 | 


--------------------------------------------------------------------------------
/tests/integration/test_dates_and_stat_model.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from sklearn.metrics import roc_auc_score
 4 | 
 5 | from autowoe import ReportDeco, AutoWoE
 6 | 
 7 | 
 8 | def test_dates_and_stat_model(train_data, test_data, test_target):
 9 | 
10 |     train = train_data
11 |     test = test_data
12 | 
13 |     test["target"] = test_target.values
14 | 
15 |     num_col = list(filter(lambda x: "numb" in x, train.columns))
16 |     num_feature_type = {x: "real" for x in num_col}
17 | 
18 |     date_col = list(filter(lambda x: "datetime" in x, train.columns))
19 |     date_feature_type = {x: (None, ("d", "wd")) for x in date_col}
20 | 
21 |     features_type = dict(**num_feature_type, **date_feature_type)
22 |     # подробно параметры описаны в Example_1
23 |     auto_woe = AutoWoE(
24 |         monotonic=True, max_bin_count=4, oof_woe=False, regularized_refit=False, p_val=0.05, debug=False, verbose=0
25 |     )
26 |     auto_woe = ReportDeco(auto_woe)
27 | 
28 |     start_fit_time = time.time()
29 |     auto_woe.fit(
30 |         train[num_col + date_col + ["target"]],
31 |         target_name="target",
32 |         features_type=features_type,
33 |     )
34 | 
35 |     assert time.time() - start_fit_time < 50, f"Fit time is {time.time() - start_fit_time}, it's more than 50"
36 | 
37 |     start_pred_time = time.time()
38 |     pred = auto_woe.predict_proba(test)
39 | 
40 |     assert time.time() - start_pred_time < 5, f"Predict time is {time.time() - start_pred_time}, it's more than 5"
41 | 
42 |     score = roc_auc_score(test["target"], pred)
43 | 
44 |     assert score > 0.78
45 | 
46 |     report_params = {
47 |         "automl_date_column": "report_month",  # колонка с датой в формате params['datetimeFormat']
48 |         "output_path": "./AUTOWOE_REPORT_2",  # папка, куда сгенерится отчет и сложатся нужные файлы
49 |         "report_name": "___НАЗВАНИЕ ОТЧЕТА___",
50 |         "report_version_id": 1,
51 |         "city": "Воронеж",
52 |         "model_aim": "___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___",
53 |         "model_name": "___НАЗВАНИЕ МОДЕЛИ___",
54 |         "zakazchik": "___ЗАКАЗЧИК___",
55 |         "high_level_department": "___ПОДРАЗДЕЛЕНИЕ___",
56 |         "ds_name": "___РАЗРАБОТЧИК МОДЕЛИ___",
57 |         "target_descr": "___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___",
58 |         "non_target_descr": "___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___",
59 |     }
60 | 
61 |     auto_woe.generate_report(report_params)
62 | 
63 |     # import shutil
64 | 
65 |     # shutil.rmtree("AUTOWOE_REPORT_2")
66 | 


--------------------------------------------------------------------------------
/parameters_info.md:
--------------------------------------------------------------------------------
  1 | ## Whitebox pipeline parameters:
  2 | 
  3 | ###  General params:
  4 | 
  5 |     - n_jobs
  6 |     - debug
  7 | 
  8 | ### 0) Simple typing and trash removal
  9 | ####    0.0) Remove trash feats
 10 | 
 11 |         Medium:
 12 |         - th_nan
 13 |         - th_const
 14 | 
 15 | ####    0.1) Typing (auto and user defined)
 16 | 
 17 |         Critical:
 18 |         - features_type (dict) {'age': 'real', 'education': 'cat', 'birth_date': (None, ("d", "wd"), ...}
 19 | 
 20 | ####    0.2) Dates and categories encoding
 21 | 
 22 |         Critical:
 23 |         - features_type (for datetimes)
 24 | 
 25 |         Optional:
 26 |         - cat_alpha (int) - greater means more conservative encoding
 27 | 
 28 | 
 29 | ### 1) Initial feature selection (selection based on gbm importance)
 30 | 
 31 |     Critical:
 32 |     - select_type (None or int)
 33 |     - imp_type (if type(select_type) is int 'perm_imt'/'feature_imp')
 34 | 
 35 |     Optional:
 36 |     - imt_th (float) - threshold for select_type is None
 37 | 
 38 | ### 2) Binning:
 39 | 
 40 |     Critical:
 41 |     - monotonic / features_monotone_constraints
 42 |     - max_bin_count / max_bin_count
 43 |     - min_bin_size
 44 | 
 45 |     - cat_merge_to
 46 |     - nan_merge_to
 47 | 
 48 |     Medium:
 49 |     - force_single_split
 50 | 
 51 |     Optional:
 52 |     - min_bin_mults
 53 |     - min_gains_to_split
 54 | 
 55 | ### 3) WoE estimation WoE = LN( ((% 0 in bin) / (% 0 in sample)) / ((% 1 in bin) / (% 1 in sample)) ):
 56 | 
 57 |     Critical:
 58 |     - oof_woe
 59 | 
 60 |     Optional:
 61 |     - woe_diff_th
 62 |     - n_folds (if oof_woe)
 63 | 
 64 | ### 4) Post selection:
 65 | 
 66 | ####    4.0) Partial dependencies with target
 67 | 
 68 |     Critical:
 69 |     - auc_th
 70 | 
 71 | ####    4.1) VIF
 72 | 
 73 |     Critical:
 74 |     - vif_th
 75 | 
 76 | ####    4.2) Partial correlcations
 77 | 
 78 |     Critical:
 79 |     - pearson_th
 80 | 
 81 | ### 5) Model based selection
 82 | 
 83 |     Optional:
 84 |     - n_folds
 85 |     - l1_grid_size
 86 |     - l1_exp_scale
 87 | 
 88 | 
 89 | ### 6) Final model refit:
 90 | 
 91 |     Critical:
 92 |     - regularized_refit
 93 |     - p_val (if not regularized_refit)
 94 |     - validation (if not regularized_refit)
 95 | 
 96 |     Optional:
 97 |     - interpreted_model
 98 |     - l1_grid_size (if regularized_refit)
 99 |     - l1_exp_scale (if regularized_refit)
100 | 
101 | ### 7) Report generation
102 | 
103 |     - report_params
104 | 


--------------------------------------------------------------------------------
/tests/integration/test_autotyping.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from sklearn.metrics import roc_auc_score
 4 | 
 5 | from autowoe import ReportDeco, AutoWoE
 6 | 
 7 | 
 8 | def test_autotyping(cat_data):
 9 | 
10 |     data = cat_data
11 | 
12 |     train = data.iloc[:14000, :]
13 |     test = data.iloc[14000:, :]
14 | 
15 |     # подробно параметры описаны в Example_1
16 |     auto_woe = AutoWoE(
17 |         monotonic=False,
18 |         max_bin_count=5,
19 |         oof_woe=True,
20 |         regularized_refit=True,
21 |         p_val=0.05,
22 |         debug=False,
23 |         verbose=0,
24 |         cat_merge_to="to_maxp",
25 |         nan_merge_to="to_maxp",
26 |     )
27 |     auto_woe = ReportDeco(auto_woe)
28 | 
29 |     autowoe_fit_params = {
30 |         "train": train,
31 |         "target_name": "isFraud",
32 |     }
33 |     start_fit_time = time.time()
34 |     auto_woe.fit(**autowoe_fit_params)
35 | 
36 |     assert time.time() - start_fit_time < 60, f"Fit time is {time.time() - start_fit_time}, it's more than 60 seconds"
37 | 
38 |     start_pred_time = time.time()
39 |     pred = auto_woe.predict_proba(test)
40 | 
41 |     assert (
42 |         time.time() - start_pred_time < 5
43 |     ), f"Prediction time is {time.time() - start_pred_time}, it's more than 5 seconds"
44 | 
45 |     score = roc_auc_score(test[autowoe_fit_params["target_name"]], pred)
46 | 
47 |     assert score > 0.8
48 | 
49 |     values = {}
50 |     for value in auto_woe.private_features_type.values():
51 |         if value not in values:
52 |             values[value] = 0
53 |         values[value] += 1
54 | 
55 |     assert (
56 |         values["cat"] == 12 and values["real"] == 61
57 |     ), f"There're should be 12 cat and 61 reals, but we have {values['cat']} cats and {values['real']} reals"
58 | 
59 |     report_params = {
60 |         "automl_date_column": "report_month",  # колонка с датой в формате params['datetimeFormat']
61 |         "output_path": "./AUTOWOE_REPORT_3",  # папка, куда сгенерится отчет и сложатся нужные файлы
62 |         "report_name": "___НАЗВАНИЕ ОТЧЕТА___",
63 |         "report_version_id": 1,
64 |         "city": "Воронеж",
65 |         "model_aim": "___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___",
66 |         "model_name": "___НАЗВАНИЕ МОДЕЛИ___",
67 |         "zakazchik": "___ЗАКАЗЧИК___",
68 |         "high_level_department": "___ПОДРАЗДЕЛЕНИЕ___",
69 |         "ds_name": "___РАЗРАБОТЧИК МОДЕЛИ___",
70 |         "target_descr": "___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___",
71 |         "non_target_descr": "___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___",
72 |     }
73 | 
74 |     auto_woe.generate_report(report_params)
75 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | # C extensions
  6 | *.so
  7 | # DS_store
  8 | .DS_Store
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | # lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg MANIFEST
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template before PyInstaller builds the exe, so as to
 29 | #  inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | # Installer logs
 33 | pip-log.txt
 34 | pip-delete-this-directory.txt
 35 | # Unit test / coverage reports
 36 | htmlcov/
 37 | .tox/
 38 | .nox/
 39 | .coverage
 40 | .coverage.*
 41 | .cache
 42 | nosetests.xml
 43 | coverage.xml
 44 | *.cover
 45 | *.py,
 46 | .hypothesis/
 47 | .pytest_cache/
 48 | cover/
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | db.sqlite3
 56 | db.sqlite3-journal
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | # Sphinx documentation
 63 | docs/_build/
 64 | # PyBuilder
 65 | .pybuilder/
 66 | target/
 67 | # Jupyter Notebook
 68 | .ipynb_checkpoints
 69 | # IPython
 70 | profile_default/
 71 | ipython_config.py
 72 | # pyenv
 73 | #   For a library or package, you might want to ignore these files since the code is intended to run in multiple
 74 | # environments; otherwise, check them in: .python-version pipenv
 75 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. However, in case of
 76 | #collaboration, if having platform-specific dependencies or dependencies having no cross-platform support, pipenv
 77 | #may install dependencies that don't work, or not install all needed dependencies. Pipfile.lock
 78 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 79 | __pypackages__/
 80 | # Celery stuff
 81 | celerybeat-schedule celerybeat.pid
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | autowoe_venv/
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | # Rope project settings
 97 | .ropeproject
 98 | # mkdocs documentation
 99 | /site
100 | # mypy
101 | .mypy_cache/
102 | .dmypy.json
103 | dmypy.json
104 | # Pyre type checker
105 | .pyre/
106 | # pytype static type analyzer
107 | .pytype/
108 | # Cython debug symbols
109 | cython_debug/
110 | .idea/
111 | .vscode/
112 | temp/
113 | 


--------------------------------------------------------------------------------
/autowoe/lib/cat_encoding/cat_encoding.py:
--------------------------------------------------------------------------------
 1 | # noqa: D100
 2 | 
 3 | from copy import deepcopy
 4 | from typing import Dict, List, Union
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | 
10 | class CatEncoding:
11 |     """Class for categorical data converting/reconverting to float values.
12 | 
13 |     Args:
14 |         data: Data for encoding. First column - feature, second - target.
15 | 
16 |     """
17 | 
18 |     def __init__(self, data: pd.DataFrame):
19 |         self.data = data
20 |         self.col = data.columns
21 | 
22 |         self.data_info = pd.DataFrame(index=data.index, columns=[self.col[0], "mean_enc"])
23 |         self.data_info[self.col[0]] = self.data[self.col[0]].values
24 | 
25 |     def __call__(
26 |         self, cv_index_split: Dict[int, List[int]], nan_index: np.array, cat_alpha: float = 1.0
27 |     ) -> pd.DataFrame:
28 |         """Mean_target encoding by cross-val.
29 | 
30 |         Args:
31 |             cv_index_split: CV indexes.
32 |             nan_index: Indexes of nan-values.
33 |             cat_alpha: Smooth coefficient alpha.
34 | 
35 |         Returns:
36 |             Encoded values.
37 | 
38 |         """
39 |         cv_index_split_ = deepcopy(cv_index_split)
40 |         feature, target = self.col
41 | 
42 |         for key in cv_index_split_:
43 |             train_index, test_index = cv_index_split_[key]
44 |             train_index, test_index = np.setdiff1d(train_index, nan_index), np.setdiff1d(test_index, nan_index)
45 | 
46 |             data_sl = self.data.iloc[train_index]
47 |             d_agg = data_sl.groupby(feature)[target].agg(["sum", "count"])
48 |             d_agg = (d_agg["sum"] + cat_alpha * data_sl[target].mean()) / (d_agg["count"] + cat_alpha)
49 | 
50 |             d_agg = d_agg.to_dict()
51 |             self.data_info.iloc[test_index, 1] = self.data_info.iloc[test_index, 0].map(d_agg)
52 | 
53 |         train_f = self.data.copy()
54 |         train_f.iloc[:, 0] = self.data_info["mean_enc"].values
55 |         return train_f
56 | 
57 |     def mean_target_reverse(self, split: Union[List[float], np.ndarray]) -> Dict[int, int]:
58 |         """Reverse mean-target.
59 | 
60 |         Should be run after '__call__'
61 | 
62 |         Args:
63 |             split: Splits.
64 | 
65 |         Returns:
66 |             Mapping.
67 | 
68 |         """
69 |         df = self.data_info.copy()
70 |         df["split_cat"] = np.searchsorted(split, df.mean_enc.values)
71 | 
72 |         crosstab = pd.crosstab(df[self.col[0]], df.split_cat)
73 |         crosstab = crosstab.div(crosstab.sum(axis=1), axis=0)
74 |         max_cat = np.argmax(crosstab.values, axis=1)
75 | 
76 |         # словарь соответствий: имя категории -> номер бина
77 |         return dict(zip(crosstab.index, max_cat))
78 | 


--------------------------------------------------------------------------------
/autowoe/lib/types_handler/features_checkers_handlers.py:
--------------------------------------------------------------------------------
 1 | """Type feature checkers."""
 2 | 
 3 | from typing import Optional, Tuple, cast
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | F_UNIQUE = 5
 9 | 
10 | 
11 | def dates_checker(feature: pd.Series) -> bool:
12 |     """Check that feature belongs to the datetime.
13 | 
14 |     Args:
15 |         feature: Values.
16 | 
17 |     Returns:
18 |         Flag.
19 | 
20 |     """
21 |     try:
22 |         feature = pd.to_datetime(feature)
23 |         if (feature.min().year <= 1975) or (feature.min().year is np.nan):
24 |             return False
25 |         else:
26 |             return True
27 |     except ValueError:
28 |         return False
29 |     except Exception:
30 |         raise ValueError("Something is wrong with object types")
31 | 
32 | 
33 | def dates_handler(
34 |     feature: pd.Series, feature_type: Tuple[Optional[str], Tuple[str, ...]] = (None, ("wd", "m", "y", "d"))
35 | ) -> Tuple:
36 |     """Handle datetime feature.
37 | 
38 |     feature_type ("%Y%d%m", ("m", "d", "wd", "h", "min")), (None, ("m", "d", "wd", "h", "min"))
39 | 
40 |     Args:
41 |         feature: Datetime values.
42 |         feature_type: Tuple of date format and seasonality.
43 | 
44 |     Returns:
45 |         Processed datetime, feature_type.
46 | 
47 |     """
48 |     date_format = feature_type[0]
49 |     seasonality = feature_type[1]
50 | 
51 |     if not len(seasonality):
52 |         raise ValueError("Seasonality is empty!")
53 | 
54 |     seas2func = {
55 |         "y": lambda x: x.year,
56 |         "m": lambda x: x.month,
57 |         "d": lambda x: x.day,
58 |         "wd": lambda x: x.weekday(),
59 |         "h": lambda x: x.hour,
60 |         "min": lambda x: x.minute,
61 |     }
62 | 
63 |     new_features = []
64 |     new_feature = cast(pd.Series, pd.to_datetime(feature, format=date_format))
65 | 
66 |     for seas in seasonality:
67 |         new_feature_name = str(new_feature.name) + "__F__" + seas
68 | 
69 |         new_feature_ = new_feature.map(lambda x: seas2func[seas](x))  # noqa: B023
70 |         new_features.append((new_feature_name, new_feature_))
71 | 
72 |     return new_features, feature_type
73 | 
74 | 
75 | def cat_checker(feature: pd.Series) -> bool:
76 |     """Check that feature belongs to the category.
77 | 
78 |     Args:
79 |         feature: Values.
80 | 
81 |     Returns:
82 |         Flag.
83 | 
84 |     """
85 |     dtypes = [object, str]
86 |     if np.__version__ < "1.18.0":
87 |         dtypes.append(np.str)
88 |     if feature.dtype in dtypes:
89 |         return True
90 | 
91 |     feature_unique = feature.unique()
92 |     if 2 < feature_unique.shape[0] <= F_UNIQUE and np.all(feature_unique.astype(np.int64) == feature_unique):
93 |         return True
94 |     else:
95 |         return False
96 | 


--------------------------------------------------------------------------------
/tests/integration/test_marked_values.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import time
 4 | 
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.metrics import roc_auc_score
 7 | 
 8 | from autowoe import AutoWoE
 9 | 
10 | 
11 | def test_marked_values(train_data):
12 |     df = train_data
13 | 
14 |     TARGET_NAME = "target"
15 | 
16 |     num_features = [col for col in df.columns if col.startswith("number")][:10]
17 |     cat_features = [col for col in df.columns if col.startswith("string")][:5]
18 | 
19 |     df = df[num_features + cat_features + [TARGET_NAME]]
20 | 
21 |     df.iloc[:10, 0] = -1
22 |     df.iloc[10:20, 0] = -2
23 |     df.iloc[:20, 1] = 1234567890
24 |     df.iloc[:20, 11] = "Special"
25 | 
26 |     train_df, test_df = train_test_split(df, stratify=df[TARGET_NAME], test_size=0.4, random_state=42, shuffle=True)
27 | 
28 |     assert all(train_df["string_1"].head(1) == "other")
29 | 
30 |     autowoe = AutoWoE(task="BIN", n_jobs=1, verbose=0)
31 | 
32 |     assert autowoe._params["l1_exp_scale"] == 4
33 |     assert autowoe._params["imp_type"] == "feature_imp"
34 |     assert autowoe._params["population_size"] is None
35 |     assert not autowoe._params["monotonic"]
36 | 
37 |     none_params = (
38 |         "woe_dict",
39 |         "train_df",
40 |         "split_dict",
41 |         "target",
42 |         "clf",
43 |         "features_fit",
44 |         "_cv_split",
45 |         "_private_features_type",
46 |         "_public_features_type",
47 |         "_weights",
48 |         "_intercept",
49 |         "_p_vals",
50 |         "feature_history",
51 |     )
52 |     for param in none_params:
53 |         assert autowoe.__dict__[param] is None, f"This value should be None, but it's {autowoe.__dict__[param]}"
54 | 
55 |     start_fit_time = time.time()
56 |     autowoe.fit(
57 |         train=train_df,
58 |         target_name=TARGET_NAME,
59 |         features_mark_values={"number_0": (-1, -2), "number_1": (1234567890,), "string_1": ("Special",)},
60 |     )
61 | 
62 |     assert time.time() - start_fit_time < 10, f"Fit time is {time.time() - start_fit_time}, it's more than 10"
63 | 
64 |     start_predict_time = time.time()
65 |     test_pred = autowoe.predict_proba(test_df)
66 |     assert time.time() - start_predict_time < 0.05, f"Diff is {time.time() - start_predict_time}, >= 0.05"
67 | 
68 |     score = roc_auc_score(test_df[TARGET_NAME], test_pred)
69 | 
70 |     assert score > 0.58
71 | 
72 |     assert autowoe.get_sql_inference_query("FEATURE_TABLE")
73 | 
74 |     representation = autowoe.get_model_represenation()
75 | 
76 |     features_representation = pd.DataFrame(representation["features"])
77 | 
78 |     assert all(
79 |         np.isclose(features_representation["number_9"]["splits"], [7072.0, 11699.5, 13292.5])
80 |     ), "There are different splits"
81 | 
82 |     assert np.isclose(representation["intercept"], -4.5482746), "There are different intercept coef"
83 | 


--------------------------------------------------------------------------------
/autowoe/lib/utilities/cv_split_f.py:
--------------------------------------------------------------------------------
 1 | """Cross validation utilities."""
 2 | 
 3 | from typing import Iterable, Optional
 4 | 
 5 | import numpy as np
 6 | from sklearn.model_selection import GroupKFold, StratifiedKFold
 7 | 
 8 | from autowoe.lib.utilities.utils import TaskType
 9 | 
10 | 
11 | def cv_split_f(
12 |     x, y, task: TaskType, group_kf: Optional[Iterable] = None, n_splits: int = 6, random_state: int = 42
13 | ) -> dict:
14 |     """Get CV-splits.
15 | 
16 |     Args:
17 |         x: Features.
18 |         y: Target.
19 |         task: Task.
20 |         group_kf: Groups.
21 |         n_splits: Number of splits.
22 |         random_state: Random state.
23 | 
24 |     Returns:
25 |         CV-splits.
26 | 
27 |     """
28 |     if task == TaskType.BIN:
29 |         if group_kf is not None:
30 |             gkf = GroupKFold(n_splits=n_splits)
31 |             return dict(enumerate(gkf.split(X=x, y=y, groups=group_kf)))
32 |         else:
33 |             skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
34 |             return dict(enumerate(skf.split(X=x, y=y)))
35 |     else:
36 |         skf = StratifiedKFoldReg(n_splits=n_splits, shuffle=True, random_state=random_state)
37 |         return dict(enumerate(skf.split(X=x, y=y)))
38 | 
39 | 
40 | class StratifiedKFoldReg(StratifiedKFold):
41 |     """Stratification for continuous variable.
42 | 
43 |     Stratification method 'sorted' was taken from:
44 |         (https://github.com/scikit-learn/scikit-learn/issues/4757)
45 | 
46 |     Args:
47 |         method: Method for stratification
48 |         n_y_bins: Number of target bins. Default: None.
49 | 
50 |     """
51 | 
52 |     def __init__(self, method: Optional[str] = None, n_y_bins: Optional[int] = None, **kwargs):
53 |         self._method = method
54 |         self._n_y_bins = n_y_bins
55 | 
56 |         super().__init__(**kwargs)
57 | 
58 |     def split(self, X, y, groups=None):
59 |         """Generate indices to split data into training and test set."""
60 |         if self._method is None:
61 |             return self._sorted_split(X, y, groups)
62 |         else:
63 |             raise NotImplementedError
64 | 
65 |     def _sorted_split(self, X, y, groups=None):
66 |         n_samples = len(y)
67 | 
68 |         n_labels = int(np.floor(n_samples / self.n_splits))
69 |         y_labels_sorted = np.concatenate([np.repeat(ii, self.n_splits) for ii in range(n_labels)])
70 | 
71 |         mod = np.mod(n_samples, self.n_splits)
72 | 
73 |         _, labels_idx = np.unique(y_labels_sorted, return_index=True)
74 |         rand_label_ix = np.random.choice(labels_idx, mod, replace=False)
75 |         y_labels_sorted = np.insert(y_labels_sorted, rand_label_ix, y_labels_sorted[rand_label_ix])
76 | 
77 |         map_labels_y = dict(zip(np.argsort(y), y_labels_sorted))
78 | 
79 |         y_labels = np.array([map_labels_y[ii] for ii in range(n_samples)])
80 | 
81 |         return super().split(X, y_labels, groups)
82 | 
83 |     def _bins_split(self, X, y, groups=None):
84 |         y_labels = y
85 |         return super().split(X, y_labels, groups)
86 | 


--------------------------------------------------------------------------------
/autowoe/lib/selectors/selector_last.py:
--------------------------------------------------------------------------------
  1 | """Post-selection."""
  2 | 
  3 | from typing import Any, Dict, List, Optional, Tuple, TypeVar
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from autowoe.lib.utilities.utils import Result, TaskType
  8 | 
  9 | from .composed_selector import ComposedSelector
 10 | from .l1 import L1
 11 | from .utils import F_LIST_TYPE
 12 | 
 13 | __all__ = ["Selector"]
 14 | 
 15 | WoE = TypeVar("WoE")
 16 | 
 17 | 
 18 | class Selector:
 19 |     """Class for post-selection of features.
 20 | 
 21 |     Args:
 22 |         interpreted_model: Build interpreted model.
 23 |         task: Task.
 24 |         train: Train features.
 25 |         target: Train target.
 26 |         features_type: Features types.
 27 |         n_jobs: Number of threads.
 28 |         cv_split: Cross-Val splits.
 29 |         features_mark_values:
 30 | 
 31 |     """
 32 | 
 33 |     def __init__(
 34 |         self,
 35 |         interpreted_model: bool,
 36 |         task: TaskType,
 37 |         train: pd.DataFrame,
 38 |         target: pd.Series,
 39 |         features_type: Dict[str, str],
 40 |         n_jobs: int,
 41 |         cv_split: Dict[int, Tuple[List[int], List[int]]],
 42 |         features_mark_values: Optional[Dict[str, Tuple[Any]]],
 43 |     ):
 44 |         self.__features_fit = list(features_type.keys())
 45 |         self.__pearson_selector = ComposedSelector(train, target, task, features_mark_values)
 46 |         self.__main_selector = L1(
 47 |             task, train=train, target=target, interpreted_model=interpreted_model, n_jobs=n_jobs, cv_split=cv_split
 48 |         )
 49 |         self.train = train
 50 |         self.target = target
 51 | 
 52 |         self.__interpreted_model = interpreted_model
 53 |         self.__n_jobs = n_jobs
 54 |         self.__features = train.columns
 55 |         self.__cv_split = cv_split
 56 | 
 57 |     @property
 58 |     def features_fit(self):
 59 |         """Input features."""
 60 |         return self.__features_fit
 61 | 
 62 |     def __call__(
 63 |         self,
 64 |         feature_history: Dict[str, str],
 65 |         pearson_th: float,
 66 |         vif_th: float,
 67 |         metric_th: float,
 68 |         l1_grid_size: int,
 69 |         l1_exp_scale: float,
 70 |         metric_tol: float = 1e-4,
 71 |     ) -> Tuple[F_LIST_TYPE, Result]:
 72 |         """Run selector.
 73 | 
 74 |         Args:
 75 |             pearson_th: Pearson threshold.
 76 |             vif_th: VIF threshold
 77 |             metric_th: Metric threshold.
 78 |             l1_grid_size: Number of points on grid.
 79 |             l1_exp_scale: Maximum values of `C`.
 80 |             metric_tol: Metric tolerance.
 81 |             feature_history: HIstory of features filtering.
 82 | 
 83 |         Returns:
 84 |             Selected features, summary L1-selector info.
 85 | 
 86 |         """
 87 |         features_fit = self.__pearson_selector(
 88 |             feature_history, self.features_fit, pearson_th=pearson_th, metric_th=metric_th, vif_th=vif_th
 89 |         )
 90 |         features_before = set(features_fit)
 91 |         features_fit, result = self.__main_selector(
 92 |             features_fit=features_fit, l1_grid_size=l1_grid_size, l1_exp_scale=l1_exp_scale, metric_tol=metric_tol
 93 |         )
 94 |         if feature_history is not None:
 95 |             features_diff = features_before - set(features_fit)
 96 |             for feat in features_diff:
 97 |                 feature_history[feat] = f"Pruned by {self.__main_selector.__class__.__name__} selector"
 98 | 
 99 |         return features_fit, result
100 | 


--------------------------------------------------------------------------------
/autowoe/lib/utilities/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility."""
  2 | 
  3 | from collections import namedtuple
  4 | from typing import Any, Callable, Dict, Hashable, Iterable, Set, Tuple, Union
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from strenum import StrEnum
  9 | 
 10 | Result = namedtuple("Result", ["score", "reg_alpha", "is_neg", "min_weights"])
 11 | 
 12 | 
 13 | class TaskType(StrEnum):
 14 |     """Solvable task types."""
 15 | 
 16 |     BIN: "TaskType" = "BIN"  # type: ignore
 17 |     REG: "TaskType" = "REG"  # type: ignore
 18 | 
 19 | 
 20 | def drop_keys(dict_: Dict, keys: Iterable[Hashable]) -> Dict:
 21 |     """Drop multiple keys from dict.
 22 | 
 23 |     Args:
 24 |         dict_: Dictionary.
 25 |         keys: Dropped keys.
 26 | 
 27 |     Returns:
 28 |         Filtered dictornary.
 29 | 
 30 |     """
 31 |     for key in keys:
 32 |         dict_.pop(key)
 33 |     return dict_
 34 | 
 35 | 
 36 | def flatten(d: dict, parent_key: str = "", sep: str = "_"):
 37 |     """Flatten Dictionary of dictionaries.
 38 | 
 39 |     Args:
 40 |         d: Dictionary with nested dictionaries.
 41 |         parent_key: Parent outer key.
 42 |         sep: Separator for merged keys.
 43 | 
 44 |     Returns:
 45 |         Expanded Dictionary.
 46 | 
 47 |     """
 48 |     items = []
 49 |     for k, v in d.items():
 50 |         new_key = parent_key + sep + k if parent_key else k
 51 |         if isinstance(v, dict):
 52 |             items.extend(flatten(v, new_key, sep=sep).items())
 53 |         else:
 54 |             items.append((new_key, v))
 55 |     return dict(items)
 56 | 
 57 | 
 58 | def get_task_type(values: np.ndarray) -> TaskType:
 59 |     """Determine task type.
 60 | 
 61 |     Args:
 62 |         values: Array of values.
 63 | 
 64 |     Returns:
 65 |         task.
 66 | 
 67 |     """
 68 |     n_unique_values = np.unique(values).shape[0]
 69 | 
 70 |     task: str
 71 |     if n_unique_values == 1:
 72 |         raise RuntimeError("Only unique value in target")
 73 |     elif n_unique_values == 2:
 74 |         task = TaskType.BIN
 75 |     else:
 76 |         task = TaskType.REG
 77 | 
 78 |     return task
 79 | 
 80 | 
 81 | def feature_changing(
 82 |     feature_history: Dict[str, str],
 83 |     step_name: str,
 84 |     features_before: Union[Dict[str, str], Set[str]],
 85 |     func: Callable,
 86 |     *args,
 87 |     **kwargs,
 88 | ) -> Tuple[Any, Any]:
 89 |     """Safe feature filtering.
 90 | 
 91 |     Args:
 92 |         feature_history: History changes of features processing.
 93 |         step_name: Name of step.
 94 |         features_before: Features before processing.
 95 |         func: Filtering function.
 96 |         args: Function positional arguments.
 97 |         kwargs: Function named arguments.
 98 | 
 99 |     Returns:
100 |         output:
101 |         filter_features:
102 | 
103 |     """
104 |     # features_before: Set[str]
105 |     if isinstance(features_before, dict):
106 |         features_before = set(features_before.keys())
107 |     else:
108 |         features_before = set(features_before)
109 | 
110 |     output, filter_features = func(*args, **kwargs)
111 |     if isinstance(filter_features, dict):
112 |         features_after = set(filter_features.keys())
113 |     elif isinstance(filter_features, pd.Series):
114 |         features_after = set(filter_features.index)
115 |     elif isinstance(filter_features, Iterable):
116 |         features_after = set(filter_features)
117 |     else:
118 |         raise RuntimeError("Can't extract features after function call.")
119 | 
120 |     features_diff = features_before - features_after
121 |     for feature in features_diff:
122 |         feature_history[feature] = step_name
123 | 
124 |     return output, filter_features
125 | 


--------------------------------------------------------------------------------
/autowoe/lib/utilities/eli5_permutation.py:
--------------------------------------------------------------------------------
  1 | # module was taken from eli5 lib as is
  2 | # this was made to simplify dependencies
  3 | # ruff: noqa
  4 | 
  5 | """ELI5 library.
  6 | 
  7 | A module for computing feature importances by measuring how score decreases
  8 | when a feature is not available. It contains basic building blocks;
  9 | there is a full-featured sklearn-compatible implementation
 10 | in :class:`~.PermutationImportance`.
 11 | A similar method is described in Breiman, "Random Forests", Machine Learning,
 12 | 45(1), 5-32, 2001 (available online at
 13 | https://www.stat.berkeley.edu/%7Ebreiman/randomforest2001.pdf), with an
 14 | application to random forests. It is known in literature as
 15 | "Mean Decrease Accuracy (MDA)" or "permutation importance".
 16 | 
 17 | """
 18 | 
 19 | from __future__ import absolute_import
 20 | 
 21 | from typing import Any
 22 | from typing import Callable
 23 | from typing import List
 24 | from typing import Tuple
 25 | 
 26 | import numpy as np
 27 | 
 28 | from sklearn.utils import check_random_state
 29 | 
 30 | 
 31 | def iter_shuffled(X, columns_to_shuffle=None, pre_shuffle=False, random_state=None):
 32 |     """Return an iterator of X matrices which have one or more columns shuffled.
 33 | 
 34 |     After each iteration yielded matrix is mutated inplace, so
 35 |     if you want to use multiple of them at the same time, make copies.
 36 |     ``columns_to_shuffle`` is a sequence of column numbers to shuffle.
 37 |     By default, all columns are shuffled once, i.e. columns_to_shuffle
 38 |     is ``range(X.shape[1])``.
 39 |     If ``pre_shuffle`` is True, a copy of ``X`` is shuffled once, and then
 40 |     result takes shuffled columns from this copy. If it is False,
 41 |     columns are shuffled on fly. ``pre_shuffle = True`` can be faster
 42 |     if there is a lot of columns, or if columns are used multiple times.
 43 | 
 44 |     # noqa: DAR101
 45 |     # noqa: DAR301
 46 | 
 47 |     """
 48 |     rng = check_random_state(random_state)
 49 | 
 50 |     if columns_to_shuffle is None:
 51 |         columns_to_shuffle = range(X.shape[1])
 52 | 
 53 |     if pre_shuffle:
 54 |         X_shuffled = X.copy()
 55 |         rng.shuffle(X_shuffled)
 56 | 
 57 |     X_res = X.copy()
 58 |     for columns in columns_to_shuffle:
 59 |         if pre_shuffle:
 60 |             X_res[:, columns] = X_shuffled[:, columns]
 61 |         else:
 62 |             rng.shuffle(X_res[:, columns])
 63 |         yield X_res
 64 |         X_res[:, columns] = X[:, columns]
 65 | 
 66 | 
 67 | def get_score_importances(
 68 |     score_func,  # type: Callable[[Any, Any], float]
 69 |     X,
 70 |     y,
 71 |     n_iter=5,  # type: int
 72 |     columns_to_shuffle=None,
 73 |     random_state=None,
 74 | ):
 75 |     # type: (...) -> Tuple[float, List[np.ndarray]]
 76 |     """Return ``(base_score, score_decreases)`` tuple with the base score and score decreases when a feature is not available.
 77 | 
 78 |     ``base_score`` is ``score_func(X, y)``; ``score_decreases``
 79 |     is a list of length ``n_iter`` with feature importance arrays
 80 |     (each array is of shape ``n_features``); feature importances are computed
 81 |     as score decrease when a feature is not available.
 82 |     ``n_iter`` iterations of the basic algorithm is done, each iteration
 83 |     starting from a different random seed.
 84 |     If you just want feature importances, you can take a mean of the result::
 85 |         import numpy as np
 86 |         from eli5.permutation_importance import get_score_importances
 87 | 
 88 |         base_score, score_decreases = get_score_importances(score_func, X, y)
 89 |         feature_importances = np.mean(score_decreases, axis=0)
 90 | 
 91 |     # noqa: DAR301
 92 |     # noqa: DAR101
 93 |     # noqa: DAR201
 94 | 
 95 |     """
 96 |     rng = check_random_state(random_state)
 97 |     base_score = score_func(X, y)
 98 |     scores_decreases = []
 99 |     for i in range(n_iter):
100 |         scores_shuffled = _get_scores_shufled(score_func, X, y, columns_to_shuffle=columns_to_shuffle, random_state=rng)
101 |         scores_decreases.append(-scores_shuffled + base_score)
102 |     return base_score, scores_decreases
103 | 
104 | 
105 | def _get_scores_shufled(score_func, X, y, columns_to_shuffle=None, random_state=None):
106 |     Xs = iter_shuffled(X, columns_to_shuffle, random_state=random_state)
107 |     return np.array([score_func(X_shuffled, y) for X_shuffled in Xs])
108 | 


--------------------------------------------------------------------------------
/autowoe/lib/report/report_generator.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa
 2 | 
 3 | import os
 4 | 
 5 | from datetime import datetime
 6 | from shutil import copyfile
 7 | 
 8 | from jinja2 import Environment
 9 | from jinja2 import FileSystemLoader
10 | 
11 | from ..logging import get_logger
12 | 
13 | 
14 | logger = get_logger(__name__)
15 | 
16 | 
17 | class ReportGenerator:
18 |     def __init__(self):
19 |         self.env = Environment(loader=FileSystemLoader(searchpath=os.path.dirname(__file__)))
20 |         self.base_template = self.env.get_template("report_en_v2.html")
21 | 
22 |     def write_report_to_file(self, report_params):
23 |         with open(os.path.join(report_params["output_path"], "autowoe_report.html"), "w", encoding="utf-8") as f:
24 |             f.write(
25 |                 self.base_template.render(
26 |                     report_name=str(report_params["report_name"]),
27 |                     report_version=str(report_params["report_version_id"]),
28 |                     city=str(report_params["city"]),
29 |                     year=str(datetime.now().year),
30 |                     model_aim=str(report_params["model_aim"]),
31 |                     model_name=str(report_params["model_name"]),
32 |                     zakazchik=str(report_params["zakazchik"]),
33 |                     high_level_department=str(report_params["high_level_department"]),
34 |                     ds_name=str(report_params["ds_name"]),
35 |                     target_descr=str(report_params["target_descr"]),
36 |                     non_target_descr=str(report_params["non_target_descr"]),
37 |                     count_train=report_params["count_train"],
38 |                     train_target_cnt=report_params["train_target_cnt"],
39 |                     train_nontarget_cnt=report_params["train_nontarget_cnt"],
40 |                     train_target_perc=report_params["train_target_perc"],
41 |                     train_auc_full=report_params["train_auc_full"],
42 |                     train_gini_full=report_params["train_gini_full"],
43 |                     count_test=report_params["count_test"],
44 |                     test_target_cnt=report_params["test_target_cnt"],
45 |                     test_nontarget_cnt=report_params["test_nontarget_cnt"],
46 |                     test_target_perc=report_params["test_target_perc"],
47 |                     test_auc_full=report_params["test_auc_full"],
48 |                     test_gini_full=report_params["test_gini_full"],
49 |                     train_gini_confint=report_params["train_gini_confint"],
50 |                     test_gini_confint=report_params["test_gini_confint"],
51 |                     model_coef=report_params["model_coef"],
52 |                     p_vals=report_params["p_vals"],
53 |                     p_vals_test=report_params["p_vals_test"],
54 |                     final_nan_stat=report_params["final_nan_stat"],
55 |                     features_roc_auc=report_params["features_roc_auc"],
56 |                     features_woe=report_params["features_woe"],
57 |                     woe_bars=report_params["woe_bars"],
58 |                     backlash_plots=report_params["backlash_plots"],
59 |                     train_vif=report_params["train_vif"],
60 |                     psi_total=report_params["psi_total"],
61 |                     psi_zeros=report_params["psi_zeros"],
62 |                     psi_ones=report_params["psi_ones"],
63 |                     psi_binned_total=report_params["psi_binned_total"],
64 |                     psi_binned_zeros=report_params["psi_binned_zeros"],
65 |                     psi_binned_ones=report_params["psi_binned_ones"],
66 |                     scorecard=report_params["scorecard"],
67 |                     feature_history=report_params["feature_history"],
68 |                     feature_contribution=report_params["feature_contribution"],
69 |                     corr_map_table=report_params["corr_map_table"],
70 |                     binned_p_stats_train=report_params["binned_p_stats_train"],
71 |                     binned_p_stats_test=report_params["binned_p_stats_test"],
72 |                     dategrouped_value=report_params["dategrouped_value"],
73 |                     dategrouped_gini=report_params["dategrouped_gini"],
74 |                     dategrouped_nan=report_params["dategrouped_nan"],
75 |                 )
76 |             )
77 | 
78 |     def generate_report(self, report_params):
79 |         copyfile(
80 |             os.path.join(os.path.dirname(__file__), "shaptxt"), os.path.join(report_params["output_path"], "shap.js")
81 |         )
82 | 
83 |         self.write_report_to_file(report_params)
84 | 
85 |         logger.info(f"Successfully wrote {os.path.join(report_params['output_path'], 'autowoe_report.html')}.")
86 | 


--------------------------------------------------------------------------------
/tests/integration/test_basic_usage_and_params.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | 
  4 | from sklearn.metrics import roc_auc_score
  5 | 
  6 | from autowoe import ReportDeco, AutoWoE
  7 | 
  8 | 
  9 | def test_basic_usage_and_params(train_data, test_data, test_target):
 10 | 
 11 |     train = train_data
 12 | 
 13 |     train = train.iloc[:, 50:100]
 14 | 
 15 |     num_col = list(filter(lambda x: "numb" in x, train.columns))
 16 |     num_feature_type = {x: "real" for x in num_col}
 17 | 
 18 |     date_col = filter(lambda x: "datetime" in x, train.columns)
 19 |     for col in date_col:
 20 |         train[col + "_year"] = train[col].map(lambda x: x.year)
 21 |         train[col + "_weekday"] = train[col].map(lambda x: x.weekday())
 22 |         train[col + "_month"] = train[col].map(lambda x: x.month)
 23 | 
 24 |     test = test_data
 25 | 
 26 |     date_col = filter(lambda x: "datetime" in x, test.columns)
 27 |     for col in date_col:
 28 |         test[col + "_year"] = test[col].map(lambda x: x.year)
 29 |         test[col + "_weekday"] = test[col].map(lambda x: x.weekday())
 30 |         test[col + "_month"] = test[col].map(lambda x: x.month)
 31 | 
 32 |     test["target"] = test_target.values
 33 | 
 34 |     cat_col = list(filter(lambda x: "str" in x, train.columns))
 35 |     cat_feature_type = {x: "cat" for x in cat_col}
 36 | 
 37 |     year_col = list(filter(lambda x: "_year" in x, train.columns))
 38 |     year_feature_type = {x: "cat" for x in year_col}
 39 | 
 40 |     weekday_col = list(filter(lambda x: "_weekday" in x, train.columns))
 41 |     weekday_feature_type = {x: "cat" for x in weekday_col}
 42 | 
 43 |     month_col = list(filter(lambda x: "_month" in x, train.columns))
 44 |     month_feature_type = {x: "cat" for x in month_col}
 45 | 
 46 |     features = cat_col + year_col + weekday_col + month_col + num_col
 47 | 
 48 |     features_type = dict(
 49 |         **num_feature_type, **cat_feature_type, **year_feature_type, **weekday_feature_type, **month_feature_type
 50 |     )
 51 | 
 52 |     features_monotone_constraints = {"number_74": "auto", "number_83": "auto"}
 53 | 
 54 |     max_bin_count = {"number_47": 3, "number_51": 2}
 55 | 
 56 |     auto_woe = AutoWoE(
 57 |         task="BIN",
 58 |         interpreted_model=True,
 59 |         monotonic=False,
 60 |         max_bin_count=5,
 61 |         select_type=None,
 62 |         pearson_th=0.9,
 63 |         auc_th=0.505,
 64 |         vif_th=10.0,
 65 |         imp_th=0,
 66 |         th_const=32,
 67 |         force_single_split=True,
 68 |         th_nan=0.01,
 69 |         th_cat=0.005,
 70 |         woe_diff_th=0.01,
 71 |         min_bin_size=0.01,
 72 |         min_bin_mults=(2, 4),
 73 |         min_gains_to_split=(0.0, 0.5, 1.0),
 74 |         auc_tol=1e-4,
 75 |         cat_alpha=100,
 76 |         cat_merge_to="to_woe_0",
 77 |         nan_merge_to="to_woe_0",
 78 |         oof_woe=True,
 79 |         n_folds=6,
 80 |         n_jobs=4,
 81 |         l1_grid_size=20,
 82 |         l1_exp_scale=6,
 83 |         imp_type="feature_imp",
 84 |         regularized_refit=False,
 85 |         p_val=0.05,
 86 |         debug=False,
 87 |         verbose=0,
 88 |     )
 89 | 
 90 |     auto_woe = ReportDeco(auto_woe)
 91 | 
 92 |     start_fit_time = time.time()
 93 |     auto_woe.fit(
 94 |         train[features + ["target"]],
 95 |         target_name="target",
 96 |         features_type=features_type,
 97 |         group_kf=None,
 98 |         max_bin_count=max_bin_count,
 99 |         features_monotone_constraints=features_monotone_constraints,
100 |         validation=test,
101 |     )
102 | 
103 |     assert time.time() - start_fit_time < 25, f"Fit time is {time.time() - start_fit_time}, it's more than 25"
104 | 
105 |     start_predict_time = time.time()
106 |     pred = auto_woe.predict_proba(test)
107 |     assert (
108 |         time.time() - start_predict_time < 3.5
109 |     ), f"Predict time is {time.time() - start_predict_time}, it's more than 3.5"
110 | 
111 |     score_1 = roc_auc_score(test["target"], pred)
112 | 
113 |     assert score_1 > 0.76
114 | 
115 |     # assert np.isclose(score_1, 0.7791178), f"Real score is {score_1}"
116 | 
117 |     pred = auto_woe.predict_proba(test[["number_72"]], report=False)
118 |     score_2 = roc_auc_score(test["target"], pred)
119 | 
120 |     assert np.isclose(score_1, score_2), f"Scores {score_1} and {score_2} musts be equal"
121 | 
122 |     report_params = {
123 |         "automl_date_column": "report_month",  # колонка с датой в формате params['datetimeFormat']
124 |         "output_path": "./AUTOWOE_REPORT_1",  # папка, куда сгенерится отчет и сложатся нужные файлы
125 |         "report_name": "___НАЗВАНИЕ ОТЧЕТА___",
126 |         "report_version_id": 1,
127 |         "city": "Воронеж",
128 |         "model_aim": "___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___",
129 |         "model_name": "___НАЗВАНИЕ МОДЕЛИ___",
130 |         "zakazchik": "___ЗАКАЗЧИК___",
131 |         "high_level_department": "___ПОДРАЗДЕЛЕНИЕ___",
132 |         "ds_name": "___РАЗРАБОТЧИК МОДЕЛИ___",
133 |         "target_descr": "___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___",
134 |         "non_target_descr": "___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___",
135 |     }
136 | 
137 |     auto_woe.generate_report(report_params)
138 | 


--------------------------------------------------------------------------------
/autowoe/lib/optimizer/optimizer.py:
--------------------------------------------------------------------------------
  1 | """Optimization of decision tree parameters."""
  2 | 
  3 | from collections import OrderedDict
  4 | from copy import copy
  5 | from itertools import product
  6 | from typing import Any, Dict, Iterable, List, Tuple, Union
  7 | 
  8 | import lightgbm as lgb
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | from autowoe.lib.utilities.cv_split_f import cv_split_f
 13 | from autowoe.lib.utilities.utils import TaskType
 14 | 
 15 | # TODO: Do we need random state here?
 16 | np.random.seed(232)
 17 | 
 18 | 
 19 | class TreeParamOptimizer:
 20 |     """Optimizer of decision tree parameters.
 21 | 
 22 |     Args:
 23 |         data: Dataset. First column - feature, second - Target.
 24 |         params_range: OrderedDict with parameters and ranges for binning algorithms
 25 |             Ex. params_range = OrderedDict({"max_depth": (4, 7, 17, 2, 3),  "min_child_samples": (40000, 20000, 5000),})
 26 | 
 27 |     """
 28 | 
 29 |     _cv_metric_map = {"auc": "auc", "mse": "l2"}
 30 | 
 31 |     def __init__(self, data: pd.DataFrame, task: TaskType, params_range: Dict[str, tuple], n_folds: int = 5):
 32 |         self._params_range = copy(params_range)
 33 |         self._task = task
 34 |         self._metric = "auc" if self._task == TaskType.BIN else "mse"
 35 | 
 36 |         ds_params = {}
 37 |         try:
 38 |             ds_params["min_data_in_bin"] = self._params_range.pop("min_data_in_bin")[0]
 39 |         except KeyError:
 40 |             pass
 41 | 
 42 |         # TODO: Fix double saved data
 43 |         self._X = pd.DataFrame(data.iloc[:, 0])
 44 |         self._y = data.iloc[:, 1]
 45 | 
 46 |         self._lgb_train = lgb.Dataset(data=self._X.copy(), label=self._y.copy(), params=ds_params)
 47 |         self.n_folds = n_folds
 48 |         self._params_stats = None
 49 | 
 50 |     def __get_folds(self, random_state):
 51 |         skf = cv_split_f(self._X, self._y, self._task, None, self.n_folds, random_state)
 52 | 
 53 |         # folds = np.zeros(self._lgb_train.data.shape[0])
 54 |         # for fold_idx, tt_idx in skf.items():
 55 |         #     _, test_idx = tt_idx
 56 |         #     folds[test_idx] =  fold_idx
 57 | 
 58 |         # return skf.items()
 59 | 
 60 |         for v in skf.values():
 61 |             yield v
 62 | 
 63 |     @property
 64 |     def __params_gen(self) -> Iterable[Tuple]:
 65 |         return product(*self._params_range.values())
 66 | 
 67 |     def __get_scores(self, params: Dict[str, Any], n: int) -> List[float]:
 68 |         """Scores for set of parameters.
 69 | 
 70 |         Args:
 71 |             params: Tree parameters.
 72 |             n: The amount of cross-validation to evaluate hyperparameters
 73 | 
 74 |         Returns:
 75 |             Scores.
 76 | 
 77 |         """
 78 |         default_tree_params = {
 79 |             "boosting_type": "gbdt",
 80 |             "learning_rate": 1,
 81 |             "objective": "binary" if self._task == TaskType.BIN else "regression",
 82 |             "bagging_freq": 1,
 83 |             "bagging_fraction": 1,
 84 |             "feature_fraction": 1,
 85 |             "bagging_seed": 323,
 86 |             "n_jobs": 1,
 87 |             "verbosity": -1,
 88 |         }
 89 |         unite_params = {**params, **default_tree_params}
 90 | 
 91 |         score_add_string = ""
 92 |         if lgb.__version__ >= "4.1.0":
 93 |             score_add_string = "valid "
 94 | 
 95 |         scores = []
 96 |         for seed in range(n):
 97 |             folds = self.__get_folds(seed)
 98 |             cv_results = lgb.cv(
 99 |                 params=unite_params, train_set=self._lgb_train, num_boost_round=1, folds=folds, metrics=self._metric
100 |             )
101 |             scores.append(cv_results[score_add_string + f"{self._cv_metric_map[self._metric]}-mean"])
102 | 
103 |         return scores
104 | 
105 |     def __get_stats(self, stats: List[List[float]]):
106 |         """Calculate statistics of scores.
107 | 
108 |         Args:
109 |             stats: Scores [combinations of parameters, cv-s, number of folds in cv]
110 | 
111 |         """
112 |         stats = np.array(stats)
113 |         median_, std_ = np.median(stats, axis=(1, 2)), np.std(stats, axis=(1, 2))
114 | 
115 |         scores = zip(*(median_ if self._task == TaskType.BIN else -median_, -std_))
116 |         id_best = max(enumerate(scores), key=lambda x: x[1])[0]
117 | 
118 |         stat_score = zip(*(median_, std_))
119 |         self._params_stats = OrderedDict((key, value) for (key, value) in zip(self.__params_gen, stat_score)), id_best
120 | 
121 |     def __call__(self, n: int) -> Dict[str, Union[int, str, None]]:
122 |         """Execute optimization.
123 | 
124 |         Args:
125 |             n: Number of iterations.
126 | 
127 |         Returns:
128 |             Best parameters.
129 | 
130 |         """
131 |         scores_ = []
132 |         for val in self.__params_gen:
133 |             params = {key[1]: val[key[0]] for key in enumerate(self._params_range.keys())}
134 |             scores_.append(self.__get_scores(params, n))
135 |         self.__get_stats(scores_)
136 | 
137 |         opt_params = list(self._params_stats[0].keys())[self._params_stats[1]]
138 |         return dict(zip(self._params_range.keys(), opt_params))
139 | 


--------------------------------------------------------------------------------
/autowoe/lib/types_handler/types_handler.py:
--------------------------------------------------------------------------------
  1 | """Type processing."""
  2 | 
  3 | import collections
  4 | from copy import deepcopy
  5 | from typing import Any, Dict, Hashable, Optional
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from .features_checkers_handlers import cat_checker, dates_checker, dates_handler
 10 | 
 11 | 
 12 | class TypesHandler:
 13 |     """Класс для автоматического определения типов признаков.
 14 | 
 15 |     Базовая имплементация порядка разработки:
 16 | 
 17 |     0.
 18 |         0.a) Парсим то, что указал юзер
 19 |         0.b) Даты парсим c указанием сезонности ("m", "d", "wd", "h", "min")
 20 |         (месяц, день, день недели, час, минута)
 21 |     1.
 22 |         Если стринга, то категория
 23 |     2.
 24 |         Если отношение shape[1] к количеству уникальных значений >> 5, то категория
 25 | 
 26 |     Args:
 27 |         train:
 28 |         public_features_type:
 29 |         max_bin_count:
 30 |         features_monotone_constraints:
 31 |         features_mark_values:
 32 | 
 33 |     """
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         train: pd.DataFrame,
 38 |         public_features_type: Dict[Hashable, Any],
 39 |         max_bin_count: Optional[Dict[Hashable, Optional[int]]] = None,
 40 |         features_monotone_constraints: Optional[dict] = None,
 41 |         features_mark_values: Optional[dict] = None,
 42 |     ):
 43 |         self.__train = deepcopy(train)
 44 |         self.__public_features_type = deepcopy(public_features_type)
 45 |         self.__private_features_type: Dict[str, Any] = {}
 46 | 
 47 |         if max_bin_count is None:
 48 |             max_bin_count = {}
 49 |         self.__max_bin_count = collections.defaultdict(lambda: None, max_bin_count)
 50 | 
 51 |         if features_monotone_constraints is None:
 52 |             features_monotone_constraints = {}
 53 |         self.__features_monotone_constraints = collections.defaultdict(lambda: "0", features_monotone_constraints)
 54 | 
 55 |     @property
 56 |     def train(self):
 57 |         """Train data (Read only)."""
 58 |         return self.__train
 59 | 
 60 |     @property
 61 |     def public_features_type(self):
 62 |         """Public features types (Read only)."""
 63 |         return self.__public_features_type
 64 | 
 65 |     @property
 66 |     def private_features_type(self):
 67 |         """Private features types (Read only)."""
 68 |         return self.__private_features_type
 69 | 
 70 |     @property
 71 |     def max_bin_count(self):
 72 |         """Maximum bin count."""
 73 |         return self.__max_bin_count
 74 | 
 75 |     @property
 76 |     def features_monotone_constraints(self):
 77 |         """Feature monotone constraints."""
 78 |         return self.__features_monotone_constraints
 79 | 
 80 |     def __feature_handler(self, feature_name):
 81 |         if dates_checker(self.__train[feature_name]):
 82 |             new_features, feature_type = dates_handler(self.__train[feature_name])
 83 |             self.__public_features_type[feature_name] = feature_type
 84 |             for new_feature_name, new_feature in new_features:
 85 |                 self.__train[new_feature_name] = new_feature
 86 |                 self.__max_bin_count[new_feature_name] = self.max_bin_count[feature_name]
 87 |                 self.__private_features_type[new_feature_name] = "real"
 88 |                 self.__features_monotone_constraints[new_feature_name] = self.features_monotone_constraints[
 89 |                     feature_name
 90 |                 ]
 91 | 
 92 |         elif cat_checker(self.__train[feature_name]):
 93 |             self.__public_features_type[feature_name] = "cat"
 94 |             self.__private_features_type[feature_name] = "cat"
 95 |             self.__features_monotone_constraints[feature_name] = "1"
 96 |         else:
 97 |             self.__public_features_type[feature_name] = "real"
 98 |             self.__private_features_type[feature_name] = "real"
 99 | 
100 |     def transform(self):
101 |         """Основной метод данного класса.
102 | 
103 |         Если feature_type[feature] == None, то парсим тип признака
104 |         Иначе происходит обработка указанных типов.
105 |         Возможные типы признаков:
106 |             "cat"
107 |             "real"
108 |             ("%Y%d%m", ("m", "d", "wd", "h", "min"))
109 | 
110 |         Returns:
111 |             Info.
112 | 
113 |         """
114 |         for feature_name in self.public_features_type:
115 |             if not self.public_features_type[feature_name]:
116 |                 self.__feature_handler(feature_name)
117 |             elif isinstance(self.public_features_type[feature_name], tuple):  # переданы данные для дат
118 |                 new_features, _ = dates_handler(self.train[feature_name], self.public_features_type[feature_name])
119 |                 for new_feature_name, new_feature in new_features:
120 |                     self.__train[new_feature_name] = new_feature
121 |                     self.__max_bin_count[new_feature_name] = self.max_bin_count[feature_name]
122 |                     self.__private_features_type[new_feature_name] = "real"
123 |                     self.__features_monotone_constraints[new_feature_name] = self.__features_monotone_constraints[
124 |                         feature_name
125 |                     ]
126 | 
127 |             elif self.public_features_type[feature_name] == "cat":
128 |                 self.__private_features_type[feature_name] = "cat"
129 |                 self.__features_monotone_constraints[feature_name] = "1"
130 | 
131 |             elif self.public_features_type[feature_name] == "real":
132 |                 self.__private_features_type[feature_name] = "real"
133 |                 self.__train[feature_name] = pd.to_numeric(self.train[feature_name], errors="coerce")
134 | 
135 |             else:
136 |                 raise ValueError("The specified data type is not supported")
137 | 
138 |         return (
139 |             self.train,
140 |             self.public_features_type,
141 |             self.private_features_type,
142 |             self.max_bin_count,
143 |             self.features_monotone_constraints,
144 |         )
145 | 


--------------------------------------------------------------------------------
/autowoe/lib/selectors/utils.py:
--------------------------------------------------------------------------------
  1 | # noqa: D100
  2 | 
  3 | from collections import namedtuple
  4 | from typing import List, Mapping, Sequence, Tuple, Union
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.linear_model import LassoCV, LogisticRegressionCV
  9 | from sklearn.metrics import roc_auc_score
 10 | from sklearn.model_selection import BaseCrossValidator
 11 | from sklearn.svm import l1_min_c
 12 | 
 13 | from autowoe.lib.logging import get_logger
 14 | from autowoe.lib.utilities.utils import TaskType
 15 | 
 16 | logger = get_logger(__name__)
 17 | 
 18 | Result = namedtuple("Result", ["score", "reg_alpha", "is_neg", "min_weights"])
 19 | 
 20 | FEATURE = Union[str, int, float]
 21 | F_LIST_TYPE = Sequence[FEATURE]
 22 | 
 23 | 
 24 | def scorer(estimator, x_train, y):
 25 |     """Evaluate ROC-AUC."""
 26 |     return roc_auc_score(y, estimator.predict_proba(x_train)[:, 1])
 27 | 
 28 | 
 29 | class PredefinedFolds(BaseCrossValidator):
 30 |     """Predefined Folds."""
 31 | 
 32 |     def __init__(self, cv_split: Mapping[int, Tuple[Sequence[int], Sequence[int]]]):
 33 |         self.cv_split = cv_split
 34 | 
 35 |     def _iter_test_indices(
 36 |         self, x_train: np.ndarray = None, y: np.ndarray = None, groups: np.ndarray = None
 37 |     ) -> np.ndarray:
 38 |         """Generates integer indices corresponding to test sets.
 39 | 
 40 |         Args:
 41 |             x_train: Train features.
 42 |             y: Train target.
 43 |             groups: Groups.
 44 | 
 45 |         Yields:
 46 |             test set indexes.
 47 | 
 48 |         """
 49 |         for n in self.cv_split:
 50 |             yield self.cv_split[n][1]
 51 | 
 52 |     def get_n_splits(self, *args, **kwargs) -> int:
 53 |         """Number of splits."""
 54 |         return len(self.cv_split)
 55 | 
 56 | 
 57 | def analyze_result(
 58 |     model: Union[LogisticRegressionCV, LassoCV], features_names: Sequence[str], interpreted_model: bool = True
 59 | ) -> List[Result]:
 60 |     """Analyze the result of the searching coefficient regularization.
 61 | 
 62 |     Args:
 63 |         model: Linear model.
 64 |         features_names: List of features names.
 65 |         interpreted_model: Build interpreted model.
 66 | 
 67 |     Returns:
 68 |         Summary.
 69 | 
 70 |     """
 71 |     scores = model.scores_[1]
 72 |     cs_scores = scores.mean(axis=0)
 73 | 
 74 |     cs_len = scores.shape[1]
 75 |     coef_ = np.moveaxis(model.coefs_paths_[1][:, :, :-1], 1, 0)
 76 | 
 77 |     if interpreted_model:
 78 |         cs_negs = (coef_.reshape((cs_len, -1)) <= 0).all(axis=1)
 79 |     else:
 80 |         cs_negs = [True] * cs_len
 81 | 
 82 |     cs_min_weights = [pd.Series(coef_[x].min(axis=0), index=features_names) for x in range(cs_len)]  # .sort_values()
 83 | 
 84 |     results = [
 85 |         Result(score, c, is_neg, min_weights)
 86 |         for (score, c, is_neg, min_weights) in zip(cs_scores, model.Cs, cs_negs, cs_min_weights)
 87 |     ]
 88 | 
 89 |     return results
 90 | 
 91 | 
 92 | def l1_select(
 93 |     task: TaskType,
 94 |     interpreted_model: bool,
 95 |     n_jobs: int,
 96 |     dataset: Tuple[pd.DataFrame, pd.Series],
 97 |     l1_grid_size: int,
 98 |     l1_exp_scale: float,
 99 |     cv_split: Mapping[int, Tuple[Sequence[int], Sequence[int]]],
100 |     metric_tol: float = 1e-4,
101 | ) -> Tuple[F_LIST_TYPE, Result]:
102 |     """Select the main features according to the lasso model.
103 | 
104 |     Args:
105 |         task: Task.
106 |         interpreted_model: Create interpreted model.
107 |         n_jobs: Number of threads.
108 |         dataset: Tuple of features and target.
109 |         l1_grid_size: Number of points on grid.
110 |         l1_exp_scale: Maximum value of `C`.
111 |         cv_split: Cross-Val splits.
112 |         metric_tol: Metric tolerance.
113 | 
114 |     Returns:
115 |         Selected features, summary info.
116 | 
117 |     """
118 |     # fit model with crossvalidation
119 |     cv = PredefinedFolds(cv_split)
120 |     if task == TaskType.BIN:
121 |         # get grid for cs
122 |         cs = l1_min_c(dataset[0], dataset[1], loss="log", fit_intercept=True) * np.logspace(
123 |             0, l1_exp_scale, l1_grid_size
124 |         )
125 |         logger.info(f"C parameter range in [{cs[0]}:{cs[-1]}], {l1_grid_size} values")
126 | 
127 |         model = LogisticRegressionCV(
128 |             Cs=cs,
129 |             solver="saga",
130 |             tol=1e-5,
131 |             cv=cv,
132 |             penalty="l1",
133 |             scoring=scorer,
134 |             intercept_scaling=10000.0,
135 |             max_iter=1000,
136 |             n_jobs=n_jobs,
137 |             random_state=42,
138 |         )
139 |     else:
140 |         # get grid for cs
141 |         cs = np.logspace(0, l1_exp_scale, l1_grid_size + 1)
142 |         alphas = 1.0 / cs[1:][::-1]
143 |         logger.info(f"Alphas parameter range in [{alphas[0]}:{alphas[-1]}], {l1_grid_size} values")
144 | 
145 |         model = LassoCV(
146 |             alphas=alphas, cv=cv, positive=interpreted_model, tol=1e-5, max_iter=1000, n_jobs=n_jobs, random_state=42
147 |         )
148 | 
149 |     model.fit(dataset[0].values, dataset[1].values)
150 | 
151 |     features_fit: List[str]
152 |     if task == TaskType.BIN:
153 |         # analyze cv results
154 |         result = analyze_result(model, dataset[0].columns, interpreted_model)
155 | 
156 |         # perform selection
157 |         # filter bad weights models
158 |         scores_neg = [x for x in result if x.is_neg]
159 |         # get top score from avail models
160 |         max_score = max([x.score for x in result])
161 |         # get score with tolerance
162 |         ok_score = max_score - metric_tol
163 |         # select first model that is ok with tolerance
164 |         res = None
165 |         for res in scores_neg:
166 |             if res.score >= ok_score:
167 |                 break
168 | 
169 |         # get selected features
170 |         features_fit = [x for (x, y) in zip(dataset[0].columns, res.min_weights) if y != 0]
171 |         logger.info(res)
172 |     else:
173 |         features_fit = [x for (x, y) in zip(dataset[0].columns, model.coef_) if y != 0]
174 |         res = Result(
175 |             score=model.mse_path_.mean(axis=1).min(),
176 |             reg_alpha=model.alpha_,
177 |             is_neg=[True] * model.coef_.shape[0],
178 |             min_weights=np.min(model.coef_),
179 |         )
180 | 
181 |     return features_fit, res
182 | 


--------------------------------------------------------------------------------
/autowoe/lib/selectors/composed_selector.py:
--------------------------------------------------------------------------------
  1 | """Compose several selector."""
  2 | 
  3 | from copy import copy
  4 | from typing import Any, Dict, List, Optional, Tuple, TypeVar
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.metrics import r2_score, roc_auc_score
  9 | 
 10 | from autowoe.lib.logging import get_logger
 11 | from autowoe.lib.utilities.utils import TaskType, feature_changing
 12 | 
 13 | from .utils import F_LIST_TYPE
 14 | 
 15 | logger = get_logger(__name__)
 16 | 
 17 | WoE = TypeVar("WoE")
 18 | 
 19 | 
 20 | class ComposedSelector:
 21 |     """Compose feature selector.
 22 | 
 23 |     Sequential filtering of features by rules:
 24 |         1) Unique WoE value.
 25 |         2) Single feature model has metric lower than threshold.
 26 |         3) VIF of feature greater than threshold.
 27 |         4) There are features with a pair correlation above the threshold.
 28 | 
 29 |     Metrics:
 30 |         1) BIN - AUC
 31 |         2) REG - R2
 32 | 
 33 |     Args:
 34 |         train: Train features.
 35 |         target: Train target.
 36 |         task: Task.
 37 |         features_mark_values: Marked values of features.
 38 | 
 39 |     """
 40 | 
 41 |     default_metric_th = {TaskType.BIN: 0.5, TaskType.REG: 0.0}
 42 | 
 43 |     def __init__(
 44 |         self,
 45 |         train: pd.DataFrame,
 46 |         target: pd.Series,
 47 |         task: TaskType,
 48 |         features_mark_values: Optional[Dict[str, Tuple[Any]]],
 49 |     ):
 50 |         self.train = train
 51 |         self.target = target
 52 |         self.task = task
 53 |         self.features_mark_values = features_mark_values
 54 |         # precompute corrs
 55 | 
 56 |         if features_mark_values is not None:
 57 |             mask_good_values = pd.Series([True] * train.shape[0])
 58 |             for col, mvs in features_mark_values.items():
 59 |                 if col in train.columns:
 60 |                     mask_good_values = mask_good_values & (~train[col].isin(mvs))
 61 |         else:
 62 |             mask_good_values = pd.Series([True] * train.shape[0], index=train.index)
 63 |         train_values = train[mask_good_values].values
 64 | 
 65 |         cc = np.abs(np.corrcoef(train_values, rowvar=False))
 66 |         self.precomp_corr = pd.DataFrame(cc, index=train.columns, columns=train.columns)
 67 | 
 68 |         metrics = []
 69 |         for col in train.columns:
 70 |             if task == TaskType.BIN:
 71 |                 m = 1 - roc_auc_score(target, train[col])
 72 |             else:
 73 |                 m = r2_score(target, train[col])
 74 |             metrics.append(m)
 75 |         self.precomp_metrics = pd.Series(metrics, index=train.columns)
 76 | 
 77 |     @staticmethod
 78 |     def __compare_msg(closure, value, msg=None):
 79 |         flg = closure(value)
 80 |         if not flg:
 81 |             logger.info(msg)
 82 |         return flg
 83 | 
 84 |     def __call__(
 85 |         self,
 86 |         feature_history: Dict[str, str],
 87 |         features_fit: List[str],
 88 |         pearson_th: float = 0.9,
 89 |         metric_th: Optional[float] = None,
 90 |         vif_th: float = 5.0,
 91 |     ) -> F_LIST_TYPE:
 92 |         """Filtered features."""
 93 |         if metric_th is None:
 94 |             metric_th = self.default_metric_th[self.task]
 95 | 
 96 |         candidates = copy(features_fit)
 97 |         features_before = set(candidates)
 98 | 
 99 |         # откинем константные
100 |         _, filter_features = feature_changing(
101 |             feature_history,
102 |             "Constant WoE value",
103 |             features_before,
104 |             lambda candidates: (
105 |                 None,
106 |                 [
107 |                     col
108 |                     for col in candidates
109 |                     if self.__compare_msg(
110 |                         lambda x: ~np.isnan(self.precomp_corr.loc[x, x]),
111 |                         col,
112 |                         f"Feature {col} removed due to single WOE value",
113 |                     )
114 |                 ],
115 |             ),  # func
116 |             candidates,  # args
117 |             # ...,  # kwargs
118 |         )
119 | 
120 |         # откинем с низкой метрикой
121 |         _, filter_features = feature_changing(
122 |             feature_history,
123 |             "Low metric value",  # TODO: feature name
124 |             filter_features,
125 |             lambda candidates: (
126 |                 None,
127 |                 [
128 |                     col
129 |                     for col in candidates
130 |                     if self.__compare_msg(
131 |                         lambda x: self.precomp_metrics[x] >= metric_th,
132 |                         col,
133 |                         f"Feature {col} removed due to low metric value {self.precomp_metrics[col]}",
134 |                     )
135 |                 ],
136 |             ),  # func
137 |             filter_features,  # args
138 |             # ...,  # kwargs
139 |         )
140 |         candidates = filter_features
141 | 
142 |         # итеративный виф
143 |         max_vif = np.inf
144 |         while max_vif > vif_th:
145 |             corrs = self.precomp_corr.loc[candidates, candidates]
146 |             # fix singularity
147 |             corrs = corrs.values + np.diag(np.ones(corrs.shape[0]) * 1e-4)
148 |             vifs = np.linalg.inv(corrs).diagonal()
149 | 
150 |             max_vif_idx = vifs.argmax()
151 |             max_vif = vifs[max_vif_idx]
152 | 
153 |             if max_vif >= vif_th:
154 |                 logger.info(f"Feature {candidates[max_vif_idx]} removed due to high VIF value = {max_vif}")
155 |                 if feature_history is not None:
156 |                     feature_history[candidates[max_vif_idx]] = f"High VIF value = {round(max_vif, 2)}"
157 |                 candidates = [x for (n, x) in enumerate(candidates) if n != max_vif_idx]
158 | 
159 |                 # попарные корреляции
160 |         # отсортируем по убыванию метрики
161 |         order_ = np.array([self.precomp_metrics[x] for x in candidates]).argsort()[::-1]
162 |         candidates = [candidates[x] for x in order_]
163 | 
164 |         n = 0
165 |         while n < (len(candidates) - 1):
166 |             partial_corrs = self.precomp_corr.loc[candidates[n], candidates[n + 1 :]]
167 |             big_partial_corrs = partial_corrs[partial_corrs >= pearson_th]
168 |             if len(big_partial_corrs) > 0:
169 |                 logger.info(
170 |                     (
171 |                         f"Features {list(big_partial_corrs.index.values)}: "
172 |                         f"metric = {list(self.precomp_metrics[big_partial_corrs.index])} was removed due to "
173 |                         f"corr = {list(big_partial_corrs.values)} with feat {candidates[n]}: "
174 |                         f"metric = {self.precomp_metrics[candidates[n]]}"
175 |                     )
176 |                 )
177 |                 if feature_history is not None:
178 |                     for feat in big_partial_corrs.index.values:
179 |                         feature_history[feat] = f"High correlation with feat {candidates[n]}"
180 | 
181 |             candidates = [x for x in candidates if x not in set(big_partial_corrs.index.values)]
182 |             n += 1
183 | 
184 |         return candidates
185 | 


--------------------------------------------------------------------------------
/autowoe/lib/selectors/selector_first.py:
--------------------------------------------------------------------------------
  1 | """Selection of features according to the importance of the model."""
  2 | 
  3 | import logging
  4 | from copy import deepcopy
  5 | from typing import Any, Dict, Hashable, Optional, Tuple, Union
  6 | 
  7 | import lightgbm as lgb
  8 | import numpy as np
  9 | import pandas as pd
 10 | from pandas import DataFrame
 11 | from sklearn.metrics import mean_squared_error, roc_auc_score
 12 | from sklearn.model_selection import train_test_split
 13 | 
 14 | from autowoe.lib.logging import get_logger
 15 | from autowoe.lib.utilities.eli5_permutation import get_score_importances
 16 | from autowoe.lib.utilities.utils import TaskType, drop_keys
 17 | 
 18 | pd.options.mode.chained_assignment = None
 19 | 
 20 | logger = get_logger(__name__)
 21 | 
 22 | root_logger = logging.getLogger()
 23 | level = root_logger.getEffectiveLevel()
 24 | 
 25 | if level in (logging.CRITICAL, logging.ERROR, logging.WARNING):
 26 |     verbose_eval = 0  # False
 27 | elif level == logging.INFO:
 28 |     verbose_eval = 100
 29 | else:
 30 |     verbose_eval = 10
 31 | 
 32 | 
 33 | def nan_constant_selector(
 34 |     data: DataFrame, features_type: Dict[Hashable, str], th_const: float = 32
 35 | ) -> Tuple[DataFrame, Dict[Hashable, str]]:
 36 |     """Selector NaN / Const columns.
 37 | 
 38 |     Filters columns with a large number of NaN-values or with almost constant values.
 39 | 
 40 |     Args:
 41 |         data: DataFrame
 42 |         features_type: Dict[Hashable, str]
 43 |         th_const: Constant threshold. Filters if the number of valid values is less than the threshold.
 44 | 
 45 |     Returns:
 46 |         Data, features list.
 47 | 
 48 |     """
 49 |     th_ = data.shape[0] - th_const
 50 | 
 51 |     features_to_drop = []
 52 | 
 53 |     for col in features_type:
 54 |         nan_count = data[col].isna().sum()
 55 |         if nan_count >= th_:
 56 |             features_to_drop.append(col)
 57 |         else:
 58 |             vc = data[col].value_counts().values[0]
 59 |             if vc >= th_:
 60 |                 features_to_drop.append(col)
 61 | 
 62 |     logger.info(f" features {features_to_drop} contain too many nans or identical values")
 63 |     data = data.drop(columns=features_to_drop, axis=1)
 64 |     features_type = drop_keys(features_type, features_to_drop)
 65 |     return data, features_type
 66 | 
 67 | 
 68 | def get_score_function(model, task: TaskType):
 69 |     """Score function for task - {BIN: ROC_AUC, REG: MSE}."""
 70 |     if task == TaskType.BIN:
 71 |         return lambda x, y: roc_auc_score(y, model.predict_proba(x)[:, 1])
 72 |     else:
 73 |         return lambda x, y: -mean_squared_error(y, model.predict(x))
 74 | 
 75 | 
 76 | def feature_imp_selector(
 77 |     data: DataFrame,
 78 |     task: TaskType,
 79 |     features_type: Dict[Hashable, str],
 80 |     features_mark_values: Optional[Dict[str, Tuple[Any]]],
 81 |     target_name: Hashable,
 82 |     imp_th: float,
 83 |     imp_type: str,
 84 |     select_type: Union[None, int],
 85 |     process_num: int,
 86 | ) -> Tuple[DataFrame, Dict[Hashable, str]]:
 87 |     """Features selection by imp_type.
 88 | 
 89 |     Available FS:
 90 |         - lgbm feature_importance
 91 |         - permutation importance
 92 | 
 93 |     Args:
 94 |         data: Dataset.
 95 |         task: Task.
 96 |         features_type: Features types.
 97 |         features_mark_values: Marked values of feature.
 98 |         target_name: Target column name.
 99 |         imp_th: Importance threshold.
100 |         imp_type: Importance type ("feature_imp" -- feature_importances, "perm_imp" -- permutation_importances).
101 |         select_type: Type of first feature selection.
102 |             - If `None` then choose features with `feature_importance > 0`.
103 |             - If `int` then choose the N-th best features.
104 |         process_num: Number of threads.
105 | 
106 |     Returns:
107 |         Data, features.
108 | 
109 |     """
110 |     data_ = deepcopy(data)
111 | 
112 |     if features_mark_values:
113 |         for col, mvs in features_mark_values.items():
114 |             data_ = data_[~data_[col].isin(mvs)]
115 | 
116 |     categorical_feature = [key for key in features_type if features_type[key] == "cat"]
117 |     if categorical_feature:
118 |         data_[categorical_feature] = data_[categorical_feature].astype("category")
119 | 
120 |     train, test = train_test_split(data_, test_size=0.2, random_state=42)
121 |     params = {"boosting_type": "gbdt", "n_jobs": process_num, "bagging_seed": 323, "min_gain_to_split": 0.01}
122 | 
123 |     if task == TaskType.BIN:
124 |         params["objective"] = "binary"
125 |         params["metric"] = "auc"
126 |     elif task == TaskType.REG:
127 |         params["objective"] = "regression"
128 |         params["metric"] = "mse"
129 |     else:
130 |         raise RuntimeError("Wrong task value")
131 | 
132 |     if imp_type == "feature_imp":
133 |         lgb_train = lgb.Dataset(
134 |             data=train.drop(target_name, axis=1), label=train[target_name], categorical_feature=categorical_feature
135 |         )
136 |         lgb_test = lgb.Dataset(
137 |             data=test.drop(target_name, axis=1), label=test[target_name], categorical_feature=categorical_feature
138 |         )
139 | 
140 |         lgb_kwargs = {"params": params, "train_set": lgb_train, "valid_sets": [lgb_test], "valid_names": ["val_test"]}
141 |         if lgb.__version__ >= "3.3.0":
142 |             lgb_kwargs["callbacks"] = [lgb.log_evaluation(period=verbose_eval), lgb.early_stopping(10, False, True)]
143 |         else:
144 |             lgb_kwargs["early_stopping_rounds"] = 10
145 |             lgb_kwargs["verbose_eval"] = verbose_eval
146 | 
147 |         model = lgb.train(**lgb_kwargs)
148 |         imp_dict = dict(zip(train.drop(target_name, axis=1).columns, model.feature_importance()))
149 |     elif imp_type == "perm_imp":
150 |         if task == TaskType.BIN:
151 |             model = lgb.LGBMClassifier(**params)
152 |         else:
153 |             model = lgb.LGBMRegressor(**params)
154 | 
155 |         for cat in categorical_feature:
156 |             vc = train[cat].value_counts()
157 | 
158 |             vc = vc[vc > 1]
159 |             vc = vc + np.arange(vc.shape[0]) / vc.shape[0]
160 |             train[cat] = train[cat].map(vc).astype(np.float32).fillna(0).values
161 |             test[cat] = test[cat].map(vc).astype(np.float32).fillna(0).values
162 | 
163 |         test_ = test.drop(target_name, axis=1).astype(np.float32).values
164 | 
165 |         model.fit(
166 |             X=train.drop(target_name, axis=1).astype(np.float32).values,
167 |             y=train[target_name].values,
168 |             eval_set=[(test_, test[target_name].values)],
169 |             eval_names=["val_set"],
170 |             eval_metric=params["metric"],
171 |             early_stopping_rounds=10,
172 |             verbose=verbose_eval,
173 |         )
174 |         _, score_decreases = get_score_importances(
175 |             score_func=get_score_function(model, task), X=test_, y=test[target_name]
176 |         )
177 |         col = list(train.columns)
178 |         col.remove(target_name)
179 |         imp_dict = dict(zip(col, np.array(score_decreases).min(axis=0, initial=None)))
180 |     else:
181 |         raise ValueError("imp_type is feature_imp or perm_imp")
182 | 
183 |     if isinstance(select_type, int):
184 |         features_to_drop, _ = zip(*sorted(imp_dict.items(), key=lambda x: x[1], reverse=True))
185 |         features_to_drop = list(features_to_drop[select_type:])
186 |     elif select_type is None:
187 |         features_to_drop = [x for x in imp_dict if imp_dict[x] <= imp_th]
188 |     else:
189 |         raise ValueError("select_type is None or int > 0")
190 |     logger.info(f" features {features_to_drop} have low importance")
191 |     data = data.drop(columns=features_to_drop, axis=1)
192 |     features_type = drop_keys(features_type, features_to_drop)
193 |     return data, features_type
194 | 


--------------------------------------------------------------------------------
/autowoe/lib/pipelines/pipeline_feature_special_values.py:
--------------------------------------------------------------------------------
  1 | """Process nan values."""
  2 | 
  3 | from collections import defaultdict
  4 | from copy import deepcopy
  5 | from typing import Any, Dict, Hashable, Optional, Set, Tuple, TypeVar
  6 | 
  7 | import pandas as pd
  8 | 
  9 | from autowoe.lib.selectors.utils import F_LIST_TYPE
 10 | 
 11 | TKey = TypeVar("TKey")
 12 | TValue = TypeVar("TValue")
 13 | 
 14 | 
 15 | def _opt2val(name: str, options: Set[str]) -> Dict[str, str]:
 16 |     fmt = "__{NAME}_{VAL}__"
 17 |     return {k: fmt.format(NAME=name, VAL=k.rsplit("_")[-1]) for k in options}
 18 | 
 19 | 
 20 | def _values(d: Dict[TKey, TValue]) -> Set[TValue]:
 21 |     return {v for _, v in d.items()}
 22 | 
 23 | 
 24 | DEFAULT_OPTIONS_SPECIAL_VALUES: Set[str] = {"to_woe_0", "to_maxfreq", "to_minp", "to_maxp"}
 25 | EXTEND_OPTIONS_SPECIAL_VALUES: Set[str] = {*DEFAULT_OPTIONS_SPECIAL_VALUES, "to_nan"}
 26 | 
 27 | NAN_MERGE_CASES = _opt2val("NaN", DEFAULT_OPTIONS_SPECIAL_VALUES)
 28 | SMALL_MERGE_CASES = _opt2val("Small", EXTEND_OPTIONS_SPECIAL_VALUES)
 29 | MARK_MERGE_CASES = _opt2val("Mark", EXTEND_OPTIONS_SPECIAL_VALUES)
 30 | 
 31 | 
 32 | NAN_SET = {*_values(NAN_MERGE_CASES), "__NaN__"}
 33 | SMALL_SET = {*_values(SMALL_MERGE_CASES), "__Small__"}
 34 | MARK_SET = {*_values(MARK_MERGE_CASES), "__Mark__"}
 35 | 
 36 | CATEGORY_SPECIAL_SET = {*SMALL_SET, *NAN_SET, *MARK_SET} - {"__NaN__", "__Small__", "__Mark__"}
 37 | REAL_SPECIAL_SET = {*NAN_SET, *MARK_SET}  # - {"__NaN__", "__Small__", "__Mark__"}
 38 | 
 39 | 
 40 | def is_mark_prefix(s):
 41 |     """Mark encode."""
 42 |     return isinstance(s, str) and s.startswith("__Mark__")
 43 | 
 44 | 
 45 | class FeatureSpecialValues:
 46 |     """Class for processing special values in features.
 47 | 
 48 |     Вещественные признаки в отдельную группу. Если сэмплов меньше, чем
 49 |     th_nan, то присавиваем woe 0. И на train и на test
 50 |     --------------------------------------------------------------------------------
 51 |     Категориальные признаки. Если категория небольшая (число сэмплов меньше, чем th_cat),
 52 |     то кодируем её отельным числом. Если nan то кодируем по аналогии с
 53 |     вещественным случаем с помощью th_nan. Если на тесте встречаем категорию,
 54 |     которой не было на train, то отправляем её в nan, маленькие категории, в woe со значением 0.
 55 | 
 56 |     Groups of special values:
 57 |         1. NaN-values (real, categorical features).
 58 |         2. Small groups (categorical features).
 59 |         3. Mark values (real, categorical features).
 60 | 
 61 |     Real features processing:
 62 |         1. If there are fewer samples than `th_nan`, then assign `WoE` to 0.
 63 | 
 64 |     Categorical features processing:
 65 |         1. Small category (number of samples less than `th_cat`) ->
 66 |         2. Processing NaN-values as in real variables.
 67 |         3. Category that didn't occur in the train dataset is assigned a NaN.
 68 |         4. Category that didn't occur in the train dataset is assigned a NaN.
 69 | 
 70 |     Args:
 71 |         th_nan: Threshold for NaN-values process.
 72 |         th_cat: Threshold for category values process.
 73 |         cat_merge_to:
 74 |         nan_merge_to:
 75 | 
 76 |     """
 77 | 
 78 |     def __init__(
 79 |         self,
 80 |         th_nan: float = 32,
 81 |         th_cat: float = 32,
 82 |         th_mark: float = 32,
 83 |         cat_merge_to: str = "to_woe_0",
 84 |         nan_merge_to: str = "to_woe_0",
 85 |         mark_merge_to: str = "to_woe_0",
 86 |         mark_values: Optional[Dict[str, Any]] = None,
 87 |     ):
 88 |         self._th_nan = th_nan
 89 |         self._th_cat = th_cat
 90 |         self._th_mark = th_mark
 91 |         self._cat_merge_to = cat_merge_to
 92 |         self._nan_merge_to = nan_merge_to
 93 |         self._mark_merge_to = mark_merge_to
 94 |         self._mark_values = mark_values
 95 | 
 96 |         self._features_type = None
 97 |         self.cat_encoding = None  # Словарь с кодированием по группам категориальных признаков
 98 |         self.all_encoding = None
 99 |         self.mark_encoding = None
100 |         self._spec_values = None
101 | 
102 |     def fit_transform(
103 |         self, train: pd.DataFrame, features_type: Dict[Hashable, str]
104 |     ) -> Tuple[pd.DataFrame, Dict[Hashable, Dict[str, float]]]:
105 |         """Fit/transform.
106 | 
107 |         Args:
108 |             train: Dataset.
109 |             features_type: Type of features. {"cat" - category, "real" - real}
110 | 
111 |         Returns:
112 |             Processed dataset, special values.
113 | 
114 |         """
115 |         train_ = deepcopy(train)
116 |         all_encoding = {}
117 |         cat_encoding = {}
118 |         mark_encoding = defaultdict(dict)
119 |         spec_values = {}
120 |         self._features_type = features_type
121 |         for col in self._features_type:
122 |             d = {}
123 | 
124 |             if self._mark_values is not None and col in self._mark_values:
125 |                 mark_values_mask = train_[col].isin(self._mark_values[col])
126 | 
127 |                 fill_val = None
128 |                 if mark_values_mask.sum() < self._th_mark:
129 |                     enc_type = MARK_MERGE_CASES[self._mark_merge_to]
130 |                     if enc_type == "__Mark_0__":
131 |                         fill_val = 0
132 |                     # d[enc_type] = fill_val
133 |                 else:
134 |                     enc_type = "__Mark__"
135 | 
136 |                     # if self._features_type[col] != "cat":
137 |                     #     d[enc_type] = None
138 | 
139 |                 for mv in self._mark_values[col]:
140 |                     enc_type_t = enc_type + f"{mv}__" if enc_type == "__Mark__" else enc_type
141 |                     train_.loc[train_[col] == mv, col] = enc_type_t
142 |                     mark_encoding[col][mv] = enc_type_t
143 |                     # if self._features_type[col] != "cat":
144 |                     d[enc_type_t] = fill_val
145 |             else:
146 |                 mark_values_mask = pd.Series(data=[False] * train_.shape[0], index=train_.index)
147 | 
148 |             if self._features_type[col] == "cat":
149 |                 vc = train_.loc[~mark_values_mask, col].value_counts()
150 |                 big_cat = set(vc.index)
151 |                 vc = vc.loc[vc < self._th_cat]
152 |                 vc_sum, small_cat = vc.sum(), set(vc.index)
153 |                 if vc_sum < self._th_nan:  # TODO: _th_nan -> _th_cat ?
154 |                     # Случай когда суммарно всех небольших категорий все равно мало
155 |                     enc_type = SMALL_MERGE_CASES[self._cat_merge_to]
156 |                     fill_val = 0 if enc_type == "__Small_0__" else None
157 |                     d[enc_type] = fill_val
158 |                 else:
159 |                     enc_type = "__Small__"
160 |                     # d[enc_type] = None
161 | 
162 |                 if train_.loc[:, col].dtypes is not object:  # trouble when we have numerical col
163 |                     train_[col] = train_[col].astype(object)
164 | 
165 |                 train_.loc[train_[col].isin(small_cat), col] = enc_type
166 |                 cat_encoding[col] = big_cat.difference(small_cat), small_cat, enc_type
167 |                 #  Небольшие категории, которые будем кодировать отдельно
168 | 
169 |             nan_count = train_[col].isna().sum()
170 | 
171 |             if nan_count < self._th_nan:
172 |                 enc_type = NAN_MERGE_CASES[self._nan_merge_to]
173 |                 fill_val = 0 if enc_type == "__NaN_0__" else None
174 |                 d[enc_type] = fill_val
175 |             else:
176 |                 enc_type = "__NaN__"  # Большое число пропусков. Кодируем как обычную категорию
177 |                 # исключаем NaN из специальных значений для категорий
178 |                 if self._features_type[col] != "cat":
179 |                     d[enc_type] = None
180 | 
181 |             spec_values[col] = d
182 | 
183 |             train_[col] = train_[col].fillna(enc_type)
184 |             all_encoding[col] = enc_type
185 | 
186 |         self.cat_encoding = cat_encoding
187 |         self.all_encoding = all_encoding
188 |         self.mark_encoding = mark_encoding
189 |         self._spec_values = spec_values
190 | 
191 |         return train_, spec_values
192 | 
193 |     def transform(self, test: pd.DataFrame, features: F_LIST_TYPE):
194 |         """Transform dataset.
195 | 
196 |         Args:
197 |             test: Test dataset.
198 |             features: List of features for processing.
199 | 
200 |         Returns:
201 |             Processed dataset.
202 | 
203 |         """
204 |         test_ = test[features].copy()
205 | 
206 |         for col in features:
207 |             if self._mark_values is not None and col in self._mark_values:
208 |                 mark_values_mask = test_[col].isin(self._mark_values[col])
209 |                 if mark_values_mask.sum() > 0:
210 |                     test_.loc[mark_values_mask, col] = test_.loc[mark_values_mask, col].map(self.mark_encoding[col])
211 |             else:
212 |                 mark_values_mask = pd.Series(data=[False] * test.shape[0], index=test.index)
213 | 
214 |             if self._features_type[col] == "cat":
215 |                 big_cat, _, small_pad = self.cat_encoding[col]
216 |                 test_.loc[~(test_[col].isin(big_cat) | test_[col].isna() | mark_values_mask), col] = small_pad
217 | 
218 |             test_[col] = test_[col].fillna(self.all_encoding[col])
219 | 
220 |         return test_, deepcopy(self._spec_values)
221 | 


--------------------------------------------------------------------------------
/autowoe/lib/woe/woe.py:
--------------------------------------------------------------------------------
  1 | """Weight of evidence."""
  2 | 
  3 | from copy import deepcopy
  4 | from typing import Dict, List, Tuple
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from pandas.core.frame import DataFrame
  9 | 
 10 | from autowoe.lib.pipelines.pipeline_feature_special_values import is_mark_prefix
 11 | from autowoe.lib.utilities.utils import TaskType
 12 | 
 13 | 
 14 | class WoE:
 15 |     """Class for WoE transformation.
 16 | 
 17 |     Args:
 18 |         f_type: Feature type. {"cat" - categorical, "real" - real}.
 19 |         split: Splits. Formats:
 20 |             - real type: [-27, 1, 4, 5, 12, 100]
 21 |             - cat type:  {12: 1, 17: 1, 20: 2, 35: 3}
 22 |         woe_diff_th: WoE difference threshold.
 23 |         target_type: Type of target value.
 24 | 
 25 |     """
 26 | 
 27 |     def __init__(self, f_type: str, split: List[float], woe_diff_th: float = 0.0, target_type: TaskType = TaskType.BIN):
 28 |         self.f_type = f_type
 29 |         self.split = split
 30 |         # новая фича - нуллы могут отнестись к ближайшей группе, если достаточно данных
 31 |         self.woe_diff = woe_diff_th
 32 |         self.target_type = target_type
 33 | 
 34 |         self.iv = None
 35 |         self.cod_dict = None
 36 | 
 37 |     def __codding(self, x: pd.Series):
 38 |         """Encode values."""
 39 |         if self.f_type == "cat":
 40 |             x_cod = x.map(self.split)
 41 |         elif self.f_type == "real":
 42 |             x_cod = np.searchsorted(self.split, x.values, side="left")  # check
 43 |             x_cod = pd.Series(data=x_cod, index=x.index)
 44 |         else:
 45 |             raise ValueError("_f_type is cat or real")
 46 |         return x_cod
 47 | 
 48 |     @staticmethod
 49 |     def _bucket_woe(x, total_good: int, total_bad: int):
 50 |         t_bad = x["bad"]
 51 |         t_good = x["count_nonzero"]
 52 |         t_bad = 0.5 if t_bad == 0 else t_bad
 53 |         t_good = 0.5 if t_good == 0 else t_good
 54 |         return np.log((t_bad / total_bad) / (t_good / total_good))
 55 | 
 56 |     def __woe(self, df: pd.DataFrame) -> Tuple[Dict, DataFrame, Tuple[float, ...]]:
 57 |         """Calculate WoE coefficient for each category values."""
 58 |         df.columns = [0, "target"]
 59 |         stat = df.groupby(0)["target"].agg(["mean", np.count_nonzero, np.size])
 60 | 
 61 |         if self.target_type == TaskType.BIN:
 62 |             stat["bad"] = stat["size"] - stat["count_nonzero"]
 63 |             t_good = np.maximum(stat["count_nonzero"].sum(), 0.5)  # Если меток вообще нет
 64 |             t_bad = np.maximum(stat["bad"].sum(), 0.5)  # Если меток вообще нет
 65 | 
 66 |             stat["woe"] = stat.apply(
 67 |                 lambda x: self._bucket_woe(x, t_good, t_bad), axis=1
 68 |             )  # ||P.Correction|-> + np.log(t_good / t_bad)||
 69 |             iv_stat = (stat["bad"] / t_bad - stat["count_nonzero"] / t_good) * stat["woe"]  # Кульбака-Лейблера
 70 |             self.iv = iv_stat.sum()
 71 | 
 72 |             return stat["woe"].to_dict(), stat, (t_good, t_bad)
 73 |         elif self.target_type == TaskType.REG:
 74 |             stat["woe"] = stat["mean"]
 75 |             iv_stat = stat["woe"].abs() * stat["size"] / stat["size"].sum()
 76 |             self.iv = iv_stat.sum()
 77 | 
 78 |             return stat["woe"].to_dict(), stat, None
 79 | 
 80 |     def __df_cod_transform(self, x: pd.Series, spec_values):
 81 |         x_ = deepcopy(x)
 82 |         if isinstance(spec_values, list):
 83 |             spec_values_ = spec_values.copy()
 84 |         elif isinstance(spec_values, dict):
 85 |             spec_values_ = spec_values.keys()
 86 |         else:
 87 |             spec_values_ = []
 88 | 
 89 |         x_.loc[x_.isin(spec_values_)] = -np.inf
 90 |         df_cod = self.__codding(x_)
 91 | 
 92 |         if len(x.loc[x.isin(spec_values_)]) == 0 or len(spec_values_) == 0:
 93 |             return df_cod
 94 | 
 95 |         if df_cod.dtypes is not object:
 96 |             df_cod = df_cod.astype(object)
 97 | 
 98 |         df_cod.loc[x.isin(spec_values_)] = x.loc[x.isin(spec_values_)]
 99 | 
100 |         return df_cod
101 | 
102 |     def fit(self, x, y, spec_values):
103 |         """Fit WoE transformation.
104 | 
105 |         Args:
106 |             x: Feature.
107 |             y: Target.
108 |             spec_values: Special values. Если значение не None, то кодируем WoE по дефолту, если же нет, то кодируем 0
109 | 
110 |         Returns:
111 |             df.
112 | 
113 |         """
114 |         df_cod = self.__df_cod_transform(x, spec_values)
115 |         df_cod = pd.concat([df_cod, y], axis=1)
116 |         stat, total, t_stat = self.__woe(df_cod)
117 | 
118 |         if self.target_type == TaskType.BIN:
119 |             t_good, t_bad = t_stat
120 | 
121 |         good_stats = total.loc[
122 |             [x for x in total.index if type(x) in [int, float] or x in ("__Small__", "__NaN__") or is_mark_prefix(x)]
123 |         ]
124 | 
125 |         # первая обработка - мерджим близкие нуллы/категории
126 |         nsm_values = (
127 |             [x for x in spec_values if "NaN" in x]
128 |             + [x for x in spec_values if "Small" in x]
129 |             + [x for x in spec_values if "Mark" in x]
130 |         )
131 | 
132 |         for key in nsm_values:
133 |             if (key in ("__Small__", "__NaN__") or is_mark_prefix(key)) and key in good_stats.index:
134 |                 check_row = good_stats.loc[key]
135 |                 diff = (good_stats["woe"] - check_row["woe"]).abs()
136 |                 min_diff = diff[diff > 0].min()
137 | 
138 |                 if min_diff < self.woe_diff:
139 |                     idx = diff <= min_diff
140 |                     # если ближайший слишком близко - мерджим
141 | 
142 |                     if self.target_type == TaskType.BIN:
143 |                         good_stats.loc[idx, "woe"] = self._bucket_woe(
144 |                             good_stats.loc[idx, ["bad", "count_nonzero"]].sum(axis=0), t_good, t_bad
145 |                         )
146 |                         good_stats.loc[idx, "size"] = good_stats.loc[idx, "size"].sum()
147 |                         good_stats.loc[idx, "mean"] = good_stats.loc[idx, "count_nonzero"].sum() / good_stats["size"]
148 |                     elif self.target_type == TaskType.REG:
149 |                         gs = good_stats.loc[idx, ["woe", "size"]].copy()
150 |                         t_gs_size = gs["size"].sum()
151 |                         good_stats.loc[idx, "woe"] = (gs["woe"] * gs["size"]).sum() / t_gs_size
152 |                         good_stats.loc[idx, "size"] = t_gs_size
153 |                         good_stats.loc[idx, "mean"] = good_stats.loc[idx, "woe"]
154 | 
155 |         # TODO: re-right
156 |         for key in good_stats.index.values:
157 |             stat[key] = good_stats.loc[key, "woe"]
158 | 
159 |         # далее обработка нуллов и маленьких категорий
160 |         for key in nsm_values:
161 |             woe_val = None
162 | 
163 |             if key in ("__Mark_0__", "__Small_0__", "__NaN_0__"):
164 |                 woe_val = 0
165 | 
166 |             elif key in ("__Mark_maxfreq__", "__Small_maxfreq__", "__NaN_maxfreq__"):
167 |                 idx = good_stats["size"].values.argmax()
168 |                 woe_val = good_stats.iloc[idx]["woe"]
169 | 
170 |             elif key in ("__Mark_maxp__", "__Small_maxp__", "__NaN_maxp__"):
171 |                 # Отберем только тех, по кому что-то нормальное можно оценить
172 |                 idx = good_stats["mean"].values.argmax()
173 |                 woe_val = good_stats.iloc[idx]["woe"]
174 | 
175 |             elif key in ("__Mark_minp__", "__Small_minp__", "__NaN_minp__"):
176 |                 # Отберем только тех, по кому что-то нормальное можно оценить
177 |                 idx = good_stats["mean"].values.argmin()
178 |                 woe_val = good_stats.iloc[idx]["woe"]
179 | 
180 |             elif key in ("__Small__", "__NaN__") or is_mark_prefix(key):
181 |                 continue
182 | 
183 |             stat[key] = woe_val
184 | 
185 |         self.cod_dict = stat
186 |         return df_cod
187 | 
188 |     def fit_transform(self, x: pd.Series, y: pd.Series, spec_values):
189 |         """Fit/transfor.
190 | 
191 |         Args:
192 |             x: Feature.
193 |             y: Target.
194 |             spec_values: Special values. Если значение не None, то кодируем WoE по дефолту, если же нет, то кодируем 0
195 | 
196 |         Returns:
197 |             Transformed feature.
198 | 
199 |         """
200 |         df_cod = self.fit(x, y, spec_values)
201 |         df_cod = df_cod[0].map(self.cod_dict).copy()
202 |         return df_cod
203 | 
204 |     def transform(self, x: pd.Series, spec_values):
205 |         """Transform by WoE.
206 | 
207 |         Args:
208 |             x: Feature.
209 |             spec_values: Special values.
210 | 
211 |         Returns:
212 |             Transformed feature.
213 | 
214 |         """
215 |         df_cod = self.__df_cod_transform(x, spec_values)
216 |         df_cod = df_cod.map(self.cod_dict)
217 |         return df_cod
218 | 
219 |     def split_feature(self, x: pd.Series, spec_values):
220 |         """Split by Bins.
221 | 
222 |         Args:
223 |             x: Feature.
224 |             spec_values: Special values.
225 | 
226 |         Returns:
227 |             Transformed feature.
228 | 
229 |         """
230 |         df_cod = self.__df_cod_transform(x, spec_values)
231 |         return df_cod
232 | 
233 |     def fit_transform_cv(self, x: pd.Series, y: pd.Series, spec_values, cv_index_split: Dict[int, List[int]]):
234 |         """Cross-Val WoE encoding.
235 | 
236 |         Args:
237 |             x: Feature.
238 |             y: Target.
239 |             spec_values: Special values. Если значаение не None, то кодируем WoE по дефолту, если же нет, то кодируем 0
240 |             cv_index_split: Cross-Val splits.
241 | 
242 |         Returns:
243 |             Encoded feature.
244 | 
245 |         """
246 |         x_ = deepcopy(x)
247 |         for value in cv_index_split.values():
248 |             train_index, test_index = value
249 |             self.fit(x.iloc[train_index], y.iloc[train_index], spec_values)
250 |             x_.iloc[test_index] = self.transform(x.iloc[test_index], spec_values).astype(x.dtype)
251 |         return x_.astype(float)
252 | 


--------------------------------------------------------------------------------
/autowoe/lib/utilities/refit.py:
--------------------------------------------------------------------------------
  1 | """Additional functional for refitting model."""
  2 | 
  3 | from copy import deepcopy
  4 | from typing import Optional, Tuple, cast
  5 | 
  6 | import numpy as np
  7 | import sklearn
  8 | from scipy import linalg, stats
  9 | from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
 10 | from sklearn.svm import l1_min_c
 11 | 
 12 | from autowoe.lib.logging import get_logger
 13 | 
 14 | from .utils import TaskType
 15 | 
 16 | logger = get_logger(__name__)
 17 | 
 18 | 
 19 | def refit_reg(
 20 |     task: TaskType,
 21 |     x_train: np.ndarray,
 22 |     y: np.ndarray,
 23 |     l1_grid_size: int,
 24 |     l1_exp_scale: float,
 25 |     max_penalty: float,
 26 |     interp: bool = True,
 27 | ) -> Tuple[np.ndarray, float, np.ndarray]:
 28 |     """Final model refit with regularization.
 29 | 
 30 |     Args:
 31 |         task: Task.
 32 |         x_train: Train features.
 33 |         y: Train target.
 34 |         l1_grid_size: Number of point at regularized grid.
 35 |         l1_exp_scale: Maximum value of `C` coefficient.
 36 |         max_penalty: maximum value of `C` coefficient.
 37 |         interp: Interpreted model.
 38 | 
 39 |     Returns:
 40 |         Weights , intercept of model, features mask.
 41 | 
 42 |     """
 43 |     weights, intercepts = [], []
 44 |     if task == TaskType.BIN:
 45 |         clf = LogisticRegression(penalty="l1", solver="saga", warm_start=True, intercept_scaling=100000)
 46 |         cs = l1_min_c(x_train, y, loss="log", fit_intercept=True) * np.logspace(0, l1_exp_scale, l1_grid_size)
 47 |         cs = cs[cs <= max_penalty]
 48 |         # add final penalty
 49 |         if cs[-1] < max_penalty:
 50 |             cs = list(cs)
 51 |             cs.append(max_penalty)
 52 | 
 53 |         # fit path
 54 |         for c in cs:
 55 |             clf.set_params(C=c)
 56 |             clf.fit(x_train, y)
 57 |             weights.append(deepcopy(clf.coef_[0]))
 58 |             intercepts.append(clf.intercept_[0])
 59 | 
 60 |         if not interp:
 61 |             w, i = weights[-1], intercepts[-1]
 62 |             neg = w != 0
 63 |             return w[neg], i, neg
 64 | 
 65 |         for w, i in zip(weights[::-1], intercepts[::-1]):
 66 |             pos = (w > 0).sum()
 67 |             if pos > 0:
 68 |                 continue
 69 | 
 70 |             neg = w < 0
 71 |             return w[neg], i, neg
 72 |     else:
 73 |         cs_max_penalty = 1 / max_penalty
 74 |         model = Lasso(warm_start=True, positive=interp)
 75 |         cs = np.logspace(0, l1_exp_scale, l1_grid_size + 1)
 76 |         cs = cs[cs <= cs_max_penalty]
 77 |         # add final penalty
 78 |         if cs[-1] < cs_max_penalty:
 79 |             cs = list(cs)
 80 |             cs.append(cs_max_penalty)
 81 |         cs = np.array(cs)
 82 | 
 83 |         alphas = (1.0 / cs[1:])[::-1]
 84 | 
 85 |         for alpha in alphas:
 86 |             model.set_params(alpha=alpha)
 87 |             model.fit(x_train, y)
 88 |             weights.append(model.coef_)
 89 |             intercepts.append(model.intercept_)
 90 | 
 91 |         w, i = weights[0], intercepts[0]
 92 |         pos = w >= 0
 93 | 
 94 |         return w[pos], i, pos
 95 | 
 96 |     raise ValueError("No negative weights grid")
 97 | 
 98 | 
 99 | def refit_simple(
100 |     task: TaskType,
101 |     x_train: np.ndarray,
102 |     y: np.ndarray,
103 |     interp: bool = True,
104 |     p_val: float = 0.05,
105 |     x_val: Optional[np.ndarray] = None,
106 |     y_val: Optional[np.ndarray] = None,
107 |     n_jobs: int = -1,
108 | ) -> Tuple[np.ndarray, float, np.ndarray, np.ndarray, np.ndarray]:
109 |     """Final model refit with stat model mode.
110 | 
111 |     Args:
112 |         task: Task.
113 |         x_train: Train features.
114 |         y: Train target.
115 |         interp: Interpreted model.
116 |         p_val: P-value.
117 |         x_val: Validation features.
118 |         y_val: Validation target.
119 |         n_jobs: Number of threads.
120 | 
121 |     Returns:
122 |         weights, intercept, features mask, p values, b vars.
123 | 
124 |     """
125 |     sl_ok = np.ones(x_train.shape[1], dtype=bool)
126 | 
127 |     n = -1
128 | 
129 |     logreg_penalty = None if sklearn.__version__ >= "1.2.0" else "none"
130 | 
131 |     while True:
132 |         n += 1
133 |         assert sl_ok.sum() > 0, "No features left to fit on iter"
134 | 
135 |         logger.info(f"Iter {n} of final refit starts with {sl_ok.sum()} features")
136 | 
137 |         x_train_ = x_train[:, sl_ok]
138 |         # индексы в исходном массиве
139 |         ok_idx = np.arange(x_train.shape[1])[sl_ok]
140 | 
141 |         if task == TaskType.BIN:
142 |             model = LogisticRegression(penalty=logreg_penalty, solver="lbfgs", warm_start=False, intercept_scaling=1)
143 |             model.fit(x_train_, y)
144 |             model_coef = model.coef_[0]
145 |             model_intercept = model.intercept_[0]
146 |         else:
147 |             model = LinearRegression(n_jobs=n_jobs)
148 |             model.fit(x_train_, y)
149 |             model_coef = model.coef_
150 |             model_intercept = model.intercept_
151 | 
152 |         # check negative coefs here if interp
153 |         sl_pos_coef = np.zeros((x_train_.shape[1],), dtype=bool)
154 |         if interp:
155 |             sl_pos_coef = model.coef_[0] >= 0 if task == TaskType.BIN else model.coef_[0] <= 0
156 | 
157 |         # если хотя бы один неотрицательный - убирай самый большой и по новой
158 |         if sl_pos_coef.sum() > 0:
159 |             max_coef_idx = model_coef.argmax()
160 |             sl_ok[ok_idx[max_coef_idx]] = False
161 |             continue
162 | 
163 |         # если прошли все отрицательные смотрим на pvalue
164 |         if task == TaskType.BIN:
165 |             p_vals, b_var = calc_p_val(x_train_, model_coef, model_intercept)
166 |         else:
167 |             p_vals, b_var = calc_p_val_reg(x_train_, y, model_coef, model_intercept)
168 | 
169 |         # без интерсепта
170 |         p_vals_f = p_vals[:-1]
171 | 
172 |         model_p_vals = p_vals.copy()
173 |         model_b_var = b_var.copy() if b_var is not None else None
174 | 
175 |         # если хотя бы один больше p_val - дропай самый большой и погнали по новой
176 |         if p_vals_f.max() > p_val:
177 |             max_p_val_idx = p_vals_f.argmax()
178 |             sl_ok[ok_idx[max_p_val_idx]] = False
179 |             continue
180 | 
181 |         if x_val is not None:
182 |             # то же самое на валидационной выборке
183 |             logger.info("Validation data checks")
184 |             x_val_ = x_val[:, sl_ok]
185 | 
186 |             p_vals, b_var = calc_p_val_on_valid(x_val_, y_val, task, n_jobs)
187 |             p_vals_f = p_vals[:-1]
188 | 
189 |             # если хотя бы один больше p_val - дропай самый большой и погнали по новой
190 |             if p_vals_f.max() > p_val:
191 |                 max_p_val_idx = p_vals_f.argmax()
192 |                 sl_ok[ok_idx[max_p_val_idx]] = False
193 |                 continue
194 | 
195 |         weights = cast(np.ndarray, model_coef)
196 |         intercept = cast(float, model_intercept)
197 | 
198 |         return weights, intercept, sl_ok, cast(np.ndarray, model_p_vals), cast(np.ndarray, model_b_var)
199 | 
200 | 
201 | def calc_p_val(x_train: np.ndarray, weights: np.ndarray, intercept: float) -> Tuple[np.ndarray, np.ndarray]:
202 |     """Calc p-values for coef estimates.
203 | 
204 |     Args:
205 |         x_train: Train features.
206 |         weights: Model Weights.
207 |         intercept: Model intercept coefficient.
208 | 
209 |     Returns:
210 |         p values, b vars.
211 | 
212 |     """
213 |     coef_ = np.concatenate([weights, [intercept]])
214 |     x_train = np.concatenate([x_train, np.ones((x_train.shape[0], 1))], axis=1)
215 |     prob_ = 1 / (1 + np.exp(-np.dot(x_train, coef_)))
216 |     prob_ = prob_ * (1 - prob_)
217 |     hess = np.dot((prob_[:, np.newaxis] * x_train).T, x_train)
218 | 
219 |     inv_hess = np.linalg.inv(hess)
220 |     b_var = inv_hess.diagonal()
221 |     w_stat = (coef_**2) / b_var
222 | 
223 |     p_vals = 1 - stats.chi2(1).cdf(w_stat)
224 | 
225 |     return p_vals, b_var
226 | 
227 | 
228 | def calc_p_val_on_valid(
229 |     x_train: np.ndarray, y: np.ndarray, task: TaskType, n_jobs: int = -1
230 | ) -> Tuple[np.ndarray, np.ndarray]:
231 |     """Fit algo and calc p-values.
232 | 
233 |     Args:
234 |         x_train: Train features.
235 |         y: Train target.
236 |         task: Task.
237 |         n_jobs: Number of threads.
238 | 
239 |     Returns:
240 |         p values, b vars.
241 | 
242 |     """
243 |     logreg_penalty = None if sklearn.__version__ >= "1.2.0" else "none"
244 | 
245 |     if task == TaskType.BIN:
246 |         model = LogisticRegression(penalty=logreg_penalty, solver="lbfgs", warm_start=False, intercept_scaling=1)
247 |         model.fit(x_train, y)
248 | 
249 |         return calc_p_val(x_train, model.coef_[0], model.intercept_[0])
250 |     else:
251 |         model = LinearRegression(n_jobs=n_jobs)
252 |         model.fit(x_train, y)
253 | 
254 |         return calc_p_val_reg(x_train, y, model.coef_, model.intercept_)
255 | 
256 | 
257 | def calc_p_val_reg(
258 |     x_train: np.ndarray, y_train: np.ndarray, weights: np.ndarray, intercept: float
259 | ) -> Tuple[np.ndarray, np.ndarray]:
260 |     """Calculate p values for regression task."""
261 |     n, k = x_train.shape
262 |     y_pred = (np.dot(x_train, weights) + intercept).T
263 | 
264 |     # Change X and Y into numpy matrices. x also has a column of ones added to it.
265 |     x = np.hstack((np.matrix(x_train), np.ones((n, 1))))
266 |     y_train = np.matrix(y_train).T
267 | 
268 |     # Degrees of freedom.
269 |     freedom_degrees = float(n - k - 1)
270 | 
271 |     # Sample variance.
272 |     sse = np.sum(np.square(y_pred - y_train), axis=0)
273 |     sampleVariance = sse / freedom_degrees
274 | 
275 |     # Sample variance for x.
276 |     sampleVarianceX = x.T * x
277 | 
278 |     # Covariance Matrix = [(s^2)(X'X)^-1]^0.5. (sqrtm = matrix square root.  ugly)
279 |     covarianceMatrix = linalg.sqrtm(sampleVariance[0, 0] * sampleVarianceX.I)
280 | 
281 |     # Standard errors for the difference coefficients: the diagonal elements of the covariance matrix.
282 |     se = covarianceMatrix.diagonal()  # [1:]
283 | 
284 |     # T statistic for each beta.
285 |     betasTStat = np.zeros(len(se))
286 |     for i in range(len(se) - 1):
287 |         betasTStat[i] = weights[i] / se[i]
288 |     betasTStat[-1] = intercept / se[-1]
289 | 
290 |     # P-value for each beta. This is a two sided t-test, since the betas can be
291 |     # positive or negative.
292 |     betasPValue = 1 - stats.t.cdf(abs(betasTStat), freedom_degrees)
293 | 
294 |     return betasPValue, None
295 | 


--------------------------------------------------------------------------------
/examples/Tutorial_2__Dates_and_stat_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from sklearn.metrics import roc_auc_score\n",
 11 |     "\n",
 12 |     "from autowoe import AutoWoE, ReportDeco"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "### Чтение и подготовка обучающей выборки"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "train = pd.read_csv(\n",
 29 |     "    \"./data/train_demo.csv\", low_memory=False, index_col=\"line_id\", parse_dates=[\"datetime_\" + str(i) for i in range(2)]\n",
 30 |     ")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "### Чтение и подготовка тестовой выборки"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "test = pd.read_csv(\"./data/test_demo.csv\", index_col=\"line_id\", parse_dates=[\"datetime_\" + str(i) for i in range(2)])\n",
 47 |     "\n",
 48 |     "test_target = pd.read_csv(\"./data/test-target_demo.csv\")[\"target\"]\n",
 49 |     "test[\"target\"] = test_target.values"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "### Параметры модели"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "Для обучения модели рекомендуется указать тип признаков для обучения.\n",
 64 |     "Поэтому создается словарь features_type с ключами: \n",
 65 |     "\n",
 66 |     " \"real\" -- вещественный признак\n",
 67 |     " \n",
 68 |     " \"cat\" --  категориальный.\n",
 69 |     " \n",
 70 |     " __\"date\"-- (\"%Y%d%m\", (\"m\", \"d\", \"wd\", \"h\", \"min\"))__\n",
 71 |     " \n",
 72 |     " Для признаков, которые не размечены, типы будут определены автоматом. Такой вариант будет работать, но качество порядочно просядет\n",
 73 |     " \n",
 74 |     "__Попробуем указать даты с форматом None (автоопределение) и сезонностью - день месяца и день недели__"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "#### features_type"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 4,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "num_col = list(filter(lambda x: \"numb\" in x, train.columns))\n",
 91 |     "num_feature_type = dict.fromkeys(num_col, \"real\")\n",
 92 |     "\n",
 93 |     "date_col = list(filter(lambda x: \"datetime\" in x, train.columns))\n",
 94 |     "date_feature_type = dict.fromkeys(date_col, (None, (\"d\", \"wd\")))"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 5,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "features_type = dict(**num_feature_type, **date_feature_type)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 6,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "# подробно параметры описаны в Example_1\n",
113 |     "auto_woe = AutoWoE(\n",
114 |     "    monotonic=True, max_bin_count=4, oof_woe=False, regularized_refit=False, p_val=0.05, debug=False, verbose=0\n",
115 |     ")\n",
116 |     "auto_woe = ReportDeco(auto_woe)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 7,
122 |    "metadata": {
123 |     "scrolled": true
124 |    },
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "[LightGBM] [Info] Number of positive: 63, number of negative: 5537\n",
131 |       "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010733 seconds.\n",
132 |       "You can set `force_col_wise=true` to remove the overhead.\n",
133 |       "[LightGBM] [Info] Total Bins 11532\n",
134 |       "[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 652\n",
135 |       "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011250 -> initscore=-4.476073\n",
136 |       "[LightGBM] [Info] Start training from score -4.476073\n"
137 |      ]
138 |     }
139 |    ],
140 |    "source": [
141 |     "auto_woe.fit(train[num_col + date_col + [\"target\"]], target_name=\"target\", features_type=features_type)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 8,
147 |    "metadata": {
148 |     "scrolled": true
149 |    },
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "0.7911446119486321"
155 |       ]
156 |      },
157 |      "execution_count": 8,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "pred = auto_woe.predict_proba(test)\n",
164 |     "roc_auc_score(test[\"target\"], pred)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "##### Замечание\n",
172 |     "ReportDeco - обертка для построения отчета. Она не обязательна для обучения и применения модели, но обязательна для построения отчета (см последнюю ячейку)."
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "### Значения коэфициентов и p-values\n",
180 |     "\n",
181 |     "При указании regularized_refit=False будет произведена оценка p-value на коэфициенты модели. Коэфициенты с p-value выше указанного порога не будут включены в модель"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 9,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/plain": [
192 |        "number_254         -0.487530\n",
193 |        "number_10          -0.475665\n",
194 |        "number_345         -0.707849\n",
195 |        "number_759         -0.763258\n",
196 |        "number_761         -0.894294\n",
197 |        "number_706         -0.648337\n",
198 |        "number_1           -1.044868\n",
199 |        "number_368         -1.062441\n",
200 |        "datetime_1__F__d   -1.232442\n",
201 |        "dtype: float64"
202 |       ]
203 |      },
204 |      "execution_count": 9,
205 |      "metadata": {},
206 |      "output_type": "execute_result"
207 |     }
208 |    ],
209 |    "source": [
210 |     "auto_woe.features_fit"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 10,
216 |    "metadata": {},
217 |    "outputs": [
218 |     {
219 |      "data": {
220 |       "text/plain": [
221 |        "-4.545016720125766"
222 |       ]
223 |      },
224 |      "execution_count": 10,
225 |      "metadata": {},
226 |      "output_type": "execute_result"
227 |     }
228 |    ],
229 |    "source": [
230 |     "auto_woe.intercept"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 11,
236 |    "metadata": {},
237 |    "outputs": [
238 |     {
239 |      "data": {
240 |       "text/plain": [
241 |        "number_254          0.013034\n",
242 |        "number_10           0.030010\n",
243 |        "number_345          0.004663\n",
244 |        "number_759          0.001166\n",
245 |        "number_761          0.000357\n",
246 |        "number_706          0.006792\n",
247 |        "number_1            0.001364\n",
248 |        "number_368          0.000006\n",
249 |        "datetime_1__F__d    0.003993\n",
250 |        "Intercept_          0.000000\n",
251 |        "dtype: float64"
252 |       ]
253 |      },
254 |      "execution_count": 11,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "auto_woe.p_vals"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "### Формирование отчета"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 12,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "name": "stderr",
277 |      "output_type": "stream",
278 |      "text": [
279 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
280 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
281 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
282 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
283 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
284 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
285 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
286 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
287 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
288 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n"
289 |      ]
290 |     }
291 |    ],
292 |    "source": [
293 |     "report_params = {\n",
294 |     "    \"automl_date_column\": \"report_month\",  # колонка с датой в формате params['datetimeFormat']\n",
295 |     "    \"output_path\": \"./AUTOWOE_REPORT_2\",  # папка, куда сгенерится отчет и сложатся нужные файлы\n",
296 |     "    \"report_name\": \"___НАЗВАНИЕ ОТЧЕТА___\",\n",
297 |     "    \"report_version_id\": 1,\n",
298 |     "    \"city\": \"Воронеж\",\n",
299 |     "    \"model_aim\": \"___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___\",\n",
300 |     "    \"model_name\": \"___НАЗВАНИЕ МОДЕЛИ___\",\n",
301 |     "    \"zakazchik\": \"___ЗАКАЗЧИК___\",\n",
302 |     "    \"high_level_department\": \"___ПОДРАЗДЕЛЕНИЕ___\",\n",
303 |     "    \"ds_name\": \"___РАЗРАБОТЧИК МОДЕЛИ___\",\n",
304 |     "    \"target_descr\": \"___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___\",\n",
305 |     "    \"non_target_descr\": \"___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___\",\n",
306 |     "}\n",
307 |     "\n",
308 |     "auto_woe.generate_report(report_params)"
309 |    ]
310 |   }
311 |  ],
312 |  "metadata": {
313 |   "kernelspec": {
314 |    "display_name": "Anaconda_py38",
315 |    "language": "python",
316 |    "name": "anaconda_py38"
317 |   },
318 |   "language_info": {
319 |    "codemirror_mode": {
320 |     "name": "ipython",
321 |     "version": 3
322 |    },
323 |    "file_extension": ".py",
324 |    "mimetype": "text/x-python",
325 |    "name": "python",
326 |    "nbconvert_exporter": "python",
327 |    "pygments_lexer": "ipython3",
328 |    "version": "3.8.5"
329 |   },
330 |   "stem_cell": {
331 |    "cell_type": "raw",
332 |    "metadata": {
333 |     "pycharm": {
334 |      "metadata": false
335 |     }
336 |    },
337 |    "source": ""
338 |   }
339 |  },
340 |  "nbformat": 4,
341 |  "nbformat_minor": 1
342 | }
343 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                 Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 | Copyright 2020 (с) Ryzhkov Alexander, Vakhrushev Anton, Savchenko Maksim,
190 | 	           Simakov Dmitrii, Damdinov Rinchin, Kirilin Alexander,
191 |                    Bunakov Vasilii
192 | 
193 | Licensed under the Apache License, Version 2.0 (the "License");
194 | you may not use this file except in compliance with the License.
195 | You may obtain a copy of the License at
196 | 
197 |     http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 | Unless required by applicable law or agreed to in writing, software
200 | distributed under the License is distributed on an "AS IS" BASIS,
201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 | See the License for the specific language governing permissions and
203 | limitations under the License.
204 | 


--------------------------------------------------------------------------------
/autowoe/lib/report/utilities_images/utilities_images.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa
  2 | 
  3 | from typing import List
  4 | from typing import Union
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | import pandas as pd
  9 | import seaborn as sns
 10 | 
 11 | from sklearn.metrics import roc_auc_score
 12 | from sklearn.metrics import roc_curve
 13 | 
 14 | 
 15 | def plot_bars(df, path, title=None):
 16 |     sns.set(style="whitegrid", font_scale=1.5)
 17 |     pl = df.plot(figsize=(10, 10), kind="bar", cmap="Accent", width=1)
 18 |     if title:
 19 |         pl.title.set_text(title)
 20 |     pl.get_figure().savefig(path, bbox_inches="tight")
 21 |     plt.close()
 22 | 
 23 | 
 24 | def plot_roc_curve_image(y_true, y_pred, path):
 25 |     sns.set(style="whitegrid", font_scale=1.5)
 26 |     plt.figure(figsize=(10, 10))
 27 | 
 28 |     fpr_reg, tpr_reg, _ = roc_curve(y_true, y_pred)
 29 |     auc_score_reg = roc_auc_score(y_true, y_score=y_pred)
 30 | 
 31 |     lw = 2
 32 |     plt.plot(
 33 |         fpr_reg,
 34 |         tpr_reg,
 35 |         color="darkorange",
 36 |         lw=lw,
 37 |         label=f"WhiteBox модель (GINI = {(2 * auc_score_reg - 1):.3f})",
 38 |     )
 39 |     plt.plot([0, 1], [0, 1], color="red", lw=lw, linestyle="--", label="Random Model")
 40 |     plt.xlim([-0.05, 1.05])
 41 |     plt.ylim([-0.05, 1.05])
 42 |     plt.xlabel("False Positive Rate")
 43 |     plt.ylabel("True Positive Rate")
 44 |     lgd = plt.legend(bbox_to_anchor=(0.5, -0.15), loc="upper center", ncol=2)
 45 |     plt.xticks(np.arange(0, 1.01, 0.05), rotation=45)
 46 |     plt.yticks(np.arange(0, 1.01, 0.05))
 47 |     plt.grid(color="gray", linestyle="-", linewidth=1)
 48 |     plt.title(f"ROC кривая (GINI = {(2 * auc_score_reg - 1):.3f})")
 49 |     plt.savefig(path, bbox_extra_artists=(lgd,), bbox_inches="tight")
 50 |     plt.close()
 51 | 
 52 | 
 53 | def plot_double_roc_curve(train_y_true, train_y_pred, test_y_true, test_y_pred, path):
 54 |     sns.set(style="whitegrid", font_scale=1.5)
 55 |     plt.figure(figsize=(10, 10))
 56 | 
 57 |     train_fpr_reg, train_tpr_reg, _ = roc_curve(train_y_true, train_y_pred)
 58 |     train_auc_score_reg = roc_auc_score(train_y_true, y_score=train_y_pred)
 59 |     test_fpr_reg, test_tpr_reg, _ = roc_curve(test_y_true, test_y_pred)
 60 |     test_auc_score_reg = roc_auc_score(test_y_true, y_score=test_y_pred)
 61 | 
 62 |     lw = 2
 63 |     plt.plot(
 64 |         train_fpr_reg,
 65 |         train_tpr_reg,
 66 |         color="darkorange",
 67 |         lw=lw,
 68 |         label=f"По данным train (GINI = {(2 * train_auc_score_reg - 1):.3f})",
 69 |     )
 70 |     plt.plot(
 71 |         test_fpr_reg,
 72 |         test_tpr_reg,
 73 |         color="blue",
 74 |         lw=lw,
 75 |         label=f"По данным test (GINI = {(2 * test_auc_score_reg - 1):.3f})",
 76 |     )
 77 |     plt.plot([0, 1], [0, 1], color="red", lw=lw, linestyle="--", label="Random Model")
 78 |     plt.xlim([-0.05, 1.05])
 79 |     plt.ylim([-0.05, 1.05])
 80 |     plt.xlabel("False Positive Rate")
 81 |     plt.ylabel("True Positive Rate")
 82 |     plt.legend(loc="lower right")
 83 |     plt.xticks(np.arange(0, 1.01, 0.05), rotation=45)
 84 |     plt.yticks(np.arange(0, 1.01, 0.05))
 85 |     plt.grid(color="gray", linestyle="-", linewidth=1)
 86 |     plt.title("ROC кривая")
 87 |     plt.savefig(path, bbox_inches="tight")
 88 |     plt.close()
 89 | 
 90 | 
 91 | def plot_roc_curve_feature_image(feature_name, y_true, y_pred, path):
 92 |     sns.set(style="whitegrid", font_scale=1.5)
 93 |     plt.figure(figsize=(10, 10))
 94 | 
 95 |     fpr_reg, tpr_reg, _ = roc_curve(y_true, y_pred)
 96 |     auc_score_reg = roc_auc_score(y_true, y_score=y_pred)
 97 | 
 98 |     lw = 2
 99 |     plt.plot(
100 |         fpr_reg,
101 |         tpr_reg,
102 |         color="darkorange",
103 |         lw=lw,
104 |         label=feature_name + f" (GINI = {(2 * auc_score_reg - 1):.3f})",
105 |     )
106 |     plt.plot([0, 1], [0, 1], color="red", lw=lw, linestyle="--", label="Random Model")
107 |     plt.xlim([-0.05, 1.05])
108 |     plt.ylim([-0.05, 1.05])
109 |     plt.xlabel("False Positive Rate")
110 |     plt.ylabel("True Positive Rate")
111 |     lgd = plt.legend(bbox_to_anchor=(0.5, -0.15), loc="upper center", ncol=2)
112 |     plt.xticks(np.arange(0, 1.01, 0.05), rotation=45)
113 |     plt.yticks(np.arange(0, 1.01, 0.05))
114 |     plt.grid(color="gray", linestyle="-", linewidth=1)
115 |     plt.title(f"ROC curve(GINI = {(2 * auc_score_reg - 1):.3f})" + f" of feature {feature_name}")
116 |     plt.savefig(path, bbox_extra_artists=(lgd,), bbox_inches="tight")
117 |     plt.close()
118 | 
119 | 
120 | def plot_model_weights(features, path):
121 |     sns.set(style="whitegrid", font_scale=1.5)
122 |     fig = plt.figure(figsize=(20, 5))
123 |     ax = fig.add_axes([0, 0, 1, 1])
124 |     ax.bar(features.index, features.values, color="g")
125 |     lgd = plt.legend(bbox_to_anchor=(0.5, -0.15), loc="upper center", ncol=2)
126 |     plt.title("Model coefs", fontsize=28)
127 |     plt.xlabel("Features", fontsize=20)
128 |     plt.ylabel("Coef values", fontsize=20)
129 |     plt.xticks(fontsize=15, rotation=90)
130 |     plt.yticks(fontsize=15)
131 |     plt.savefig(path, bbox_extra_artists=(lgd,), bbox_inches="tight")
132 |     plt.close()
133 | 
134 | 
135 | def plot_feature_split(feature_name, features, path):
136 |     sns.set(style="whitegrid", font_scale=1.5)
137 |     fig = plt.figure(figsize=(15, 5))
138 |     ax = fig.add_axes([0, 0, 1, 1])
139 |     ax.bar(features.index, features.values, color="g")
140 |     lgd = plt.legend(bbox_to_anchor=(0.5, -0.15), loc="upper center", ncol=2)
141 |     plt.title("Split of feature " + feature_name + " and woe values")
142 |     plt.xlabel("Bins", fontsize=20)
143 |     plt.ylabel("WoE values", fontsize=20)
144 |     plt.xticks(fontsize=15)
145 |     plt.yticks(fontsize=15)
146 |     plt.savefig(path, bbox_extra_artists=(lgd,), bbox_inches="tight")
147 |     plt.close()
148 | 
149 | 
150 | def plot_ginis(data_enc, target, path):
151 |     sns.set(style="whitegrid", font_scale=1.5)
152 |     feats = list(data_enc.columns)
153 |     aucs = [roc_auc_score(target, -data_enc[col].values) for col in feats]
154 |     ginis = [(x - 0.5) * 2 for x in aucs]
155 |     ginis = pd.Series(ginis, index=feats).sort_values(ascending=True)
156 |     pl = ginis.plot(kind="barh", figsize=(10, 10))
157 |     pl.get_figure().savefig(path, bbox_inches="tight")
158 |     plt.close()
159 | 
160 | 
161 | def plot_woe_bars(train_enc, train_target, test_enc, test_target, target_name, column, path):
162 |     sns.set(style="whitegrid", font_scale=1.5)
163 |     names = ["train", "test"]
164 |     samples = []
165 |     for df, target in zip([train_enc, test_enc], [train_target, test_target]):
166 |         df_copy = df.copy().round(3)
167 |         df_copy[target_name] = target
168 |         samples.append(df_copy)
169 | 
170 |     samples = [
171 |         x[[target_name, column]].groupby(column)[target_name].agg(["mean", "count"]).reset_index() for x in samples
172 |     ]
173 | 
174 |     for df in samples:
175 |         df["count"] /= df["count"].sum()
176 | 
177 |         df.rename({"count": "Freq", "mean": "DefaultRate", column: "WOE: " + column}, inplace=True, axis=1)
178 | 
179 |     total = pd.concat(samples, axis=0, ignore_index=True)
180 |     order = total["WOE: " + column].drop_duplicates().sort_values().values
181 |     order = pd.Series(np.arange(order.shape[0]), index=order)
182 | 
183 |     total["_sample_"] = np.concatenate([[n] * x.shape[0] for (n, x) in zip(names, samples)])
184 | 
185 |     plt.figure(figsize=(10, 10))
186 |     ax = sns.barplot(x="WOE: " + column, hue="_sample_", y="Freq", data=total, palette=sns.color_palette("Accent", 7))
187 |     ax2 = ax.twinx()
188 | 
189 |     for df, name in zip(samples, names):
190 |         df.set_index(df["WOE: " + column].map(order).values)["DefaultRate"].plot(ax=ax2, label=name, marker="x")
191 |     ax2.legend(title="_sample_")
192 | 
193 |     plt.savefig(path, bbox_inches="tight")
194 |     plt.close()
195 | 
196 | 
197 | def plot_backlash_check(predict_proba, data_enc, target, col, path):
198 |     sns.set(style="whitegrid", font_scale=1.5)
199 |     df = pd.DataFrame({"pred": predict_proba, col: data_enc[col], "Target": target})
200 |     grp = df.groupby(col)[["pred", "Target"]].mean()
201 |     grp.plot(figsize=(10, 10)).get_figure().savefig(path, bbox_inches="tight")
202 |     plt.close()
203 | 
204 | 
205 | def plot_binned(data_binned, path1, path2):
206 |     sns.set(style="whitegrid", font_scale=1.5)
207 |     pl = (data_binned.groupby("ScoreBin").size().sort_index() / data_binned.shape[0]).plot(figsize=(10, 10), kind="bar")
208 |     pl.get_figure().savefig(path1, bbox_inches="tight")
209 |     plt.close()
210 | 
211 |     neg = data_binned[data_binned["Target"] == 0].groupby("ScoreBin").size().sort_index() / (
212 |         data_binned.shape[0] - data_binned["Target"].sum()
213 |     )  # .plot(kind='bar', cmap='Accent')
214 | 
215 |     pos = data_binned[data_binned["Target"] == 1].groupby("ScoreBin").size().sort_index() / (
216 |         data_binned["Target"].sum()
217 |     )  # .plot(kind='bar', cmap='Accent', color='blue')
218 | 
219 |     pl = pd.DataFrame({"positive": pos, "negative": neg}).plot(figsize=(10, 10), kind="bar", cmap="Accent", width=1)
220 |     pl.get_figure().savefig(path2, bbox_inches="tight")
221 |     plt.close()
222 | 
223 | 
224 | def plot_binned_stats(data_binned, path):
225 |     sns.set(style="whitegrid", font_scale=1.5)
226 |     pl = data_binned[["ScoreBin", "P"]].boxplot(by="ScoreBin", rot=90, figsize=(10, 10))
227 |     pl.get_figure().savefig(path, bbox_inches="tight")
228 |     plt.close()
229 | 
230 | 
231 | def plot_corr_heatmap(corr_map, path):
232 |     sns.set(style="whitegrid", font_scale=1.5)
233 |     plt.figure(figsize=(20, 10))
234 |     pl = sns.heatmap(corr_map, annot=True, annot_kws={"size": 8}, fmt=".1g")
235 |     pl.get_figure().savefig(path, bbox_inches="tight")
236 |     plt.close()
237 | 
238 | 
239 | def plot_mean_target(train_binned, test_binned, path):
240 |     sns.set(style="whitegrid", font_scale=1.5)
241 |     train_stat = train_binned.groupby("ScoreBin").agg(mean_target=("Target", "mean"))
242 |     test_stat = test_binned.groupby("ScoreBin").agg(mean_target=("Target", "mean"))
243 |     df = pd.DataFrame({"train_mean_target": train_stat["mean_target"], "test_mean_target": test_stat["mean_target"]})
244 |     pl = df.plot(figsize=(10, 10), kind="bar", cmap="Accent", width=1)
245 |     pl.get_figure().savefig(path, bbox_inches="tight")
246 |     plt.close()
247 | 
248 | 
249 | def plot_grouped(
250 |     df: List[pd.DataFrame],
251 |     group_columns: Union[str, List[str]],
252 |     group_name: str = None,
253 |     plot_kind: str = "point",
254 |     path: str = None,
255 | ):
256 |     """Построить график аггрегированных значений для тренировочных и валидационных данных.
257 | 
258 |     Данные датафреймов аггрегируются либо по столбцу group_column,
259 |     который должен быть в каждом датафрейме, либо по последовательностям
260 |     group_data_train и group_data_test для тренировочного и валидационного датафрейма соответственно.
261 | 
262 |     Args:
263 |         df: Данные (список датафремов) для построения графиков
264 |         group_columns: Имя столбца или нескольких столбцов, по которым будет производиться аггрегация.
265 |         group_name: Название оси Х на графике, вдоль которой будет производиться группировка значений.
266 |             Если не задано, будут использованы названия столбцов group_columns.
267 |         plot_kind: Тип графика. Возможны значения "point", "bar" и "line".
268 |         path: Путь к файлу, в который будет сохранено изображение. Если не задан, то изображение не сохраняется.
269 | 
270 |     """
271 | 
272 |     if not df:
273 |         return
274 | 
275 |     if isinstance(group_columns, str):
276 |         group_columns = [group_columns]
277 | 
278 |     group_name = group_name or (group_columns if isinstance(group_columns, str) else "_".join(group_columns))
279 | 
280 |     mdf = pd.concat(list(map(lambda x: pd.melt(x, id_vars=group_columns), df)))
281 |     mdf = mdf.sort_values(by=group_columns)
282 |     mdf[group_name] = mdf[group_columns].astype(str).agg("/".join, axis=1)
283 |     mdf = mdf[["variable", "value", group_name]]
284 | 
285 |     # bins = max(df_train[group_columns].nunique(dropna=False), df_test[group_columns].nunique(dropna=False))
286 |     # if bins > max_bins:
287 | 
288 |     sns.set(style="whitegrid", font_scale=1.5)
289 |     if plot_kind == "point":
290 |         plot = sns.catplot(x=group_name, y="value", hue="variable", kind="point", data=mdf, height=10)
291 |         plot.set_xticklabels(rotation=30)
292 |     elif plot_kind == "line":
293 |         sns.set(rc={"figure.figsize": (10, 10)})
294 |         plot = sns.lineplot(x=group_name, y="value", hue="variable", data=mdf, sort=False)
295 |         plt.xticks(rotation=30)
296 |     elif plot_kind == "box":
297 |         plot = sns.boxplot(x=group_name, y="value", hue="variable", data=mdf, showfliers=False)
298 |         plot.set_xticklabels(rotation=30)
299 |     # elif plot_kind == 'bar':
300 |     #     mdf = mdf.groupby(by=group_name).agg('mean')
301 |     #     plot = mdf.plot(figsize=(10, 10), kind='bar', cmap='Accent', width=0.8)
302 |     #     plt.xticks(rotation=30)
303 |     else:
304 |         raise ValueError(f"Invalid plot kind: {plot_kind}")
305 | 
306 |     if path:
307 |         plot.get_figure().savefig(path, bbox_inches="tight")
308 | 
309 |     plt.close()
310 | 


--------------------------------------------------------------------------------
/autowoe/lib/utilities/sql.py:
--------------------------------------------------------------------------------
  1 | """SQL-query utilities."""
  2 | 
  3 | from typing import Any, Dict, List, Optional, Tuple, Union
  4 | 
  5 | from autowoe.lib.pipelines.pipeline_feature_special_values import MARK_SET, NAN_SET, SMALL_SET, is_mark_prefix
  6 | from autowoe.lib.utilities.utils import TaskType
  7 | from autowoe.lib.woe.woe import WoE
  8 | 
  9 | 
 10 | def prepare_number(
 11 |     woe_dict: WoE,
 12 |     name: str,
 13 |     r_val: int = 3,
 14 |     round_features: int = 5,
 15 |     nan_pattern: str = "({0} IS NULL OR {0} = 'NaN')",
 16 |     preprocessing: Optional[str] = None,
 17 |     mark_values: Optional[Dict[str, Tuple[Any]]] = None,
 18 |     mark_encoding: Optional[Dict[Any, str]] = None,
 19 | ):
 20 |     """Get encoding case when for number.
 21 | 
 22 |     Args:
 23 |         woe_dict: Dictionary of WoE values.
 24 |         name: Name of feature.
 25 |         r_val: Numbers after the decimal point.
 26 |         round_features: Numbers after the decimal point.
 27 |         nan_pattern: Expression for nan processing.
 28 |         preprocessing: Name preprocessing.
 29 |         mark_values: List of marked values.
 30 |         mark_encoding: Map marked value to code.
 31 | 
 32 |     Returns:
 33 |         sql query part for number.
 34 | 
 35 |     """
 36 |     # value in case
 37 |     feature_mark_values = [] if mark_values is None else mark_values.get(name, [])
 38 | 
 39 |     f_val = name
 40 |     if preprocessing is not None:
 41 |         f_val = preprocessing.format(name)
 42 | 
 43 |     # search for NaN encoding
 44 |     for grp in woe_dict.cod_dict:
 45 |         if type(grp) is str and grp.startswith("__NaN_"):
 46 |             nan_val = round(woe_dict.cod_dict[grp], r_val)
 47 |             break
 48 |     else:
 49 |         raise ValueError("NaN encoding value does not exists in woe_dict")
 50 | 
 51 |     nan_case = nan_pattern.format(f_val)
 52 |     feature = f"""CASE\n  WHEN {nan_case} THEN {nan_val}\n"""
 53 | 
 54 |     # if feature_mark_values is not None:
 55 |     #     for grp in woe_dict.cod_dict:
 56 |     #         if type(grp) is str and grp.startswith("__Mark_"):
 57 |     #             mark_val = round(woe_dict.cod_dict[grp], r_val)
 58 |     #             break
 59 | 
 60 |     #     mark_case = ", ".join(str(m) for m in feature_mark_values)
 61 |     #     feature += """  WHEN {} IN ({}) THEN {}\n""".format(f_val, mark_case, mark_val)
 62 | 
 63 |     # create regular bins
 64 |     for grp, val in enumerate(woe_dict.split):
 65 |         enc_val = round(woe_dict.cod_dict[grp], r_val)
 66 |         feature += f"""  WHEN {f_val} <= {round(val, round_features)} THEN {enc_val}\n"""
 67 | 
 68 |     for mv in feature_mark_values:
 69 |         # enc = "__Mark__{}__".format(mv)
 70 |         enc = mark_encoding[name][mv]
 71 |         enc_val = round(woe_dict.cod_dict[enc], r_val)
 72 |         feature += f"""  WHEN {f_val} == {mv} THEN {enc_val}\n"""
 73 | 
 74 |     # create last else val
 75 |     enc_val = round(woe_dict.cod_dict[len(woe_dict.split)], r_val)
 76 |     feature += f"""  ELSE {enc_val}\nEND AS {name}"""
 77 | 
 78 |     return feature
 79 | 
 80 | 
 81 | def check_cat_symb(x: Union[str, Any]) -> str:
 82 |     """Wrap to quotes.
 83 | 
 84 |     Args:
 85 |         x: Value.
 86 | 
 87 |     Returns:
 88 |         quoted string.
 89 | 
 90 |     """
 91 |     if type(x) is str:
 92 |         x = f"'{x}'"
 93 |     else:
 94 |         x = str(x)
 95 | 
 96 |     return x
 97 | 
 98 | 
 99 | def prepare_category(
100 |     woe_dict,
101 |     name: str,
102 |     r_val: int = 3,
103 |     nan_pattern: str = "({0} IS NULL OR LOWER(CAST({0} AS VARCHAR(50))) = 'nan')",
104 |     preprocessing: Optional[str] = None,
105 |     mark_values: Optional[Dict[str, List[Any]]] = None,
106 |     mark_encoding: Optional[Dict[Any, str]] = None,
107 | ):
108 |     """Get encoding case when for category.
109 | 
110 |     Args:
111 |         woe_dict: Dictionary of WoE values.
112 |         name: Name of feature.
113 |         r_val: Numbers after the decimal point.
114 |         nan_pattern: Expression for nan processing.
115 |         preprocessing: Name preprocessing.
116 |         mark_values: List of mark values.
117 |         mark_encoding: Map marked value to code.
118 | 
119 |     Returns:
120 |         sql query part for category.
121 | 
122 |     """
123 |     feature_mark_values = [] if mark_values is None else mark_values.get(name, [])
124 | 
125 |     # value in case
126 |     f_val = name
127 |     if preprocessing is not None:
128 |         f_val = preprocessing.format(name)
129 | 
130 |     # search for Mark, NaN and Small encodings
131 |     nan_val, small_val, small_grp = None, None, None
132 |     for grp in woe_dict.split:
133 |         if type(grp) is str:
134 |             if grp.startswith("__NaN_"):
135 |                 nan_grp = woe_dict.split[grp]
136 |                 nan_val = round(woe_dict.cod_dict[nan_grp], r_val)
137 | 
138 |             if grp.startswith("__Small_"):
139 |                 small_grp = woe_dict.split[grp]
140 |                 small_val = round(woe_dict.cod_dict[small_grp], r_val)
141 | 
142 |             # if grp.startswith("__Mark_"):
143 |             #     mark_grp = woe_dict.split[grp]
144 |             #     mark_val = round(woe_dict.cod_dict[mark_grp], r_val)
145 | 
146 |     # search for small in cod_dict
147 |     for grp in woe_dict.cod_dict:
148 |         if type(grp) is str:
149 |             if grp.startswith("__NaN_"):
150 |                 nan_val = round(woe_dict.cod_dict[grp], r_val)
151 |             if grp.startswith("__Small_"):
152 |                 small_val = round(woe_dict.cod_dict[grp], r_val)
153 |                 small_grp = -1
154 | 
155 |     assert nan_val is not None, "NaN encoding value does not exists in woe_dict"
156 |     # assert small_val is not None, "Small encoding value does not exists in woe_dict"
157 |     # TODO: assert for mark val
158 | 
159 |     feature = """CASE\n"""
160 |     if nan_val != small_val:
161 |         nan_case = nan_pattern.format(f_val)
162 |         feature += f"""  WHEN {nan_case} THEN {nan_val}\n"""
163 | 
164 |     # if feature_mark_values is not None:
165 |     #     mark_case = []
166 |     #     for m in feature_mark_values:
167 |     #         if isinstance(m, str):
168 |     #             fmt = "'{}'".format(m)
169 |     #         else:
170 |     #             fmt = str(m)
171 |     #         mark_case.append(fmt)
172 |     #     mark_case = ", ".join(mark_case)
173 |     #     feature += """  WHEN {} IN ({}) THEN {}\n""".format(f_val, mark_case, mark_val)
174 | 
175 |     # create regular bins
176 |     passed = {small_grp}
177 |     for grp in woe_dict.split.values():
178 |         if grp not in passed:
179 |             search_vals = [
180 |                 x
181 |                 for x in woe_dict.split
182 |                 if woe_dict.split[x] == grp and x not in {*SMALL_SET, *NAN_SET, *MARK_SET} and not is_mark_prefix(x)
183 |             ]
184 |             length = len(search_vals)
185 |             search_vals = list(map(check_cat_symb, search_vals))
186 | 
187 |             # filter NaN and Small cases separately
188 |             enc_val = round(woe_dict.cod_dict[grp], r_val)
189 |             if length > 1:
190 |                 feature += f"""  WHEN {f_val} IN ({", ".join(search_vals)}) THEN {enc_val}\n"""
191 |             elif length == 1:
192 |                 feature += f"""  WHEN {f_val} == {search_vals[0]} THEN {enc_val}\n"""
193 | 
194 |             passed.add(grp)
195 | 
196 |     for mv in feature_mark_values:
197 |         # enc = "__Mark__{}__".format(mv)
198 |         enc = mark_encoding[name][mv]
199 |         idx_enc = woe_dict.split[enc]
200 |         enc_val = round(woe_dict.cod_dict[idx_enc], r_val)
201 |         feature += f"""  WHEN {f_val} == {check_cat_symb(mv)} THEN {enc_val}\n"""
202 | 
203 |     # create last ELSE with small
204 |     feature += f"""  ELSE {small_val}\nEND AS {name}"""
205 | 
206 |     return feature
207 | 
208 | 
209 | def set_indent(x: str, n: int = 2):
210 |     """Indentation in spaces for a line.
211 | 
212 |     Args:
213 |         x: String.
214 |         n: Number of spaces.
215 | 
216 |     Returns:
217 |         Shifted string.
218 | 
219 |     """
220 |     indent = " " * n
221 | 
222 |     x = indent + x
223 |     x = x.replace("\n", "\n" + indent)
224 | 
225 |     return x
226 | 
227 | 
228 | def get_encoded_table(
229 |     model,
230 |     table_name,
231 |     round_woe=3,
232 |     round_features=5,
233 |     nan_pattern_numbers="({0} IS NULL OR {0} = 'NaN')",
234 |     nan_pattern_category="({0} IS NULL OR LOWER(CAST({0} AS VARCHAR(50))) = 'nan')",
235 |     preprocessing=None,
236 |     mark_values=None,
237 |     mark_encoding=None,
238 | ):
239 |     """Get encoding table.
240 | 
241 |     Args:
242 |         model: Model.
243 |         table_name: Feature table name.
244 |         round_woe: Numbers after the decimal point.
245 |         round_features: Numbers after the decimal point.
246 |         nan_pattern_numbers: Expression for nan processing in number feature.
247 |         nan_pattern_category: Expression for nan processing in category feature.
248 |         preprocessing: Name processing.
249 |         mark_values: List of mark values.
250 |         mark_encoding: Map marked value to code.
251 | 
252 |     Returns:
253 |         query.
254 | 
255 |     """
256 |     if preprocessing is None:
257 |         preprocessing = {}
258 | 
259 |     query = """SELECT\n"""
260 | 
261 |     for n, name in enumerate(model.features_fit.index):
262 |         woe_dict = model.woe_dict[name]
263 | 
264 |         prep = None
265 |         if name in preprocessing:
266 |             prep = preprocessing[name]
267 | 
268 |         if woe_dict.f_type == "cat":
269 |             feature = prepare_category(
270 |                 woe_dict, name, round_woe, nan_pattern_category, prep, mark_values, mark_encoding
271 |             )
272 |         else:
273 |             feature = prepare_number(
274 |                 woe_dict, name, round_woe, round_features, nan_pattern_numbers, prep, mark_values, mark_encoding
275 |             )
276 | 
277 |         query += set_indent(feature)
278 | 
279 |         if (n + 1) != len(model.features_fit):
280 |             query += ","
281 | 
282 |         query += "\n"
283 | 
284 |     query += f"""FROM {table_name}"""
285 | 
286 |     return query
287 | 
288 | 
289 | def get_weights_query(model, table_name, output_name="PROB", alias="WOE_TAB", bypass_encoded=False, round_wts=3):
290 |     """Calc prob over woe table.
291 | 
292 |     Args:
293 |         model: Model.
294 |         table_name: WoE table name.
295 |         output_name: Output name.
296 |         alias: Alias.
297 |         bypass_encoded: Add encoded features to result query.
298 |         round_wts: Round.
299 | 
300 |     Returns:
301 |         query.
302 | 
303 |     """
304 |     if model.params["task"] == TaskType.BIN:
305 |         # query = """SELECT\n  1 / (1 + EXP(-({0}\n  ))) as {3}{1}\nFROM {2} as {4}"""
306 |         query = """SELECT\n  1 / (1 + EXP(-({LIN_FUN}\n  ))) as {OUTPUT_NAME}{WOE_VALS}\nFROM {TABLE_NAME} as {ALIAS}"""
307 |     else:
308 |         # query = """SELECT\n ( {0}\n ) as {3}{1}\nFROM {2} as {4}"""
309 |         query = """SELECT\n ( {S} * ( {LIN_FUN}\n) + {M}\n ) as {OUTPUT_NAME}{WOE_VALS}\nFROM {TABLE_NAME} as {ALIAS}"""
310 | 
311 |     dot = f"\n    {round(model.intercept, round_wts)}"
312 | 
313 |     for name, val in zip(model.features_fit.index, model.features_fit.values):
314 |         sign = "" if val < 0 else "+"
315 |         dot += f"""\n    {sign}{round(val, round_wts)}*{alias}.{name}"""
316 | 
317 |     other = ""
318 |     if bypass_encoded:
319 |         other = f""",\n  {alias}.*"""
320 | 
321 |     # return query.format(dot, other, table_name, output_name, alias)
322 |     query_args = {
323 |         "LIN_FUN": dot,
324 |         "WOE_VALS": other,
325 |         "TABLE_NAME": table_name,
326 |         "OUTPUT_NAME": output_name,
327 |         "ALIAS": alias,
328 |     }
329 |     if model.params["task"] == TaskType.REG:
330 |         query_args["S"] = round(model._target_std, round_wts)
331 |         query_args["M"] = round(model._target_mean, round_wts)
332 | 
333 |     return query.format(**query_args)
334 | 
335 | 
336 | def get_sql_inference_query(
337 |     model,
338 |     table_name,
339 |     round_digits=3,
340 |     round_features=5,
341 |     output_name="PROB",
342 |     alias="WOE_TAB",
343 |     bypass_encoded=True,
344 |     template=None,
345 |     nan_pattern_numbers="({0} IS NULL OR {0} = 'NaN')",
346 |     nan_pattern_category="({0} IS NULL OR LOWER(CAST({0} AS VARCHAR(50))) = 'nan')",
347 |     preprocessing=None,
348 |     mark_values=None,
349 |     mark_encoding=None,
350 | ):
351 |     """Get sql query.
352 | 
353 |     Args:
354 |         model: Model.
355 |         table_name: Feature table name.
356 |         round_digits: Round digits.
357 |         round_features: Round digits of features.
358 |         output_name: Output name.
359 |         alias: Alias.
360 |         bypass_encoded: Add encoded features to result query.
361 |         template: T.
362 |         nan_pattern_numbers: Expression for nan processing in number feature.
363 |         nan_pattern_category: Expression for nan processing in category feature.
364 |         preprocessing: Name preprocessing.
365 |         mark_values: List of marked values.
366 |         mark_encoding: Map marked value to code.
367 | 
368 |     Returns:
369 |         query.
370 | 
371 |     """
372 |     assert template in ["td"] or template is None, "Unknown template"
373 | 
374 |     if template == "td":
375 |         nan_pattern_numbers = "{0} IS NULL"
376 |         nan_pattern_category = "{0} IS NULL"
377 | 
378 |     # get table with features
379 |     encode_table = "({0})".format(
380 |         get_encoded_table(
381 |             model,
382 |             table_name,
383 |             round_digits,
384 |             round_features,
385 |             nan_pattern_numbers,
386 |             nan_pattern_category,
387 |             preprocessing,
388 |             mark_values,
389 |             mark_encoding,
390 |         )
391 |     )
392 |     encode_table = """\n  """ + set_indent(encode_table)
393 | 
394 |     # get table with weights
395 |     query = get_weights_query(
396 |         model, encode_table, output_name=output_name, bypass_encoded=bypass_encoded, alias=alias, round_wts=round_digits
397 |     )
398 | 
399 |     return query
400 | 


--------------------------------------------------------------------------------
/examples/Tutorial_1__Basic_usage_and_params.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from sklearn.metrics import roc_auc_score\n",
 11 |     "\n",
 12 |     "from autowoe import AutoWoE, ReportDeco"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "### Чтение и подготовка обучающей выборки"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "train = pd.read_csv(\n",
 29 |     "    \"./data/train_demo.csv\", low_memory=False, index_col=\"line_id\", parse_dates=[\"datetime_\" + str(i) for i in range(2)]\n",
 30 |     ")\n",
 31 |     "\n",
 32 |     "train = train.iloc[:, 50:100]\n",
 33 |     "\n",
 34 |     "num_col = list(filter(lambda x: \"numb\" in x, train.columns))\n",
 35 |     "num_feature_type = dict.fromkeys(num_col, \"real\")\n",
 36 |     "\n",
 37 |     "date_col = filter(lambda x: \"datetime\" in x, train.columns)\n",
 38 |     "for col in date_col:\n",
 39 |     "    train[col + \"_year\"] = train[col].map(lambda x: x.year)\n",
 40 |     "    train[col + \"_weekday\"] = train[col].map(lambda x: x.weekday())\n",
 41 |     "    train[col + \"_month\"] = train[col].map(lambda x: x.month)"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Чтение и подготовка тестовой выборки"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "test = pd.read_csv(\"./data/test_demo.csv\", index_col=\"line_id\", parse_dates=[\"datetime_\" + str(i) for i in range(2)])\n",
 58 |     "\n",
 59 |     "date_col = filter(lambda x: \"datetime\" in x, test.columns)\n",
 60 |     "for col in date_col:\n",
 61 |     "    test[col + \"_year\"] = test[col].map(lambda x: x.year)\n",
 62 |     "    test[col + \"_weekday\"] = test[col].map(lambda x: x.weekday())\n",
 63 |     "    test[col + \"_month\"] = test[col].map(lambda x: x.month)\n",
 64 |     "\n",
 65 |     "test_target = pd.read_csv(\"./data/test-target_demo.csv\")[\"target\"]\n",
 66 |     "test[\"target\"] = test_target.values"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "### Параметры модели"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "Для обучения модели рекомендуется указать тип признаков для обучения.\n",
 81 |     "Поэтому создается словарь features_type с ключами: \n",
 82 |     "\n",
 83 |     "\n",
 84 |     "\"real\" -- вещественный признак,\n",
 85 |     "\n",
 86 |     "\"cat\" --  категориальный.\n",
 87 |     "\n",
 88 |     "Для признаков, которые не размечены, типы будут определены автоматом. Такой вариант будет работать, но качество порядочно просядет"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "#### features_type"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 4,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "cat_col = list(filter(lambda x: \"str\" in x, train.columns))\n",
105 |     "cat_feature_type = dict.fromkeys(cat_col, \"cat\")\n",
106 |     "\n",
107 |     "year_col = list(filter(lambda x: \"_year\" in x, train.columns))\n",
108 |     "year_feature_type = dict.fromkeys(year_col, \"cat\")\n",
109 |     "\n",
110 |     "weekday_col = list(filter(lambda x: \"_weekday\" in x, train.columns))\n",
111 |     "weekday_feature_type = dict.fromkeys(weekday_col, \"cat\")\n",
112 |     "\n",
113 |     "month_col = list(filter(lambda x: \"_month\" in x, train.columns))\n",
114 |     "month_feature_type = dict.fromkeys(month_col, \"cat\")"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 5,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "features = cat_col + year_col + weekday_col + month_col + num_col"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "#### Feature level constrains"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 6,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "features_type = dict(\n",
140 |     "    **num_feature_type, **cat_feature_type, **year_feature_type, **weekday_feature_type, **month_feature_type\n",
141 |     ")"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "- `features_monotone_constraints` - также можно указать зависимость целевой переменной от признака. Если заранее известно, что при возрастании признака feature_1, то эту информацию можно учесть в модели, добавив в словарь пару {feature_1: \"1\"}. Если же зависимость признака от целевой переменной обратная, то можно указать {feature_1: \"-1\"} Если про зависимость ничего неизвестно, но хочется, чтобы она была монотонная, можно указать 'auto'. Можно указать  {feature_1: \"0\"}, в случае, если установлено общее ограничение на монотонность, чтобы не распространять его на эту фичу. Если специальных условий нет, то можно не собирать этот дикт\n",
149 |     "\n",
150 |     "\n",
151 |     "Рекомендуемое использование:\n",
152 |     "\n",
153 |     "1) В случае, если задано общее условие на монотонность, то можно собрать дикт {feature_1: \"0\", feature_2: \"0\"}, чтобы игнорировать это ограничение для признаков feature_1, feature_2\n",
154 |     "\n",
155 |     "2) В случае, если не задано общее условие на монотонность, то можно собрать дикт {feature_1: \"auto\", feature_2: \"auto\"}, чтобы установить это ограничение для признаков feature_1, feature_2"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 7,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "features_monotone_constraints = {\"number_74\": \"auto\", \"number_83\": \"auto\"}"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "- `max_bin_count`  - через словарь max_bin_count можно задать число бинов для WoE кодирования, если для какого-то признака оно отлично от общего. "
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 8,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "max_bin_count = {\"number_47\": 3, \"number_51\": 2}"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "####  Рекомендация\n",
188 |     "В общем случае, в первый момент построения модели лучше не указывать специальных ограничений в features_monotone_constraints и max_bin_count. Если в результате анализа полученной модели разбиение оказалось неинтерпретируемым или нестабильным по отдельным признакам, но в целом по модели ок, то ограничить сложность разбиения отдельных призаков имеет смысл. Если разбивка большинства признаков в модели оказалась неудовлетворительная, то рекомендуется в первую очередь настраивать глобальные ограничения (см параметры модели max_bin_count, monotonic, min_bin_size и др ниже)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "####  Общие параметры модели\n",
196 |     "\n",
197 |     "- `interpreted_model` - требуется ли интерпретируемость модели (условие на знак в коэффициентах логистической регрессии)\n",
198 |     "\n",
199 |     "- `monotonic` - Глобальное условие на монотонность. Если указано True, то для всех признаков по умолчанию будут строится только монотонные разбиения. Указать специальные условия для отдельных признаков можно используя features_monotone_constraints аргумент метода .fit\n",
200 |     "\n",
201 |     "- `max_bin_count` - Глобальное ограничение на число бинов. Указать специальные условия для отдельных признаков можно используя max_bin_count аргумент метода .fit\n",
202 |     "\n",
203 |     "- `select_type`  - способ ПРЕДВАРИТЕЛЬНОГО!!! (ЭТО ВАЖНО) отбора признаков. Если указать None, то будут отобраны признаки, у которых importance больше imp_th. Если указвать, например 50, то после предварительного отобра останется только 50 признаков самых важных признаков. Крайне не рекомендуется сильно ограничивать\n",
204 |     "\n",
205 |     "- `pearson_th` - пороговое значен для корреляции Пирсона. Используется на финальной стадии отбора признаков.\n",
206 |     "Если корреляция вух признаков по модулю больше pearson_th, то будет выброшен тот, у которого \n",
207 |     "информативность меньше\n",
208 |     "\n",
209 |     "- `auc_th` - пороговое значнеи для одномерной оценки качества признака\n",
210 |     "\n",
211 |     "- `vif_th` - пороговое значнеи для VIF признака\n",
212 |     "\n",
213 |     "- `imp_th` - порог по которому будет произведен отбор признаков, если указать select_type=None (см. ниже).\n",
214 |     "\n",
215 |     "- `th_const` порог по которому признак будет считаться константным. Все константные признаки в модели не учитываются. Если число валидных значений больше трешхолда, то колонка не константная (int). В случае указания float, трешхолд будет определяться как размер_выборки * th_const\n",
216 |     "\n",
217 |     "- `force_single_split` - иногда в силу ограничений на min_bin_size невозможно построить ниодной группировки на переменную. force_single_split=True заставит в этом случае построить единственно возмоджный сплит, в случае если при этом выделяется группа размера более чем th_const. False будет выкидывать этот признак\n",
218 |     "\n",
219 |     "\n",
220 |     "- `th_nan` - порог по которому будет выделена отдельная категория для пропусков в данных.\n",
221 |     "Если число пропусков меньше чем th_nan, то WoE значения для пропусков берется равным нулю.\n",
222 |     "В противном случае пропущенные значения будут выделены в отдельную группу и для них отдельно\n",
223 |     "будет рассчитано WoE значение.\n",
224 |     "Так же влияет на редкие категории (менее th_cat). Если суммарно таких категорий будет менее th_nan, то обработка будет производиться по принципу отпределенному в `cat_merge_to`, иначе оценено по группе\n",
225 |     "\n",
226 |     "- `th_cat` - порог, по которой немногочисленные категории в категориальных признаках будут объединятся в отдельную группу\n",
227 |     "\n",
228 |     "\n",
229 |     "- `woe_diff_th` - Возмодность смеджить наны и редкие категории с каким-то бином, если разница в вое менее woe_diff_th\n",
230 |     "\n",
231 |     "\n",
232 |     "- `min_bin_size` - минимальный размер бина при группировке. Возможно int как число наблюдений и float как доля от выбрки\n",
233 |     "\n",
234 |     "- `min_bin_mults` - в ходе построения бинов будут протестированы возможные значения min_bin_size, \n",
235 |     "min_bin_size * min_bin_mults[0], min_bin_size * min_bin_mults[1] ... . Ждем float > 1. Дефолт - (2, 4), в принципе можно не трогать\n",
236 |     "\n",
237 |     "- `min_gains_to_split` - возможные значения регуляризатора, которые будут протестированы в ходе построения биннинга\n",
238 |     "\n",
239 |     "\n",
240 |     "- `auc_tol` - Чувствительность к AUC. Считаем, что можем пожертвовать auc_tol качества от максимального, чтобы сделать модель проще\n",
241 |     "\n",
242 |     "\n",
243 |     "- `cat_alpha` - Регуляризатор для кодировщика категорий\n",
244 |     "\n",
245 |     "\n",
246 |     "\n",
247 |     "- `cat_merge_to` - группа для редких (менее th_cat) категорий либо новых на тесте\n",
248 |     "         \"to_nan\" -- в группу nan, \n",
249 |     "         \"to_woe_0\" -- отдельная группа с WoE = 0,\n",
250 |     "         \"to_maxfreq\" - в самую большую группу,\n",
251 |     "         \"to_maxp\" - в группу с наибольшей вероятностью события,\n",
252 |     "         \"to_minp\" - в группу с наименьшей вероятностью события\n",
253 |     "         \n",
254 |     "- `nan_merge_to` - группа для НаНов\n",
255 |     "         \"to_woe_0\" -- отдельная группа с WoE = 0,\n",
256 |     "         \"to_maxfreq\" - в самую большую группу,\n",
257 |     "         \"to_maxp\" - в группу с наибольшей вероятностью события,\n",
258 |     "         \"to_minp\" - в группу с наименьшей вероятностью события  \n",
259 |     "         \n",
260 |     "         \n",
261 |     "- `oof_woe` - если указать oof_woe=True, то WoE кодирование будет происходить по кросс-валидации. Если же False, то сразу на всей обучающей выборке.\n",
262 |     "\n",
263 |     "- `n_folds` - количество фолдов для внутренней кроссвалидации\n",
264 |     "\n",
265 |     "\n",
266 |     "- `n_jobs` - число процессов, которое будет использовать модель \n",
267 |     "\n",
268 |     "- `l1_grid_size` - в данной модели на одном из шагов используется отбор признаков LASSO. l1_base_step -- размер сетки для перебора C\n",
269 |     "\n",
270 |     "- `l1_exp_scale` - шкала сетки для L1 отбора. 4 соответствует макс значению C порядка 3-4. Увеличивать, если необходимо сделать менее регуляризованную модель\n",
271 |     "\n",
272 |     "- `imp_type` - способ определения значимости признаков -- features importance (\"feature_imp\" - в общем случае более сложная модель) или permutation importance (\"perm_imp\" - в общем случае более простая модель)\n",
273 |     "\n",
274 |     "- `regularized_refit` - после отбора признаков полученная модель пересчитывается на всех данных. Стоит ли включать L1 при этом. Если нет, то в интерпретируемом режиме модель будет итеративно переобучаться, пока все веса не станут отрицательны. Если да - то аналогичное будет получаться закручиванием L1. Может быть полезно ставить False если нужна стат модель, те p-value на оценки\n",
275 |     "\n",
276 |     "- `p_val` - допустимый уровень p_value на оценки модели при условии обучении стат модели (regularized_refit=False)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 9,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "auto_woe = AutoWoE(\n",
286 |     "    task=\"BIN\",\n",
287 |     "    interpreted_model=True,\n",
288 |     "    monotonic=False,\n",
289 |     "    max_bin_count=5,\n",
290 |     "    select_type=None,\n",
291 |     "    pearson_th=0.9,\n",
292 |     "    auc_th=0.505,\n",
293 |     "    vif_th=10.0,\n",
294 |     "    imp_th=0,\n",
295 |     "    th_const=32,\n",
296 |     "    force_single_split=True,\n",
297 |     "    th_nan=0.01,\n",
298 |     "    th_cat=0.005,\n",
299 |     "    woe_diff_th=0.01,\n",
300 |     "    min_bin_size=0.01,\n",
301 |     "    min_bin_mults=(2, 4),\n",
302 |     "    min_gains_to_split=(0.0, 0.5, 1.0),\n",
303 |     "    auc_tol=1e-4,\n",
304 |     "    cat_alpha=100,\n",
305 |     "    cat_merge_to=\"to_woe_0\",\n",
306 |     "    nan_merge_to=\"to_woe_0\",\n",
307 |     "    oof_woe=True,\n",
308 |     "    n_folds=6,\n",
309 |     "    n_jobs=4,\n",
310 |     "    l1_grid_size=20,\n",
311 |     "    l1_exp_scale=6,\n",
312 |     "    imp_type=\"feature_imp\",\n",
313 |     "    regularized_refit=False,\n",
314 |     "    p_val=0.05,\n",
315 |     "    debug=False,\n",
316 |     "    verbose=0,\n",
317 |     ")\n",
318 |     "\n",
319 |     "auto_woe = ReportDeco(auto_woe)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {},
325 |    "source": [
326 |     "- `train` обучающая выборка\n",
327 |     "\n",
328 |     "- `target_name` - название целевой переменной\n",
329 |     "\n",
330 |     "- `features_type` - см выше описание дикта features_type. Возможно указание None для автозаполнения, но не рекомендуется\n",
331 |     "\n",
332 |     "- `group_kf` -  название колонки-группы для GroupKFold https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupKFold.html\n",
333 |     "\n",
334 |     "- `max_bin_count` - см выше описание дикта max_bin_count. Можно ничего не передавать, если специальных условий не предусмотрено. Общее для всех условние задано в __init__\n",
335 |     "\n",
336 |     "- `features_monotone_constraints` - см выше описание дикта features_monotone_constraints. Можно ничего не передавать, если специальных условий не предусмотрено. Общее для всех условние задано в __init__\n",
337 |     "\n",
338 |     "- `validation` - возможность использовать валидацию в построении/отборе признаков. Можно не передавать. На текущий момент используется для 1) отбора признаков по p-value при построении стат модели\n"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 10,
344 |    "metadata": {
345 |     "scrolled": false
346 |    },
347 |    "outputs": [
348 |     {
349 |      "name": "stdout",
350 |      "output_type": "stream",
351 |      "text": [
352 |       "[LightGBM] [Info] Number of positive: 63, number of negative: 5537\n",
353 |       "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001103 seconds.\n",
354 |       "You can set `force_row_wise=true` to remove the overhead.\n",
355 |       "And if memory is not enough, you can set `force_col_wise=true`.\n",
356 |       "[LightGBM] [Info] Total Bins 379\n",
357 |       "[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 49\n",
358 |       "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011250 -> initscore=-4.476073\n",
359 |       "[LightGBM] [Info] Start training from score -4.476073\n"
360 |      ]
361 |     }
362 |    ],
363 |    "source": [
364 |     "auto_woe.fit(\n",
365 |     "    train[features + [\"target\"]],\n",
366 |     "    target_name=\"target\",\n",
367 |     "    features_type=features_type,\n",
368 |     "    group_kf=None,\n",
369 |     "    max_bin_count=max_bin_count,\n",
370 |     "    features_monotone_constraints=features_monotone_constraints,\n",
371 |     "    validation=test,\n",
372 |     ")"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 11,
378 |    "metadata": {
379 |     "scrolled": false
380 |    },
381 |    "outputs": [
382 |     {
383 |      "data": {
384 |       "text/plain": [
385 |        "0.7791178112786152"
386 |       ]
387 |      },
388 |      "execution_count": 11,
389 |      "metadata": {},
390 |      "output_type": "execute_result"
391 |     }
392 |    ],
393 |    "source": [
394 |     "pred = auto_woe.predict_proba(test)\n",
395 |     "roc_auc_score(test[\"target\"], pred)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 12,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "data": {
405 |       "text/plain": [
406 |        "0.7791178112786152"
407 |       ]
408 |      },
409 |      "execution_count": 12,
410 |      "metadata": {},
411 |      "output_type": "execute_result"
412 |     }
413 |    ],
414 |    "source": [
415 |     "pred = auto_woe.predict_proba(test[[\"number_72\"]], report=False)\n",
416 |     "roc_auc_score(test[\"target\"], pred)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 13,
422 |    "metadata": {},
423 |    "outputs": [
424 |     {
425 |      "name": "stdout",
426 |      "output_type": "stream",
427 |      "text": [
428 |       "SELECT\n",
429 |       "  1 / (1 + EXP(-(\n",
430 |       "    -4.517\n",
431 |       "    -0.946*WOE_TAB.number_72\n",
432 |       "  ))) as PROB,\n",
433 |       "  WOE_TAB.*\n",
434 |       "FROM \n",
435 |       "    (SELECT\n",
436 |       "    CASE\n",
437 |       "      WHEN (number_72 IS NULL OR number_72 = 'NaN') THEN -0.974\n",
438 |       "      WHEN number_72 <= 0.0 THEN 0.296\n",
439 |       "      ELSE -1.96\n",
440 |       "    END AS number_72\n",
441 |       "  FROM table) as WOE_TAB\n"
442 |      ]
443 |     }
444 |    ],
445 |    "source": [
446 |     "print(auto_woe.get_sql_inference_query(\"table\"))"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "markdown",
451 |    "metadata": {},
452 |    "source": [
453 |     "### Полезные методы модели"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "markdown",
458 |    "metadata": {},
459 |    "source": [
460 |     "- `private_features_type` - типизация признаков\n",
461 |     "- `get_woe` - рабиение на бины и WoE значения в них\n",
462 |     "- `get_split` - границы разбиения. Особо полезен для категориальных признаков\n",
463 |     "\n",
464 |     "\n",
465 |     "##### Замечание: \n",
466 |     "ReportDeco - обертка для построения отчета. Она не обязательна для обучения и применения модели, но обязательна для построения отчета (см последнюю ячейку).\n",
467 |     "Для доступа к атрибутам самой модели необходимо обратится к атрибуту auto_woe.model декоратора\n",
468 |     "Все атрибуты объекта-модели так же доступны через объект-отчета.\n",
469 |     "Однако в пикл отчета будет весить существенно больше, так что для сохранения модели на инференс стоит сохранять только auto_woe.model\n"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {},
475 |    "source": [
476 |     "### Формирование отчета"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 14,
482 |    "metadata": {},
483 |    "outputs": [
484 |     {
485 |      "name": "stderr",
486 |      "output_type": "stream",
487 |      "text": [
488 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
489 |       "No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n"
490 |      ]
491 |     }
492 |    ],
493 |    "source": [
494 |     "report_params = {\n",
495 |     "    \"automl_date_column\": \"report_month\",  # колонка с датой в формате params['datetimeFormat']\n",
496 |     "    \"output_path\": \"./AUTOWOE_REPORT_1\",  # папка, куда сгенерится отчет и сложатся нужные файлы\n",
497 |     "    \"report_name\": \"___НАЗВАНИЕ ОТЧЕТА___\",\n",
498 |     "    \"report_version_id\": 1,\n",
499 |     "    \"city\": \"Воронеж\",\n",
500 |     "    \"model_aim\": \"___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___\",\n",
501 |     "    \"model_name\": \"___НАЗВАНИЕ МОДЕЛИ___\",\n",
502 |     "    \"zakazchik\": \"___ЗАКАЗЧИК___\",\n",
503 |     "    \"high_level_department\": \"___ПОДРАЗДЕЛЕНИЕ___\",\n",
504 |     "    \"ds_name\": \"___РАЗРАБОТЧИК МОДЕЛИ___\",\n",
505 |     "    \"target_descr\": \"___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___\",\n",
506 |     "    \"non_target_descr\": \"___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___\",\n",
507 |     "}\n",
508 |     "\n",
509 |     "auto_woe.generate_report(report_params)"
510 |    ]
511 |   }
512 |  ],
513 |  "metadata": {
514 |   "kernelspec": {
515 |    "display_name": "Anaconda_py38",
516 |    "language": "python",
517 |    "name": "anaconda_py38"
518 |   },
519 |   "language_info": {
520 |    "codemirror_mode": {
521 |     "name": "ipython",
522 |     "version": 3
523 |    },
524 |    "file_extension": ".py",
525 |    "mimetype": "text/x-python",
526 |    "name": "python",
527 |    "nbconvert_exporter": "python",
528 |    "pygments_lexer": "ipython3",
529 |    "version": "3.8.5"
530 |   },
531 |   "stem_cell": {
532 |    "cell_type": "raw",
533 |    "metadata": {
534 |     "pycharm": {
535 |      "metadata": false
536 |     }
537 |    },
538 |    "source": ""
539 |   }
540 |  },
541 |  "nbformat": 4,
542 |  "nbformat_minor": 1
543 | }
544 | 


--------------------------------------------------------------------------------