├── autowoe ├── lib │ ├── __init__.py │ ├── woe │ │ ├── __init__.py │ │ └── woe.py │ ├── optimizer │ │ ├── __init__.py │ │ └── optimizer.py │ ├── pipelines │ │ ├── __init__.py │ │ ├── pipeline_homotopy.py │ │ └── pipeline_feature_special_values.py │ ├── report │ │ ├── __init__.py │ │ ├── utilities_images │ │ │ ├── __init__.py │ │ │ └── utilities_images.py │ │ └── report_generator.py │ ├── selectors │ │ ├── __init__.py │ │ ├── l1.py │ │ ├── selector_last.py │ │ ├── utils.py │ │ ├── composed_selector.py │ │ └── selector_first.py │ ├── utilities │ │ ├── __init__.py │ │ ├── s3.py │ │ ├── cv_split_f.py │ │ ├── utils.py │ │ ├── eli5_permutation.py │ │ ├── refit.py │ │ └── sql.py │ ├── cat_encoding │ │ ├── __init__.py │ │ └── cat_encoding.py │ ├── types_handler │ │ ├── __init__.py │ │ ├── features_checkers_handlers.py │ │ └── types_handler.py │ └── logging.py └── __init__.py ├── _config.yml ├── poetry.toml ├── .pre-commit-config.yaml ├── setup.cfg ├── ruff.toml ├── tests ├── conftest.py └── integration │ ├── test_regression_task.py │ ├── test_eda_allfeatures.py │ ├── test_dates_and_stat_model.py │ ├── test_autotyping.py │ ├── test_marked_values.py │ └── test_basic_usage_and_params.py ├── .github └── workflows │ ├── tests_macos.yml │ ├── tests_ubuntu.yml │ ├── tests_windows.yml │ └── CI.yml ├── tox.ini ├── pyproject.toml ├── README.md ├── parameters_info.md ├── .gitignore ├── examples ├── Tutorial_2__Dates_and_stat_model.ipynb └── Tutorial_1__Basic_usage_and_params.ipynb └── LICENSE /autowoe/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autowoe/lib/woe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autowoe/lib/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autowoe/lib/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autowoe/lib/report/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autowoe/lib/selectors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autowoe/lib/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autowoe/lib/cat_encoding/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autowoe/lib/types_handler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-architect 2 | -------------------------------------------------------------------------------- /autowoe/lib/report/utilities_images/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | create = true 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.11.2 4 | hooks: 5 | - id: ruff 6 | args: ["--fix"] 7 | - id: ruff-format 8 | args: ["--diff"] 9 | 10 | - repo: https://github.com/pre-commit/pre-commit-hooks 11 | rev: v3.4.0 12 | hooks: 13 | - id: trailing-whitespace 14 | - id: end-of-file-fixer 15 | - id: debug-statements 16 | -------------------------------------------------------------------------------- /autowoe/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | 5 | from .lib.autowoe import AutoWoE 6 | from .lib.report.report import ReportDeco 7 | 8 | __all__ = ["AutoWoE", "ReportDeco"] 9 | 10 | if os.getenv("DOCUMENTATION_ENV") is None: 11 | try: 12 | import importlib.metadata as importlib_metadata 13 | except ModuleNotFoundError: 14 | import importlib_metadata 15 | 16 | __version__ = importlib_metadata.version(__name__) 17 | 18 | np.random.seed(42) 19 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [darglint] 2 | docstring_style = google 3 | strictness = short 4 | ignore = DAR401, DAR402 5 | 6 | 7 | [flake8] 8 | max-line-length = 120 9 | ignore = D100, D103, D104, D105, D107, E402, E203, W503, W605, E722, E501 10 | docstring-convention = google 11 | # per-file-ignores = 12 | exclude = 13 | .git 14 | __pycache__ 15 | setup.py 16 | build 17 | dist 18 | releases 19 | .venv 20 | .tox 21 | .mypy_cache 22 | .pytest_cache 23 | .vscode 24 | .github 25 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | # Exclude a variety of commonly ignored directories. 2 | exclude = [ 3 | "tests", 4 | ".git", 5 | "__pycache__", 6 | "setup.py", 7 | "build", 8 | "dist", 9 | "releases", 10 | ".venv", 11 | ".tox", 12 | ".mypy_cache", 13 | ".pytest_cache", 14 | ".vscode", 15 | ".github", 16 | ] 17 | 18 | # Same as Black. 19 | line-length = 120 20 | indent-width = 4 21 | 22 | target-version = "py38" 23 | 24 | [lint] 25 | select = [ 26 | "S", "B", "A", "D", "F", "E", "N", "I", "PD", "PERF", 27 | "UP032", 28 | "PERF401", 29 | "TID252", 30 | "C4"] 31 | ignore = [ 32 | "D203", 33 | "D213", 34 | "PD011", 35 | "PD901", 36 | "D107", 37 | "D104", 38 | "D103", 39 | "E501", 40 | "S101", 41 | "N803", 42 | "N806", 43 | "D401", 44 | "B904", 45 | "PLR0912" 46 | ] 47 | 48 | fixable = ["ALL"] 49 | 50 | 51 | [format] 52 | quote-style = "double" 53 | 54 | indent-style = "space" 55 | 56 | skip-magic-trailing-comma = false 57 | 58 | line-ending = "auto" 59 | 60 | docstring-code-format = true 61 | 62 | docstring-code-line-length = "dynamic" 63 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | 8 | 9 | RANDOM_STATE = 42 10 | np.random.seed(RANDOM_STATE) 11 | 12 | 13 | DATA_DIR = "examples/data/" 14 | 15 | 16 | @pytest.fixture() 17 | def train_data(): 18 | train = pd.read_csv( 19 | DATA_DIR + "train_demo.csv", 20 | low_memory=False, 21 | index_col="line_id", 22 | parse_dates=["datetime_" + str(i) for i in range(2)], 23 | ) 24 | return train 25 | 26 | 27 | @pytest.fixture() 28 | def test_data(): 29 | test = pd.read_csv( 30 | DATA_DIR + "test_demo.csv", index_col="line_id", parse_dates=["datetime_" + str(i) for i in range(2)] 31 | ) 32 | return test 33 | 34 | 35 | @pytest.fixture() 36 | def test_target(): 37 | test_target = pd.read_csv(DATA_DIR + "test-target_demo.csv")["target"] 38 | return test_target 39 | 40 | 41 | @pytest.fixture() 42 | def cat_data(): 43 | data = pd.read_csv(DATA_DIR + "data_cat.csv") 44 | return data 45 | 46 | 47 | @pytest.fixture() 48 | def regression_data(): 49 | data = pd.read_csv(DATA_DIR + "regression_dataset.csv") 50 | return data 51 | -------------------------------------------------------------------------------- /.github/workflows/tests_macos.yml: -------------------------------------------------------------------------------- 1 | name: tests_macos 2 | 3 | on: 4 | # # At 20:59 every day (23:59 MSK) 5 | # schedule: 6 | # - cron: 59 20 * * * 7 | 8 | # Manually triggerable in github 9 | workflow_dispatch: 10 | 11 | workflow_run: 12 | workflows: ["tests_ubuntu"] 13 | branches: [master] 14 | types: 15 | - completed 16 | 17 | jobs: 18 | macos-tests: 19 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 20 | runs-on: macos-latest 21 | strategy: 22 | fail-fast: true 23 | matrix: 24 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 25 | 26 | steps: 27 | - uses: actions/checkout@v2 28 | 29 | - name: Set up Python ${{ matrix.python-version }} 30 | uses: actions/setup-python@v2 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | 34 | - name: install deps for MacOS 35 | run: brew update && brew install libomp cairo pango gdk-pixbuf libffi 36 | 37 | - name: install with pip 38 | run: | 39 | pip install tox==4.23.2 40 | pip install tox-gh-actions==3.2.0 41 | 42 | - name: test with tox 43 | run: | 44 | tox -- -vv 45 | -------------------------------------------------------------------------------- /.github/workflows/tests_ubuntu.yml: -------------------------------------------------------------------------------- 1 | name: tests_ubuntu 2 | 3 | on: 4 | # # At 20:59 every day (23:59 MSK) 5 | # schedule: 6 | # - cron: 59 20 * * * 7 | 8 | # Manually triggerable in github 9 | workflow_dispatch: 10 | 11 | workflow_run: 12 | workflows: ["CI"] 13 | types: 14 | - completed 15 | 16 | jobs: 17 | ubuntu-tests: 18 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 19 | runs-on: ubuntu-latest 20 | strategy: 21 | fail-fast: true 22 | matrix: 23 | python-version: ["3.8", "3.9", "3.11", "3.12"] # "3.10" is tested in CI 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v4 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | 33 | - name: install deps for Ubuntu 34 | run: sudo apt-get install build-essential libcairo2 libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 libffi-dev shared-mime-info 35 | 36 | - name: install tox 37 | run: | 38 | python3 -m pip install --upgrade pip 39 | pip3 install tox==4.23.2 40 | pip3 install tox-gh-actions==3.2.0 41 | 42 | - name: test with tox 43 | run: | 44 | tox -- -vv 45 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | min_version = 3.28.0 3 | isolated_build = True 4 | envlist = 5 | py{38, 39, 310, 311, 312}, 6 | lint, 7 | docs, 8 | typing, 9 | build 10 | codespell 11 | 12 | [tox:.package] 13 | # note tox will use the same python version as under what tox is installed to package 14 | # so unless this is python 3 you can require a given python version for the packaging 15 | # environment via the basepython key 16 | basepython = python3 17 | 18 | [gh-actions] 19 | python = 20 | 3.8: py38 21 | 3.9: py39 22 | 3.10: py310 23 | 3.11: py311 24 | 3.12: py312 25 | 26 | [gh-actions:env] 27 | PLATFORM = 28 | ubuntu-latest: linux 29 | macos-latest: macos 30 | windows-latest: windows 31 | 32 | [testenv] 33 | allowlist_externals = make 34 | package = wheel 35 | deps = 36 | .[all] 37 | pytest >= 6.2.5 38 | commands = pytest {posargs} -v --basetemp="{envtmpdir}" --log-level=DEBUG 39 | 40 | [testenv:lint] 41 | deps = 42 | pre-commit == 2.15.0 43 | commands = 44 | pre-commit install 45 | pre-commit run --all-files 46 | 47 | [testenv:build] 48 | deps = 49 | poetry >= 1.1.7 50 | commands = 51 | poetry build 52 | 53 | [testenv:codespell] 54 | deps = 55 | codespell == 2.3.0 56 | commands = 57 | codespell --skip="*.js,*.csv,*.ipynb,*shaptxt*" 58 | -------------------------------------------------------------------------------- /tests/integration/test_regression_task.py: -------------------------------------------------------------------------------- 1 | import time 2 | from sklearn.model_selection import train_test_split 3 | from sklearn.metrics import r2_score 4 | 5 | from autowoe import AutoWoE 6 | 7 | DATA_DIR = "examples/data/" 8 | 9 | 10 | def test_regression_task(regression_data): 11 | 12 | df = regression_data 13 | 14 | TARGET_NAME = "Target" 15 | 16 | train_df, test_df = train_test_split(df, test_size=0.4, random_state=42, shuffle=True) 17 | 18 | autowoe = AutoWoE( 19 | task="REG", 20 | monotonic=True, 21 | interpreted_model=True, 22 | regularized_refit=True, 23 | metric_th=0.0, 24 | n_jobs=1, 25 | verbose=0, 26 | ) 27 | 28 | start_fit_time = time.time() 29 | autowoe.fit(train=train_df, target_name=TARGET_NAME) 30 | 31 | assert time.time() - start_fit_time < 25 32 | 33 | start_predicts_time = time.time() 34 | 35 | train_pred = autowoe.predict(train_df) 36 | test_pred = autowoe.predict(test_df) 37 | 38 | train_pred = autowoe.predict(train_df) 39 | 40 | assert time.time() - start_predicts_time < 0.3, f"Pred time is {time.time() - start_predicts_time}, >= 0.3" 41 | 42 | r2_train = r2_score(train_df[TARGET_NAME], train_pred) 43 | r2_test = r2_score(test_df[TARGET_NAME], test_pred) 44 | 45 | assert r2_train > 0.8 46 | assert r2_test > 0.76 47 | 48 | autowoe.get_sql_inference_query("FEATURE_TABLE") 49 | -------------------------------------------------------------------------------- /.github/workflows/tests_windows.yml: -------------------------------------------------------------------------------- 1 | name: tests_windows 2 | 3 | on: 4 | # # At 20:59 every day (23:59 MSK) 5 | # schedule: 6 | # - cron: 59 20 * * * 7 | 8 | # Manually triggerable in github 9 | workflow_dispatch: 10 | 11 | workflow_run: 12 | workflows: ["tests_ubuntu"] 13 | branches: [master] 14 | types: 15 | - completed 16 | 17 | jobs: 18 | windows-tests: 19 | if: ${{ github.event.workflow_run.conclusion == 'success' }} 20 | runs-on: windows-latest 21 | strategy: 22 | fail-fast: true 23 | matrix: 24 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] 25 | 26 | steps: 27 | - uses: actions/checkout@v2 28 | 29 | - name: Set up Python ${{ matrix.python-version }} 30 | uses: actions/setup-python@v2 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | 34 | - name: setup-msys2 35 | uses: msys2/setup-msys2@v2 36 | with: 37 | msystem: MINGW64 38 | update: true 39 | install: >- 40 | mingw-w64-x86_64-cairo 41 | 42 | - name: install deps for Windows 43 | run: pip3 install pycairo 44 | 45 | - name: install tox 46 | run: | 47 | python3 -m pip install --upgrade pip 48 | pip3 install tox==4.23.2 49 | pip3 install tox-gh-actions==3.2.0 50 | 51 | - name: test with tox 52 | run: | 53 | tox -- -vv 54 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "AutoWoE" 3 | version = "1.3.4" 4 | description = "Library for automatic interpretable model building (Whitebox AutoML)" 5 | authors = ["Vakhrushev Anton "] 6 | readme = "README.md" 7 | homepage = "https://github.com/AILab-MLTools/AutoMLWhitebox" 8 | repository = "https://github.com/AILab-MLTools/AutoMLWhitebox" 9 | classifiers = [ 10 | "Programming Language :: Python :: 3.8", 11 | "Programming Language :: Python :: 3.9", 12 | "Programming Language :: Python :: 3.10", 13 | "Programming Language :: Python :: 3.11", 14 | "Programming Language :: Python :: 3.12", 15 | "Operating System :: OS Independent", 16 | "Intended Audience :: Science/Research", 17 | "Development Status :: 3 - Alpha", 18 | "Environment :: Console", 19 | "Natural Language :: English", 20 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 21 | "Typing :: Typed" 22 | ] 23 | 24 | [tool.poetry.dependencies] 25 | python = ">=3.8" 26 | numpy = "*" 27 | scipy = "*" 28 | pandas = "*" 29 | scikit-learn = "*" 30 | lightgbm = "*" 31 | sphinx = "*" 32 | sphinx-rtd-theme = "*" 33 | joblib = "*" 34 | pytz = "*" 35 | pytest = "*" 36 | jinja2 = "*" 37 | matplotlib = "*" 38 | seaborn = "*" 39 | tqdm = "^4.62.3" 40 | StrEnum = "^0.4.7" 41 | 42 | 43 | 44 | [tool.poetry.dev-dependencies] 45 | notebook = "^6.4.6" 46 | black = "20.8b1" 47 | pre-commit = "2.15.0" 48 | mypy = "^0.910" 49 | tox = "*" 50 | darglint = "^1.8.1" 51 | flake8-docstrings = "^1.6.0" 52 | isort = "5.7.0" 53 | jupyter-contrib-nbextensions = "^0.5.1" 54 | jupyter_nbextensions_configurator = "^0.4.1" 55 | 56 | [build-system] 57 | requires = ["poetry>=0.12"] 58 | build-backend = "poetry.masonry.api" 59 | -------------------------------------------------------------------------------- /autowoe/lib/utilities/s3.py: -------------------------------------------------------------------------------- 1 | """S3 wrapper.""" 2 | # ruff: noqa 3 | 4 | import s3fs 5 | 6 | 7 | class S3Client(s3fs.S3FileSystem): 8 | """Класс-обёртка для доступа к хранилищу S3. 9 | 10 | Используется для доступа к объектам хранилища с использованием интерфейса файловой системы. 11 | 12 | """ 13 | 14 | def __init__(self, aws_access_key_id, aws_secret_access_key, namespace=None, endpoint_url=None, **kwargs): 15 | """Конструктор объекта файловой системы на S3 SberCloud. 16 | 17 | Args: 18 | aws_access_key_id: Публичный ключ доступа к бакету S3 19 | aws_secret_access_key: Приватный ключ доступа к бакету S3 20 | namespace: Идентификатор пространства пользователя в хранилище SberCloud. Используется для формировании 21 | URL web-сервиса S3 SberCloud. Если не задан, то необходимо задать URL в аргументе endpoint_url 22 | endpoint_url: URL web-сервиса S3 SberCloud. Если не задан, то URL будет автоматически 23 | сконструирован на основании значения namespace. 24 | kwargs: Дополнительные параметры, передаваемые конструктору s3fs.S3FileSystem 25 | 26 | """ 27 | if not namespace and not endpoint_url: 28 | raise ValueError("Either namespace or endpoint_url is required") 29 | 30 | self.namespace = namespace 31 | self.aws_access_key_id = aws_access_key_id 32 | self.aws_secret_access_key = aws_secret_access_key 33 | self.endpoint_url = endpoint_url 34 | 35 | super(S3Client, self).__init__( 36 | key=self.aws_access_key_id, 37 | secret=self.aws_secret_access_key, 38 | client_kwargs={"endpoint_url": self.endpoint_url}, 39 | **kwargs, 40 | ) 41 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | # Manually triggerable in github 5 | workflow_dispatch: 6 | 7 | push: 8 | 9 | 10 | pull_request: 11 | 12 | 13 | jobs: 14 | pre-commit: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: checkout 18 | uses: actions/checkout@v4 19 | 20 | - uses: actions/setup-python@v4 21 | with: 22 | python-version: "3.10" 23 | 24 | - name: pre-commit 25 | uses: pre-commit/action@v3.0.1 26 | 27 | codespell: 28 | runs-on: ubuntu-latest 29 | 30 | steps: 31 | - name: checkout 32 | uses: actions/checkout@v4 33 | 34 | - name: codespell 35 | run: | 36 | pip install codespell 37 | codespell --skip="*.js,*.csv,*.ipynb,*shaptxt*" 38 | 39 | linux-py310-tests: 40 | needs: pre-commit 41 | runs-on: ubuntu-latest 42 | if: | 43 | ( github.event_name == 'push' ) && ( needs.pre-commit.result == 'success' ) 44 | || 45 | ( github.event_name == 'pull_request' ) && ( needs.pre-commit.result == 'success' ) 46 | || 47 | ( github.event_name == 'workflow_dispatch' ) && ( needs.pre-commit.result == 'success' ) 48 | 49 | steps: 50 | - uses: actions/checkout@v4 51 | 52 | - name: Set up Python 53 | uses: actions/setup-python@v4 54 | 55 | - uses: Gr1N/setup-poetry@v8 56 | with: 57 | poetry-version: 1.8.0 58 | 59 | # - name: update pip if python 3.12 60 | # run: pip install setuptools && python -m ensurepip --upgrade 61 | 62 | - name: install deps for Ubuntu 63 | run: sudo apt-get install build-essential libcairo2 libpango-1.0-0 libpangocairo-1.0-0 libgdk-pixbuf2.0-0 libffi-dev shared-mime-info 64 | 65 | - name: install tox 66 | run: | 67 | python3 -m pip install --upgrade pip 68 | pip3 install tox==4.23.2 69 | pip3 install tox-gh-actions==3.2.0 70 | 71 | - name: test with tox 72 | run: | 73 | tox 74 | -------------------------------------------------------------------------------- /autowoe/lib/pipelines/pipeline_homotopy.py: -------------------------------------------------------------------------------- 1 | # noqa: D100 2 | 3 | import lightgbm as lgb 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.model_selection import StratifiedKFold 7 | 8 | from autowoe.lib.utilities.utils import TaskType, flatten 9 | 10 | 11 | class HTransform: 12 | """Homotopy transform. 13 | 14 | Args: 15 | x: Feature. 16 | y: Target. 17 | cv_splits: Number of splits. 18 | 19 | """ 20 | 21 | def __init__(self, task: TaskType, x: pd.Series, y: pd.Series, cv_splits: int = 5): 22 | self.x, self.y = x, y 23 | self._task = task 24 | # TODO: for what ? 25 | self.cv = self._get_cv(cv_splits) 26 | 27 | @staticmethod 28 | def _get_cv(cv_splits: int) -> StratifiedKFold: 29 | return StratifiedKFold(n_splits=cv_splits, random_state=323, shuffle=True) 30 | 31 | def __call__(self, tree_params: dict) -> np.ndarray: 32 | """Return the boundaries of the split by the transmitted sample and parameters. 33 | 34 | Args: 35 | tree_params: dict or lightgbm tree params 36 | 37 | Returns: 38 | Splitting. 39 | 40 | """ 41 | default_tree_params = { 42 | "boosting_type": "rf", 43 | "objective": "binary" if self._task == TaskType.BIN else "regression", 44 | "bagging_freq": 1, 45 | "bagging_fraction": 0.999, 46 | "feature_fraction": 0.999, 47 | "bagging_seed": 323, 48 | "verbosity": -1, 49 | } 50 | 51 | unite_params = {**default_tree_params, **tree_params, "num_threads": 1} 52 | lgb_train = lgb.Dataset(self.x.values.astype(np.float32)[:, np.newaxis], label=self.y) 53 | gbm = lgb.train(params=unite_params, train_set=lgb_train, num_boost_round=1) 54 | 55 | d_tree_prop = flatten(gbm.dump_model()["tree_info"][0]) 56 | limits = {d_tree_prop[key] for key in d_tree_prop if "threshold" in key} 57 | 58 | limits = list(limits) 59 | limits.sort() 60 | 61 | return np.unique(limits) 62 | -------------------------------------------------------------------------------- /autowoe/lib/logging.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | import logging 4 | import sys 5 | import warnings 6 | 7 | 8 | logging.captureWarnings(True) 9 | 10 | debug_log_format = f"%(asctime)s - [%(levelname)s] - %(name)s - (%(filename)s).%(funcName)s(%(lineno)d) - %(message)s" 11 | default_log_format = f"%(message)s" 12 | 13 | 14 | def verbosity_to_loglevel(verbosity): 15 | if verbosity <= 0: 16 | log_level = logging.ERROR 17 | warnings.filterwarnings("ignore") 18 | elif verbosity == 1: 19 | log_level = logging.WARNING 20 | elif verbosity == 2: 21 | log_level = logging.INFO 22 | else: 23 | log_level = logging.DEBUG 24 | 25 | return log_level 26 | 27 | 28 | def get_file_handler(): 29 | file_handler = logging.FileHandler("x.log") 30 | file_handler.setLevel(logging.WARNING) 31 | file_handler.setFormatter(logging.Formatter(default_log_format)) 32 | return file_handler 33 | 34 | 35 | def get_stream_handler(stream, level=None, handler_filter=None): 36 | stream_handler = logging.StreamHandler(stream) 37 | stream_handler.setFormatter(logging.Formatter(default_log_format)) 38 | 39 | if level: 40 | stream_handler.setLevel(level) 41 | 42 | if handler_filter: 43 | stream_handler.addFilter(handler_filter) 44 | 45 | return stream_handler 46 | 47 | 48 | def get_logger(name=None, level=None): 49 | class InfoFilter(logging.Filter): 50 | def filter(self, rec): 51 | return rec.levelno in (logging.DEBUG, logging.INFO) 52 | 53 | logger = logging.getLogger(name) 54 | 55 | if level: 56 | logger.setLevel(level) 57 | 58 | if logger.hasHandlers(): 59 | logger.handlers.clear() 60 | 61 | logger.addHandler(get_stream_handler(stream=None, level=logging.WARNING)) 62 | logger.addHandler(get_stream_handler(stream=sys.stdout, level=logging.DEBUG, handler_filter=InfoFilter())) 63 | 64 | logger.propagate = False 65 | 66 | return logger 67 | 68 | 69 | class DuplicateFilter(object): 70 | def __init__(self): 71 | self.msgs = set() 72 | 73 | def filter(self, record): 74 | rv = record.msg not in self.msgs 75 | self.msgs.add(record.msg) 76 | return rv 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## AutoWoE library 2 | [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/AutoWoE)](https://pypi.org/project/AutoWoE) 3 | [![PyPI - Version](https://img.shields.io/pypi/v/AutoWoE)](https://pypi.org/project/AutoWoE) 4 | ![pypi - Downloads](https://img.shields.io/pypi/dm/AutoWoE?color=green&label=PyPI%20downloads&logo=pypi&logoColor=green) 5 | [![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/sb-ai-lab/AutoMLWhitebox/CI.yml)](https://github.com/sb-ai-lab/AutoMLWhitebox/actions/workflows/CI.yml?query=branch%3Amaster) 6 | 7 | This is the repository for **AutoWoE** library, developed by LightAutoML group. This library can be used for automatic creation of interpretable ML model based on feature binning, WoE features transformation, feature selection and Logistic Regression. 8 | 9 | **Authors:** Vakhrushev Anton, Grigorii Penkin, Alexander Kirilin 10 | 11 | **Library setup** can be done by one of three scenarios below: 12 | 13 | 1. Installation from PyPI: 14 | ```bash 15 | pip install autowoe 16 | ``` 17 | 2. Installation from source code 18 | 19 | First of all you need to install [git](https://git-scm.com/downloads) and [poetry](https://python-poetry.org/docs/#installation). 20 | 21 | ```bash 22 | 23 | # Load WhiteBox source code 24 | git clone https://github.com/AILab-MLTools/AutoMLWhitebox.git 25 | 26 | cd AutoMLWhiteBox/ 27 | 28 | # !!!Choose only one item!!! 29 | 30 | # 1. Recommended: Create virtual environment inside your project directory 31 | poetry config virtualenvs.in-project true 32 | 33 | # 2. Global installation: Don't create virtual environment 34 | poetry config virtualenvs.create false --local 35 | 36 | # For more information read poetry docs 37 | 38 | # Install WhiteBox 39 | poetry install 40 | 41 | ``` 42 | 43 | 44 | **Usage tutorials** are in Jupyter notebooks in the repository root. For **parameters description** take a look at `parameters_info.md`. 45 | 46 | **Bugs / Questions / Suggestions:** 47 | - Seek prompt advice in [Telegram group](https://t.me/joinchat/sp8P7sdAqaU0YmRi). 48 | - Open bug reports and feature requests on GitHub [issues](https://github.com/sb-ai-lab/AutoMLWhitebox/issues). 49 | - Also follow our [Telegram channel](https://t.me/lightautoml) 50 | -------------------------------------------------------------------------------- /tests/integration/test_eda_allfeatures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.metrics import roc_auc_score 5 | 6 | from pandas import Series 7 | 8 | from autowoe import AutoWoE 9 | 10 | 11 | def test_eda_all_features(train_data): 12 | df = train_data 13 | 14 | TARGET_NAME = "target" 15 | 16 | num_features = [col for col in df.columns if col.startswith("number")][:10] 17 | cat_features = [col for col in df.columns if col.startswith("string")][:5] 18 | 19 | df = df[num_features + cat_features + [TARGET_NAME]] 20 | 21 | train_df, test_df = train_test_split(df, stratify=df[TARGET_NAME], test_size=0.4, random_state=42, shuffle=True) 22 | 23 | autowoe = AutoWoE( 24 | task="BIN", 25 | n_jobs=1, 26 | verbose=0, 27 | # turn off initial importance selection - this step force all features to pass into the binning stage 28 | imp_th=-1, 29 | ) 30 | 31 | autowoe.fit(train=train_df, target_name=TARGET_NAME) 32 | 33 | test_pred = autowoe.predict_proba(test_df) 34 | 35 | score = roc_auc_score(test_df[TARGET_NAME], test_pred) 36 | 37 | assert np.isclose(score, 0.6186, atol=1e-4), f"Real score is {score}" 38 | 39 | enc = autowoe.test_encoding(train_df, list(autowoe.woe_dict.keys()), bins=True) 40 | fails_counter = 0 41 | for col in enc.columns: 42 | start_time = time.time() 43 | 44 | grp = enc.groupby(col).size() 45 | woe = autowoe.woe_dict[col] 46 | 47 | woe_val = Series(woe.cod_dict).reset_index() 48 | woe_val.columns = [col, "WoE"] 49 | woe_val["count"] = woe_val[col].map(grp).fillna(0).values.astype(int) 50 | if woe.f_type == "cat": 51 | woe_val["bin"] = woe_val[col] 52 | else: 53 | split = list(woe.split.astype(np.float32)) 54 | mapper = {n: f"({x}; {y}]" for (n, (x, y)) in enumerate(zip(["-inf"] + split, split + ["inf"]))} 55 | woe_val["bin"] = woe_val[col].map(mapper) 56 | woe_val["bin"] = np.where(woe_val["bin"].isnull().values, woe_val[col], woe_val["bin"]) 57 | 58 | if time.time() - start_time > 0.3: 59 | fails_counter += 1 60 | assert fails_counter <= 1, f"There were {fails_counter} fails, it's more than 1" 61 | -------------------------------------------------------------------------------- /autowoe/lib/selectors/l1.py: -------------------------------------------------------------------------------- 1 | """Selector based on Lasso.""" 2 | 3 | from typing import Dict, List, Tuple, TypeVar 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from autowoe.lib.selectors.utils import Result, l1_select 9 | from autowoe.lib.utilities.utils import TaskType 10 | 11 | from .utils import F_LIST_TYPE, FEATURE 12 | 13 | WoE = TypeVar("WoE") 14 | 15 | 16 | class L1: 17 | """L1 selector. 18 | 19 | Args: 20 | interpreted_model: Build interpreted model. 21 | train: Train features. 22 | target: Train target. 23 | n_jobs: Number of threads. 24 | cv_split: Cross-Val splits. 25 | 26 | """ 27 | 28 | def __init__( 29 | self, 30 | task: TaskType, 31 | interpreted_model: bool, 32 | train: pd.DataFrame, 33 | target: pd.Series, 34 | n_jobs: int, 35 | cv_split: Dict[int, Tuple[List[int], List[int]]], 36 | ): 37 | self.task = task 38 | self.train = train 39 | self.target = target 40 | 41 | self.__interpreted_model = interpreted_model 42 | self.__n_jobs = n_jobs 43 | self.__features = train.columns 44 | self.__cv_split = cv_split 45 | 46 | def __call__( 47 | self, features_fit: List[FEATURE], l1_grid_size: int, l1_exp_scale: float, metric_tol: float = 1e-4 48 | ) -> Tuple[F_LIST_TYPE, Result]: 49 | """Run selector. 50 | 51 | Args: 52 | features_fit: List of features. 53 | l1_grid_size: Number of points on grid. 54 | l1_exp_scale: Maximum value of `C`. 55 | metric_tol: Metric tolerance. 56 | 57 | Returns: 58 | Selected features, summary info. 59 | 60 | 61 | """ 62 | np.random.seed(323) 63 | features_fit_ = features_fit.copy() 64 | dataset = self.train[features_fit_], self.target 65 | 66 | best_features, result = l1_select( 67 | self.task, 68 | interpreted_model=self.__interpreted_model, 69 | n_jobs=self.__n_jobs, 70 | dataset=dataset, 71 | l1_grid_size=l1_grid_size, 72 | l1_exp_scale=l1_exp_scale, 73 | cv_split=self.__cv_split, 74 | metric_tol=metric_tol, 75 | ) 76 | 77 | return best_features, result 78 | -------------------------------------------------------------------------------- /tests/integration/test_dates_and_stat_model.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from sklearn.metrics import roc_auc_score 4 | 5 | from autowoe import ReportDeco, AutoWoE 6 | 7 | 8 | def test_dates_and_stat_model(train_data, test_data, test_target): 9 | 10 | train = train_data 11 | test = test_data 12 | 13 | test["target"] = test_target.values 14 | 15 | num_col = list(filter(lambda x: "numb" in x, train.columns)) 16 | num_feature_type = {x: "real" for x in num_col} 17 | 18 | date_col = list(filter(lambda x: "datetime" in x, train.columns)) 19 | date_feature_type = {x: (None, ("d", "wd")) for x in date_col} 20 | 21 | features_type = dict(**num_feature_type, **date_feature_type) 22 | # подробно параметры описаны в Example_1 23 | auto_woe = AutoWoE( 24 | monotonic=True, max_bin_count=4, oof_woe=False, regularized_refit=False, p_val=0.05, debug=False, verbose=0 25 | ) 26 | auto_woe = ReportDeco(auto_woe) 27 | 28 | start_fit_time = time.time() 29 | auto_woe.fit( 30 | train[num_col + date_col + ["target"]], 31 | target_name="target", 32 | features_type=features_type, 33 | ) 34 | 35 | assert time.time() - start_fit_time < 50, f"Fit time is {time.time() - start_fit_time}, it's more than 50" 36 | 37 | start_pred_time = time.time() 38 | pred = auto_woe.predict_proba(test) 39 | 40 | assert time.time() - start_pred_time < 5, f"Predict time is {time.time() - start_pred_time}, it's more than 5" 41 | 42 | score = roc_auc_score(test["target"], pred) 43 | 44 | assert score > 0.78 45 | 46 | report_params = { 47 | "automl_date_column": "report_month", # колонка с датой в формате params['datetimeFormat'] 48 | "output_path": "./AUTOWOE_REPORT_2", # папка, куда сгенерится отчет и сложатся нужные файлы 49 | "report_name": "___НАЗВАНИЕ ОТЧЕТА___", 50 | "report_version_id": 1, 51 | "city": "Воронеж", 52 | "model_aim": "___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___", 53 | "model_name": "___НАЗВАНИЕ МОДЕЛИ___", 54 | "zakazchik": "___ЗАКАЗЧИК___", 55 | "high_level_department": "___ПОДРАЗДЕЛЕНИЕ___", 56 | "ds_name": "___РАЗРАБОТЧИК МОДЕЛИ___", 57 | "target_descr": "___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___", 58 | "non_target_descr": "___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___", 59 | } 60 | 61 | auto_woe.generate_report(report_params) 62 | 63 | # import shutil 64 | 65 | # shutil.rmtree("AUTOWOE_REPORT_2") 66 | -------------------------------------------------------------------------------- /parameters_info.md: -------------------------------------------------------------------------------- 1 | ## Whitebox pipeline parameters: 2 | 3 | ### General params: 4 | 5 | - n_jobs 6 | - debug 7 | 8 | ### 0) Simple typing and trash removal 9 | #### 0.0) Remove trash feats 10 | 11 | Medium: 12 | - th_nan 13 | - th_const 14 | 15 | #### 0.1) Typing (auto and user defined) 16 | 17 | Critical: 18 | - features_type (dict) {'age': 'real', 'education': 'cat', 'birth_date': (None, ("d", "wd"), ...} 19 | 20 | #### 0.2) Dates and categories encoding 21 | 22 | Critical: 23 | - features_type (for datetimes) 24 | 25 | Optional: 26 | - cat_alpha (int) - greater means more conservative encoding 27 | 28 | 29 | ### 1) Initial feature selection (selection based on gbm importance) 30 | 31 | Critical: 32 | - select_type (None or int) 33 | - imp_type (if type(select_type) is int 'perm_imt'/'feature_imp') 34 | 35 | Optional: 36 | - imt_th (float) - threshold for select_type is None 37 | 38 | ### 2) Binning: 39 | 40 | Critical: 41 | - monotonic / features_monotone_constraints 42 | - max_bin_count / max_bin_count 43 | - min_bin_size 44 | 45 | - cat_merge_to 46 | - nan_merge_to 47 | 48 | Medium: 49 | - force_single_split 50 | 51 | Optional: 52 | - min_bin_mults 53 | - min_gains_to_split 54 | 55 | ### 3) WoE estimation WoE = LN( ((% 0 in bin) / (% 0 in sample)) / ((% 1 in bin) / (% 1 in sample)) ): 56 | 57 | Critical: 58 | - oof_woe 59 | 60 | Optional: 61 | - woe_diff_th 62 | - n_folds (if oof_woe) 63 | 64 | ### 4) Post selection: 65 | 66 | #### 4.0) Partial dependencies with target 67 | 68 | Critical: 69 | - auc_th 70 | 71 | #### 4.1) VIF 72 | 73 | Critical: 74 | - vif_th 75 | 76 | #### 4.2) Partial correlcations 77 | 78 | Critical: 79 | - pearson_th 80 | 81 | ### 5) Model based selection 82 | 83 | Optional: 84 | - n_folds 85 | - l1_grid_size 86 | - l1_exp_scale 87 | 88 | 89 | ### 6) Final model refit: 90 | 91 | Critical: 92 | - regularized_refit 93 | - p_val (if not regularized_refit) 94 | - validation (if not regularized_refit) 95 | 96 | Optional: 97 | - interpreted_model 98 | - l1_grid_size (if regularized_refit) 99 | - l1_exp_scale (if regularized_refit) 100 | 101 | ### 7) Report generation 102 | 103 | - report_params 104 | -------------------------------------------------------------------------------- /tests/integration/test_autotyping.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from sklearn.metrics import roc_auc_score 4 | 5 | from autowoe import ReportDeco, AutoWoE 6 | 7 | 8 | def test_autotyping(cat_data): 9 | 10 | data = cat_data 11 | 12 | train = data.iloc[:14000, :] 13 | test = data.iloc[14000:, :] 14 | 15 | # подробно параметры описаны в Example_1 16 | auto_woe = AutoWoE( 17 | monotonic=False, 18 | max_bin_count=5, 19 | oof_woe=True, 20 | regularized_refit=True, 21 | p_val=0.05, 22 | debug=False, 23 | verbose=0, 24 | cat_merge_to="to_maxp", 25 | nan_merge_to="to_maxp", 26 | ) 27 | auto_woe = ReportDeco(auto_woe) 28 | 29 | autowoe_fit_params = { 30 | "train": train, 31 | "target_name": "isFraud", 32 | } 33 | start_fit_time = time.time() 34 | auto_woe.fit(**autowoe_fit_params) 35 | 36 | assert time.time() - start_fit_time < 60, f"Fit time is {time.time() - start_fit_time}, it's more than 60 seconds" 37 | 38 | start_pred_time = time.time() 39 | pred = auto_woe.predict_proba(test) 40 | 41 | assert ( 42 | time.time() - start_pred_time < 5 43 | ), f"Prediction time is {time.time() - start_pred_time}, it's more than 5 seconds" 44 | 45 | score = roc_auc_score(test[autowoe_fit_params["target_name"]], pred) 46 | 47 | assert score > 0.8 48 | 49 | values = {} 50 | for value in auto_woe.private_features_type.values(): 51 | if value not in values: 52 | values[value] = 0 53 | values[value] += 1 54 | 55 | assert ( 56 | values["cat"] == 12 and values["real"] == 61 57 | ), f"There're should be 12 cat and 61 reals, but we have {values['cat']} cats and {values['real']} reals" 58 | 59 | report_params = { 60 | "automl_date_column": "report_month", # колонка с датой в формате params['datetimeFormat'] 61 | "output_path": "./AUTOWOE_REPORT_3", # папка, куда сгенерится отчет и сложатся нужные файлы 62 | "report_name": "___НАЗВАНИЕ ОТЧЕТА___", 63 | "report_version_id": 1, 64 | "city": "Воронеж", 65 | "model_aim": "___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___", 66 | "model_name": "___НАЗВАНИЕ МОДЕЛИ___", 67 | "zakazchik": "___ЗАКАЗЧИК___", 68 | "high_level_department": "___ПОДРАЗДЕЛЕНИЕ___", 69 | "ds_name": "___РАЗРАБОТЧИК МОДЕЛИ___", 70 | "target_descr": "___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___", 71 | "non_target_descr": "___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___", 72 | } 73 | 74 | auto_woe.generate_report(report_params) 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | # C extensions 6 | *.so 7 | # DS_store 8 | .DS_Store 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | # lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg MANIFEST 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template before PyInstaller builds the exe, so as to 29 | # inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .nox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *.cover 45 | *.py, 46 | .hypothesis/ 47 | .pytest_cache/ 48 | cover/ 49 | # Translations 50 | *.mo 51 | *.pot 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | db.sqlite3 56 | db.sqlite3-journal 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | # Scrapy stuff: 61 | .scrapy 62 | # Sphinx documentation 63 | docs/_build/ 64 | # PyBuilder 65 | .pybuilder/ 66 | target/ 67 | # Jupyter Notebook 68 | .ipynb_checkpoints 69 | # IPython 70 | profile_default/ 71 | ipython_config.py 72 | # pyenv 73 | # For a library or package, you might want to ignore these files since the code is intended to run in multiple 74 | # environments; otherwise, check them in: .python-version pipenv 75 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. However, in case of 76 | #collaboration, if having platform-specific dependencies or dependencies having no cross-platform support, pipenv 77 | #may install dependencies that don't work, or not install all needed dependencies. Pipfile.lock 78 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 79 | __pypackages__/ 80 | # Celery stuff 81 | celerybeat-schedule celerybeat.pid 82 | # SageMath parsed files 83 | *.sage.py 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | autowoe_venv/ 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | # Rope project settings 97 | .ropeproject 98 | # mkdocs documentation 99 | /site 100 | # mypy 101 | .mypy_cache/ 102 | .dmypy.json 103 | dmypy.json 104 | # Pyre type checker 105 | .pyre/ 106 | # pytype static type analyzer 107 | .pytype/ 108 | # Cython debug symbols 109 | cython_debug/ 110 | .idea/ 111 | .vscode/ 112 | temp/ 113 | -------------------------------------------------------------------------------- /autowoe/lib/cat_encoding/cat_encoding.py: -------------------------------------------------------------------------------- 1 | # noqa: D100 2 | 3 | from copy import deepcopy 4 | from typing import Dict, List, Union 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | class CatEncoding: 11 | """Class for categorical data converting/reconverting to float values. 12 | 13 | Args: 14 | data: Data for encoding. First column - feature, second - target. 15 | 16 | """ 17 | 18 | def __init__(self, data: pd.DataFrame): 19 | self.data = data 20 | self.col = data.columns 21 | 22 | self.data_info = pd.DataFrame(index=data.index, columns=[self.col[0], "mean_enc"]) 23 | self.data_info[self.col[0]] = self.data[self.col[0]].values 24 | 25 | def __call__( 26 | self, cv_index_split: Dict[int, List[int]], nan_index: np.array, cat_alpha: float = 1.0 27 | ) -> pd.DataFrame: 28 | """Mean_target encoding by cross-val. 29 | 30 | Args: 31 | cv_index_split: CV indexes. 32 | nan_index: Indexes of nan-values. 33 | cat_alpha: Smooth coefficient alpha. 34 | 35 | Returns: 36 | Encoded values. 37 | 38 | """ 39 | cv_index_split_ = deepcopy(cv_index_split) 40 | feature, target = self.col 41 | 42 | for key in cv_index_split_: 43 | train_index, test_index = cv_index_split_[key] 44 | train_index, test_index = np.setdiff1d(train_index, nan_index), np.setdiff1d(test_index, nan_index) 45 | 46 | data_sl = self.data.iloc[train_index] 47 | d_agg = data_sl.groupby(feature)[target].agg(["sum", "count"]) 48 | d_agg = (d_agg["sum"] + cat_alpha * data_sl[target].mean()) / (d_agg["count"] + cat_alpha) 49 | 50 | d_agg = d_agg.to_dict() 51 | self.data_info.iloc[test_index, 1] = self.data_info.iloc[test_index, 0].map(d_agg) 52 | 53 | train_f = self.data.copy() 54 | train_f.iloc[:, 0] = self.data_info["mean_enc"].values 55 | return train_f 56 | 57 | def mean_target_reverse(self, split: Union[List[float], np.ndarray]) -> Dict[int, int]: 58 | """Reverse mean-target. 59 | 60 | Should be run after '__call__' 61 | 62 | Args: 63 | split: Splits. 64 | 65 | Returns: 66 | Mapping. 67 | 68 | """ 69 | df = self.data_info.copy() 70 | df["split_cat"] = np.searchsorted(split, df.mean_enc.values) 71 | 72 | crosstab = pd.crosstab(df[self.col[0]], df.split_cat) 73 | crosstab = crosstab.div(crosstab.sum(axis=1), axis=0) 74 | max_cat = np.argmax(crosstab.values, axis=1) 75 | 76 | # словарь соответствий: имя категории -> номер бина 77 | return dict(zip(crosstab.index, max_cat)) 78 | -------------------------------------------------------------------------------- /autowoe/lib/types_handler/features_checkers_handlers.py: -------------------------------------------------------------------------------- 1 | """Type feature checkers.""" 2 | 3 | from typing import Optional, Tuple, cast 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | F_UNIQUE = 5 9 | 10 | 11 | def dates_checker(feature: pd.Series) -> bool: 12 | """Check that feature belongs to the datetime. 13 | 14 | Args: 15 | feature: Values. 16 | 17 | Returns: 18 | Flag. 19 | 20 | """ 21 | try: 22 | feature = pd.to_datetime(feature) 23 | if (feature.min().year <= 1975) or (feature.min().year is np.nan): 24 | return False 25 | else: 26 | return True 27 | except ValueError: 28 | return False 29 | except Exception: 30 | raise ValueError("Something is wrong with object types") 31 | 32 | 33 | def dates_handler( 34 | feature: pd.Series, feature_type: Tuple[Optional[str], Tuple[str, ...]] = (None, ("wd", "m", "y", "d")) 35 | ) -> Tuple: 36 | """Handle datetime feature. 37 | 38 | feature_type ("%Y%d%m", ("m", "d", "wd", "h", "min")), (None, ("m", "d", "wd", "h", "min")) 39 | 40 | Args: 41 | feature: Datetime values. 42 | feature_type: Tuple of date format and seasonality. 43 | 44 | Returns: 45 | Processed datetime, feature_type. 46 | 47 | """ 48 | date_format = feature_type[0] 49 | seasonality = feature_type[1] 50 | 51 | if not len(seasonality): 52 | raise ValueError("Seasonality is empty!") 53 | 54 | seas2func = { 55 | "y": lambda x: x.year, 56 | "m": lambda x: x.month, 57 | "d": lambda x: x.day, 58 | "wd": lambda x: x.weekday(), 59 | "h": lambda x: x.hour, 60 | "min": lambda x: x.minute, 61 | } 62 | 63 | new_features = [] 64 | new_feature = cast(pd.Series, pd.to_datetime(feature, format=date_format)) 65 | 66 | for seas in seasonality: 67 | new_feature_name = str(new_feature.name) + "__F__" + seas 68 | 69 | new_feature_ = new_feature.map(lambda x: seas2func[seas](x)) # noqa: B023 70 | new_features.append((new_feature_name, new_feature_)) 71 | 72 | return new_features, feature_type 73 | 74 | 75 | def cat_checker(feature: pd.Series) -> bool: 76 | """Check that feature belongs to the category. 77 | 78 | Args: 79 | feature: Values. 80 | 81 | Returns: 82 | Flag. 83 | 84 | """ 85 | dtypes = [object, str] 86 | if np.__version__ < "1.18.0": 87 | dtypes.append(np.str) 88 | if feature.dtype in dtypes: 89 | return True 90 | 91 | feature_unique = feature.unique() 92 | if 2 < feature_unique.shape[0] <= F_UNIQUE and np.all(feature_unique.astype(np.int64) == feature_unique): 93 | return True 94 | else: 95 | return False 96 | -------------------------------------------------------------------------------- /tests/integration/test_marked_values.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import time 4 | 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.metrics import roc_auc_score 7 | 8 | from autowoe import AutoWoE 9 | 10 | 11 | def test_marked_values(train_data): 12 | df = train_data 13 | 14 | TARGET_NAME = "target" 15 | 16 | num_features = [col for col in df.columns if col.startswith("number")][:10] 17 | cat_features = [col for col in df.columns if col.startswith("string")][:5] 18 | 19 | df = df[num_features + cat_features + [TARGET_NAME]] 20 | 21 | df.iloc[:10, 0] = -1 22 | df.iloc[10:20, 0] = -2 23 | df.iloc[:20, 1] = 1234567890 24 | df.iloc[:20, 11] = "Special" 25 | 26 | train_df, test_df = train_test_split(df, stratify=df[TARGET_NAME], test_size=0.4, random_state=42, shuffle=True) 27 | 28 | assert all(train_df["string_1"].head(1) == "other") 29 | 30 | autowoe = AutoWoE(task="BIN", n_jobs=1, verbose=0) 31 | 32 | assert autowoe._params["l1_exp_scale"] == 4 33 | assert autowoe._params["imp_type"] == "feature_imp" 34 | assert autowoe._params["population_size"] is None 35 | assert not autowoe._params["monotonic"] 36 | 37 | none_params = ( 38 | "woe_dict", 39 | "train_df", 40 | "split_dict", 41 | "target", 42 | "clf", 43 | "features_fit", 44 | "_cv_split", 45 | "_private_features_type", 46 | "_public_features_type", 47 | "_weights", 48 | "_intercept", 49 | "_p_vals", 50 | "feature_history", 51 | ) 52 | for param in none_params: 53 | assert autowoe.__dict__[param] is None, f"This value should be None, but it's {autowoe.__dict__[param]}" 54 | 55 | start_fit_time = time.time() 56 | autowoe.fit( 57 | train=train_df, 58 | target_name=TARGET_NAME, 59 | features_mark_values={"number_0": (-1, -2), "number_1": (1234567890,), "string_1": ("Special",)}, 60 | ) 61 | 62 | assert time.time() - start_fit_time < 10, f"Fit time is {time.time() - start_fit_time}, it's more than 10" 63 | 64 | start_predict_time = time.time() 65 | test_pred = autowoe.predict_proba(test_df) 66 | assert time.time() - start_predict_time < 0.05, f"Diff is {time.time() - start_predict_time}, >= 0.05" 67 | 68 | score = roc_auc_score(test_df[TARGET_NAME], test_pred) 69 | 70 | assert score > 0.58 71 | 72 | assert autowoe.get_sql_inference_query("FEATURE_TABLE") 73 | 74 | representation = autowoe.get_model_represenation() 75 | 76 | features_representation = pd.DataFrame(representation["features"]) 77 | 78 | assert all( 79 | np.isclose(features_representation["number_9"]["splits"], [7072.0, 11699.5, 13292.5]) 80 | ), "There are different splits" 81 | 82 | assert np.isclose(representation["intercept"], -4.5482746), "There are different intercept coef" 83 | -------------------------------------------------------------------------------- /autowoe/lib/utilities/cv_split_f.py: -------------------------------------------------------------------------------- 1 | """Cross validation utilities.""" 2 | 3 | from typing import Iterable, Optional 4 | 5 | import numpy as np 6 | from sklearn.model_selection import GroupKFold, StratifiedKFold 7 | 8 | from autowoe.lib.utilities.utils import TaskType 9 | 10 | 11 | def cv_split_f( 12 | x, y, task: TaskType, group_kf: Optional[Iterable] = None, n_splits: int = 6, random_state: int = 42 13 | ) -> dict: 14 | """Get CV-splits. 15 | 16 | Args: 17 | x: Features. 18 | y: Target. 19 | task: Task. 20 | group_kf: Groups. 21 | n_splits: Number of splits. 22 | random_state: Random state. 23 | 24 | Returns: 25 | CV-splits. 26 | 27 | """ 28 | if task == TaskType.BIN: 29 | if group_kf is not None: 30 | gkf = GroupKFold(n_splits=n_splits) 31 | return dict(enumerate(gkf.split(X=x, y=y, groups=group_kf))) 32 | else: 33 | skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state) 34 | return dict(enumerate(skf.split(X=x, y=y))) 35 | else: 36 | skf = StratifiedKFoldReg(n_splits=n_splits, shuffle=True, random_state=random_state) 37 | return dict(enumerate(skf.split(X=x, y=y))) 38 | 39 | 40 | class StratifiedKFoldReg(StratifiedKFold): 41 | """Stratification for continuous variable. 42 | 43 | Stratification method 'sorted' was taken from: 44 | (https://github.com/scikit-learn/scikit-learn/issues/4757) 45 | 46 | Args: 47 | method: Method for stratification 48 | n_y_bins: Number of target bins. Default: None. 49 | 50 | """ 51 | 52 | def __init__(self, method: Optional[str] = None, n_y_bins: Optional[int] = None, **kwargs): 53 | self._method = method 54 | self._n_y_bins = n_y_bins 55 | 56 | super().__init__(**kwargs) 57 | 58 | def split(self, X, y, groups=None): 59 | """Generate indices to split data into training and test set.""" 60 | if self._method is None: 61 | return self._sorted_split(X, y, groups) 62 | else: 63 | raise NotImplementedError 64 | 65 | def _sorted_split(self, X, y, groups=None): 66 | n_samples = len(y) 67 | 68 | n_labels = int(np.floor(n_samples / self.n_splits)) 69 | y_labels_sorted = np.concatenate([np.repeat(ii, self.n_splits) for ii in range(n_labels)]) 70 | 71 | mod = np.mod(n_samples, self.n_splits) 72 | 73 | _, labels_idx = np.unique(y_labels_sorted, return_index=True) 74 | rand_label_ix = np.random.choice(labels_idx, mod, replace=False) 75 | y_labels_sorted = np.insert(y_labels_sorted, rand_label_ix, y_labels_sorted[rand_label_ix]) 76 | 77 | map_labels_y = dict(zip(np.argsort(y), y_labels_sorted)) 78 | 79 | y_labels = np.array([map_labels_y[ii] for ii in range(n_samples)]) 80 | 81 | return super().split(X, y_labels, groups) 82 | 83 | def _bins_split(self, X, y, groups=None): 84 | y_labels = y 85 | return super().split(X, y_labels, groups) 86 | -------------------------------------------------------------------------------- /autowoe/lib/selectors/selector_last.py: -------------------------------------------------------------------------------- 1 | """Post-selection.""" 2 | 3 | from typing import Any, Dict, List, Optional, Tuple, TypeVar 4 | 5 | import pandas as pd 6 | 7 | from autowoe.lib.utilities.utils import Result, TaskType 8 | 9 | from .composed_selector import ComposedSelector 10 | from .l1 import L1 11 | from .utils import F_LIST_TYPE 12 | 13 | __all__ = ["Selector"] 14 | 15 | WoE = TypeVar("WoE") 16 | 17 | 18 | class Selector: 19 | """Class for post-selection of features. 20 | 21 | Args: 22 | interpreted_model: Build interpreted model. 23 | task: Task. 24 | train: Train features. 25 | target: Train target. 26 | features_type: Features types. 27 | n_jobs: Number of threads. 28 | cv_split: Cross-Val splits. 29 | features_mark_values: 30 | 31 | """ 32 | 33 | def __init__( 34 | self, 35 | interpreted_model: bool, 36 | task: TaskType, 37 | train: pd.DataFrame, 38 | target: pd.Series, 39 | features_type: Dict[str, str], 40 | n_jobs: int, 41 | cv_split: Dict[int, Tuple[List[int], List[int]]], 42 | features_mark_values: Optional[Dict[str, Tuple[Any]]], 43 | ): 44 | self.__features_fit = list(features_type.keys()) 45 | self.__pearson_selector = ComposedSelector(train, target, task, features_mark_values) 46 | self.__main_selector = L1( 47 | task, train=train, target=target, interpreted_model=interpreted_model, n_jobs=n_jobs, cv_split=cv_split 48 | ) 49 | self.train = train 50 | self.target = target 51 | 52 | self.__interpreted_model = interpreted_model 53 | self.__n_jobs = n_jobs 54 | self.__features = train.columns 55 | self.__cv_split = cv_split 56 | 57 | @property 58 | def features_fit(self): 59 | """Input features.""" 60 | return self.__features_fit 61 | 62 | def __call__( 63 | self, 64 | feature_history: Dict[str, str], 65 | pearson_th: float, 66 | vif_th: float, 67 | metric_th: float, 68 | l1_grid_size: int, 69 | l1_exp_scale: float, 70 | metric_tol: float = 1e-4, 71 | ) -> Tuple[F_LIST_TYPE, Result]: 72 | """Run selector. 73 | 74 | Args: 75 | pearson_th: Pearson threshold. 76 | vif_th: VIF threshold 77 | metric_th: Metric threshold. 78 | l1_grid_size: Number of points on grid. 79 | l1_exp_scale: Maximum values of `C`. 80 | metric_tol: Metric tolerance. 81 | feature_history: HIstory of features filtering. 82 | 83 | Returns: 84 | Selected features, summary L1-selector info. 85 | 86 | """ 87 | features_fit = self.__pearson_selector( 88 | feature_history, self.features_fit, pearson_th=pearson_th, metric_th=metric_th, vif_th=vif_th 89 | ) 90 | features_before = set(features_fit) 91 | features_fit, result = self.__main_selector( 92 | features_fit=features_fit, l1_grid_size=l1_grid_size, l1_exp_scale=l1_exp_scale, metric_tol=metric_tol 93 | ) 94 | if feature_history is not None: 95 | features_diff = features_before - set(features_fit) 96 | for feat in features_diff: 97 | feature_history[feat] = f"Pruned by {self.__main_selector.__class__.__name__} selector" 98 | 99 | return features_fit, result 100 | -------------------------------------------------------------------------------- /autowoe/lib/utilities/utils.py: -------------------------------------------------------------------------------- 1 | """Utility.""" 2 | 3 | from collections import namedtuple 4 | from typing import Any, Callable, Dict, Hashable, Iterable, Set, Tuple, Union 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from strenum import StrEnum 9 | 10 | Result = namedtuple("Result", ["score", "reg_alpha", "is_neg", "min_weights"]) 11 | 12 | 13 | class TaskType(StrEnum): 14 | """Solvable task types.""" 15 | 16 | BIN: "TaskType" = "BIN" # type: ignore 17 | REG: "TaskType" = "REG" # type: ignore 18 | 19 | 20 | def drop_keys(dict_: Dict, keys: Iterable[Hashable]) -> Dict: 21 | """Drop multiple keys from dict. 22 | 23 | Args: 24 | dict_: Dictionary. 25 | keys: Dropped keys. 26 | 27 | Returns: 28 | Filtered dictornary. 29 | 30 | """ 31 | for key in keys: 32 | dict_.pop(key) 33 | return dict_ 34 | 35 | 36 | def flatten(d: dict, parent_key: str = "", sep: str = "_"): 37 | """Flatten Dictionary of dictionaries. 38 | 39 | Args: 40 | d: Dictionary with nested dictionaries. 41 | parent_key: Parent outer key. 42 | sep: Separator for merged keys. 43 | 44 | Returns: 45 | Expanded Dictionary. 46 | 47 | """ 48 | items = [] 49 | for k, v in d.items(): 50 | new_key = parent_key + sep + k if parent_key else k 51 | if isinstance(v, dict): 52 | items.extend(flatten(v, new_key, sep=sep).items()) 53 | else: 54 | items.append((new_key, v)) 55 | return dict(items) 56 | 57 | 58 | def get_task_type(values: np.ndarray) -> TaskType: 59 | """Determine task type. 60 | 61 | Args: 62 | values: Array of values. 63 | 64 | Returns: 65 | task. 66 | 67 | """ 68 | n_unique_values = np.unique(values).shape[0] 69 | 70 | task: str 71 | if n_unique_values == 1: 72 | raise RuntimeError("Only unique value in target") 73 | elif n_unique_values == 2: 74 | task = TaskType.BIN 75 | else: 76 | task = TaskType.REG 77 | 78 | return task 79 | 80 | 81 | def feature_changing( 82 | feature_history: Dict[str, str], 83 | step_name: str, 84 | features_before: Union[Dict[str, str], Set[str]], 85 | func: Callable, 86 | *args, 87 | **kwargs, 88 | ) -> Tuple[Any, Any]: 89 | """Safe feature filtering. 90 | 91 | Args: 92 | feature_history: History changes of features processing. 93 | step_name: Name of step. 94 | features_before: Features before processing. 95 | func: Filtering function. 96 | args: Function positional arguments. 97 | kwargs: Function named arguments. 98 | 99 | Returns: 100 | output: 101 | filter_features: 102 | 103 | """ 104 | # features_before: Set[str] 105 | if isinstance(features_before, dict): 106 | features_before = set(features_before.keys()) 107 | else: 108 | features_before = set(features_before) 109 | 110 | output, filter_features = func(*args, **kwargs) 111 | if isinstance(filter_features, dict): 112 | features_after = set(filter_features.keys()) 113 | elif isinstance(filter_features, pd.Series): 114 | features_after = set(filter_features.index) 115 | elif isinstance(filter_features, Iterable): 116 | features_after = set(filter_features) 117 | else: 118 | raise RuntimeError("Can't extract features after function call.") 119 | 120 | features_diff = features_before - features_after 121 | for feature in features_diff: 122 | feature_history[feature] = step_name 123 | 124 | return output, filter_features 125 | -------------------------------------------------------------------------------- /autowoe/lib/utilities/eli5_permutation.py: -------------------------------------------------------------------------------- 1 | # module was taken from eli5 lib as is 2 | # this was made to simplify dependencies 3 | # ruff: noqa 4 | 5 | """ELI5 library. 6 | 7 | A module for computing feature importances by measuring how score decreases 8 | when a feature is not available. It contains basic building blocks; 9 | there is a full-featured sklearn-compatible implementation 10 | in :class:`~.PermutationImportance`. 11 | A similar method is described in Breiman, "Random Forests", Machine Learning, 12 | 45(1), 5-32, 2001 (available online at 13 | https://www.stat.berkeley.edu/%7Ebreiman/randomforest2001.pdf), with an 14 | application to random forests. It is known in literature as 15 | "Mean Decrease Accuracy (MDA)" or "permutation importance". 16 | 17 | """ 18 | 19 | from __future__ import absolute_import 20 | 21 | from typing import Any 22 | from typing import Callable 23 | from typing import List 24 | from typing import Tuple 25 | 26 | import numpy as np 27 | 28 | from sklearn.utils import check_random_state 29 | 30 | 31 | def iter_shuffled(X, columns_to_shuffle=None, pre_shuffle=False, random_state=None): 32 | """Return an iterator of X matrices which have one or more columns shuffled. 33 | 34 | After each iteration yielded matrix is mutated inplace, so 35 | if you want to use multiple of them at the same time, make copies. 36 | ``columns_to_shuffle`` is a sequence of column numbers to shuffle. 37 | By default, all columns are shuffled once, i.e. columns_to_shuffle 38 | is ``range(X.shape[1])``. 39 | If ``pre_shuffle`` is True, a copy of ``X`` is shuffled once, and then 40 | result takes shuffled columns from this copy. If it is False, 41 | columns are shuffled on fly. ``pre_shuffle = True`` can be faster 42 | if there is a lot of columns, or if columns are used multiple times. 43 | 44 | # noqa: DAR101 45 | # noqa: DAR301 46 | 47 | """ 48 | rng = check_random_state(random_state) 49 | 50 | if columns_to_shuffle is None: 51 | columns_to_shuffle = range(X.shape[1]) 52 | 53 | if pre_shuffle: 54 | X_shuffled = X.copy() 55 | rng.shuffle(X_shuffled) 56 | 57 | X_res = X.copy() 58 | for columns in columns_to_shuffle: 59 | if pre_shuffle: 60 | X_res[:, columns] = X_shuffled[:, columns] 61 | else: 62 | rng.shuffle(X_res[:, columns]) 63 | yield X_res 64 | X_res[:, columns] = X[:, columns] 65 | 66 | 67 | def get_score_importances( 68 | score_func, # type: Callable[[Any, Any], float] 69 | X, 70 | y, 71 | n_iter=5, # type: int 72 | columns_to_shuffle=None, 73 | random_state=None, 74 | ): 75 | # type: (...) -> Tuple[float, List[np.ndarray]] 76 | """Return ``(base_score, score_decreases)`` tuple with the base score and score decreases when a feature is not available. 77 | 78 | ``base_score`` is ``score_func(X, y)``; ``score_decreases`` 79 | is a list of length ``n_iter`` with feature importance arrays 80 | (each array is of shape ``n_features``); feature importances are computed 81 | as score decrease when a feature is not available. 82 | ``n_iter`` iterations of the basic algorithm is done, each iteration 83 | starting from a different random seed. 84 | If you just want feature importances, you can take a mean of the result:: 85 | import numpy as np 86 | from eli5.permutation_importance import get_score_importances 87 | 88 | base_score, score_decreases = get_score_importances(score_func, X, y) 89 | feature_importances = np.mean(score_decreases, axis=0) 90 | 91 | # noqa: DAR301 92 | # noqa: DAR101 93 | # noqa: DAR201 94 | 95 | """ 96 | rng = check_random_state(random_state) 97 | base_score = score_func(X, y) 98 | scores_decreases = [] 99 | for i in range(n_iter): 100 | scores_shuffled = _get_scores_shufled(score_func, X, y, columns_to_shuffle=columns_to_shuffle, random_state=rng) 101 | scores_decreases.append(-scores_shuffled + base_score) 102 | return base_score, scores_decreases 103 | 104 | 105 | def _get_scores_shufled(score_func, X, y, columns_to_shuffle=None, random_state=None): 106 | Xs = iter_shuffled(X, columns_to_shuffle, random_state=random_state) 107 | return np.array([score_func(X_shuffled, y) for X_shuffled in Xs]) 108 | -------------------------------------------------------------------------------- /autowoe/lib/report/report_generator.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa 2 | 3 | import os 4 | 5 | from datetime import datetime 6 | from shutil import copyfile 7 | 8 | from jinja2 import Environment 9 | from jinja2 import FileSystemLoader 10 | 11 | from ..logging import get_logger 12 | 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | class ReportGenerator: 18 | def __init__(self): 19 | self.env = Environment(loader=FileSystemLoader(searchpath=os.path.dirname(__file__))) 20 | self.base_template = self.env.get_template("report_en_v2.html") 21 | 22 | def write_report_to_file(self, report_params): 23 | with open(os.path.join(report_params["output_path"], "autowoe_report.html"), "w", encoding="utf-8") as f: 24 | f.write( 25 | self.base_template.render( 26 | report_name=str(report_params["report_name"]), 27 | report_version=str(report_params["report_version_id"]), 28 | city=str(report_params["city"]), 29 | year=str(datetime.now().year), 30 | model_aim=str(report_params["model_aim"]), 31 | model_name=str(report_params["model_name"]), 32 | zakazchik=str(report_params["zakazchik"]), 33 | high_level_department=str(report_params["high_level_department"]), 34 | ds_name=str(report_params["ds_name"]), 35 | target_descr=str(report_params["target_descr"]), 36 | non_target_descr=str(report_params["non_target_descr"]), 37 | count_train=report_params["count_train"], 38 | train_target_cnt=report_params["train_target_cnt"], 39 | train_nontarget_cnt=report_params["train_nontarget_cnt"], 40 | train_target_perc=report_params["train_target_perc"], 41 | train_auc_full=report_params["train_auc_full"], 42 | train_gini_full=report_params["train_gini_full"], 43 | count_test=report_params["count_test"], 44 | test_target_cnt=report_params["test_target_cnt"], 45 | test_nontarget_cnt=report_params["test_nontarget_cnt"], 46 | test_target_perc=report_params["test_target_perc"], 47 | test_auc_full=report_params["test_auc_full"], 48 | test_gini_full=report_params["test_gini_full"], 49 | train_gini_confint=report_params["train_gini_confint"], 50 | test_gini_confint=report_params["test_gini_confint"], 51 | model_coef=report_params["model_coef"], 52 | p_vals=report_params["p_vals"], 53 | p_vals_test=report_params["p_vals_test"], 54 | final_nan_stat=report_params["final_nan_stat"], 55 | features_roc_auc=report_params["features_roc_auc"], 56 | features_woe=report_params["features_woe"], 57 | woe_bars=report_params["woe_bars"], 58 | backlash_plots=report_params["backlash_plots"], 59 | train_vif=report_params["train_vif"], 60 | psi_total=report_params["psi_total"], 61 | psi_zeros=report_params["psi_zeros"], 62 | psi_ones=report_params["psi_ones"], 63 | psi_binned_total=report_params["psi_binned_total"], 64 | psi_binned_zeros=report_params["psi_binned_zeros"], 65 | psi_binned_ones=report_params["psi_binned_ones"], 66 | scorecard=report_params["scorecard"], 67 | feature_history=report_params["feature_history"], 68 | feature_contribution=report_params["feature_contribution"], 69 | corr_map_table=report_params["corr_map_table"], 70 | binned_p_stats_train=report_params["binned_p_stats_train"], 71 | binned_p_stats_test=report_params["binned_p_stats_test"], 72 | dategrouped_value=report_params["dategrouped_value"], 73 | dategrouped_gini=report_params["dategrouped_gini"], 74 | dategrouped_nan=report_params["dategrouped_nan"], 75 | ) 76 | ) 77 | 78 | def generate_report(self, report_params): 79 | copyfile( 80 | os.path.join(os.path.dirname(__file__), "shaptxt"), os.path.join(report_params["output_path"], "shap.js") 81 | ) 82 | 83 | self.write_report_to_file(report_params) 84 | 85 | logger.info(f"Successfully wrote {os.path.join(report_params['output_path'], 'autowoe_report.html')}.") 86 | -------------------------------------------------------------------------------- /tests/integration/test_basic_usage_and_params.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | 4 | from sklearn.metrics import roc_auc_score 5 | 6 | from autowoe import ReportDeco, AutoWoE 7 | 8 | 9 | def test_basic_usage_and_params(train_data, test_data, test_target): 10 | 11 | train = train_data 12 | 13 | train = train.iloc[:, 50:100] 14 | 15 | num_col = list(filter(lambda x: "numb" in x, train.columns)) 16 | num_feature_type = {x: "real" for x in num_col} 17 | 18 | date_col = filter(lambda x: "datetime" in x, train.columns) 19 | for col in date_col: 20 | train[col + "_year"] = train[col].map(lambda x: x.year) 21 | train[col + "_weekday"] = train[col].map(lambda x: x.weekday()) 22 | train[col + "_month"] = train[col].map(lambda x: x.month) 23 | 24 | test = test_data 25 | 26 | date_col = filter(lambda x: "datetime" in x, test.columns) 27 | for col in date_col: 28 | test[col + "_year"] = test[col].map(lambda x: x.year) 29 | test[col + "_weekday"] = test[col].map(lambda x: x.weekday()) 30 | test[col + "_month"] = test[col].map(lambda x: x.month) 31 | 32 | test["target"] = test_target.values 33 | 34 | cat_col = list(filter(lambda x: "str" in x, train.columns)) 35 | cat_feature_type = {x: "cat" for x in cat_col} 36 | 37 | year_col = list(filter(lambda x: "_year" in x, train.columns)) 38 | year_feature_type = {x: "cat" for x in year_col} 39 | 40 | weekday_col = list(filter(lambda x: "_weekday" in x, train.columns)) 41 | weekday_feature_type = {x: "cat" for x in weekday_col} 42 | 43 | month_col = list(filter(lambda x: "_month" in x, train.columns)) 44 | month_feature_type = {x: "cat" for x in month_col} 45 | 46 | features = cat_col + year_col + weekday_col + month_col + num_col 47 | 48 | features_type = dict( 49 | **num_feature_type, **cat_feature_type, **year_feature_type, **weekday_feature_type, **month_feature_type 50 | ) 51 | 52 | features_monotone_constraints = {"number_74": "auto", "number_83": "auto"} 53 | 54 | max_bin_count = {"number_47": 3, "number_51": 2} 55 | 56 | auto_woe = AutoWoE( 57 | task="BIN", 58 | interpreted_model=True, 59 | monotonic=False, 60 | max_bin_count=5, 61 | select_type=None, 62 | pearson_th=0.9, 63 | auc_th=0.505, 64 | vif_th=10.0, 65 | imp_th=0, 66 | th_const=32, 67 | force_single_split=True, 68 | th_nan=0.01, 69 | th_cat=0.005, 70 | woe_diff_th=0.01, 71 | min_bin_size=0.01, 72 | min_bin_mults=(2, 4), 73 | min_gains_to_split=(0.0, 0.5, 1.0), 74 | auc_tol=1e-4, 75 | cat_alpha=100, 76 | cat_merge_to="to_woe_0", 77 | nan_merge_to="to_woe_0", 78 | oof_woe=True, 79 | n_folds=6, 80 | n_jobs=4, 81 | l1_grid_size=20, 82 | l1_exp_scale=6, 83 | imp_type="feature_imp", 84 | regularized_refit=False, 85 | p_val=0.05, 86 | debug=False, 87 | verbose=0, 88 | ) 89 | 90 | auto_woe = ReportDeco(auto_woe) 91 | 92 | start_fit_time = time.time() 93 | auto_woe.fit( 94 | train[features + ["target"]], 95 | target_name="target", 96 | features_type=features_type, 97 | group_kf=None, 98 | max_bin_count=max_bin_count, 99 | features_monotone_constraints=features_monotone_constraints, 100 | validation=test, 101 | ) 102 | 103 | assert time.time() - start_fit_time < 25, f"Fit time is {time.time() - start_fit_time}, it's more than 25" 104 | 105 | start_predict_time = time.time() 106 | pred = auto_woe.predict_proba(test) 107 | assert ( 108 | time.time() - start_predict_time < 3.5 109 | ), f"Predict time is {time.time() - start_predict_time}, it's more than 3.5" 110 | 111 | score_1 = roc_auc_score(test["target"], pred) 112 | 113 | assert score_1 > 0.76 114 | 115 | # assert np.isclose(score_1, 0.7791178), f"Real score is {score_1}" 116 | 117 | pred = auto_woe.predict_proba(test[["number_72"]], report=False) 118 | score_2 = roc_auc_score(test["target"], pred) 119 | 120 | assert np.isclose(score_1, score_2), f"Scores {score_1} and {score_2} musts be equal" 121 | 122 | report_params = { 123 | "automl_date_column": "report_month", # колонка с датой в формате params['datetimeFormat'] 124 | "output_path": "./AUTOWOE_REPORT_1", # папка, куда сгенерится отчет и сложатся нужные файлы 125 | "report_name": "___НАЗВАНИЕ ОТЧЕТА___", 126 | "report_version_id": 1, 127 | "city": "Воронеж", 128 | "model_aim": "___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___", 129 | "model_name": "___НАЗВАНИЕ МОДЕЛИ___", 130 | "zakazchik": "___ЗАКАЗЧИК___", 131 | "high_level_department": "___ПОДРАЗДЕЛЕНИЕ___", 132 | "ds_name": "___РАЗРАБОТЧИК МОДЕЛИ___", 133 | "target_descr": "___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___", 134 | "non_target_descr": "___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___", 135 | } 136 | 137 | auto_woe.generate_report(report_params) 138 | -------------------------------------------------------------------------------- /autowoe/lib/optimizer/optimizer.py: -------------------------------------------------------------------------------- 1 | """Optimization of decision tree parameters.""" 2 | 3 | from collections import OrderedDict 4 | from copy import copy 5 | from itertools import product 6 | from typing import Any, Dict, Iterable, List, Tuple, Union 7 | 8 | import lightgbm as lgb 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from autowoe.lib.utilities.cv_split_f import cv_split_f 13 | from autowoe.lib.utilities.utils import TaskType 14 | 15 | # TODO: Do we need random state here? 16 | np.random.seed(232) 17 | 18 | 19 | class TreeParamOptimizer: 20 | """Optimizer of decision tree parameters. 21 | 22 | Args: 23 | data: Dataset. First column - feature, second - Target. 24 | params_range: OrderedDict with parameters and ranges for binning algorithms 25 | Ex. params_range = OrderedDict({"max_depth": (4, 7, 17, 2, 3), "min_child_samples": (40000, 20000, 5000),}) 26 | 27 | """ 28 | 29 | _cv_metric_map = {"auc": "auc", "mse": "l2"} 30 | 31 | def __init__(self, data: pd.DataFrame, task: TaskType, params_range: Dict[str, tuple], n_folds: int = 5): 32 | self._params_range = copy(params_range) 33 | self._task = task 34 | self._metric = "auc" if self._task == TaskType.BIN else "mse" 35 | 36 | ds_params = {} 37 | try: 38 | ds_params["min_data_in_bin"] = self._params_range.pop("min_data_in_bin")[0] 39 | except KeyError: 40 | pass 41 | 42 | # TODO: Fix double saved data 43 | self._X = pd.DataFrame(data.iloc[:, 0]) 44 | self._y = data.iloc[:, 1] 45 | 46 | self._lgb_train = lgb.Dataset(data=self._X.copy(), label=self._y.copy(), params=ds_params) 47 | self.n_folds = n_folds 48 | self._params_stats = None 49 | 50 | def __get_folds(self, random_state): 51 | skf = cv_split_f(self._X, self._y, self._task, None, self.n_folds, random_state) 52 | 53 | # folds = np.zeros(self._lgb_train.data.shape[0]) 54 | # for fold_idx, tt_idx in skf.items(): 55 | # _, test_idx = tt_idx 56 | # folds[test_idx] = fold_idx 57 | 58 | # return skf.items() 59 | 60 | for v in skf.values(): 61 | yield v 62 | 63 | @property 64 | def __params_gen(self) -> Iterable[Tuple]: 65 | return product(*self._params_range.values()) 66 | 67 | def __get_scores(self, params: Dict[str, Any], n: int) -> List[float]: 68 | """Scores for set of parameters. 69 | 70 | Args: 71 | params: Tree parameters. 72 | n: The amount of cross-validation to evaluate hyperparameters 73 | 74 | Returns: 75 | Scores. 76 | 77 | """ 78 | default_tree_params = { 79 | "boosting_type": "gbdt", 80 | "learning_rate": 1, 81 | "objective": "binary" if self._task == TaskType.BIN else "regression", 82 | "bagging_freq": 1, 83 | "bagging_fraction": 1, 84 | "feature_fraction": 1, 85 | "bagging_seed": 323, 86 | "n_jobs": 1, 87 | "verbosity": -1, 88 | } 89 | unite_params = {**params, **default_tree_params} 90 | 91 | score_add_string = "" 92 | if lgb.__version__ >= "4.1.0": 93 | score_add_string = "valid " 94 | 95 | scores = [] 96 | for seed in range(n): 97 | folds = self.__get_folds(seed) 98 | cv_results = lgb.cv( 99 | params=unite_params, train_set=self._lgb_train, num_boost_round=1, folds=folds, metrics=self._metric 100 | ) 101 | scores.append(cv_results[score_add_string + f"{self._cv_metric_map[self._metric]}-mean"]) 102 | 103 | return scores 104 | 105 | def __get_stats(self, stats: List[List[float]]): 106 | """Calculate statistics of scores. 107 | 108 | Args: 109 | stats: Scores [combinations of parameters, cv-s, number of folds in cv] 110 | 111 | """ 112 | stats = np.array(stats) 113 | median_, std_ = np.median(stats, axis=(1, 2)), np.std(stats, axis=(1, 2)) 114 | 115 | scores = zip(*(median_ if self._task == TaskType.BIN else -median_, -std_)) 116 | id_best = max(enumerate(scores), key=lambda x: x[1])[0] 117 | 118 | stat_score = zip(*(median_, std_)) 119 | self._params_stats = OrderedDict((key, value) for (key, value) in zip(self.__params_gen, stat_score)), id_best 120 | 121 | def __call__(self, n: int) -> Dict[str, Union[int, str, None]]: 122 | """Execute optimization. 123 | 124 | Args: 125 | n: Number of iterations. 126 | 127 | Returns: 128 | Best parameters. 129 | 130 | """ 131 | scores_ = [] 132 | for val in self.__params_gen: 133 | params = {key[1]: val[key[0]] for key in enumerate(self._params_range.keys())} 134 | scores_.append(self.__get_scores(params, n)) 135 | self.__get_stats(scores_) 136 | 137 | opt_params = list(self._params_stats[0].keys())[self._params_stats[1]] 138 | return dict(zip(self._params_range.keys(), opt_params)) 139 | -------------------------------------------------------------------------------- /autowoe/lib/types_handler/types_handler.py: -------------------------------------------------------------------------------- 1 | """Type processing.""" 2 | 3 | import collections 4 | from copy import deepcopy 5 | from typing import Any, Dict, Hashable, Optional 6 | 7 | import pandas as pd 8 | 9 | from .features_checkers_handlers import cat_checker, dates_checker, dates_handler 10 | 11 | 12 | class TypesHandler: 13 | """Класс для автоматического определения типов признаков. 14 | 15 | Базовая имплементация порядка разработки: 16 | 17 | 0. 18 | 0.a) Парсим то, что указал юзер 19 | 0.b) Даты парсим c указанием сезонности ("m", "d", "wd", "h", "min") 20 | (месяц, день, день недели, час, минута) 21 | 1. 22 | Если стринга, то категория 23 | 2. 24 | Если отношение shape[1] к количеству уникальных значений >> 5, то категория 25 | 26 | Args: 27 | train: 28 | public_features_type: 29 | max_bin_count: 30 | features_monotone_constraints: 31 | features_mark_values: 32 | 33 | """ 34 | 35 | def __init__( 36 | self, 37 | train: pd.DataFrame, 38 | public_features_type: Dict[Hashable, Any], 39 | max_bin_count: Optional[Dict[Hashable, Optional[int]]] = None, 40 | features_monotone_constraints: Optional[dict] = None, 41 | features_mark_values: Optional[dict] = None, 42 | ): 43 | self.__train = deepcopy(train) 44 | self.__public_features_type = deepcopy(public_features_type) 45 | self.__private_features_type: Dict[str, Any] = {} 46 | 47 | if max_bin_count is None: 48 | max_bin_count = {} 49 | self.__max_bin_count = collections.defaultdict(lambda: None, max_bin_count) 50 | 51 | if features_monotone_constraints is None: 52 | features_monotone_constraints = {} 53 | self.__features_monotone_constraints = collections.defaultdict(lambda: "0", features_monotone_constraints) 54 | 55 | @property 56 | def train(self): 57 | """Train data (Read only).""" 58 | return self.__train 59 | 60 | @property 61 | def public_features_type(self): 62 | """Public features types (Read only).""" 63 | return self.__public_features_type 64 | 65 | @property 66 | def private_features_type(self): 67 | """Private features types (Read only).""" 68 | return self.__private_features_type 69 | 70 | @property 71 | def max_bin_count(self): 72 | """Maximum bin count.""" 73 | return self.__max_bin_count 74 | 75 | @property 76 | def features_monotone_constraints(self): 77 | """Feature monotone constraints.""" 78 | return self.__features_monotone_constraints 79 | 80 | def __feature_handler(self, feature_name): 81 | if dates_checker(self.__train[feature_name]): 82 | new_features, feature_type = dates_handler(self.__train[feature_name]) 83 | self.__public_features_type[feature_name] = feature_type 84 | for new_feature_name, new_feature in new_features: 85 | self.__train[new_feature_name] = new_feature 86 | self.__max_bin_count[new_feature_name] = self.max_bin_count[feature_name] 87 | self.__private_features_type[new_feature_name] = "real" 88 | self.__features_monotone_constraints[new_feature_name] = self.features_monotone_constraints[ 89 | feature_name 90 | ] 91 | 92 | elif cat_checker(self.__train[feature_name]): 93 | self.__public_features_type[feature_name] = "cat" 94 | self.__private_features_type[feature_name] = "cat" 95 | self.__features_monotone_constraints[feature_name] = "1" 96 | else: 97 | self.__public_features_type[feature_name] = "real" 98 | self.__private_features_type[feature_name] = "real" 99 | 100 | def transform(self): 101 | """Основной метод данного класса. 102 | 103 | Если feature_type[feature] == None, то парсим тип признака 104 | Иначе происходит обработка указанных типов. 105 | Возможные типы признаков: 106 | "cat" 107 | "real" 108 | ("%Y%d%m", ("m", "d", "wd", "h", "min")) 109 | 110 | Returns: 111 | Info. 112 | 113 | """ 114 | for feature_name in self.public_features_type: 115 | if not self.public_features_type[feature_name]: 116 | self.__feature_handler(feature_name) 117 | elif isinstance(self.public_features_type[feature_name], tuple): # переданы данные для дат 118 | new_features, _ = dates_handler(self.train[feature_name], self.public_features_type[feature_name]) 119 | for new_feature_name, new_feature in new_features: 120 | self.__train[new_feature_name] = new_feature 121 | self.__max_bin_count[new_feature_name] = self.max_bin_count[feature_name] 122 | self.__private_features_type[new_feature_name] = "real" 123 | self.__features_monotone_constraints[new_feature_name] = self.__features_monotone_constraints[ 124 | feature_name 125 | ] 126 | 127 | elif self.public_features_type[feature_name] == "cat": 128 | self.__private_features_type[feature_name] = "cat" 129 | self.__features_monotone_constraints[feature_name] = "1" 130 | 131 | elif self.public_features_type[feature_name] == "real": 132 | self.__private_features_type[feature_name] = "real" 133 | self.__train[feature_name] = pd.to_numeric(self.train[feature_name], errors="coerce") 134 | 135 | else: 136 | raise ValueError("The specified data type is not supported") 137 | 138 | return ( 139 | self.train, 140 | self.public_features_type, 141 | self.private_features_type, 142 | self.max_bin_count, 143 | self.features_monotone_constraints, 144 | ) 145 | -------------------------------------------------------------------------------- /autowoe/lib/selectors/utils.py: -------------------------------------------------------------------------------- 1 | # noqa: D100 2 | 3 | from collections import namedtuple 4 | from typing import List, Mapping, Sequence, Tuple, Union 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.linear_model import LassoCV, LogisticRegressionCV 9 | from sklearn.metrics import roc_auc_score 10 | from sklearn.model_selection import BaseCrossValidator 11 | from sklearn.svm import l1_min_c 12 | 13 | from autowoe.lib.logging import get_logger 14 | from autowoe.lib.utilities.utils import TaskType 15 | 16 | logger = get_logger(__name__) 17 | 18 | Result = namedtuple("Result", ["score", "reg_alpha", "is_neg", "min_weights"]) 19 | 20 | FEATURE = Union[str, int, float] 21 | F_LIST_TYPE = Sequence[FEATURE] 22 | 23 | 24 | def scorer(estimator, x_train, y): 25 | """Evaluate ROC-AUC.""" 26 | return roc_auc_score(y, estimator.predict_proba(x_train)[:, 1]) 27 | 28 | 29 | class PredefinedFolds(BaseCrossValidator): 30 | """Predefined Folds.""" 31 | 32 | def __init__(self, cv_split: Mapping[int, Tuple[Sequence[int], Sequence[int]]]): 33 | self.cv_split = cv_split 34 | 35 | def _iter_test_indices( 36 | self, x_train: np.ndarray = None, y: np.ndarray = None, groups: np.ndarray = None 37 | ) -> np.ndarray: 38 | """Generates integer indices corresponding to test sets. 39 | 40 | Args: 41 | x_train: Train features. 42 | y: Train target. 43 | groups: Groups. 44 | 45 | Yields: 46 | test set indexes. 47 | 48 | """ 49 | for n in self.cv_split: 50 | yield self.cv_split[n][1] 51 | 52 | def get_n_splits(self, *args, **kwargs) -> int: 53 | """Number of splits.""" 54 | return len(self.cv_split) 55 | 56 | 57 | def analyze_result( 58 | model: Union[LogisticRegressionCV, LassoCV], features_names: Sequence[str], interpreted_model: bool = True 59 | ) -> List[Result]: 60 | """Analyze the result of the searching coefficient regularization. 61 | 62 | Args: 63 | model: Linear model. 64 | features_names: List of features names. 65 | interpreted_model: Build interpreted model. 66 | 67 | Returns: 68 | Summary. 69 | 70 | """ 71 | scores = model.scores_[1] 72 | cs_scores = scores.mean(axis=0) 73 | 74 | cs_len = scores.shape[1] 75 | coef_ = np.moveaxis(model.coefs_paths_[1][:, :, :-1], 1, 0) 76 | 77 | if interpreted_model: 78 | cs_negs = (coef_.reshape((cs_len, -1)) <= 0).all(axis=1) 79 | else: 80 | cs_negs = [True] * cs_len 81 | 82 | cs_min_weights = [pd.Series(coef_[x].min(axis=0), index=features_names) for x in range(cs_len)] # .sort_values() 83 | 84 | results = [ 85 | Result(score, c, is_neg, min_weights) 86 | for (score, c, is_neg, min_weights) in zip(cs_scores, model.Cs, cs_negs, cs_min_weights) 87 | ] 88 | 89 | return results 90 | 91 | 92 | def l1_select( 93 | task: TaskType, 94 | interpreted_model: bool, 95 | n_jobs: int, 96 | dataset: Tuple[pd.DataFrame, pd.Series], 97 | l1_grid_size: int, 98 | l1_exp_scale: float, 99 | cv_split: Mapping[int, Tuple[Sequence[int], Sequence[int]]], 100 | metric_tol: float = 1e-4, 101 | ) -> Tuple[F_LIST_TYPE, Result]: 102 | """Select the main features according to the lasso model. 103 | 104 | Args: 105 | task: Task. 106 | interpreted_model: Create interpreted model. 107 | n_jobs: Number of threads. 108 | dataset: Tuple of features and target. 109 | l1_grid_size: Number of points on grid. 110 | l1_exp_scale: Maximum value of `C`. 111 | cv_split: Cross-Val splits. 112 | metric_tol: Metric tolerance. 113 | 114 | Returns: 115 | Selected features, summary info. 116 | 117 | """ 118 | # fit model with crossvalidation 119 | cv = PredefinedFolds(cv_split) 120 | if task == TaskType.BIN: 121 | # get grid for cs 122 | cs = l1_min_c(dataset[0], dataset[1], loss="log", fit_intercept=True) * np.logspace( 123 | 0, l1_exp_scale, l1_grid_size 124 | ) 125 | logger.info(f"C parameter range in [{cs[0]}:{cs[-1]}], {l1_grid_size} values") 126 | 127 | model = LogisticRegressionCV( 128 | Cs=cs, 129 | solver="saga", 130 | tol=1e-5, 131 | cv=cv, 132 | penalty="l1", 133 | scoring=scorer, 134 | intercept_scaling=10000.0, 135 | max_iter=1000, 136 | n_jobs=n_jobs, 137 | random_state=42, 138 | ) 139 | else: 140 | # get grid for cs 141 | cs = np.logspace(0, l1_exp_scale, l1_grid_size + 1) 142 | alphas = 1.0 / cs[1:][::-1] 143 | logger.info(f"Alphas parameter range in [{alphas[0]}:{alphas[-1]}], {l1_grid_size} values") 144 | 145 | model = LassoCV( 146 | alphas=alphas, cv=cv, positive=interpreted_model, tol=1e-5, max_iter=1000, n_jobs=n_jobs, random_state=42 147 | ) 148 | 149 | model.fit(dataset[0].values, dataset[1].values) 150 | 151 | features_fit: List[str] 152 | if task == TaskType.BIN: 153 | # analyze cv results 154 | result = analyze_result(model, dataset[0].columns, interpreted_model) 155 | 156 | # perform selection 157 | # filter bad weights models 158 | scores_neg = [x for x in result if x.is_neg] 159 | # get top score from avail models 160 | max_score = max([x.score for x in result]) 161 | # get score with tolerance 162 | ok_score = max_score - metric_tol 163 | # select first model that is ok with tolerance 164 | res = None 165 | for res in scores_neg: 166 | if res.score >= ok_score: 167 | break 168 | 169 | # get selected features 170 | features_fit = [x for (x, y) in zip(dataset[0].columns, res.min_weights) if y != 0] 171 | logger.info(res) 172 | else: 173 | features_fit = [x for (x, y) in zip(dataset[0].columns, model.coef_) if y != 0] 174 | res = Result( 175 | score=model.mse_path_.mean(axis=1).min(), 176 | reg_alpha=model.alpha_, 177 | is_neg=[True] * model.coef_.shape[0], 178 | min_weights=np.min(model.coef_), 179 | ) 180 | 181 | return features_fit, res 182 | -------------------------------------------------------------------------------- /autowoe/lib/selectors/composed_selector.py: -------------------------------------------------------------------------------- 1 | """Compose several selector.""" 2 | 3 | from copy import copy 4 | from typing import Any, Dict, List, Optional, Tuple, TypeVar 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.metrics import r2_score, roc_auc_score 9 | 10 | from autowoe.lib.logging import get_logger 11 | from autowoe.lib.utilities.utils import TaskType, feature_changing 12 | 13 | from .utils import F_LIST_TYPE 14 | 15 | logger = get_logger(__name__) 16 | 17 | WoE = TypeVar("WoE") 18 | 19 | 20 | class ComposedSelector: 21 | """Compose feature selector. 22 | 23 | Sequential filtering of features by rules: 24 | 1) Unique WoE value. 25 | 2) Single feature model has metric lower than threshold. 26 | 3) VIF of feature greater than threshold. 27 | 4) There are features with a pair correlation above the threshold. 28 | 29 | Metrics: 30 | 1) BIN - AUC 31 | 2) REG - R2 32 | 33 | Args: 34 | train: Train features. 35 | target: Train target. 36 | task: Task. 37 | features_mark_values: Marked values of features. 38 | 39 | """ 40 | 41 | default_metric_th = {TaskType.BIN: 0.5, TaskType.REG: 0.0} 42 | 43 | def __init__( 44 | self, 45 | train: pd.DataFrame, 46 | target: pd.Series, 47 | task: TaskType, 48 | features_mark_values: Optional[Dict[str, Tuple[Any]]], 49 | ): 50 | self.train = train 51 | self.target = target 52 | self.task = task 53 | self.features_mark_values = features_mark_values 54 | # precompute corrs 55 | 56 | if features_mark_values is not None: 57 | mask_good_values = pd.Series([True] * train.shape[0]) 58 | for col, mvs in features_mark_values.items(): 59 | if col in train.columns: 60 | mask_good_values = mask_good_values & (~train[col].isin(mvs)) 61 | else: 62 | mask_good_values = pd.Series([True] * train.shape[0], index=train.index) 63 | train_values = train[mask_good_values].values 64 | 65 | cc = np.abs(np.corrcoef(train_values, rowvar=False)) 66 | self.precomp_corr = pd.DataFrame(cc, index=train.columns, columns=train.columns) 67 | 68 | metrics = [] 69 | for col in train.columns: 70 | if task == TaskType.BIN: 71 | m = 1 - roc_auc_score(target, train[col]) 72 | else: 73 | m = r2_score(target, train[col]) 74 | metrics.append(m) 75 | self.precomp_metrics = pd.Series(metrics, index=train.columns) 76 | 77 | @staticmethod 78 | def __compare_msg(closure, value, msg=None): 79 | flg = closure(value) 80 | if not flg: 81 | logger.info(msg) 82 | return flg 83 | 84 | def __call__( 85 | self, 86 | feature_history: Dict[str, str], 87 | features_fit: List[str], 88 | pearson_th: float = 0.9, 89 | metric_th: Optional[float] = None, 90 | vif_th: float = 5.0, 91 | ) -> F_LIST_TYPE: 92 | """Filtered features.""" 93 | if metric_th is None: 94 | metric_th = self.default_metric_th[self.task] 95 | 96 | candidates = copy(features_fit) 97 | features_before = set(candidates) 98 | 99 | # откинем константные 100 | _, filter_features = feature_changing( 101 | feature_history, 102 | "Constant WoE value", 103 | features_before, 104 | lambda candidates: ( 105 | None, 106 | [ 107 | col 108 | for col in candidates 109 | if self.__compare_msg( 110 | lambda x: ~np.isnan(self.precomp_corr.loc[x, x]), 111 | col, 112 | f"Feature {col} removed due to single WOE value", 113 | ) 114 | ], 115 | ), # func 116 | candidates, # args 117 | # ..., # kwargs 118 | ) 119 | 120 | # откинем с низкой метрикой 121 | _, filter_features = feature_changing( 122 | feature_history, 123 | "Low metric value", # TODO: feature name 124 | filter_features, 125 | lambda candidates: ( 126 | None, 127 | [ 128 | col 129 | for col in candidates 130 | if self.__compare_msg( 131 | lambda x: self.precomp_metrics[x] >= metric_th, 132 | col, 133 | f"Feature {col} removed due to low metric value {self.precomp_metrics[col]}", 134 | ) 135 | ], 136 | ), # func 137 | filter_features, # args 138 | # ..., # kwargs 139 | ) 140 | candidates = filter_features 141 | 142 | # итеративный виф 143 | max_vif = np.inf 144 | while max_vif > vif_th: 145 | corrs = self.precomp_corr.loc[candidates, candidates] 146 | # fix singularity 147 | corrs = corrs.values + np.diag(np.ones(corrs.shape[0]) * 1e-4) 148 | vifs = np.linalg.inv(corrs).diagonal() 149 | 150 | max_vif_idx = vifs.argmax() 151 | max_vif = vifs[max_vif_idx] 152 | 153 | if max_vif >= vif_th: 154 | logger.info(f"Feature {candidates[max_vif_idx]} removed due to high VIF value = {max_vif}") 155 | if feature_history is not None: 156 | feature_history[candidates[max_vif_idx]] = f"High VIF value = {round(max_vif, 2)}" 157 | candidates = [x for (n, x) in enumerate(candidates) if n != max_vif_idx] 158 | 159 | # попарные корреляции 160 | # отсортируем по убыванию метрики 161 | order_ = np.array([self.precomp_metrics[x] for x in candidates]).argsort()[::-1] 162 | candidates = [candidates[x] for x in order_] 163 | 164 | n = 0 165 | while n < (len(candidates) - 1): 166 | partial_corrs = self.precomp_corr.loc[candidates[n], candidates[n + 1 :]] 167 | big_partial_corrs = partial_corrs[partial_corrs >= pearson_th] 168 | if len(big_partial_corrs) > 0: 169 | logger.info( 170 | ( 171 | f"Features {list(big_partial_corrs.index.values)}: " 172 | f"metric = {list(self.precomp_metrics[big_partial_corrs.index])} was removed due to " 173 | f"corr = {list(big_partial_corrs.values)} with feat {candidates[n]}: " 174 | f"metric = {self.precomp_metrics[candidates[n]]}" 175 | ) 176 | ) 177 | if feature_history is not None: 178 | for feat in big_partial_corrs.index.values: 179 | feature_history[feat] = f"High correlation with feat {candidates[n]}" 180 | 181 | candidates = [x for x in candidates if x not in set(big_partial_corrs.index.values)] 182 | n += 1 183 | 184 | return candidates 185 | -------------------------------------------------------------------------------- /autowoe/lib/selectors/selector_first.py: -------------------------------------------------------------------------------- 1 | """Selection of features according to the importance of the model.""" 2 | 3 | import logging 4 | from copy import deepcopy 5 | from typing import Any, Dict, Hashable, Optional, Tuple, Union 6 | 7 | import lightgbm as lgb 8 | import numpy as np 9 | import pandas as pd 10 | from pandas import DataFrame 11 | from sklearn.metrics import mean_squared_error, roc_auc_score 12 | from sklearn.model_selection import train_test_split 13 | 14 | from autowoe.lib.logging import get_logger 15 | from autowoe.lib.utilities.eli5_permutation import get_score_importances 16 | from autowoe.lib.utilities.utils import TaskType, drop_keys 17 | 18 | pd.options.mode.chained_assignment = None 19 | 20 | logger = get_logger(__name__) 21 | 22 | root_logger = logging.getLogger() 23 | level = root_logger.getEffectiveLevel() 24 | 25 | if level in (logging.CRITICAL, logging.ERROR, logging.WARNING): 26 | verbose_eval = 0 # False 27 | elif level == logging.INFO: 28 | verbose_eval = 100 29 | else: 30 | verbose_eval = 10 31 | 32 | 33 | def nan_constant_selector( 34 | data: DataFrame, features_type: Dict[Hashable, str], th_const: float = 32 35 | ) -> Tuple[DataFrame, Dict[Hashable, str]]: 36 | """Selector NaN / Const columns. 37 | 38 | Filters columns with a large number of NaN-values or with almost constant values. 39 | 40 | Args: 41 | data: DataFrame 42 | features_type: Dict[Hashable, str] 43 | th_const: Constant threshold. Filters if the number of valid values is less than the threshold. 44 | 45 | Returns: 46 | Data, features list. 47 | 48 | """ 49 | th_ = data.shape[0] - th_const 50 | 51 | features_to_drop = [] 52 | 53 | for col in features_type: 54 | nan_count = data[col].isna().sum() 55 | if nan_count >= th_: 56 | features_to_drop.append(col) 57 | else: 58 | vc = data[col].value_counts().values[0] 59 | if vc >= th_: 60 | features_to_drop.append(col) 61 | 62 | logger.info(f" features {features_to_drop} contain too many nans or identical values") 63 | data = data.drop(columns=features_to_drop, axis=1) 64 | features_type = drop_keys(features_type, features_to_drop) 65 | return data, features_type 66 | 67 | 68 | def get_score_function(model, task: TaskType): 69 | """Score function for task - {BIN: ROC_AUC, REG: MSE}.""" 70 | if task == TaskType.BIN: 71 | return lambda x, y: roc_auc_score(y, model.predict_proba(x)[:, 1]) 72 | else: 73 | return lambda x, y: -mean_squared_error(y, model.predict(x)) 74 | 75 | 76 | def feature_imp_selector( 77 | data: DataFrame, 78 | task: TaskType, 79 | features_type: Dict[Hashable, str], 80 | features_mark_values: Optional[Dict[str, Tuple[Any]]], 81 | target_name: Hashable, 82 | imp_th: float, 83 | imp_type: str, 84 | select_type: Union[None, int], 85 | process_num: int, 86 | ) -> Tuple[DataFrame, Dict[Hashable, str]]: 87 | """Features selection by imp_type. 88 | 89 | Available FS: 90 | - lgbm feature_importance 91 | - permutation importance 92 | 93 | Args: 94 | data: Dataset. 95 | task: Task. 96 | features_type: Features types. 97 | features_mark_values: Marked values of feature. 98 | target_name: Target column name. 99 | imp_th: Importance threshold. 100 | imp_type: Importance type ("feature_imp" -- feature_importances, "perm_imp" -- permutation_importances). 101 | select_type: Type of first feature selection. 102 | - If `None` then choose features with `feature_importance > 0`. 103 | - If `int` then choose the N-th best features. 104 | process_num: Number of threads. 105 | 106 | Returns: 107 | Data, features. 108 | 109 | """ 110 | data_ = deepcopy(data) 111 | 112 | if features_mark_values: 113 | for col, mvs in features_mark_values.items(): 114 | data_ = data_[~data_[col].isin(mvs)] 115 | 116 | categorical_feature = [key for key in features_type if features_type[key] == "cat"] 117 | if categorical_feature: 118 | data_[categorical_feature] = data_[categorical_feature].astype("category") 119 | 120 | train, test = train_test_split(data_, test_size=0.2, random_state=42) 121 | params = {"boosting_type": "gbdt", "n_jobs": process_num, "bagging_seed": 323, "min_gain_to_split": 0.01} 122 | 123 | if task == TaskType.BIN: 124 | params["objective"] = "binary" 125 | params["metric"] = "auc" 126 | elif task == TaskType.REG: 127 | params["objective"] = "regression" 128 | params["metric"] = "mse" 129 | else: 130 | raise RuntimeError("Wrong task value") 131 | 132 | if imp_type == "feature_imp": 133 | lgb_train = lgb.Dataset( 134 | data=train.drop(target_name, axis=1), label=train[target_name], categorical_feature=categorical_feature 135 | ) 136 | lgb_test = lgb.Dataset( 137 | data=test.drop(target_name, axis=1), label=test[target_name], categorical_feature=categorical_feature 138 | ) 139 | 140 | lgb_kwargs = {"params": params, "train_set": lgb_train, "valid_sets": [lgb_test], "valid_names": ["val_test"]} 141 | if lgb.__version__ >= "3.3.0": 142 | lgb_kwargs["callbacks"] = [lgb.log_evaluation(period=verbose_eval), lgb.early_stopping(10, False, True)] 143 | else: 144 | lgb_kwargs["early_stopping_rounds"] = 10 145 | lgb_kwargs["verbose_eval"] = verbose_eval 146 | 147 | model = lgb.train(**lgb_kwargs) 148 | imp_dict = dict(zip(train.drop(target_name, axis=1).columns, model.feature_importance())) 149 | elif imp_type == "perm_imp": 150 | if task == TaskType.BIN: 151 | model = lgb.LGBMClassifier(**params) 152 | else: 153 | model = lgb.LGBMRegressor(**params) 154 | 155 | for cat in categorical_feature: 156 | vc = train[cat].value_counts() 157 | 158 | vc = vc[vc > 1] 159 | vc = vc + np.arange(vc.shape[0]) / vc.shape[0] 160 | train[cat] = train[cat].map(vc).astype(np.float32).fillna(0).values 161 | test[cat] = test[cat].map(vc).astype(np.float32).fillna(0).values 162 | 163 | test_ = test.drop(target_name, axis=1).astype(np.float32).values 164 | 165 | model.fit( 166 | X=train.drop(target_name, axis=1).astype(np.float32).values, 167 | y=train[target_name].values, 168 | eval_set=[(test_, test[target_name].values)], 169 | eval_names=["val_set"], 170 | eval_metric=params["metric"], 171 | early_stopping_rounds=10, 172 | verbose=verbose_eval, 173 | ) 174 | _, score_decreases = get_score_importances( 175 | score_func=get_score_function(model, task), X=test_, y=test[target_name] 176 | ) 177 | col = list(train.columns) 178 | col.remove(target_name) 179 | imp_dict = dict(zip(col, np.array(score_decreases).min(axis=0, initial=None))) 180 | else: 181 | raise ValueError("imp_type is feature_imp or perm_imp") 182 | 183 | if isinstance(select_type, int): 184 | features_to_drop, _ = zip(*sorted(imp_dict.items(), key=lambda x: x[1], reverse=True)) 185 | features_to_drop = list(features_to_drop[select_type:]) 186 | elif select_type is None: 187 | features_to_drop = [x for x in imp_dict if imp_dict[x] <= imp_th] 188 | else: 189 | raise ValueError("select_type is None or int > 0") 190 | logger.info(f" features {features_to_drop} have low importance") 191 | data = data.drop(columns=features_to_drop, axis=1) 192 | features_type = drop_keys(features_type, features_to_drop) 193 | return data, features_type 194 | -------------------------------------------------------------------------------- /autowoe/lib/pipelines/pipeline_feature_special_values.py: -------------------------------------------------------------------------------- 1 | """Process nan values.""" 2 | 3 | from collections import defaultdict 4 | from copy import deepcopy 5 | from typing import Any, Dict, Hashable, Optional, Set, Tuple, TypeVar 6 | 7 | import pandas as pd 8 | 9 | from autowoe.lib.selectors.utils import F_LIST_TYPE 10 | 11 | TKey = TypeVar("TKey") 12 | TValue = TypeVar("TValue") 13 | 14 | 15 | def _opt2val(name: str, options: Set[str]) -> Dict[str, str]: 16 | fmt = "__{NAME}_{VAL}__" 17 | return {k: fmt.format(NAME=name, VAL=k.rsplit("_")[-1]) for k in options} 18 | 19 | 20 | def _values(d: Dict[TKey, TValue]) -> Set[TValue]: 21 | return {v for _, v in d.items()} 22 | 23 | 24 | DEFAULT_OPTIONS_SPECIAL_VALUES: Set[str] = {"to_woe_0", "to_maxfreq", "to_minp", "to_maxp"} 25 | EXTEND_OPTIONS_SPECIAL_VALUES: Set[str] = {*DEFAULT_OPTIONS_SPECIAL_VALUES, "to_nan"} 26 | 27 | NAN_MERGE_CASES = _opt2val("NaN", DEFAULT_OPTIONS_SPECIAL_VALUES) 28 | SMALL_MERGE_CASES = _opt2val("Small", EXTEND_OPTIONS_SPECIAL_VALUES) 29 | MARK_MERGE_CASES = _opt2val("Mark", EXTEND_OPTIONS_SPECIAL_VALUES) 30 | 31 | 32 | NAN_SET = {*_values(NAN_MERGE_CASES), "__NaN__"} 33 | SMALL_SET = {*_values(SMALL_MERGE_CASES), "__Small__"} 34 | MARK_SET = {*_values(MARK_MERGE_CASES), "__Mark__"} 35 | 36 | CATEGORY_SPECIAL_SET = {*SMALL_SET, *NAN_SET, *MARK_SET} - {"__NaN__", "__Small__", "__Mark__"} 37 | REAL_SPECIAL_SET = {*NAN_SET, *MARK_SET} # - {"__NaN__", "__Small__", "__Mark__"} 38 | 39 | 40 | def is_mark_prefix(s): 41 | """Mark encode.""" 42 | return isinstance(s, str) and s.startswith("__Mark__") 43 | 44 | 45 | class FeatureSpecialValues: 46 | """Class for processing special values in features. 47 | 48 | Вещественные признаки в отдельную группу. Если сэмплов меньше, чем 49 | th_nan, то присавиваем woe 0. И на train и на test 50 | -------------------------------------------------------------------------------- 51 | Категориальные признаки. Если категория небольшая (число сэмплов меньше, чем th_cat), 52 | то кодируем её отельным числом. Если nan то кодируем по аналогии с 53 | вещественным случаем с помощью th_nan. Если на тесте встречаем категорию, 54 | которой не было на train, то отправляем её в nan, маленькие категории, в woe со значением 0. 55 | 56 | Groups of special values: 57 | 1. NaN-values (real, categorical features). 58 | 2. Small groups (categorical features). 59 | 3. Mark values (real, categorical features). 60 | 61 | Real features processing: 62 | 1. If there are fewer samples than `th_nan`, then assign `WoE` to 0. 63 | 64 | Categorical features processing: 65 | 1. Small category (number of samples less than `th_cat`) -> 66 | 2. Processing NaN-values as in real variables. 67 | 3. Category that didn't occur in the train dataset is assigned a NaN. 68 | 4. Category that didn't occur in the train dataset is assigned a NaN. 69 | 70 | Args: 71 | th_nan: Threshold for NaN-values process. 72 | th_cat: Threshold for category values process. 73 | cat_merge_to: 74 | nan_merge_to: 75 | 76 | """ 77 | 78 | def __init__( 79 | self, 80 | th_nan: float = 32, 81 | th_cat: float = 32, 82 | th_mark: float = 32, 83 | cat_merge_to: str = "to_woe_0", 84 | nan_merge_to: str = "to_woe_0", 85 | mark_merge_to: str = "to_woe_0", 86 | mark_values: Optional[Dict[str, Any]] = None, 87 | ): 88 | self._th_nan = th_nan 89 | self._th_cat = th_cat 90 | self._th_mark = th_mark 91 | self._cat_merge_to = cat_merge_to 92 | self._nan_merge_to = nan_merge_to 93 | self._mark_merge_to = mark_merge_to 94 | self._mark_values = mark_values 95 | 96 | self._features_type = None 97 | self.cat_encoding = None # Словарь с кодированием по группам категориальных признаков 98 | self.all_encoding = None 99 | self.mark_encoding = None 100 | self._spec_values = None 101 | 102 | def fit_transform( 103 | self, train: pd.DataFrame, features_type: Dict[Hashable, str] 104 | ) -> Tuple[pd.DataFrame, Dict[Hashable, Dict[str, float]]]: 105 | """Fit/transform. 106 | 107 | Args: 108 | train: Dataset. 109 | features_type: Type of features. {"cat" - category, "real" - real} 110 | 111 | Returns: 112 | Processed dataset, special values. 113 | 114 | """ 115 | train_ = deepcopy(train) 116 | all_encoding = {} 117 | cat_encoding = {} 118 | mark_encoding = defaultdict(dict) 119 | spec_values = {} 120 | self._features_type = features_type 121 | for col in self._features_type: 122 | d = {} 123 | 124 | if self._mark_values is not None and col in self._mark_values: 125 | mark_values_mask = train_[col].isin(self._mark_values[col]) 126 | 127 | fill_val = None 128 | if mark_values_mask.sum() < self._th_mark: 129 | enc_type = MARK_MERGE_CASES[self._mark_merge_to] 130 | if enc_type == "__Mark_0__": 131 | fill_val = 0 132 | # d[enc_type] = fill_val 133 | else: 134 | enc_type = "__Mark__" 135 | 136 | # if self._features_type[col] != "cat": 137 | # d[enc_type] = None 138 | 139 | for mv in self._mark_values[col]: 140 | enc_type_t = enc_type + f"{mv}__" if enc_type == "__Mark__" else enc_type 141 | train_.loc[train_[col] == mv, col] = enc_type_t 142 | mark_encoding[col][mv] = enc_type_t 143 | # if self._features_type[col] != "cat": 144 | d[enc_type_t] = fill_val 145 | else: 146 | mark_values_mask = pd.Series(data=[False] * train_.shape[0], index=train_.index) 147 | 148 | if self._features_type[col] == "cat": 149 | vc = train_.loc[~mark_values_mask, col].value_counts() 150 | big_cat = set(vc.index) 151 | vc = vc.loc[vc < self._th_cat] 152 | vc_sum, small_cat = vc.sum(), set(vc.index) 153 | if vc_sum < self._th_nan: # TODO: _th_nan -> _th_cat ? 154 | # Случай когда суммарно всех небольших категорий все равно мало 155 | enc_type = SMALL_MERGE_CASES[self._cat_merge_to] 156 | fill_val = 0 if enc_type == "__Small_0__" else None 157 | d[enc_type] = fill_val 158 | else: 159 | enc_type = "__Small__" 160 | # d[enc_type] = None 161 | 162 | if train_.loc[:, col].dtypes is not object: # trouble when we have numerical col 163 | train_[col] = train_[col].astype(object) 164 | 165 | train_.loc[train_[col].isin(small_cat), col] = enc_type 166 | cat_encoding[col] = big_cat.difference(small_cat), small_cat, enc_type 167 | # Небольшие категории, которые будем кодировать отдельно 168 | 169 | nan_count = train_[col].isna().sum() 170 | 171 | if nan_count < self._th_nan: 172 | enc_type = NAN_MERGE_CASES[self._nan_merge_to] 173 | fill_val = 0 if enc_type == "__NaN_0__" else None 174 | d[enc_type] = fill_val 175 | else: 176 | enc_type = "__NaN__" # Большое число пропусков. Кодируем как обычную категорию 177 | # исключаем NaN из специальных значений для категорий 178 | if self._features_type[col] != "cat": 179 | d[enc_type] = None 180 | 181 | spec_values[col] = d 182 | 183 | train_[col] = train_[col].fillna(enc_type) 184 | all_encoding[col] = enc_type 185 | 186 | self.cat_encoding = cat_encoding 187 | self.all_encoding = all_encoding 188 | self.mark_encoding = mark_encoding 189 | self._spec_values = spec_values 190 | 191 | return train_, spec_values 192 | 193 | def transform(self, test: pd.DataFrame, features: F_LIST_TYPE): 194 | """Transform dataset. 195 | 196 | Args: 197 | test: Test dataset. 198 | features: List of features for processing. 199 | 200 | Returns: 201 | Processed dataset. 202 | 203 | """ 204 | test_ = test[features].copy() 205 | 206 | for col in features: 207 | if self._mark_values is not None and col in self._mark_values: 208 | mark_values_mask = test_[col].isin(self._mark_values[col]) 209 | if mark_values_mask.sum() > 0: 210 | test_.loc[mark_values_mask, col] = test_.loc[mark_values_mask, col].map(self.mark_encoding[col]) 211 | else: 212 | mark_values_mask = pd.Series(data=[False] * test.shape[0], index=test.index) 213 | 214 | if self._features_type[col] == "cat": 215 | big_cat, _, small_pad = self.cat_encoding[col] 216 | test_.loc[~(test_[col].isin(big_cat) | test_[col].isna() | mark_values_mask), col] = small_pad 217 | 218 | test_[col] = test_[col].fillna(self.all_encoding[col]) 219 | 220 | return test_, deepcopy(self._spec_values) 221 | -------------------------------------------------------------------------------- /autowoe/lib/woe/woe.py: -------------------------------------------------------------------------------- 1 | """Weight of evidence.""" 2 | 3 | from copy import deepcopy 4 | from typing import Dict, List, Tuple 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from pandas.core.frame import DataFrame 9 | 10 | from autowoe.lib.pipelines.pipeline_feature_special_values import is_mark_prefix 11 | from autowoe.lib.utilities.utils import TaskType 12 | 13 | 14 | class WoE: 15 | """Class for WoE transformation. 16 | 17 | Args: 18 | f_type: Feature type. {"cat" - categorical, "real" - real}. 19 | split: Splits. Formats: 20 | - real type: [-27, 1, 4, 5, 12, 100] 21 | - cat type: {12: 1, 17: 1, 20: 2, 35: 3} 22 | woe_diff_th: WoE difference threshold. 23 | target_type: Type of target value. 24 | 25 | """ 26 | 27 | def __init__(self, f_type: str, split: List[float], woe_diff_th: float = 0.0, target_type: TaskType = TaskType.BIN): 28 | self.f_type = f_type 29 | self.split = split 30 | # новая фича - нуллы могут отнестись к ближайшей группе, если достаточно данных 31 | self.woe_diff = woe_diff_th 32 | self.target_type = target_type 33 | 34 | self.iv = None 35 | self.cod_dict = None 36 | 37 | def __codding(self, x: pd.Series): 38 | """Encode values.""" 39 | if self.f_type == "cat": 40 | x_cod = x.map(self.split) 41 | elif self.f_type == "real": 42 | x_cod = np.searchsorted(self.split, x.values, side="left") # check 43 | x_cod = pd.Series(data=x_cod, index=x.index) 44 | else: 45 | raise ValueError("_f_type is cat or real") 46 | return x_cod 47 | 48 | @staticmethod 49 | def _bucket_woe(x, total_good: int, total_bad: int): 50 | t_bad = x["bad"] 51 | t_good = x["count_nonzero"] 52 | t_bad = 0.5 if t_bad == 0 else t_bad 53 | t_good = 0.5 if t_good == 0 else t_good 54 | return np.log((t_bad / total_bad) / (t_good / total_good)) 55 | 56 | def __woe(self, df: pd.DataFrame) -> Tuple[Dict, DataFrame, Tuple[float, ...]]: 57 | """Calculate WoE coefficient for each category values.""" 58 | df.columns = [0, "target"] 59 | stat = df.groupby(0)["target"].agg(["mean", np.count_nonzero, np.size]) 60 | 61 | if self.target_type == TaskType.BIN: 62 | stat["bad"] = stat["size"] - stat["count_nonzero"] 63 | t_good = np.maximum(stat["count_nonzero"].sum(), 0.5) # Если меток вообще нет 64 | t_bad = np.maximum(stat["bad"].sum(), 0.5) # Если меток вообще нет 65 | 66 | stat["woe"] = stat.apply( 67 | lambda x: self._bucket_woe(x, t_good, t_bad), axis=1 68 | ) # ||P.Correction|-> + np.log(t_good / t_bad)|| 69 | iv_stat = (stat["bad"] / t_bad - stat["count_nonzero"] / t_good) * stat["woe"] # Кульбака-Лейблера 70 | self.iv = iv_stat.sum() 71 | 72 | return stat["woe"].to_dict(), stat, (t_good, t_bad) 73 | elif self.target_type == TaskType.REG: 74 | stat["woe"] = stat["mean"] 75 | iv_stat = stat["woe"].abs() * stat["size"] / stat["size"].sum() 76 | self.iv = iv_stat.sum() 77 | 78 | return stat["woe"].to_dict(), stat, None 79 | 80 | def __df_cod_transform(self, x: pd.Series, spec_values): 81 | x_ = deepcopy(x) 82 | if isinstance(spec_values, list): 83 | spec_values_ = spec_values.copy() 84 | elif isinstance(spec_values, dict): 85 | spec_values_ = spec_values.keys() 86 | else: 87 | spec_values_ = [] 88 | 89 | x_.loc[x_.isin(spec_values_)] = -np.inf 90 | df_cod = self.__codding(x_) 91 | 92 | if len(x.loc[x.isin(spec_values_)]) == 0 or len(spec_values_) == 0: 93 | return df_cod 94 | 95 | if df_cod.dtypes is not object: 96 | df_cod = df_cod.astype(object) 97 | 98 | df_cod.loc[x.isin(spec_values_)] = x.loc[x.isin(spec_values_)] 99 | 100 | return df_cod 101 | 102 | def fit(self, x, y, spec_values): 103 | """Fit WoE transformation. 104 | 105 | Args: 106 | x: Feature. 107 | y: Target. 108 | spec_values: Special values. Если значение не None, то кодируем WoE по дефолту, если же нет, то кодируем 0 109 | 110 | Returns: 111 | df. 112 | 113 | """ 114 | df_cod = self.__df_cod_transform(x, spec_values) 115 | df_cod = pd.concat([df_cod, y], axis=1) 116 | stat, total, t_stat = self.__woe(df_cod) 117 | 118 | if self.target_type == TaskType.BIN: 119 | t_good, t_bad = t_stat 120 | 121 | good_stats = total.loc[ 122 | [x for x in total.index if type(x) in [int, float] or x in ("__Small__", "__NaN__") or is_mark_prefix(x)] 123 | ] 124 | 125 | # первая обработка - мерджим близкие нуллы/категории 126 | nsm_values = ( 127 | [x for x in spec_values if "NaN" in x] 128 | + [x for x in spec_values if "Small" in x] 129 | + [x for x in spec_values if "Mark" in x] 130 | ) 131 | 132 | for key in nsm_values: 133 | if (key in ("__Small__", "__NaN__") or is_mark_prefix(key)) and key in good_stats.index: 134 | check_row = good_stats.loc[key] 135 | diff = (good_stats["woe"] - check_row["woe"]).abs() 136 | min_diff = diff[diff > 0].min() 137 | 138 | if min_diff < self.woe_diff: 139 | idx = diff <= min_diff 140 | # если ближайший слишком близко - мерджим 141 | 142 | if self.target_type == TaskType.BIN: 143 | good_stats.loc[idx, "woe"] = self._bucket_woe( 144 | good_stats.loc[idx, ["bad", "count_nonzero"]].sum(axis=0), t_good, t_bad 145 | ) 146 | good_stats.loc[idx, "size"] = good_stats.loc[idx, "size"].sum() 147 | good_stats.loc[idx, "mean"] = good_stats.loc[idx, "count_nonzero"].sum() / good_stats["size"] 148 | elif self.target_type == TaskType.REG: 149 | gs = good_stats.loc[idx, ["woe", "size"]].copy() 150 | t_gs_size = gs["size"].sum() 151 | good_stats.loc[idx, "woe"] = (gs["woe"] * gs["size"]).sum() / t_gs_size 152 | good_stats.loc[idx, "size"] = t_gs_size 153 | good_stats.loc[idx, "mean"] = good_stats.loc[idx, "woe"] 154 | 155 | # TODO: re-right 156 | for key in good_stats.index.values: 157 | stat[key] = good_stats.loc[key, "woe"] 158 | 159 | # далее обработка нуллов и маленьких категорий 160 | for key in nsm_values: 161 | woe_val = None 162 | 163 | if key in ("__Mark_0__", "__Small_0__", "__NaN_0__"): 164 | woe_val = 0 165 | 166 | elif key in ("__Mark_maxfreq__", "__Small_maxfreq__", "__NaN_maxfreq__"): 167 | idx = good_stats["size"].values.argmax() 168 | woe_val = good_stats.iloc[idx]["woe"] 169 | 170 | elif key in ("__Mark_maxp__", "__Small_maxp__", "__NaN_maxp__"): 171 | # Отберем только тех, по кому что-то нормальное можно оценить 172 | idx = good_stats["mean"].values.argmax() 173 | woe_val = good_stats.iloc[idx]["woe"] 174 | 175 | elif key in ("__Mark_minp__", "__Small_minp__", "__NaN_minp__"): 176 | # Отберем только тех, по кому что-то нормальное можно оценить 177 | idx = good_stats["mean"].values.argmin() 178 | woe_val = good_stats.iloc[idx]["woe"] 179 | 180 | elif key in ("__Small__", "__NaN__") or is_mark_prefix(key): 181 | continue 182 | 183 | stat[key] = woe_val 184 | 185 | self.cod_dict = stat 186 | return df_cod 187 | 188 | def fit_transform(self, x: pd.Series, y: pd.Series, spec_values): 189 | """Fit/transfor. 190 | 191 | Args: 192 | x: Feature. 193 | y: Target. 194 | spec_values: Special values. Если значение не None, то кодируем WoE по дефолту, если же нет, то кодируем 0 195 | 196 | Returns: 197 | Transformed feature. 198 | 199 | """ 200 | df_cod = self.fit(x, y, spec_values) 201 | df_cod = df_cod[0].map(self.cod_dict).copy() 202 | return df_cod 203 | 204 | def transform(self, x: pd.Series, spec_values): 205 | """Transform by WoE. 206 | 207 | Args: 208 | x: Feature. 209 | spec_values: Special values. 210 | 211 | Returns: 212 | Transformed feature. 213 | 214 | """ 215 | df_cod = self.__df_cod_transform(x, spec_values) 216 | df_cod = df_cod.map(self.cod_dict) 217 | return df_cod 218 | 219 | def split_feature(self, x: pd.Series, spec_values): 220 | """Split by Bins. 221 | 222 | Args: 223 | x: Feature. 224 | spec_values: Special values. 225 | 226 | Returns: 227 | Transformed feature. 228 | 229 | """ 230 | df_cod = self.__df_cod_transform(x, spec_values) 231 | return df_cod 232 | 233 | def fit_transform_cv(self, x: pd.Series, y: pd.Series, spec_values, cv_index_split: Dict[int, List[int]]): 234 | """Cross-Val WoE encoding. 235 | 236 | Args: 237 | x: Feature. 238 | y: Target. 239 | spec_values: Special values. Если значаение не None, то кодируем WoE по дефолту, если же нет, то кодируем 0 240 | cv_index_split: Cross-Val splits. 241 | 242 | Returns: 243 | Encoded feature. 244 | 245 | """ 246 | x_ = deepcopy(x) 247 | for value in cv_index_split.values(): 248 | train_index, test_index = value 249 | self.fit(x.iloc[train_index], y.iloc[train_index], spec_values) 250 | x_.iloc[test_index] = self.transform(x.iloc[test_index], spec_values).astype(x.dtype) 251 | return x_.astype(float) 252 | -------------------------------------------------------------------------------- /autowoe/lib/utilities/refit.py: -------------------------------------------------------------------------------- 1 | """Additional functional for refitting model.""" 2 | 3 | from copy import deepcopy 4 | from typing import Optional, Tuple, cast 5 | 6 | import numpy as np 7 | import sklearn 8 | from scipy import linalg, stats 9 | from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression 10 | from sklearn.svm import l1_min_c 11 | 12 | from autowoe.lib.logging import get_logger 13 | 14 | from .utils import TaskType 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | def refit_reg( 20 | task: TaskType, 21 | x_train: np.ndarray, 22 | y: np.ndarray, 23 | l1_grid_size: int, 24 | l1_exp_scale: float, 25 | max_penalty: float, 26 | interp: bool = True, 27 | ) -> Tuple[np.ndarray, float, np.ndarray]: 28 | """Final model refit with regularization. 29 | 30 | Args: 31 | task: Task. 32 | x_train: Train features. 33 | y: Train target. 34 | l1_grid_size: Number of point at regularized grid. 35 | l1_exp_scale: Maximum value of `C` coefficient. 36 | max_penalty: maximum value of `C` coefficient. 37 | interp: Interpreted model. 38 | 39 | Returns: 40 | Weights , intercept of model, features mask. 41 | 42 | """ 43 | weights, intercepts = [], [] 44 | if task == TaskType.BIN: 45 | clf = LogisticRegression(penalty="l1", solver="saga", warm_start=True, intercept_scaling=100000) 46 | cs = l1_min_c(x_train, y, loss="log", fit_intercept=True) * np.logspace(0, l1_exp_scale, l1_grid_size) 47 | cs = cs[cs <= max_penalty] 48 | # add final penalty 49 | if cs[-1] < max_penalty: 50 | cs = list(cs) 51 | cs.append(max_penalty) 52 | 53 | # fit path 54 | for c in cs: 55 | clf.set_params(C=c) 56 | clf.fit(x_train, y) 57 | weights.append(deepcopy(clf.coef_[0])) 58 | intercepts.append(clf.intercept_[0]) 59 | 60 | if not interp: 61 | w, i = weights[-1], intercepts[-1] 62 | neg = w != 0 63 | return w[neg], i, neg 64 | 65 | for w, i in zip(weights[::-1], intercepts[::-1]): 66 | pos = (w > 0).sum() 67 | if pos > 0: 68 | continue 69 | 70 | neg = w < 0 71 | return w[neg], i, neg 72 | else: 73 | cs_max_penalty = 1 / max_penalty 74 | model = Lasso(warm_start=True, positive=interp) 75 | cs = np.logspace(0, l1_exp_scale, l1_grid_size + 1) 76 | cs = cs[cs <= cs_max_penalty] 77 | # add final penalty 78 | if cs[-1] < cs_max_penalty: 79 | cs = list(cs) 80 | cs.append(cs_max_penalty) 81 | cs = np.array(cs) 82 | 83 | alphas = (1.0 / cs[1:])[::-1] 84 | 85 | for alpha in alphas: 86 | model.set_params(alpha=alpha) 87 | model.fit(x_train, y) 88 | weights.append(model.coef_) 89 | intercepts.append(model.intercept_) 90 | 91 | w, i = weights[0], intercepts[0] 92 | pos = w >= 0 93 | 94 | return w[pos], i, pos 95 | 96 | raise ValueError("No negative weights grid") 97 | 98 | 99 | def refit_simple( 100 | task: TaskType, 101 | x_train: np.ndarray, 102 | y: np.ndarray, 103 | interp: bool = True, 104 | p_val: float = 0.05, 105 | x_val: Optional[np.ndarray] = None, 106 | y_val: Optional[np.ndarray] = None, 107 | n_jobs: int = -1, 108 | ) -> Tuple[np.ndarray, float, np.ndarray, np.ndarray, np.ndarray]: 109 | """Final model refit with stat model mode. 110 | 111 | Args: 112 | task: Task. 113 | x_train: Train features. 114 | y: Train target. 115 | interp: Interpreted model. 116 | p_val: P-value. 117 | x_val: Validation features. 118 | y_val: Validation target. 119 | n_jobs: Number of threads. 120 | 121 | Returns: 122 | weights, intercept, features mask, p values, b vars. 123 | 124 | """ 125 | sl_ok = np.ones(x_train.shape[1], dtype=bool) 126 | 127 | n = -1 128 | 129 | logreg_penalty = None if sklearn.__version__ >= "1.2.0" else "none" 130 | 131 | while True: 132 | n += 1 133 | assert sl_ok.sum() > 0, "No features left to fit on iter" 134 | 135 | logger.info(f"Iter {n} of final refit starts with {sl_ok.sum()} features") 136 | 137 | x_train_ = x_train[:, sl_ok] 138 | # индексы в исходном массиве 139 | ok_idx = np.arange(x_train.shape[1])[sl_ok] 140 | 141 | if task == TaskType.BIN: 142 | model = LogisticRegression(penalty=logreg_penalty, solver="lbfgs", warm_start=False, intercept_scaling=1) 143 | model.fit(x_train_, y) 144 | model_coef = model.coef_[0] 145 | model_intercept = model.intercept_[0] 146 | else: 147 | model = LinearRegression(n_jobs=n_jobs) 148 | model.fit(x_train_, y) 149 | model_coef = model.coef_ 150 | model_intercept = model.intercept_ 151 | 152 | # check negative coefs here if interp 153 | sl_pos_coef = np.zeros((x_train_.shape[1],), dtype=bool) 154 | if interp: 155 | sl_pos_coef = model.coef_[0] >= 0 if task == TaskType.BIN else model.coef_[0] <= 0 156 | 157 | # если хотя бы один неотрицательный - убирай самый большой и по новой 158 | if sl_pos_coef.sum() > 0: 159 | max_coef_idx = model_coef.argmax() 160 | sl_ok[ok_idx[max_coef_idx]] = False 161 | continue 162 | 163 | # если прошли все отрицательные смотрим на pvalue 164 | if task == TaskType.BIN: 165 | p_vals, b_var = calc_p_val(x_train_, model_coef, model_intercept) 166 | else: 167 | p_vals, b_var = calc_p_val_reg(x_train_, y, model_coef, model_intercept) 168 | 169 | # без интерсепта 170 | p_vals_f = p_vals[:-1] 171 | 172 | model_p_vals = p_vals.copy() 173 | model_b_var = b_var.copy() if b_var is not None else None 174 | 175 | # если хотя бы один больше p_val - дропай самый большой и погнали по новой 176 | if p_vals_f.max() > p_val: 177 | max_p_val_idx = p_vals_f.argmax() 178 | sl_ok[ok_idx[max_p_val_idx]] = False 179 | continue 180 | 181 | if x_val is not None: 182 | # то же самое на валидационной выборке 183 | logger.info("Validation data checks") 184 | x_val_ = x_val[:, sl_ok] 185 | 186 | p_vals, b_var = calc_p_val_on_valid(x_val_, y_val, task, n_jobs) 187 | p_vals_f = p_vals[:-1] 188 | 189 | # если хотя бы один больше p_val - дропай самый большой и погнали по новой 190 | if p_vals_f.max() > p_val: 191 | max_p_val_idx = p_vals_f.argmax() 192 | sl_ok[ok_idx[max_p_val_idx]] = False 193 | continue 194 | 195 | weights = cast(np.ndarray, model_coef) 196 | intercept = cast(float, model_intercept) 197 | 198 | return weights, intercept, sl_ok, cast(np.ndarray, model_p_vals), cast(np.ndarray, model_b_var) 199 | 200 | 201 | def calc_p_val(x_train: np.ndarray, weights: np.ndarray, intercept: float) -> Tuple[np.ndarray, np.ndarray]: 202 | """Calc p-values for coef estimates. 203 | 204 | Args: 205 | x_train: Train features. 206 | weights: Model Weights. 207 | intercept: Model intercept coefficient. 208 | 209 | Returns: 210 | p values, b vars. 211 | 212 | """ 213 | coef_ = np.concatenate([weights, [intercept]]) 214 | x_train = np.concatenate([x_train, np.ones((x_train.shape[0], 1))], axis=1) 215 | prob_ = 1 / (1 + np.exp(-np.dot(x_train, coef_))) 216 | prob_ = prob_ * (1 - prob_) 217 | hess = np.dot((prob_[:, np.newaxis] * x_train).T, x_train) 218 | 219 | inv_hess = np.linalg.inv(hess) 220 | b_var = inv_hess.diagonal() 221 | w_stat = (coef_**2) / b_var 222 | 223 | p_vals = 1 - stats.chi2(1).cdf(w_stat) 224 | 225 | return p_vals, b_var 226 | 227 | 228 | def calc_p_val_on_valid( 229 | x_train: np.ndarray, y: np.ndarray, task: TaskType, n_jobs: int = -1 230 | ) -> Tuple[np.ndarray, np.ndarray]: 231 | """Fit algo and calc p-values. 232 | 233 | Args: 234 | x_train: Train features. 235 | y: Train target. 236 | task: Task. 237 | n_jobs: Number of threads. 238 | 239 | Returns: 240 | p values, b vars. 241 | 242 | """ 243 | logreg_penalty = None if sklearn.__version__ >= "1.2.0" else "none" 244 | 245 | if task == TaskType.BIN: 246 | model = LogisticRegression(penalty=logreg_penalty, solver="lbfgs", warm_start=False, intercept_scaling=1) 247 | model.fit(x_train, y) 248 | 249 | return calc_p_val(x_train, model.coef_[0], model.intercept_[0]) 250 | else: 251 | model = LinearRegression(n_jobs=n_jobs) 252 | model.fit(x_train, y) 253 | 254 | return calc_p_val_reg(x_train, y, model.coef_, model.intercept_) 255 | 256 | 257 | def calc_p_val_reg( 258 | x_train: np.ndarray, y_train: np.ndarray, weights: np.ndarray, intercept: float 259 | ) -> Tuple[np.ndarray, np.ndarray]: 260 | """Calculate p values for regression task.""" 261 | n, k = x_train.shape 262 | y_pred = (np.dot(x_train, weights) + intercept).T 263 | 264 | # Change X and Y into numpy matrices. x also has a column of ones added to it. 265 | x = np.hstack((np.matrix(x_train), np.ones((n, 1)))) 266 | y_train = np.matrix(y_train).T 267 | 268 | # Degrees of freedom. 269 | freedom_degrees = float(n - k - 1) 270 | 271 | # Sample variance. 272 | sse = np.sum(np.square(y_pred - y_train), axis=0) 273 | sampleVariance = sse / freedom_degrees 274 | 275 | # Sample variance for x. 276 | sampleVarianceX = x.T * x 277 | 278 | # Covariance Matrix = [(s^2)(X'X)^-1]^0.5. (sqrtm = matrix square root. ugly) 279 | covarianceMatrix = linalg.sqrtm(sampleVariance[0, 0] * sampleVarianceX.I) 280 | 281 | # Standard errors for the difference coefficients: the diagonal elements of the covariance matrix. 282 | se = covarianceMatrix.diagonal() # [1:] 283 | 284 | # T statistic for each beta. 285 | betasTStat = np.zeros(len(se)) 286 | for i in range(len(se) - 1): 287 | betasTStat[i] = weights[i] / se[i] 288 | betasTStat[-1] = intercept / se[-1] 289 | 290 | # P-value for each beta. This is a two sided t-test, since the betas can be 291 | # positive or negative. 292 | betasPValue = 1 - stats.t.cdf(abs(betasTStat), freedom_degrees) 293 | 294 | return betasPValue, None 295 | -------------------------------------------------------------------------------- /examples/Tutorial_2__Dates_and_stat_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from sklearn.metrics import roc_auc_score\n", 11 | "\n", 12 | "from autowoe import AutoWoE, ReportDeco" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "### Чтение и подготовка обучающей выборки" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "train = pd.read_csv(\n", 29 | " \"./data/train_demo.csv\", low_memory=False, index_col=\"line_id\", parse_dates=[\"datetime_\" + str(i) for i in range(2)]\n", 30 | ")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### Чтение и подготовка тестовой выборки" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "test = pd.read_csv(\"./data/test_demo.csv\", index_col=\"line_id\", parse_dates=[\"datetime_\" + str(i) for i in range(2)])\n", 47 | "\n", 48 | "test_target = pd.read_csv(\"./data/test-target_demo.csv\")[\"target\"]\n", 49 | "test[\"target\"] = test_target.values" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "### Параметры модели" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Для обучения модели рекомендуется указать тип признаков для обучения.\n", 64 | "Поэтому создается словарь features_type с ключами: \n", 65 | "\n", 66 | " \"real\" -- вещественный признак\n", 67 | " \n", 68 | " \"cat\" -- категориальный.\n", 69 | " \n", 70 | " __\"date\"-- (\"%Y%d%m\", (\"m\", \"d\", \"wd\", \"h\", \"min\"))__\n", 71 | " \n", 72 | " Для признаков, которые не размечены, типы будут определены автоматом. Такой вариант будет работать, но качество порядочно просядет\n", 73 | " \n", 74 | "__Попробуем указать даты с форматом None (автоопределение) и сезонностью - день месяца и день недели__" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "#### features_type" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "num_col = list(filter(lambda x: \"numb\" in x, train.columns))\n", 91 | "num_feature_type = dict.fromkeys(num_col, \"real\")\n", 92 | "\n", 93 | "date_col = list(filter(lambda x: \"datetime\" in x, train.columns))\n", 94 | "date_feature_type = dict.fromkeys(date_col, (None, (\"d\", \"wd\")))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "features_type = dict(**num_feature_type, **date_feature_type)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 6, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# подробно параметры описаны в Example_1\n", 113 | "auto_woe = AutoWoE(\n", 114 | " monotonic=True, max_bin_count=4, oof_woe=False, regularized_refit=False, p_val=0.05, debug=False, verbose=0\n", 115 | ")\n", 116 | "auto_woe = ReportDeco(auto_woe)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "metadata": { 123 | "scrolled": true 124 | }, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "[LightGBM] [Info] Number of positive: 63, number of negative: 5537\n", 131 | "[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010733 seconds.\n", 132 | "You can set `force_col_wise=true` to remove the overhead.\n", 133 | "[LightGBM] [Info] Total Bins 11532\n", 134 | "[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 652\n", 135 | "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011250 -> initscore=-4.476073\n", 136 | "[LightGBM] [Info] Start training from score -4.476073\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "auto_woe.fit(train[num_col + date_col + [\"target\"]], target_name=\"target\", features_type=features_type)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "metadata": { 148 | "scrolled": true 149 | }, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "0.7911446119486321" 155 | ] 156 | }, 157 | "execution_count": 8, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "pred = auto_woe.predict_proba(test)\n", 164 | "roc_auc_score(test[\"target\"], pred)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "##### Замечание\n", 172 | "ReportDeco - обертка для построения отчета. Она не обязательна для обучения и применения модели, но обязательна для построения отчета (см последнюю ячейку)." 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Значения коэфициентов и p-values\n", 180 | "\n", 181 | "При указании regularized_refit=False будет произведена оценка p-value на коэфициенты модели. Коэфициенты с p-value выше указанного порога не будут включены в модель" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 9, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "number_254 -0.487530\n", 193 | "number_10 -0.475665\n", 194 | "number_345 -0.707849\n", 195 | "number_759 -0.763258\n", 196 | "number_761 -0.894294\n", 197 | "number_706 -0.648337\n", 198 | "number_1 -1.044868\n", 199 | "number_368 -1.062441\n", 200 | "datetime_1__F__d -1.232442\n", 201 | "dtype: float64" 202 | ] 203 | }, 204 | "execution_count": 9, 205 | "metadata": {}, 206 | "output_type": "execute_result" 207 | } 208 | ], 209 | "source": [ 210 | "auto_woe.features_fit" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 10, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "text/plain": [ 221 | "-4.545016720125766" 222 | ] 223 | }, 224 | "execution_count": 10, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | } 228 | ], 229 | "source": [ 230 | "auto_woe.intercept" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 11, 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "number_254 0.013034\n", 242 | "number_10 0.030010\n", 243 | "number_345 0.004663\n", 244 | "number_759 0.001166\n", 245 | "number_761 0.000357\n", 246 | "number_706 0.006792\n", 247 | "number_1 0.001364\n", 248 | "number_368 0.000006\n", 249 | "datetime_1__F__d 0.003993\n", 250 | "Intercept_ 0.000000\n", 251 | "dtype: float64" 252 | ] 253 | }, 254 | "execution_count": 11, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "auto_woe.p_vals" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "### Формирование отчета" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 12, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "name": "stderr", 277 | "output_type": "stream", 278 | "text": [ 279 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 280 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 281 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 282 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 283 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 284 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 285 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 286 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 287 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 288 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n" 289 | ] 290 | } 291 | ], 292 | "source": [ 293 | "report_params = {\n", 294 | " \"automl_date_column\": \"report_month\", # колонка с датой в формате params['datetimeFormat']\n", 295 | " \"output_path\": \"./AUTOWOE_REPORT_2\", # папка, куда сгенерится отчет и сложатся нужные файлы\n", 296 | " \"report_name\": \"___НАЗВАНИЕ ОТЧЕТА___\",\n", 297 | " \"report_version_id\": 1,\n", 298 | " \"city\": \"Воронеж\",\n", 299 | " \"model_aim\": \"___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___\",\n", 300 | " \"model_name\": \"___НАЗВАНИЕ МОДЕЛИ___\",\n", 301 | " \"zakazchik\": \"___ЗАКАЗЧИК___\",\n", 302 | " \"high_level_department\": \"___ПОДРАЗДЕЛЕНИЕ___\",\n", 303 | " \"ds_name\": \"___РАЗРАБОТЧИК МОДЕЛИ___\",\n", 304 | " \"target_descr\": \"___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___\",\n", 305 | " \"non_target_descr\": \"___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___\",\n", 306 | "}\n", 307 | "\n", 308 | "auto_woe.generate_report(report_params)" 309 | ] 310 | } 311 | ], 312 | "metadata": { 313 | "kernelspec": { 314 | "display_name": "Anaconda_py38", 315 | "language": "python", 316 | "name": "anaconda_py38" 317 | }, 318 | "language_info": { 319 | "codemirror_mode": { 320 | "name": "ipython", 321 | "version": 3 322 | }, 323 | "file_extension": ".py", 324 | "mimetype": "text/x-python", 325 | "name": "python", 326 | "nbconvert_exporter": "python", 327 | "pygments_lexer": "ipython3", 328 | "version": "3.8.5" 329 | }, 330 | "stem_cell": { 331 | "cell_type": "raw", 332 | "metadata": { 333 | "pycharm": { 334 | "metadata": false 335 | } 336 | }, 337 | "source": "" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 1 342 | } 343 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 (с) Ryzhkov Alexander, Vakhrushev Anton, Savchenko Maksim, 190 | Simakov Dmitrii, Damdinov Rinchin, Kirilin Alexander, 191 | Bunakov Vasilii 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. 204 | -------------------------------------------------------------------------------- /autowoe/lib/report/utilities_images/utilities_images.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from typing import List 4 | from typing import Union 5 | 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | import seaborn as sns 10 | 11 | from sklearn.metrics import roc_auc_score 12 | from sklearn.metrics import roc_curve 13 | 14 | 15 | def plot_bars(df, path, title=None): 16 | sns.set(style="whitegrid", font_scale=1.5) 17 | pl = df.plot(figsize=(10, 10), kind="bar", cmap="Accent", width=1) 18 | if title: 19 | pl.title.set_text(title) 20 | pl.get_figure().savefig(path, bbox_inches="tight") 21 | plt.close() 22 | 23 | 24 | def plot_roc_curve_image(y_true, y_pred, path): 25 | sns.set(style="whitegrid", font_scale=1.5) 26 | plt.figure(figsize=(10, 10)) 27 | 28 | fpr_reg, tpr_reg, _ = roc_curve(y_true, y_pred) 29 | auc_score_reg = roc_auc_score(y_true, y_score=y_pred) 30 | 31 | lw = 2 32 | plt.plot( 33 | fpr_reg, 34 | tpr_reg, 35 | color="darkorange", 36 | lw=lw, 37 | label=f"WhiteBox модель (GINI = {(2 * auc_score_reg - 1):.3f})", 38 | ) 39 | plt.plot([0, 1], [0, 1], color="red", lw=lw, linestyle="--", label="Random Model") 40 | plt.xlim([-0.05, 1.05]) 41 | plt.ylim([-0.05, 1.05]) 42 | plt.xlabel("False Positive Rate") 43 | plt.ylabel("True Positive Rate") 44 | lgd = plt.legend(bbox_to_anchor=(0.5, -0.15), loc="upper center", ncol=2) 45 | plt.xticks(np.arange(0, 1.01, 0.05), rotation=45) 46 | plt.yticks(np.arange(0, 1.01, 0.05)) 47 | plt.grid(color="gray", linestyle="-", linewidth=1) 48 | plt.title(f"ROC кривая (GINI = {(2 * auc_score_reg - 1):.3f})") 49 | plt.savefig(path, bbox_extra_artists=(lgd,), bbox_inches="tight") 50 | plt.close() 51 | 52 | 53 | def plot_double_roc_curve(train_y_true, train_y_pred, test_y_true, test_y_pred, path): 54 | sns.set(style="whitegrid", font_scale=1.5) 55 | plt.figure(figsize=(10, 10)) 56 | 57 | train_fpr_reg, train_tpr_reg, _ = roc_curve(train_y_true, train_y_pred) 58 | train_auc_score_reg = roc_auc_score(train_y_true, y_score=train_y_pred) 59 | test_fpr_reg, test_tpr_reg, _ = roc_curve(test_y_true, test_y_pred) 60 | test_auc_score_reg = roc_auc_score(test_y_true, y_score=test_y_pred) 61 | 62 | lw = 2 63 | plt.plot( 64 | train_fpr_reg, 65 | train_tpr_reg, 66 | color="darkorange", 67 | lw=lw, 68 | label=f"По данным train (GINI = {(2 * train_auc_score_reg - 1):.3f})", 69 | ) 70 | plt.plot( 71 | test_fpr_reg, 72 | test_tpr_reg, 73 | color="blue", 74 | lw=lw, 75 | label=f"По данным test (GINI = {(2 * test_auc_score_reg - 1):.3f})", 76 | ) 77 | plt.plot([0, 1], [0, 1], color="red", lw=lw, linestyle="--", label="Random Model") 78 | plt.xlim([-0.05, 1.05]) 79 | plt.ylim([-0.05, 1.05]) 80 | plt.xlabel("False Positive Rate") 81 | plt.ylabel("True Positive Rate") 82 | plt.legend(loc="lower right") 83 | plt.xticks(np.arange(0, 1.01, 0.05), rotation=45) 84 | plt.yticks(np.arange(0, 1.01, 0.05)) 85 | plt.grid(color="gray", linestyle="-", linewidth=1) 86 | plt.title("ROC кривая") 87 | plt.savefig(path, bbox_inches="tight") 88 | plt.close() 89 | 90 | 91 | def plot_roc_curve_feature_image(feature_name, y_true, y_pred, path): 92 | sns.set(style="whitegrid", font_scale=1.5) 93 | plt.figure(figsize=(10, 10)) 94 | 95 | fpr_reg, tpr_reg, _ = roc_curve(y_true, y_pred) 96 | auc_score_reg = roc_auc_score(y_true, y_score=y_pred) 97 | 98 | lw = 2 99 | plt.plot( 100 | fpr_reg, 101 | tpr_reg, 102 | color="darkorange", 103 | lw=lw, 104 | label=feature_name + f" (GINI = {(2 * auc_score_reg - 1):.3f})", 105 | ) 106 | plt.plot([0, 1], [0, 1], color="red", lw=lw, linestyle="--", label="Random Model") 107 | plt.xlim([-0.05, 1.05]) 108 | plt.ylim([-0.05, 1.05]) 109 | plt.xlabel("False Positive Rate") 110 | plt.ylabel("True Positive Rate") 111 | lgd = plt.legend(bbox_to_anchor=(0.5, -0.15), loc="upper center", ncol=2) 112 | plt.xticks(np.arange(0, 1.01, 0.05), rotation=45) 113 | plt.yticks(np.arange(0, 1.01, 0.05)) 114 | plt.grid(color="gray", linestyle="-", linewidth=1) 115 | plt.title(f"ROC curve(GINI = {(2 * auc_score_reg - 1):.3f})" + f" of feature {feature_name}") 116 | plt.savefig(path, bbox_extra_artists=(lgd,), bbox_inches="tight") 117 | plt.close() 118 | 119 | 120 | def plot_model_weights(features, path): 121 | sns.set(style="whitegrid", font_scale=1.5) 122 | fig = plt.figure(figsize=(20, 5)) 123 | ax = fig.add_axes([0, 0, 1, 1]) 124 | ax.bar(features.index, features.values, color="g") 125 | lgd = plt.legend(bbox_to_anchor=(0.5, -0.15), loc="upper center", ncol=2) 126 | plt.title("Model coefs", fontsize=28) 127 | plt.xlabel("Features", fontsize=20) 128 | plt.ylabel("Coef values", fontsize=20) 129 | plt.xticks(fontsize=15, rotation=90) 130 | plt.yticks(fontsize=15) 131 | plt.savefig(path, bbox_extra_artists=(lgd,), bbox_inches="tight") 132 | plt.close() 133 | 134 | 135 | def plot_feature_split(feature_name, features, path): 136 | sns.set(style="whitegrid", font_scale=1.5) 137 | fig = plt.figure(figsize=(15, 5)) 138 | ax = fig.add_axes([0, 0, 1, 1]) 139 | ax.bar(features.index, features.values, color="g") 140 | lgd = plt.legend(bbox_to_anchor=(0.5, -0.15), loc="upper center", ncol=2) 141 | plt.title("Split of feature " + feature_name + " and woe values") 142 | plt.xlabel("Bins", fontsize=20) 143 | plt.ylabel("WoE values", fontsize=20) 144 | plt.xticks(fontsize=15) 145 | plt.yticks(fontsize=15) 146 | plt.savefig(path, bbox_extra_artists=(lgd,), bbox_inches="tight") 147 | plt.close() 148 | 149 | 150 | def plot_ginis(data_enc, target, path): 151 | sns.set(style="whitegrid", font_scale=1.5) 152 | feats = list(data_enc.columns) 153 | aucs = [roc_auc_score(target, -data_enc[col].values) for col in feats] 154 | ginis = [(x - 0.5) * 2 for x in aucs] 155 | ginis = pd.Series(ginis, index=feats).sort_values(ascending=True) 156 | pl = ginis.plot(kind="barh", figsize=(10, 10)) 157 | pl.get_figure().savefig(path, bbox_inches="tight") 158 | plt.close() 159 | 160 | 161 | def plot_woe_bars(train_enc, train_target, test_enc, test_target, target_name, column, path): 162 | sns.set(style="whitegrid", font_scale=1.5) 163 | names = ["train", "test"] 164 | samples = [] 165 | for df, target in zip([train_enc, test_enc], [train_target, test_target]): 166 | df_copy = df.copy().round(3) 167 | df_copy[target_name] = target 168 | samples.append(df_copy) 169 | 170 | samples = [ 171 | x[[target_name, column]].groupby(column)[target_name].agg(["mean", "count"]).reset_index() for x in samples 172 | ] 173 | 174 | for df in samples: 175 | df["count"] /= df["count"].sum() 176 | 177 | df.rename({"count": "Freq", "mean": "DefaultRate", column: "WOE: " + column}, inplace=True, axis=1) 178 | 179 | total = pd.concat(samples, axis=0, ignore_index=True) 180 | order = total["WOE: " + column].drop_duplicates().sort_values().values 181 | order = pd.Series(np.arange(order.shape[0]), index=order) 182 | 183 | total["_sample_"] = np.concatenate([[n] * x.shape[0] for (n, x) in zip(names, samples)]) 184 | 185 | plt.figure(figsize=(10, 10)) 186 | ax = sns.barplot(x="WOE: " + column, hue="_sample_", y="Freq", data=total, palette=sns.color_palette("Accent", 7)) 187 | ax2 = ax.twinx() 188 | 189 | for df, name in zip(samples, names): 190 | df.set_index(df["WOE: " + column].map(order).values)["DefaultRate"].plot(ax=ax2, label=name, marker="x") 191 | ax2.legend(title="_sample_") 192 | 193 | plt.savefig(path, bbox_inches="tight") 194 | plt.close() 195 | 196 | 197 | def plot_backlash_check(predict_proba, data_enc, target, col, path): 198 | sns.set(style="whitegrid", font_scale=1.5) 199 | df = pd.DataFrame({"pred": predict_proba, col: data_enc[col], "Target": target}) 200 | grp = df.groupby(col)[["pred", "Target"]].mean() 201 | grp.plot(figsize=(10, 10)).get_figure().savefig(path, bbox_inches="tight") 202 | plt.close() 203 | 204 | 205 | def plot_binned(data_binned, path1, path2): 206 | sns.set(style="whitegrid", font_scale=1.5) 207 | pl = (data_binned.groupby("ScoreBin").size().sort_index() / data_binned.shape[0]).plot(figsize=(10, 10), kind="bar") 208 | pl.get_figure().savefig(path1, bbox_inches="tight") 209 | plt.close() 210 | 211 | neg = data_binned[data_binned["Target"] == 0].groupby("ScoreBin").size().sort_index() / ( 212 | data_binned.shape[0] - data_binned["Target"].sum() 213 | ) # .plot(kind='bar', cmap='Accent') 214 | 215 | pos = data_binned[data_binned["Target"] == 1].groupby("ScoreBin").size().sort_index() / ( 216 | data_binned["Target"].sum() 217 | ) # .plot(kind='bar', cmap='Accent', color='blue') 218 | 219 | pl = pd.DataFrame({"positive": pos, "negative": neg}).plot(figsize=(10, 10), kind="bar", cmap="Accent", width=1) 220 | pl.get_figure().savefig(path2, bbox_inches="tight") 221 | plt.close() 222 | 223 | 224 | def plot_binned_stats(data_binned, path): 225 | sns.set(style="whitegrid", font_scale=1.5) 226 | pl = data_binned[["ScoreBin", "P"]].boxplot(by="ScoreBin", rot=90, figsize=(10, 10)) 227 | pl.get_figure().savefig(path, bbox_inches="tight") 228 | plt.close() 229 | 230 | 231 | def plot_corr_heatmap(corr_map, path): 232 | sns.set(style="whitegrid", font_scale=1.5) 233 | plt.figure(figsize=(20, 10)) 234 | pl = sns.heatmap(corr_map, annot=True, annot_kws={"size": 8}, fmt=".1g") 235 | pl.get_figure().savefig(path, bbox_inches="tight") 236 | plt.close() 237 | 238 | 239 | def plot_mean_target(train_binned, test_binned, path): 240 | sns.set(style="whitegrid", font_scale=1.5) 241 | train_stat = train_binned.groupby("ScoreBin").agg(mean_target=("Target", "mean")) 242 | test_stat = test_binned.groupby("ScoreBin").agg(mean_target=("Target", "mean")) 243 | df = pd.DataFrame({"train_mean_target": train_stat["mean_target"], "test_mean_target": test_stat["mean_target"]}) 244 | pl = df.plot(figsize=(10, 10), kind="bar", cmap="Accent", width=1) 245 | pl.get_figure().savefig(path, bbox_inches="tight") 246 | plt.close() 247 | 248 | 249 | def plot_grouped( 250 | df: List[pd.DataFrame], 251 | group_columns: Union[str, List[str]], 252 | group_name: str = None, 253 | plot_kind: str = "point", 254 | path: str = None, 255 | ): 256 | """Построить график аггрегированных значений для тренировочных и валидационных данных. 257 | 258 | Данные датафреймов аггрегируются либо по столбцу group_column, 259 | который должен быть в каждом датафрейме, либо по последовательностям 260 | group_data_train и group_data_test для тренировочного и валидационного датафрейма соответственно. 261 | 262 | Args: 263 | df: Данные (список датафремов) для построения графиков 264 | group_columns: Имя столбца или нескольких столбцов, по которым будет производиться аггрегация. 265 | group_name: Название оси Х на графике, вдоль которой будет производиться группировка значений. 266 | Если не задано, будут использованы названия столбцов group_columns. 267 | plot_kind: Тип графика. Возможны значения "point", "bar" и "line". 268 | path: Путь к файлу, в который будет сохранено изображение. Если не задан, то изображение не сохраняется. 269 | 270 | """ 271 | 272 | if not df: 273 | return 274 | 275 | if isinstance(group_columns, str): 276 | group_columns = [group_columns] 277 | 278 | group_name = group_name or (group_columns if isinstance(group_columns, str) else "_".join(group_columns)) 279 | 280 | mdf = pd.concat(list(map(lambda x: pd.melt(x, id_vars=group_columns), df))) 281 | mdf = mdf.sort_values(by=group_columns) 282 | mdf[group_name] = mdf[group_columns].astype(str).agg("/".join, axis=1) 283 | mdf = mdf[["variable", "value", group_name]] 284 | 285 | # bins = max(df_train[group_columns].nunique(dropna=False), df_test[group_columns].nunique(dropna=False)) 286 | # if bins > max_bins: 287 | 288 | sns.set(style="whitegrid", font_scale=1.5) 289 | if plot_kind == "point": 290 | plot = sns.catplot(x=group_name, y="value", hue="variable", kind="point", data=mdf, height=10) 291 | plot.set_xticklabels(rotation=30) 292 | elif plot_kind == "line": 293 | sns.set(rc={"figure.figsize": (10, 10)}) 294 | plot = sns.lineplot(x=group_name, y="value", hue="variable", data=mdf, sort=False) 295 | plt.xticks(rotation=30) 296 | elif plot_kind == "box": 297 | plot = sns.boxplot(x=group_name, y="value", hue="variable", data=mdf, showfliers=False) 298 | plot.set_xticklabels(rotation=30) 299 | # elif plot_kind == 'bar': 300 | # mdf = mdf.groupby(by=group_name).agg('mean') 301 | # plot = mdf.plot(figsize=(10, 10), kind='bar', cmap='Accent', width=0.8) 302 | # plt.xticks(rotation=30) 303 | else: 304 | raise ValueError(f"Invalid plot kind: {plot_kind}") 305 | 306 | if path: 307 | plot.get_figure().savefig(path, bbox_inches="tight") 308 | 309 | plt.close() 310 | -------------------------------------------------------------------------------- /autowoe/lib/utilities/sql.py: -------------------------------------------------------------------------------- 1 | """SQL-query utilities.""" 2 | 3 | from typing import Any, Dict, List, Optional, Tuple, Union 4 | 5 | from autowoe.lib.pipelines.pipeline_feature_special_values import MARK_SET, NAN_SET, SMALL_SET, is_mark_prefix 6 | from autowoe.lib.utilities.utils import TaskType 7 | from autowoe.lib.woe.woe import WoE 8 | 9 | 10 | def prepare_number( 11 | woe_dict: WoE, 12 | name: str, 13 | r_val: int = 3, 14 | round_features: int = 5, 15 | nan_pattern: str = "({0} IS NULL OR {0} = 'NaN')", 16 | preprocessing: Optional[str] = None, 17 | mark_values: Optional[Dict[str, Tuple[Any]]] = None, 18 | mark_encoding: Optional[Dict[Any, str]] = None, 19 | ): 20 | """Get encoding case when for number. 21 | 22 | Args: 23 | woe_dict: Dictionary of WoE values. 24 | name: Name of feature. 25 | r_val: Numbers after the decimal point. 26 | round_features: Numbers after the decimal point. 27 | nan_pattern: Expression for nan processing. 28 | preprocessing: Name preprocessing. 29 | mark_values: List of marked values. 30 | mark_encoding: Map marked value to code. 31 | 32 | Returns: 33 | sql query part for number. 34 | 35 | """ 36 | # value in case 37 | feature_mark_values = [] if mark_values is None else mark_values.get(name, []) 38 | 39 | f_val = name 40 | if preprocessing is not None: 41 | f_val = preprocessing.format(name) 42 | 43 | # search for NaN encoding 44 | for grp in woe_dict.cod_dict: 45 | if type(grp) is str and grp.startswith("__NaN_"): 46 | nan_val = round(woe_dict.cod_dict[grp], r_val) 47 | break 48 | else: 49 | raise ValueError("NaN encoding value does not exists in woe_dict") 50 | 51 | nan_case = nan_pattern.format(f_val) 52 | feature = f"""CASE\n WHEN {nan_case} THEN {nan_val}\n""" 53 | 54 | # if feature_mark_values is not None: 55 | # for grp in woe_dict.cod_dict: 56 | # if type(grp) is str and grp.startswith("__Mark_"): 57 | # mark_val = round(woe_dict.cod_dict[grp], r_val) 58 | # break 59 | 60 | # mark_case = ", ".join(str(m) for m in feature_mark_values) 61 | # feature += """ WHEN {} IN ({}) THEN {}\n""".format(f_val, mark_case, mark_val) 62 | 63 | # create regular bins 64 | for grp, val in enumerate(woe_dict.split): 65 | enc_val = round(woe_dict.cod_dict[grp], r_val) 66 | feature += f""" WHEN {f_val} <= {round(val, round_features)} THEN {enc_val}\n""" 67 | 68 | for mv in feature_mark_values: 69 | # enc = "__Mark__{}__".format(mv) 70 | enc = mark_encoding[name][mv] 71 | enc_val = round(woe_dict.cod_dict[enc], r_val) 72 | feature += f""" WHEN {f_val} == {mv} THEN {enc_val}\n""" 73 | 74 | # create last else val 75 | enc_val = round(woe_dict.cod_dict[len(woe_dict.split)], r_val) 76 | feature += f""" ELSE {enc_val}\nEND AS {name}""" 77 | 78 | return feature 79 | 80 | 81 | def check_cat_symb(x: Union[str, Any]) -> str: 82 | """Wrap to quotes. 83 | 84 | Args: 85 | x: Value. 86 | 87 | Returns: 88 | quoted string. 89 | 90 | """ 91 | if type(x) is str: 92 | x = f"'{x}'" 93 | else: 94 | x = str(x) 95 | 96 | return x 97 | 98 | 99 | def prepare_category( 100 | woe_dict, 101 | name: str, 102 | r_val: int = 3, 103 | nan_pattern: str = "({0} IS NULL OR LOWER(CAST({0} AS VARCHAR(50))) = 'nan')", 104 | preprocessing: Optional[str] = None, 105 | mark_values: Optional[Dict[str, List[Any]]] = None, 106 | mark_encoding: Optional[Dict[Any, str]] = None, 107 | ): 108 | """Get encoding case when for category. 109 | 110 | Args: 111 | woe_dict: Dictionary of WoE values. 112 | name: Name of feature. 113 | r_val: Numbers after the decimal point. 114 | nan_pattern: Expression for nan processing. 115 | preprocessing: Name preprocessing. 116 | mark_values: List of mark values. 117 | mark_encoding: Map marked value to code. 118 | 119 | Returns: 120 | sql query part for category. 121 | 122 | """ 123 | feature_mark_values = [] if mark_values is None else mark_values.get(name, []) 124 | 125 | # value in case 126 | f_val = name 127 | if preprocessing is not None: 128 | f_val = preprocessing.format(name) 129 | 130 | # search for Mark, NaN and Small encodings 131 | nan_val, small_val, small_grp = None, None, None 132 | for grp in woe_dict.split: 133 | if type(grp) is str: 134 | if grp.startswith("__NaN_"): 135 | nan_grp = woe_dict.split[grp] 136 | nan_val = round(woe_dict.cod_dict[nan_grp], r_val) 137 | 138 | if grp.startswith("__Small_"): 139 | small_grp = woe_dict.split[grp] 140 | small_val = round(woe_dict.cod_dict[small_grp], r_val) 141 | 142 | # if grp.startswith("__Mark_"): 143 | # mark_grp = woe_dict.split[grp] 144 | # mark_val = round(woe_dict.cod_dict[mark_grp], r_val) 145 | 146 | # search for small in cod_dict 147 | for grp in woe_dict.cod_dict: 148 | if type(grp) is str: 149 | if grp.startswith("__NaN_"): 150 | nan_val = round(woe_dict.cod_dict[grp], r_val) 151 | if grp.startswith("__Small_"): 152 | small_val = round(woe_dict.cod_dict[grp], r_val) 153 | small_grp = -1 154 | 155 | assert nan_val is not None, "NaN encoding value does not exists in woe_dict" 156 | # assert small_val is not None, "Small encoding value does not exists in woe_dict" 157 | # TODO: assert for mark val 158 | 159 | feature = """CASE\n""" 160 | if nan_val != small_val: 161 | nan_case = nan_pattern.format(f_val) 162 | feature += f""" WHEN {nan_case} THEN {nan_val}\n""" 163 | 164 | # if feature_mark_values is not None: 165 | # mark_case = [] 166 | # for m in feature_mark_values: 167 | # if isinstance(m, str): 168 | # fmt = "'{}'".format(m) 169 | # else: 170 | # fmt = str(m) 171 | # mark_case.append(fmt) 172 | # mark_case = ", ".join(mark_case) 173 | # feature += """ WHEN {} IN ({}) THEN {}\n""".format(f_val, mark_case, mark_val) 174 | 175 | # create regular bins 176 | passed = {small_grp} 177 | for grp in woe_dict.split.values(): 178 | if grp not in passed: 179 | search_vals = [ 180 | x 181 | for x in woe_dict.split 182 | if woe_dict.split[x] == grp and x not in {*SMALL_SET, *NAN_SET, *MARK_SET} and not is_mark_prefix(x) 183 | ] 184 | length = len(search_vals) 185 | search_vals = list(map(check_cat_symb, search_vals)) 186 | 187 | # filter NaN and Small cases separately 188 | enc_val = round(woe_dict.cod_dict[grp], r_val) 189 | if length > 1: 190 | feature += f""" WHEN {f_val} IN ({", ".join(search_vals)}) THEN {enc_val}\n""" 191 | elif length == 1: 192 | feature += f""" WHEN {f_val} == {search_vals[0]} THEN {enc_val}\n""" 193 | 194 | passed.add(grp) 195 | 196 | for mv in feature_mark_values: 197 | # enc = "__Mark__{}__".format(mv) 198 | enc = mark_encoding[name][mv] 199 | idx_enc = woe_dict.split[enc] 200 | enc_val = round(woe_dict.cod_dict[idx_enc], r_val) 201 | feature += f""" WHEN {f_val} == {check_cat_symb(mv)} THEN {enc_val}\n""" 202 | 203 | # create last ELSE with small 204 | feature += f""" ELSE {small_val}\nEND AS {name}""" 205 | 206 | return feature 207 | 208 | 209 | def set_indent(x: str, n: int = 2): 210 | """Indentation in spaces for a line. 211 | 212 | Args: 213 | x: String. 214 | n: Number of spaces. 215 | 216 | Returns: 217 | Shifted string. 218 | 219 | """ 220 | indent = " " * n 221 | 222 | x = indent + x 223 | x = x.replace("\n", "\n" + indent) 224 | 225 | return x 226 | 227 | 228 | def get_encoded_table( 229 | model, 230 | table_name, 231 | round_woe=3, 232 | round_features=5, 233 | nan_pattern_numbers="({0} IS NULL OR {0} = 'NaN')", 234 | nan_pattern_category="({0} IS NULL OR LOWER(CAST({0} AS VARCHAR(50))) = 'nan')", 235 | preprocessing=None, 236 | mark_values=None, 237 | mark_encoding=None, 238 | ): 239 | """Get encoding table. 240 | 241 | Args: 242 | model: Model. 243 | table_name: Feature table name. 244 | round_woe: Numbers after the decimal point. 245 | round_features: Numbers after the decimal point. 246 | nan_pattern_numbers: Expression for nan processing in number feature. 247 | nan_pattern_category: Expression for nan processing in category feature. 248 | preprocessing: Name processing. 249 | mark_values: List of mark values. 250 | mark_encoding: Map marked value to code. 251 | 252 | Returns: 253 | query. 254 | 255 | """ 256 | if preprocessing is None: 257 | preprocessing = {} 258 | 259 | query = """SELECT\n""" 260 | 261 | for n, name in enumerate(model.features_fit.index): 262 | woe_dict = model.woe_dict[name] 263 | 264 | prep = None 265 | if name in preprocessing: 266 | prep = preprocessing[name] 267 | 268 | if woe_dict.f_type == "cat": 269 | feature = prepare_category( 270 | woe_dict, name, round_woe, nan_pattern_category, prep, mark_values, mark_encoding 271 | ) 272 | else: 273 | feature = prepare_number( 274 | woe_dict, name, round_woe, round_features, nan_pattern_numbers, prep, mark_values, mark_encoding 275 | ) 276 | 277 | query += set_indent(feature) 278 | 279 | if (n + 1) != len(model.features_fit): 280 | query += "," 281 | 282 | query += "\n" 283 | 284 | query += f"""FROM {table_name}""" 285 | 286 | return query 287 | 288 | 289 | def get_weights_query(model, table_name, output_name="PROB", alias="WOE_TAB", bypass_encoded=False, round_wts=3): 290 | """Calc prob over woe table. 291 | 292 | Args: 293 | model: Model. 294 | table_name: WoE table name. 295 | output_name: Output name. 296 | alias: Alias. 297 | bypass_encoded: Add encoded features to result query. 298 | round_wts: Round. 299 | 300 | Returns: 301 | query. 302 | 303 | """ 304 | if model.params["task"] == TaskType.BIN: 305 | # query = """SELECT\n 1 / (1 + EXP(-({0}\n ))) as {3}{1}\nFROM {2} as {4}""" 306 | query = """SELECT\n 1 / (1 + EXP(-({LIN_FUN}\n ))) as {OUTPUT_NAME}{WOE_VALS}\nFROM {TABLE_NAME} as {ALIAS}""" 307 | else: 308 | # query = """SELECT\n ( {0}\n ) as {3}{1}\nFROM {2} as {4}""" 309 | query = """SELECT\n ( {S} * ( {LIN_FUN}\n) + {M}\n ) as {OUTPUT_NAME}{WOE_VALS}\nFROM {TABLE_NAME} as {ALIAS}""" 310 | 311 | dot = f"\n {round(model.intercept, round_wts)}" 312 | 313 | for name, val in zip(model.features_fit.index, model.features_fit.values): 314 | sign = "" if val < 0 else "+" 315 | dot += f"""\n {sign}{round(val, round_wts)}*{alias}.{name}""" 316 | 317 | other = "" 318 | if bypass_encoded: 319 | other = f""",\n {alias}.*""" 320 | 321 | # return query.format(dot, other, table_name, output_name, alias) 322 | query_args = { 323 | "LIN_FUN": dot, 324 | "WOE_VALS": other, 325 | "TABLE_NAME": table_name, 326 | "OUTPUT_NAME": output_name, 327 | "ALIAS": alias, 328 | } 329 | if model.params["task"] == TaskType.REG: 330 | query_args["S"] = round(model._target_std, round_wts) 331 | query_args["M"] = round(model._target_mean, round_wts) 332 | 333 | return query.format(**query_args) 334 | 335 | 336 | def get_sql_inference_query( 337 | model, 338 | table_name, 339 | round_digits=3, 340 | round_features=5, 341 | output_name="PROB", 342 | alias="WOE_TAB", 343 | bypass_encoded=True, 344 | template=None, 345 | nan_pattern_numbers="({0} IS NULL OR {0} = 'NaN')", 346 | nan_pattern_category="({0} IS NULL OR LOWER(CAST({0} AS VARCHAR(50))) = 'nan')", 347 | preprocessing=None, 348 | mark_values=None, 349 | mark_encoding=None, 350 | ): 351 | """Get sql query. 352 | 353 | Args: 354 | model: Model. 355 | table_name: Feature table name. 356 | round_digits: Round digits. 357 | round_features: Round digits of features. 358 | output_name: Output name. 359 | alias: Alias. 360 | bypass_encoded: Add encoded features to result query. 361 | template: T. 362 | nan_pattern_numbers: Expression for nan processing in number feature. 363 | nan_pattern_category: Expression for nan processing in category feature. 364 | preprocessing: Name preprocessing. 365 | mark_values: List of marked values. 366 | mark_encoding: Map marked value to code. 367 | 368 | Returns: 369 | query. 370 | 371 | """ 372 | assert template in ["td"] or template is None, "Unknown template" 373 | 374 | if template == "td": 375 | nan_pattern_numbers = "{0} IS NULL" 376 | nan_pattern_category = "{0} IS NULL" 377 | 378 | # get table with features 379 | encode_table = "({0})".format( 380 | get_encoded_table( 381 | model, 382 | table_name, 383 | round_digits, 384 | round_features, 385 | nan_pattern_numbers, 386 | nan_pattern_category, 387 | preprocessing, 388 | mark_values, 389 | mark_encoding, 390 | ) 391 | ) 392 | encode_table = """\n """ + set_indent(encode_table) 393 | 394 | # get table with weights 395 | query = get_weights_query( 396 | model, encode_table, output_name=output_name, bypass_encoded=bypass_encoded, alias=alias, round_wts=round_digits 397 | ) 398 | 399 | return query 400 | -------------------------------------------------------------------------------- /examples/Tutorial_1__Basic_usage_and_params.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from sklearn.metrics import roc_auc_score\n", 11 | "\n", 12 | "from autowoe import AutoWoE, ReportDeco" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "### Чтение и подготовка обучающей выборки" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "train = pd.read_csv(\n", 29 | " \"./data/train_demo.csv\", low_memory=False, index_col=\"line_id\", parse_dates=[\"datetime_\" + str(i) for i in range(2)]\n", 30 | ")\n", 31 | "\n", 32 | "train = train.iloc[:, 50:100]\n", 33 | "\n", 34 | "num_col = list(filter(lambda x: \"numb\" in x, train.columns))\n", 35 | "num_feature_type = dict.fromkeys(num_col, \"real\")\n", 36 | "\n", 37 | "date_col = filter(lambda x: \"datetime\" in x, train.columns)\n", 38 | "for col in date_col:\n", 39 | " train[col + \"_year\"] = train[col].map(lambda x: x.year)\n", 40 | " train[col + \"_weekday\"] = train[col].map(lambda x: x.weekday())\n", 41 | " train[col + \"_month\"] = train[col].map(lambda x: x.month)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### Чтение и подготовка тестовой выборки" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "test = pd.read_csv(\"./data/test_demo.csv\", index_col=\"line_id\", parse_dates=[\"datetime_\" + str(i) for i in range(2)])\n", 58 | "\n", 59 | "date_col = filter(lambda x: \"datetime\" in x, test.columns)\n", 60 | "for col in date_col:\n", 61 | " test[col + \"_year\"] = test[col].map(lambda x: x.year)\n", 62 | " test[col + \"_weekday\"] = test[col].map(lambda x: x.weekday())\n", 63 | " test[col + \"_month\"] = test[col].map(lambda x: x.month)\n", 64 | "\n", 65 | "test_target = pd.read_csv(\"./data/test-target_demo.csv\")[\"target\"]\n", 66 | "test[\"target\"] = test_target.values" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "### Параметры модели" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "Для обучения модели рекомендуется указать тип признаков для обучения.\n", 81 | "Поэтому создается словарь features_type с ключами: \n", 82 | "\n", 83 | "\n", 84 | "\"real\" -- вещественный признак,\n", 85 | "\n", 86 | "\"cat\" -- категориальный.\n", 87 | "\n", 88 | "Для признаков, которые не размечены, типы будут определены автоматом. Такой вариант будет работать, но качество порядочно просядет" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "#### features_type" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "cat_col = list(filter(lambda x: \"str\" in x, train.columns))\n", 105 | "cat_feature_type = dict.fromkeys(cat_col, \"cat\")\n", 106 | "\n", 107 | "year_col = list(filter(lambda x: \"_year\" in x, train.columns))\n", 108 | "year_feature_type = dict.fromkeys(year_col, \"cat\")\n", 109 | "\n", 110 | "weekday_col = list(filter(lambda x: \"_weekday\" in x, train.columns))\n", 111 | "weekday_feature_type = dict.fromkeys(weekday_col, \"cat\")\n", 112 | "\n", 113 | "month_col = list(filter(lambda x: \"_month\" in x, train.columns))\n", 114 | "month_feature_type = dict.fromkeys(month_col, \"cat\")" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 5, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "features = cat_col + year_col + weekday_col + month_col + num_col" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "#### Feature level constrains" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 6, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "features_type = dict(\n", 140 | " **num_feature_type, **cat_feature_type, **year_feature_type, **weekday_feature_type, **month_feature_type\n", 141 | ")" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "- `features_monotone_constraints` - также можно указать зависимость целевой переменной от признака. Если заранее известно, что при возрастании признака feature_1, то эту информацию можно учесть в модели, добавив в словарь пару {feature_1: \"1\"}. Если же зависимость признака от целевой переменной обратная, то можно указать {feature_1: \"-1\"} Если про зависимость ничего неизвестно, но хочется, чтобы она была монотонная, можно указать 'auto'. Можно указать {feature_1: \"0\"}, в случае, если установлено общее ограничение на монотонность, чтобы не распространять его на эту фичу. Если специальных условий нет, то можно не собирать этот дикт\n", 149 | "\n", 150 | "\n", 151 | "Рекомендуемое использование:\n", 152 | "\n", 153 | "1) В случае, если задано общее условие на монотонность, то можно собрать дикт {feature_1: \"0\", feature_2: \"0\"}, чтобы игнорировать это ограничение для признаков feature_1, feature_2\n", 154 | "\n", 155 | "2) В случае, если не задано общее условие на монотонность, то можно собрать дикт {feature_1: \"auto\", feature_2: \"auto\"}, чтобы установить это ограничение для признаков feature_1, feature_2" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 7, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "features_monotone_constraints = {\"number_74\": \"auto\", \"number_83\": \"auto\"}" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "- `max_bin_count` - через словарь max_bin_count можно задать число бинов для WoE кодирования, если для какого-то признака оно отлично от общего. " 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 8, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "max_bin_count = {\"number_47\": 3, \"number_51\": 2}" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "#### Рекомендация\n", 188 | "В общем случае, в первый момент построения модели лучше не указывать специальных ограничений в features_monotone_constraints и max_bin_count. Если в результате анализа полученной модели разбиение оказалось неинтерпретируемым или нестабильным по отдельным признакам, но в целом по модели ок, то ограничить сложность разбиения отдельных призаков имеет смысл. Если разбивка большинства признаков в модели оказалась неудовлетворительная, то рекомендуется в первую очередь настраивать глобальные ограничения (см параметры модели max_bin_count, monotonic, min_bin_size и др ниже)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "#### Общие параметры модели\n", 196 | "\n", 197 | "- `interpreted_model` - требуется ли интерпретируемость модели (условие на знак в коэффициентах логистической регрессии)\n", 198 | "\n", 199 | "- `monotonic` - Глобальное условие на монотонность. Если указано True, то для всех признаков по умолчанию будут строится только монотонные разбиения. Указать специальные условия для отдельных признаков можно используя features_monotone_constraints аргумент метода .fit\n", 200 | "\n", 201 | "- `max_bin_count` - Глобальное ограничение на число бинов. Указать специальные условия для отдельных признаков можно используя max_bin_count аргумент метода .fit\n", 202 | "\n", 203 | "- `select_type` - способ ПРЕДВАРИТЕЛЬНОГО!!! (ЭТО ВАЖНО) отбора признаков. Если указать None, то будут отобраны признаки, у которых importance больше imp_th. Если указвать, например 50, то после предварительного отобра останется только 50 признаков самых важных признаков. Крайне не рекомендуется сильно ограничивать\n", 204 | "\n", 205 | "- `pearson_th` - пороговое значен для корреляции Пирсона. Используется на финальной стадии отбора признаков.\n", 206 | "Если корреляция вух признаков по модулю больше pearson_th, то будет выброшен тот, у которого \n", 207 | "информативность меньше\n", 208 | "\n", 209 | "- `auc_th` - пороговое значнеи для одномерной оценки качества признака\n", 210 | "\n", 211 | "- `vif_th` - пороговое значнеи для VIF признака\n", 212 | "\n", 213 | "- `imp_th` - порог по которому будет произведен отбор признаков, если указать select_type=None (см. ниже).\n", 214 | "\n", 215 | "- `th_const` порог по которому признак будет считаться константным. Все константные признаки в модели не учитываются. Если число валидных значений больше трешхолда, то колонка не константная (int). В случае указания float, трешхолд будет определяться как размер_выборки * th_const\n", 216 | "\n", 217 | "- `force_single_split` - иногда в силу ограничений на min_bin_size невозможно построить ниодной группировки на переменную. force_single_split=True заставит в этом случае построить единственно возмоджный сплит, в случае если при этом выделяется группа размера более чем th_const. False будет выкидывать этот признак\n", 218 | "\n", 219 | "\n", 220 | "- `th_nan` - порог по которому будет выделена отдельная категория для пропусков в данных.\n", 221 | "Если число пропусков меньше чем th_nan, то WoE значения для пропусков берется равным нулю.\n", 222 | "В противном случае пропущенные значения будут выделены в отдельную группу и для них отдельно\n", 223 | "будет рассчитано WoE значение.\n", 224 | "Так же влияет на редкие категории (менее th_cat). Если суммарно таких категорий будет менее th_nan, то обработка будет производиться по принципу отпределенному в `cat_merge_to`, иначе оценено по группе\n", 225 | "\n", 226 | "- `th_cat` - порог, по которой немногочисленные категории в категориальных признаках будут объединятся в отдельную группу\n", 227 | "\n", 228 | "\n", 229 | "- `woe_diff_th` - Возмодность смеджить наны и редкие категории с каким-то бином, если разница в вое менее woe_diff_th\n", 230 | "\n", 231 | "\n", 232 | "- `min_bin_size` - минимальный размер бина при группировке. Возможно int как число наблюдений и float как доля от выбрки\n", 233 | "\n", 234 | "- `min_bin_mults` - в ходе построения бинов будут протестированы возможные значения min_bin_size, \n", 235 | "min_bin_size * min_bin_mults[0], min_bin_size * min_bin_mults[1] ... . Ждем float > 1. Дефолт - (2, 4), в принципе можно не трогать\n", 236 | "\n", 237 | "- `min_gains_to_split` - возможные значения регуляризатора, которые будут протестированы в ходе построения биннинга\n", 238 | "\n", 239 | "\n", 240 | "- `auc_tol` - Чувствительность к AUC. Считаем, что можем пожертвовать auc_tol качества от максимального, чтобы сделать модель проще\n", 241 | "\n", 242 | "\n", 243 | "- `cat_alpha` - Регуляризатор для кодировщика категорий\n", 244 | "\n", 245 | "\n", 246 | "\n", 247 | "- `cat_merge_to` - группа для редких (менее th_cat) категорий либо новых на тесте\n", 248 | " \"to_nan\" -- в группу nan, \n", 249 | " \"to_woe_0\" -- отдельная группа с WoE = 0,\n", 250 | " \"to_maxfreq\" - в самую большую группу,\n", 251 | " \"to_maxp\" - в группу с наибольшей вероятностью события,\n", 252 | " \"to_minp\" - в группу с наименьшей вероятностью события\n", 253 | " \n", 254 | "- `nan_merge_to` - группа для НаНов\n", 255 | " \"to_woe_0\" -- отдельная группа с WoE = 0,\n", 256 | " \"to_maxfreq\" - в самую большую группу,\n", 257 | " \"to_maxp\" - в группу с наибольшей вероятностью события,\n", 258 | " \"to_minp\" - в группу с наименьшей вероятностью события \n", 259 | " \n", 260 | " \n", 261 | "- `oof_woe` - если указать oof_woe=True, то WoE кодирование будет происходить по кросс-валидации. Если же False, то сразу на всей обучающей выборке.\n", 262 | "\n", 263 | "- `n_folds` - количество фолдов для внутренней кроссвалидации\n", 264 | "\n", 265 | "\n", 266 | "- `n_jobs` - число процессов, которое будет использовать модель \n", 267 | "\n", 268 | "- `l1_grid_size` - в данной модели на одном из шагов используется отбор признаков LASSO. l1_base_step -- размер сетки для перебора C\n", 269 | "\n", 270 | "- `l1_exp_scale` - шкала сетки для L1 отбора. 4 соответствует макс значению C порядка 3-4. Увеличивать, если необходимо сделать менее регуляризованную модель\n", 271 | "\n", 272 | "- `imp_type` - способ определения значимости признаков -- features importance (\"feature_imp\" - в общем случае более сложная модель) или permutation importance (\"perm_imp\" - в общем случае более простая модель)\n", 273 | "\n", 274 | "- `regularized_refit` - после отбора признаков полученная модель пересчитывается на всех данных. Стоит ли включать L1 при этом. Если нет, то в интерпретируемом режиме модель будет итеративно переобучаться, пока все веса не станут отрицательны. Если да - то аналогичное будет получаться закручиванием L1. Может быть полезно ставить False если нужна стат модель, те p-value на оценки\n", 275 | "\n", 276 | "- `p_val` - допустимый уровень p_value на оценки модели при условии обучении стат модели (regularized_refit=False)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 9, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "auto_woe = AutoWoE(\n", 286 | " task=\"BIN\",\n", 287 | " interpreted_model=True,\n", 288 | " monotonic=False,\n", 289 | " max_bin_count=5,\n", 290 | " select_type=None,\n", 291 | " pearson_th=0.9,\n", 292 | " auc_th=0.505,\n", 293 | " vif_th=10.0,\n", 294 | " imp_th=0,\n", 295 | " th_const=32,\n", 296 | " force_single_split=True,\n", 297 | " th_nan=0.01,\n", 298 | " th_cat=0.005,\n", 299 | " woe_diff_th=0.01,\n", 300 | " min_bin_size=0.01,\n", 301 | " min_bin_mults=(2, 4),\n", 302 | " min_gains_to_split=(0.0, 0.5, 1.0),\n", 303 | " auc_tol=1e-4,\n", 304 | " cat_alpha=100,\n", 305 | " cat_merge_to=\"to_woe_0\",\n", 306 | " nan_merge_to=\"to_woe_0\",\n", 307 | " oof_woe=True,\n", 308 | " n_folds=6,\n", 309 | " n_jobs=4,\n", 310 | " l1_grid_size=20,\n", 311 | " l1_exp_scale=6,\n", 312 | " imp_type=\"feature_imp\",\n", 313 | " regularized_refit=False,\n", 314 | " p_val=0.05,\n", 315 | " debug=False,\n", 316 | " verbose=0,\n", 317 | ")\n", 318 | "\n", 319 | "auto_woe = ReportDeco(auto_woe)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [ 326 | "- `train` обучающая выборка\n", 327 | "\n", 328 | "- `target_name` - название целевой переменной\n", 329 | "\n", 330 | "- `features_type` - см выше описание дикта features_type. Возможно указание None для автозаполнения, но не рекомендуется\n", 331 | "\n", 332 | "- `group_kf` - название колонки-группы для GroupKFold https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupKFold.html\n", 333 | "\n", 334 | "- `max_bin_count` - см выше описание дикта max_bin_count. Можно ничего не передавать, если специальных условий не предусмотрено. Общее для всех условние задано в __init__\n", 335 | "\n", 336 | "- `features_monotone_constraints` - см выше описание дикта features_monotone_constraints. Можно ничего не передавать, если специальных условий не предусмотрено. Общее для всех условние задано в __init__\n", 337 | "\n", 338 | "- `validation` - возможность использовать валидацию в построении/отборе признаков. Можно не передавать. На текущий момент используется для 1) отбора признаков по p-value при построении стат модели\n" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 10, 344 | "metadata": { 345 | "scrolled": false 346 | }, 347 | "outputs": [ 348 | { 349 | "name": "stdout", 350 | "output_type": "stream", 351 | "text": [ 352 | "[LightGBM] [Info] Number of positive: 63, number of negative: 5537\n", 353 | "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001103 seconds.\n", 354 | "You can set `force_row_wise=true` to remove the overhead.\n", 355 | "And if memory is not enough, you can set `force_col_wise=true`.\n", 356 | "[LightGBM] [Info] Total Bins 379\n", 357 | "[LightGBM] [Info] Number of data points in the train set: 5600, number of used features: 49\n", 358 | "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.011250 -> initscore=-4.476073\n", 359 | "[LightGBM] [Info] Start training from score -4.476073\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "auto_woe.fit(\n", 365 | " train[features + [\"target\"]],\n", 366 | " target_name=\"target\",\n", 367 | " features_type=features_type,\n", 368 | " group_kf=None,\n", 369 | " max_bin_count=max_bin_count,\n", 370 | " features_monotone_constraints=features_monotone_constraints,\n", 371 | " validation=test,\n", 372 | ")" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 11, 378 | "metadata": { 379 | "scrolled": false 380 | }, 381 | "outputs": [ 382 | { 383 | "data": { 384 | "text/plain": [ 385 | "0.7791178112786152" 386 | ] 387 | }, 388 | "execution_count": 11, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "pred = auto_woe.predict_proba(test)\n", 395 | "roc_auc_score(test[\"target\"], pred)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 12, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "text/plain": [ 406 | "0.7791178112786152" 407 | ] 408 | }, 409 | "execution_count": 12, 410 | "metadata": {}, 411 | "output_type": "execute_result" 412 | } 413 | ], 414 | "source": [ 415 | "pred = auto_woe.predict_proba(test[[\"number_72\"]], report=False)\n", 416 | "roc_auc_score(test[\"target\"], pred)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 13, 422 | "metadata": {}, 423 | "outputs": [ 424 | { 425 | "name": "stdout", 426 | "output_type": "stream", 427 | "text": [ 428 | "SELECT\n", 429 | " 1 / (1 + EXP(-(\n", 430 | " -4.517\n", 431 | " -0.946*WOE_TAB.number_72\n", 432 | " ))) as PROB,\n", 433 | " WOE_TAB.*\n", 434 | "FROM \n", 435 | " (SELECT\n", 436 | " CASE\n", 437 | " WHEN (number_72 IS NULL OR number_72 = 'NaN') THEN -0.974\n", 438 | " WHEN number_72 <= 0.0 THEN 0.296\n", 439 | " ELSE -1.96\n", 440 | " END AS number_72\n", 441 | " FROM table) as WOE_TAB\n" 442 | ] 443 | } 444 | ], 445 | "source": [ 446 | "print(auto_woe.get_sql_inference_query(\"table\"))" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "### Полезные методы модели" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "- `private_features_type` - типизация признаков\n", 461 | "- `get_woe` - рабиение на бины и WoE значения в них\n", 462 | "- `get_split` - границы разбиения. Особо полезен для категориальных признаков\n", 463 | "\n", 464 | "\n", 465 | "##### Замечание: \n", 466 | "ReportDeco - обертка для построения отчета. Она не обязательна для обучения и применения модели, но обязательна для построения отчета (см последнюю ячейку).\n", 467 | "Для доступа к атрибутам самой модели необходимо обратится к атрибуту auto_woe.model декоратора\n", 468 | "Все атрибуты объекта-модели так же доступны через объект-отчета.\n", 469 | "Однако в пикл отчета будет весить существенно больше, так что для сохранения модели на инференс стоит сохранять только auto_woe.model\n" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "### Формирование отчета" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 14, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "name": "stderr", 486 | "output_type": "stream", 487 | "text": [ 488 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n", 489 | "No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n" 490 | ] 491 | } 492 | ], 493 | "source": [ 494 | "report_params = {\n", 495 | " \"automl_date_column\": \"report_month\", # колонка с датой в формате params['datetimeFormat']\n", 496 | " \"output_path\": \"./AUTOWOE_REPORT_1\", # папка, куда сгенерится отчет и сложатся нужные файлы\n", 497 | " \"report_name\": \"___НАЗВАНИЕ ОТЧЕТА___\",\n", 498 | " \"report_version_id\": 1,\n", 499 | " \"city\": \"Воронеж\",\n", 500 | " \"model_aim\": \"___ЦЕЛЬ ПОСТРОЕНИЯ МОДЕЛИ___\",\n", 501 | " \"model_name\": \"___НАЗВАНИЕ МОДЕЛИ___\",\n", 502 | " \"zakazchik\": \"___ЗАКАЗЧИК___\",\n", 503 | " \"high_level_department\": \"___ПОДРАЗДЕЛЕНИЕ___\",\n", 504 | " \"ds_name\": \"___РАЗРАБОТЧИК МОДЕЛИ___\",\n", 505 | " \"target_descr\": \"___ОПИСАНИЕ ЦЕЛЕВОГО СОБЫТИЯ___\",\n", 506 | " \"non_target_descr\": \"___ОПИСАНИЕ НЕЦЕЛЕВОГО СОБЫТИЯ___\",\n", 507 | "}\n", 508 | "\n", 509 | "auto_woe.generate_report(report_params)" 510 | ] 511 | } 512 | ], 513 | "metadata": { 514 | "kernelspec": { 515 | "display_name": "Anaconda_py38", 516 | "language": "python", 517 | "name": "anaconda_py38" 518 | }, 519 | "language_info": { 520 | "codemirror_mode": { 521 | "name": "ipython", 522 | "version": 3 523 | }, 524 | "file_extension": ".py", 525 | "mimetype": "text/x-python", 526 | "name": "python", 527 | "nbconvert_exporter": "python", 528 | "pygments_lexer": "ipython3", 529 | "version": "3.8.5" 530 | }, 531 | "stem_cell": { 532 | "cell_type": "raw", 533 | "metadata": { 534 | "pycharm": { 535 | "metadata": false 536 | } 537 | }, 538 | "source": "" 539 | } 540 | }, 541 | "nbformat": 4, 542 | "nbformat_minor": 1 543 | } 544 | --------------------------------------------------------------------------------