├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── examples ├── model_report │ ├── auto_mining_rules │ │ ├── combiner_rules_0.png │ │ ├── combiner_rules_1.png │ │ └── combiner_rules_2.png │ ├── bin_plots │ │ ├── bin_vars_A.png │ │ ├── bin_vars_B.png │ │ ├── bin_vars_C.png │ │ ├── bin_vars_D.png │ │ └── bin_vars_时间.png │ └── 决策树组合策略挖掘.xlsx └── pdtr_samplts.ipynb ├── pdtr ├── __init__.py ├── matplot_chinese.ttf └── transforme.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | # itlubber 163 | *.DS_Store 164 | *.pkl 165 | *.virtual_documents 166 | test.* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 itlubber 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include pdtr/*.ttf pdtr/*.xlsx 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 自动决策树规则挖掘工具包 2 | 3 | 在笔者金融风控的日常工作中,很多时候需要根据数据集内的诸多特征(有很多其他称呼,比如因子、变量、自变量、解释变量等)来挖掘一些有用的规则和组合策略,在保证通过率的基础上尽可能多的拒绝坏客户。面对成千上万的特征,如何从数据集中找到有效的规则和组合策略,一直以来都是金融风控搬砖工的日常工作。 `pdtr` 旨在帮助读者快速从高维数据中提取出有效的规则和组合策略。 4 | 5 | > 仓库地址:https://github.com/itlubber/pdtr 6 | > 7 | > 博文地址:https://itlubber.art/archives/auto-strategy-mining 8 | > 9 | > 微信公共号推文:https://mp.weixin.qq.com/s/8s785MfmVznNgQyy38YnWw 10 | > 11 | > pipy包:https://pypi.org/project/pdtr/ 12 | 13 | ## 交流 14 | 15 | | 微信 | 微信公众号 | 16 | | :---: | :----: | 17 | | itlubber.png | itlubber_art.png | 18 | | itlubber | itlubber_art | 19 | 20 | 21 | ## 背景简介 22 | 23 | 金融场景风险大致可以概括为三种:系统性风险、欺诈风险(无还款意愿)、信用风险(无还款能力),而作为一名风控搬砖工,日常工作中有大量的数据挖掘工作,如何从高维数据集中挖掘出行之有效的规则、策略及模型来防范欺诈风险和信用风险每个搬砖工的基操。本仓库由笔者基于网上开源的一系列相关知识,结合实际工作中遇到的实际需求,整理得到。旨在为诸位仁兄提供一个便捷、高效、赏心悦目的决策树组合策略挖掘报告,及一系列能够实际运用到风险控制上的策略。 24 | 25 | ## 项目结构 26 | 27 | ```bash 28 | pdtr 29 | . 30 | | README.md # 说明文档 31 | | setup.py # 打包发布文件 32 | | LICENSE # 开源协议 33 | | requirements.txt # 项目依赖包 34 | +---examples # 演示样例 35 | | | combine_rules_cache # 缓存文件 36 | | | combine_rules_cache.svg # 缓存文件 37 | | | pdtr_samplts.ipynb # 演示样例程序 38 | | \---model_report # 模型报告输出文件夹 39 | | | 决策树组合策略挖掘.xlsx # 策略挖掘报告 40 | | +---auto_mining_rules # 组合策略可视化存储文件夹 41 | | | combiner_rules_0.png # 决策树可视化图片 42 | | | ...... 43 | | \---bin_plots # 简单策略可视化存储文件夹 44 | | bin_vars_A.png # 变量分箱可视化图片 45 | | ...... 46 | \---pdtr # PDTR 源码包 47 | template.xlsx # excel 模版文件 48 | excel_writer.py # excel写入公共方法 49 | matplot_chinese.ttf # matplotlib 中文字体 50 | transforme.py # 策略挖掘方法 51 | ``` 52 | 53 | ## 环境准备 54 | 55 | ### 创建虚拟环境(可选) 56 | 57 | + 通过`conda`创建虚拟环境 58 | 59 | ```bash 60 | >> conda create -n score python==3.8.13 61 | 62 | Collecting package metadata (current_repodata.json): done 63 | Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source. 64 | Collecting package metadata (repodata.json): done 65 | Solving environment: done 66 | 67 | 68 | ==> WARNING: A newer version of conda exists. <== 69 | current version: 4.10.3 70 | latest version: 23.3.1 71 | 72 | Please update conda by running 73 | 74 | $ conda update -n base -c defaults conda 75 | 76 | 77 | 78 | ## Package Plan ## 79 | 80 | environment location: /Users/lubberit/anaconda3/envs/score 81 | 82 | added / updated specs: 83 | - python==3.8.13 84 | 85 | 86 | The following packages will be downloaded: 87 | 88 | package | build 89 | ---------------------------|----------------- 90 | ca-certificates-2023.01.10 | hecd8cb5_0 121 KB 91 | ncurses-6.4 | hcec6c5f_0 1018 KB 92 | openssl-1.1.1t | hca72f7f_0 3.3 MB 93 | pip-23.0.1 | py38hecd8cb5_0 2.5 MB 94 | python-3.8.13 | hdfd78df_1 10.8 MB 95 | setuptools-66.0.0 | py38hecd8cb5_0 1.2 MB 96 | sqlite-3.41.2 | h6c40b1e_0 1.2 MB 97 | wheel-0.38.4 | py38hecd8cb5_0 65 KB 98 | xz-5.4.2 | h6c40b1e_0 372 KB 99 | ------------------------------------------------------------ 100 | Total: 20.5 MB 101 | 102 | The following NEW packages will be INSTALLED: 103 | 104 | ca-certificates pkgs/main/osx-64::ca-certificates-2023.01.10-hecd8cb5_0 105 | libcxx pkgs/main/osx-64::libcxx-14.0.6-h9765a3e_0 106 | libffi pkgs/main/osx-64::libffi-3.3-hb1e8313_2 107 | ncurses pkgs/main/osx-64::ncurses-6.4-hcec6c5f_0 108 | openssl pkgs/main/osx-64::openssl-1.1.1t-hca72f7f_0 109 | pip pkgs/main/osx-64::pip-23.0.1-py38hecd8cb5_0 110 | python pkgs/main/osx-64::python-3.8.13-hdfd78df_1 111 | readline pkgs/main/osx-64::readline-8.2-hca72f7f_0 112 | setuptools pkgs/main/osx-64::setuptools-66.0.0-py38hecd8cb5_0 113 | sqlite pkgs/main/osx-64::sqlite-3.41.2-h6c40b1e_0 114 | tk pkgs/main/osx-64::tk-8.6.12-h5d9f67b_0 115 | wheel pkgs/main/osx-64::wheel-0.38.4-py38hecd8cb5_0 116 | xz pkgs/main/osx-64::xz-5.4.2-h6c40b1e_0 117 | zlib pkgs/main/osx-64::zlib-1.2.13-h4dc903c_0 118 | 119 | 120 | Proceed ([y]/n)? y 121 | 122 | 123 | Downloading and Extracting Packages 124 | sqlite-3.41.2 | 1.2 MB | ################################################################################################### | 100% 125 | wheel-0.38.4 | 65 KB | ################################################################################################### | 100% 126 | openssl-1.1.1t | 3.3 MB | ################################################################################################### | 100% 127 | python-3.8.13 | 10.8 MB | ################################################################################################### | 100% 128 | setuptools-66.0.0 | 1.2 MB | ################################################################################################### | 100% 129 | ncurses-6.4 | 1018 KB | ################################################################################################### | 100% 130 | xz-5.4.2 | 372 KB | ################################################################################################### | 100% 131 | ca-certificates-2023 | 121 KB | ################################################################################################### | 100% 132 | pip-23.0.1 | 2.5 MB | ################################################################################################### | 100% 133 | Preparing transaction: done 134 | Verifying transaction: done 135 | Executing transaction: done 136 | # 137 | # To activate this environment, use 138 | # 139 | # $ conda activate score 140 | # 141 | # To deactivate an active environment, use 142 | # 143 | # $ conda deactivate 144 | ``` 145 | 146 | + 通过`pyenv`创建虚拟环境 147 | 148 | ```bash 149 | # 安装环境 150 | >> pyenv install -v 3.8.13 151 | # 启动环境 152 | >> pyenv local 3.8.13 153 | # 卸载环境 154 | >> pyenv uninstall 3.8.13 155 | ``` 156 | 157 | 158 | ### 安装项目依赖 159 | 160 | ```bash 161 | >> pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com 162 | 163 | Looking in indexes: http://mirrors.aliyun.com/pypi/simple/ 164 | ...... 165 | Installing collected packages: webencodings, six, pytz, colour, zipp, tomli, tinycss2, threadpoolctl, python-dateutil, pyparsing, pycparser, pluggy, pillow, packaging, numpy, kiwisolver, joblib, iniconfig, graphviz, fonttools, exceptiongroup, et-xmlfile, defusedxml, cycler, scipy, pytest, patsy, pandas, openpyxl, importlib-resources, cssselect2, contourpy, cffi, statsmodels, scikit-learn, matplotlib, cairocffi, dtreeviz, category-encoders, CairoSVG 166 | Successfully installed CairoSVG-2.7.0 cairocffi-1.5.1 category-encoders-2.6.0 cffi-1.15.1 colour-0.1.5 contourpy-1.0.7 cssselect2-0.7.0 cycler-0.11.0 defusedxml-0.7.1 dtreeviz-2.2.1 et-xmlfile-1.1.0 exceptiongroup-1.1.1 fonttools-4.39.4 graphviz-0.20.1 importlib-resources-5.12.0 iniconfig-2.0.0 joblib-1.2.0 kiwisolver-1.4.4 matplotlib-3.7.1 numpy-1.22.2 openpyxl-3.0.7 packaging-23.1 pandas-1.5.3 patsy-0.5.3 pillow-9.5.0 pluggy-1.0.0 pycparser-2.21 pyparsing-3.0.9 pytest-7.3.1 python-dateutil-2.8.2 pytz-2023.3 scikit-learn-1.2.2 scipy-1.10.1 six-1.11.0 statsmodels-0.14.0 threadpoolctl-3.1.0 tinycss2-1.2.1 tomli-2.0.1 webencodings-0.5.1 zipp-3.15.0 167 | ``` 168 | 169 | 170 | ### `PDTR` 安装 171 | 172 | ```bash 173 | pip install pdtr 174 | ``` 175 | 176 | ### 版本介绍 177 | 178 | + `0.1.0` 179 | 180 | 仅包含决策树策略挖掘相关工具 181 | 182 | + `0.1.1` 183 | 184 | 除版本 `0.1.0` 中的决策树挖掘相关工具以外,新增了基于 `toad` 和 `optbinning` 的单变量策略挖掘相关方法 185 | 186 | + `0.1.2` 187 | 188 | 在 `0.1.1` 的基础上增加了部分方法的文档注释 189 | 190 | 191 | ### 运行样例 192 | 193 | + 导入相关依赖 194 | 195 | ```python 196 | import os 197 | import numpy as np 198 | import pandas as pd 199 | from sklearn.model_selection import train_test_split 200 | 201 | try: 202 | from pdtr import ParseDecisionTreeRules 203 | except ModuleNotFoundError: 204 | import sys 205 | 206 | sys.path.append("../") 207 | from pdtr import ParseDecisionTreeRules 208 | 209 | np.random.seed(1) 210 | ``` 211 | 212 | + 数据集加载 213 | 214 | ```python 215 | feature_map = {} 216 | n_samples = 10000 217 | ab = np.array(list('ABCDEFG')) 218 | 219 | data = pd.DataFrame({ 220 | 'A': np.random.randint(10, size = n_samples), 221 | 'B': ab[np.random.choice(7, n_samples)], 222 | 'C': ab[np.random.choice(2, n_samples)], 223 | 'D': np.random.random(size = n_samples), 224 | 'target': np.random.randint(2, size = n_samples) 225 | }) 226 | ``` 227 | 228 | + 数据集拆分 229 | 230 | ```python 231 | train, test = train_test_split(data, test_size=0.3, shuffle=data["target"]) 232 | ``` 233 | 234 | + 决策树自动规则挖掘 235 | 236 | ```python 237 | pdtr_instance = ParseDecisionTreeRules(target="target", max_iter=8, output="model_report/决策树组合策略挖掘.xlsx") 238 | pdtr_instance.fit(train, lift=0., max_depth=2, max_samples=1., verbose=False, max_features="auto") 239 | ``` 240 | 241 | + 规则验证 242 | 243 | ```python 244 | all_rules = pdtr_instance.insert_all_rules(test=test) 245 | ``` 246 | 247 | + 导出策略挖掘报告 248 | 249 | ```python 250 | pdtr_instance.save() 251 | ``` 252 | 253 | + 挖掘报告 254 | 255 | [`examples/决策树组合策略挖掘.xlsx`](https://github.com/itlubber/pdtr/blob/main/examples/model_report/%E5%86%B3%E7%AD%96%E6%A0%91%E7%BB%84%E5%90%88%E7%AD%96%E7%95%A5%E6%8C%96%E6%8E%98.xlsx) 256 | 257 | 258 | ## 参考 259 | 260 | > https://github.com/itlubber/LogisticRegressionPipeline 261 | > 262 | > https://github.com/itlubber/itlubber-excel-writer 263 | -------------------------------------------------------------------------------- /examples/model_report/auto_mining_rules/combiner_rules_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/auto_mining_rules/combiner_rules_0.png -------------------------------------------------------------------------------- /examples/model_report/auto_mining_rules/combiner_rules_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/auto_mining_rules/combiner_rules_1.png -------------------------------------------------------------------------------- /examples/model_report/auto_mining_rules/combiner_rules_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/auto_mining_rules/combiner_rules_2.png -------------------------------------------------------------------------------- /examples/model_report/bin_plots/bin_vars_A.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_A.png -------------------------------------------------------------------------------- /examples/model_report/bin_plots/bin_vars_B.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_B.png -------------------------------------------------------------------------------- /examples/model_report/bin_plots/bin_vars_C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_C.png -------------------------------------------------------------------------------- /examples/model_report/bin_plots/bin_vars_D.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_D.png -------------------------------------------------------------------------------- /examples/model_report/bin_plots/bin_vars_时间.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_时间.png -------------------------------------------------------------------------------- /examples/model_report/决策树组合策略挖掘.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/决策树组合策略挖掘.xlsx -------------------------------------------------------------------------------- /pdtr/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2023/5/15 17:55 4 | @Author : itlubber 5 | @Site : itlubber.art 6 | """ 7 | 8 | from .transforme import ParseDecisionTreeRules 9 | 10 | 11 | __version__ = "0.1.5" 12 | __all__ = ["ParseDecisionTreeRules"] 13 | -------------------------------------------------------------------------------- /pdtr/matplot_chinese.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/pdtr/matplot_chinese.ttf -------------------------------------------------------------------------------- /pdtr/transforme.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2023/5/15 17:55 4 | @Author : itlubber 5 | @Site : itlubber.art 6 | """ 7 | 8 | import warnings 9 | import os 10 | import re 11 | import graphviz 12 | import dtreeviz 13 | import numpy as np 14 | import pandas as pd 15 | import matplotlib.pyplot as plt 16 | from matplotlib import font_manager 17 | from openpyxl.worksheet.worksheet import Worksheet 18 | 19 | import category_encoders as ce 20 | from optbinning import OptimalBinning 21 | from sklearn.tree import DecisionTreeClassifier 22 | from scorecardpipeline import ExcelWriter, Combiner, feature_bin_stats, bin_plot, dataframe2excel 23 | 24 | 25 | class ParseDecisionTreeRules: 26 | 27 | def __init__(self, target="target", labels=["positive", "negative"], feature_map={}, nan=-1., max_iter=128, writer=None, combiner=None, seed=None, theme_color="2639E9"): 28 | """决策树自动规则挖掘工具包 29 | 30 | :param target: 数据集中好坏样本标签列名称,默认 target 31 | :param labels: 好坏样本标签名称,传入一个长度为2的列表,第0个元素为好样本标签,第1个元素为坏样本标签,默认 ["positive", "negative"] 32 | :param feature_map: 变量名称及其含义,在后续输出报告和策略信息时增加可读性,默认 {} 33 | :param nan: 在决策树策略挖掘时,默认空值填充的值,默认 -1 34 | :param max_iter: 最多支持在数据集上训练多少颗树模型,每次生成一棵树后,会剔除特征重要性最高的特征后,再生成树,默认 128 35 | :param output: excel 挖掘报告保存的路径, 默认 model_report/决策树组合策略挖掘.xlsx 36 | :param writer: 在之前程序运行时生成的 ExcelWriter,可以支持传入一个已有的writer,后续所有内容将保存至该workbook中,默认 None 37 | :param combiner: 可以传入提前训练好的 combiner,支持 toad.transform.Combiner 和 笔者重写的 Combiner 38 | """ 39 | self.seed = seed 40 | self.nan = nan 41 | self.target = target 42 | self.labels = labels 43 | self.theme_color = theme_color 44 | self.feature_map = feature_map 45 | self.decision_trees = [] 46 | self.max_iter = max_iter 47 | if combiner: 48 | self.combiner = combiner 49 | else: 50 | self.combiner = Combiner() 51 | self.target_enc = None 52 | self.feature_names = None 53 | self.dt_rules = pd.DataFrame() 54 | self.end_row = 2 55 | self.start_col = 2 56 | self.describe_columns = ["组合策略", "命中数", "命中率", "好样本数", "好样本占比", "坏样本数", "坏样本占比", "坏率", "样本整体坏率", "LIFT值"] 57 | 58 | self.init_setting() 59 | 60 | if writer: 61 | self.writer = writer 62 | else: 63 | self.writer = ExcelWriter(theme_color=self.theme_color) 64 | 65 | @staticmethod 66 | def init_setting(font_path=None): 67 | if "seaborn-ticks" in plt.style.available: 68 | plt.style.use('seaborn-ticks') 69 | else: 70 | plt.style.use('seaborn-v0_8-ticks') 71 | 72 | font_path = font_path or os.path.join(os.path.dirname(os.path.abspath(__file__)), 'matplot_chinese.ttf') 73 | if not os.path.isfile(font_path): 74 | import wget 75 | font_path = wget.download("https://itlubber.art/upload/matplot_chinese.ttf", os.path.join(os.path.dirname(os.path.abspath(__file__)), 'matplot_chinese.ttf')) 76 | 77 | font_manager.fontManager.addfont(font_path) 78 | plt.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name() 79 | plt.rcParams['axes.unicode_minus'] = False 80 | 81 | def encode_cat_features(self, X, y): 82 | cat_features = list(set(X.select_dtypes(include=[object, pd.CategoricalDtype]).columns)) 83 | cat_features_index = [i for i, f in enumerate(X.columns) if f in cat_features] 84 | 85 | if len(cat_features) > 0: 86 | if self.target_enc is None: 87 | self.target_enc = ce.TargetEncoder(cols=cat_features) 88 | self.target_enc.fit(X[cat_features], y) 89 | self.target_enc.target_mapping = {} 90 | X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target')) 91 | for col in cat_features: 92 | mapping = X_TE[[col, f"{col}_target"]].drop_duplicates() 93 | self.target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"])) 94 | else: 95 | X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target')) 96 | 97 | X_TE = X_TE.drop(columns=cat_features) 98 | return X_TE.rename(columns={f"{c}_target": c for c in cat_features}) 99 | else: 100 | return X 101 | 102 | def get_dt_rules(self, tree, feature_names, total_bad_rate, total_count): 103 | tree_ = tree.tree_ 104 | left = tree.tree_.children_left 105 | right = tree.tree_.children_right 106 | feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature] 107 | rules = dict() 108 | 109 | global res_df 110 | res_df = pd.DataFrame() 111 | 112 | def recurse(node, depth, parent): # 搜每个节点的规则 113 | 114 | if tree_.feature[node] != -2: # 非叶子节点,搜索每个节点的规则 115 | name = feature_name[node] 116 | thd = np.round(tree_.threshold[node], 3) 117 | s = "{} <= {} ".format(name, thd, node) 118 | # 左子 119 | if node == 0: 120 | rules[node] = s 121 | else: 122 | rules[node] = rules[parent] + ' & ' + s 123 | recurse(left[node], depth + 1, node) 124 | s = "{} > {}".format(name, thd) 125 | # 右子 126 | if node == 0: 127 | rules[node] = s 128 | else: 129 | rules[node] = rules[parent] + ' & ' + s 130 | recurse(right[node], depth + 1, node) 131 | else: 132 | df = pd.DataFrame() 133 | df['组合策略'] = rules[parent], 134 | df['好样本数'] = tree_.value[node][0][0].astype(int) 135 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate)) 136 | df['坏样本数'] = tree_.value[node][0][1].astype(int) 137 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate) 138 | df['命中数'] = df['好样本数'] + df['坏样本数'] 139 | df['命中率'] = df['命中数'] / total_count 140 | df['坏率'] = df['坏样本数'] / df['命中数'] 141 | df['样本整体坏率'] = total_bad_rate 142 | df['LIFT值'] = df['坏率'] / df['样本整体坏率'] 143 | 144 | global res_df 145 | 146 | res_df = pd.concat([res_df, df], axis=0) 147 | 148 | recurse(0, 1, 0) 149 | 150 | return res_df.sort_values("LIFT值", ascending=True)[self.describe_columns].reset_index(drop=True) 151 | 152 | def select_dt_rules(self, decision_tree, x, y, lift=0., max_samples=1., labels=["positive", "negative"], save=None, verbose=False, drop=False): 153 | rules = self.get_dt_rules(decision_tree, x.columns, sum(y) / len(y), len(y)) 154 | total_rules = len(rules) 155 | 156 | try: 157 | viz_model = dtreeviz.model(decision_tree, 158 | X_train=x, 159 | y_train=y, 160 | feature_names=x.columns, 161 | target_name=self.target, 162 | class_names=labels, 163 | ) 164 | except AttributeError: 165 | raise "请检查 dtreeviz 版本" 166 | 167 | rules = rules.query(f"LIFT值 >= {lift} & 命中率 <= {max_samples}").reset_index(drop=True) 168 | 169 | if len(rules) > 0: 170 | font_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'matplot_chinese.ttf') 171 | font_manager.fontManager.addfont(font_path) 172 | plt.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name() 173 | plt.rcParams['axes.unicode_minus'] = False 174 | 175 | decision_tree_viz = viz_model.view( 176 | scale=1.5, 177 | orientation='LR', 178 | colors={ 179 | "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]], 180 | "arrow": "#2639E9", 181 | 'text_wedge': "#F76E6C", 182 | "pie": "#2639E9", 183 | "tile_alpha": 1, 184 | "legend_edge": "#FFFFFF", 185 | }, 186 | ticks_fontsize=10, 187 | label_fontsize=10, 188 | fontname=plt.rcParams['font.family'], 189 | ) 190 | if verbose: 191 | from IPython.core.display_functions import display 192 | if self.feature_map is not None and len(self.feature_map) > 0: 193 | display(rules.replace(self.feature_map, regex=True)) 194 | else: 195 | display(rules) 196 | display(decision_tree_viz) 197 | if save: 198 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 199 | os.makedirs(os.path.dirname(save)) 200 | 201 | try: 202 | decision_tree_viz.save("combine_rules_cache.svg") 203 | except graphviz.backend.execute.ExecutableNotFound: 204 | print("请确保您已安装 graphviz 程序并且正确配置了 PATH 路径。可参考: https://stackoverflow.com/questions/35064304/runtimeerror-make-sure-the-graphviz-executables-are-on-your-systems-path-aft") 205 | 206 | try: 207 | import cairosvg 208 | cairosvg.svg2png(url="combine_rules_cache.svg", write_to=save, dpi=240) 209 | except: 210 | from reportlab.graphics import renderPDF 211 | from svglib.svglib import svg2rlg 212 | drawing = svg2rlg("combine_rules_cache.svg") 213 | renderPDF.drawToFile(drawing, save, dpi=240, fmt="PNG") 214 | 215 | if os.path.isfile("combine_rules_cache.svg"): 216 | os.remove("combine_rules_cache.svg") 217 | 218 | if os.path.isfile("combine_rules_cache"): 219 | os.remove("combine_rules_cache") 220 | 221 | if drop: 222 | if len(rules) > 0: 223 | return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))], total_rules 224 | else: 225 | return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(min(decision_tree.feature_importances_))], total_rules 226 | else: 227 | return rules, total_rules 228 | 229 | def query_dt_rules(self, x, y, parsed_rules=None): 230 | total_count = len(y) 231 | total_bad_rate = y.sum() / len(y) 232 | 233 | rules = pd.DataFrame() 234 | 235 | if isinstance(parsed_rules, pd.DataFrame): 236 | parsed_rules = parsed_rules["组合策略"].unique() 237 | 238 | for rule in parsed_rules: 239 | select_index = x.query(rule).index 240 | if len(select_index) > 0: 241 | y_select = y[select_index] 242 | df = pd.Series() 243 | df['组合策略'] = rule 244 | df['好样本数'] = len(y_select) - y_select.sum() 245 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate)) 246 | df['坏样本数'] = y_select.sum() 247 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate) 248 | df['命中数'] = df['好样本数'] + df['坏样本数'] 249 | df['命中率'] = df['命中数'] / total_count 250 | df['坏率'] = df['坏样本数'] / df['命中数'] 251 | df['样本整体坏率'] = total_bad_rate 252 | df['LIFT值'] = df['坏率'] / df['样本整体坏率'] 253 | else: 254 | df = pd.Series({'组合策略': rule, '好样本数': 0, '好样本占比': 0., '坏样本数': 0, '坏样本占比': 0., '命中数': 0, '命中率': 0., '坏率': 0., '样本整体坏率': total_bad_rate, 'LIFT值': 0., }) 255 | 256 | rules = pd.concat([rules, pd.DataFrame(df).T]).reset_index(drop=True) 257 | 258 | return rules[self.describe_columns] 259 | 260 | def insert_dt_rules(self, parsed_rules, end_row, start_col, save=None, sheet=None, figsize=(500, 350)): 261 | if isinstance(sheet, Worksheet): 262 | worksheet = sheet 263 | else: 264 | worksheet = self.writer.get_sheet_by_name(sheet or "决策树组合策略挖掘") 265 | 266 | end_row, end_col = dataframe2excel(parsed_rules, self.writer, sheet_name=worksheet, start_row=end_row + 1, start_col=start_col, percent_cols=['好样本占比', '坏样本占比', '命中率', '坏率', '样本整体坏率', 'LIFT值'], condition_cols=["坏率", "LIFT值"]) 267 | 268 | if save is not None: 269 | end_row, end_col = self.writer.insert_pic2sheet(worksheet, save, (end_row + 1, start_col), figsize=figsize) 270 | 271 | return end_row, end_col 272 | 273 | def fit(self, x, y=None, max_depth=2, lift=0., max_samples=1., min_score=None, verbose=False, *args, **kwargs): 274 | """组合策略挖掘 275 | 276 | :param x: 包含标签的数据集 277 | :param max_depth: 决策树最大深度,即最多组合的特征个数,默认 2 278 | :param lift: 组合策略最小的lift值,默认 0.,即全部组合策略 279 | :param max_samples: 每条组合策略的最大样本占比,默认 1.0,即全部组合策略 280 | :param min_score: 决策树拟合时最小的auc,如果不满足则停止后续生成决策树 281 | :param verbose: 是否调试模式,仅在 jupyter 环境有效 282 | :param kwargs: DecisionTreeClassifier 参数 283 | """ 284 | worksheet = self.writer.get_sheet_by_name("策略详情") 285 | 286 | y = x[self.target] 287 | X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y) 288 | X_TE = X_TE.fillna(self.nan) 289 | 290 | self.feature_names = list(X_TE.columns) 291 | 292 | for i in range(self.max_iter): 293 | decision_tree = DecisionTreeClassifier(max_depth=max_depth, *args, **kwargs) 294 | decision_tree = decision_tree.fit(X_TE, y) 295 | 296 | if (min_score is not None and decision_tree.score(X_TE, y) < min_score) or len(X_TE.columns) < max_depth: 297 | break 298 | 299 | try: 300 | parsed_rules, remove, total_rules = self.select_dt_rules(decision_tree, X_TE, y, lift=lift, max_samples=max_samples, labels=self.labels, verbose=verbose, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", drop=True) 301 | 302 | if len(parsed_rules) > 0: 303 | self.dt_rules = pd.concat([self.dt_rules, parsed_rules]).reset_index(drop=True) 304 | 305 | if self.writer is not None: 306 | if self.feature_map is not None and len(self.feature_map) > 0: 307 | parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True) 308 | self.end_row, _ = self.insert_dt_rules(parsed_rules, self.end_row, self.start_col, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", figsize=(500, 100 * total_rules), sheet=worksheet) 309 | 310 | X_TE = X_TE.drop(columns=remove) 311 | self.decision_trees.append(decision_tree) 312 | except: 313 | import traceback 314 | traceback.print_exc() 315 | 316 | if len(self.dt_rules) <= 0: 317 | print(f"未挖掘到有效策略, 可以考虑适当调整预设的筛选参数, 降低 lift / 提高 max_samples, 当前筛选标准为: 提取 lift >= {lift} 且 max_samples <= {max_samples} 的策略") 318 | 319 | return self 320 | 321 | def transform(self, x, y=None): 322 | y = x[self.target] 323 | X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y) 324 | X_TE = X_TE.fillna(self.nan) 325 | if self.dt_rules is not None and len(self.dt_rules) > 0: 326 | parsed_rules = self.query_dt_rules(X_TE, y, parsed_rules=self.dt_rules) 327 | if self.feature_map is not None and len(self.feature_map) > 0: 328 | parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True) 329 | return parsed_rules 330 | else: 331 | return pd.DataFrame(columns=self.describe_columns) 332 | 333 | def insert_all_rules(self, val=None, test=None, sheet="策略汇总"): 334 | """组合策略插入excel文档 335 | 336 | :param val: 验证数据集 337 | :param test: 测试数据集 338 | 339 | :return: 返回每个数据集组合策略命中情况 340 | """ 341 | worksheet = self.writer.get_sheet_by_name(sheet or "决策树组合策略挖掘") 342 | 343 | if sheet: 344 | self.writer.workbook.move_sheet(sheet, -1) 345 | 346 | parsed_rules_train = self.dt_rules.copy() 347 | if self.feature_map is not None and len(self.feature_map) > 0: 348 | parsed_rules_train["组合策略"] = parsed_rules_train["组合策略"].replace(self.feature_map, regex=True) 349 | self.end_row, _ = self.writer.insert_value2sheet(worksheet, (2 if sheet else self.end_row + 2, self.start_col), value="训练集决策树组合策略", style="header_middle") 350 | self.end_row, _ = self.insert_dt_rules(parsed_rules_train, self.end_row, self.start_col, sheet=worksheet) 351 | outputs = (parsed_rules_train,) 352 | 353 | if len(parsed_rules_train) > 0: 354 | if val is not None: 355 | parsed_rules_val = self.transform(val) 356 | self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value="验证集决策树组合策略", style="header_middle") 357 | self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col, sheet=worksheet) 358 | outputs = outputs + (parsed_rules_val,) 359 | 360 | if test is not None: 361 | parsed_rules_test = self.transform(test) 362 | self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value="测试集决策树组合策略", style="header_middle") 363 | self.end_row, _ = self.insert_dt_rules(parsed_rules_test, self.end_row, self.start_col, sheet=worksheet) 364 | outputs = outputs + (parsed_rules_test,) 365 | else: 366 | if val is not None: 367 | outputs = outputs + (parsed_rules_train,) 368 | 369 | if test is not None: 370 | outputs = outputs + (parsed_rules_train,) 371 | 372 | return outputs 373 | 374 | def query_feature_rule(self, data, feature, desc="", plot=False, figsize=(10, 6), save=None, *args, **kwargs): 375 | """传入数据集和其中一个特征名称,输出简单策略挖掘统计信息 376 | 377 | :param data: 数据集 378 | :param feature: 特征名称 379 | :param desc: 特征中文含义或其他相关信息 380 | :param bin_plot: 是否可视化特征分箱图 381 | :param figsize: 图像的尺寸 382 | :param save: 图像保存的路径 383 | 384 | :return: pd.DataFrame, 特征分箱的统计信息 385 | """ 386 | feature_table = feature_bin_stats(data, feature, desc=desc, *args, **kwargs) 387 | 388 | if plot: 389 | self.bin_plot(feature_table, desc=desc, figsize=figsize, save=save) 390 | 391 | return feature_table 392 | 393 | @staticmethod 394 | def bin_plot(*args, **kwargs): 395 | return bin_plot(*args, **kwargs) 396 | 397 | def save(self, output="model_report/决策树组合策略挖掘.xlsx"): 398 | self.writer.save(output) 399 | 400 | 401 | if __name__ == '__main__': 402 | import numpy as np 403 | import pandas as pd 404 | from sklearn.model_selection import train_test_split 405 | 406 | feature_map = {} 407 | n_samples = 10000 408 | ab = np.array(list('ABCDEFG')) 409 | 410 | data = pd.DataFrame({ 411 | 'A': np.random.randint(10, size=n_samples), 412 | 'B': ab[np.random.choice(7, n_samples)], 413 | 'C': ab[np.random.choice(2, n_samples)], 414 | 'D': np.random.random(size=n_samples), 415 | 'target': np.random.randint(2, size=n_samples) 416 | }) 417 | 418 | train, test = train_test_split(data, test_size=0.3, shuffle=data["target"]) 419 | 420 | pdtr = ParseDecisionTreeRules(target="target", feature_map=feature_map, max_iter=8) 421 | pdtr.fit(train, lift=1., max_depth=2, max_samples=0.5, verbose=False, min_samples_split=8, min_samples_leaf=5, max_features="auto") 422 | pdtr.insert_all_rules(test=test) 423 | pdtr.save() 424 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wget 2 | scorecardpipeline 3 | category-encoders>=2.6.0 4 | statsmodels<0.14,>=0.13.2 5 | CairoSVG>=2.7.0 6 | graphviz>=0.20.1 7 | dtreeviz>=2.2.1 8 | reportlab 9 | svglib -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from setuptools import setup, find_packages, Extension 4 | 5 | 6 | NAME = 'pdtr' 7 | 8 | 9 | def get_version(): 10 | with open(f"{NAME}/__init__.py", "r", encoding="utf8") as f: 11 | return re.search(r'__version__ = "(.*?)"', f.read()).group(1) 12 | 13 | 14 | def get_requirements(stage = None): 15 | file_name = 'requirements' 16 | 17 | if stage is not None: 18 | file_name = f"{file_name}-{stage}" 19 | 20 | requirements = [] 21 | with open(f"{file_name}.txt", 'r') as f: 22 | for line in f: 23 | line = line.strip() 24 | if not line or line.startswith('-'): 25 | continue 26 | 27 | requirements.append(line) 28 | 29 | return requirements 30 | 31 | 32 | setup( 33 | name = NAME, 34 | version = get_version(), 35 | description = '自动决策树规则挖掘工具包', 36 | long_description = open('README.md', encoding = 'utf-8').read(), 37 | long_description_content_type = 'text/markdown', 38 | url = 'https://github.com/itlubber/pdtr', 39 | author = 'itlubber', 40 | author_email = 'itlubber@qq.com', 41 | packages = find_packages(), 42 | include_package_data = True, 43 | python_requires = '>=3.6', 44 | install_requires = get_requirements(), 45 | license = 'MIT', 46 | classifiers = [ 47 | 'Operating System :: POSIX', 48 | 'Operating System :: Microsoft :: Windows', 49 | 'Operating System :: MacOS :: MacOS X', 50 | 'Programming Language :: Python :: 3.6', 51 | 'Programming Language :: Python :: 3.7', 52 | 'Programming Language :: Python :: 3.8', 53 | 'Programming Language :: Python :: 3.9', 54 | 'Programming Language :: Python :: 3.10', 55 | ], 56 | ) --------------------------------------------------------------------------------