├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples
├── model_report
│ ├── auto_mining_rules
│ │ ├── combiner_rules_0.png
│ │ ├── combiner_rules_1.png
│ │ └── combiner_rules_2.png
│ ├── bin_plots
│ │ ├── bin_vars_A.png
│ │ ├── bin_vars_B.png
│ │ ├── bin_vars_C.png
│ │ ├── bin_vars_D.png
│ │ └── bin_vars_时间.png
│ └── 决策树组合策略挖掘.xlsx
└── pdtr_samplts.ipynb
├── pdtr
├── __init__.py
├── matplot_chinese.ttf
└── transforme.py
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 |
162 | # itlubber
163 | *.DS_Store
164 | *.pkl
165 | *.virtual_documents
166 | test.*
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 itlubber
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include pdtr/*.ttf pdtr/*.xlsx
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 自动决策树规则挖掘工具包
2 |
3 | 在笔者金融风控的日常工作中,很多时候需要根据数据集内的诸多特征(有很多其他称呼,比如因子、变量、自变量、解释变量等)来挖掘一些有用的规则和组合策略,在保证通过率的基础上尽可能多的拒绝坏客户。面对成千上万的特征,如何从数据集中找到有效的规则和组合策略,一直以来都是金融风控搬砖工的日常工作。 `pdtr` 旨在帮助读者快速从高维数据中提取出有效的规则和组合策略。
4 |
5 | > 仓库地址:https://github.com/itlubber/pdtr
6 | >
7 | > 博文地址:https://itlubber.art/archives/auto-strategy-mining
8 | >
9 | > 微信公共号推文:https://mp.weixin.qq.com/s/8s785MfmVznNgQyy38YnWw
10 | >
11 | > pipy包:https://pypi.org/project/pdtr/
12 |
13 | ## 交流
14 |
15 | | 微信 | 微信公众号 |
16 | | :---: | :----: |
17 | |
|
|
18 | | itlubber | itlubber_art |
19 |
20 |
21 | ## 背景简介
22 |
23 | 金融场景风险大致可以概括为三种:系统性风险、欺诈风险(无还款意愿)、信用风险(无还款能力),而作为一名风控搬砖工,日常工作中有大量的数据挖掘工作,如何从高维数据集中挖掘出行之有效的规则、策略及模型来防范欺诈风险和信用风险每个搬砖工的基操。本仓库由笔者基于网上开源的一系列相关知识,结合实际工作中遇到的实际需求,整理得到。旨在为诸位仁兄提供一个便捷、高效、赏心悦目的决策树组合策略挖掘报告,及一系列能够实际运用到风险控制上的策略。
24 |
25 | ## 项目结构
26 |
27 | ```bash
28 | pdtr
29 | .
30 | | README.md # 说明文档
31 | | setup.py # 打包发布文件
32 | | LICENSE # 开源协议
33 | | requirements.txt # 项目依赖包
34 | +---examples # 演示样例
35 | | | combine_rules_cache # 缓存文件
36 | | | combine_rules_cache.svg # 缓存文件
37 | | | pdtr_samplts.ipynb # 演示样例程序
38 | | \---model_report # 模型报告输出文件夹
39 | | | 决策树组合策略挖掘.xlsx # 策略挖掘报告
40 | | +---auto_mining_rules # 组合策略可视化存储文件夹
41 | | | combiner_rules_0.png # 决策树可视化图片
42 | | | ......
43 | | \---bin_plots # 简单策略可视化存储文件夹
44 | | bin_vars_A.png # 变量分箱可视化图片
45 | | ......
46 | \---pdtr # PDTR 源码包
47 | template.xlsx # excel 模版文件
48 | excel_writer.py # excel写入公共方法
49 | matplot_chinese.ttf # matplotlib 中文字体
50 | transforme.py # 策略挖掘方法
51 | ```
52 |
53 | ## 环境准备
54 |
55 | ### 创建虚拟环境(可选)
56 |
57 | + 通过`conda`创建虚拟环境
58 |
59 | ```bash
60 | >> conda create -n score python==3.8.13
61 |
62 | Collecting package metadata (current_repodata.json): done
63 | Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
64 | Collecting package metadata (repodata.json): done
65 | Solving environment: done
66 |
67 |
68 | ==> WARNING: A newer version of conda exists. <==
69 | current version: 4.10.3
70 | latest version: 23.3.1
71 |
72 | Please update conda by running
73 |
74 | $ conda update -n base -c defaults conda
75 |
76 |
77 |
78 | ## Package Plan ##
79 |
80 | environment location: /Users/lubberit/anaconda3/envs/score
81 |
82 | added / updated specs:
83 | - python==3.8.13
84 |
85 |
86 | The following packages will be downloaded:
87 |
88 | package | build
89 | ---------------------------|-----------------
90 | ca-certificates-2023.01.10 | hecd8cb5_0 121 KB
91 | ncurses-6.4 | hcec6c5f_0 1018 KB
92 | openssl-1.1.1t | hca72f7f_0 3.3 MB
93 | pip-23.0.1 | py38hecd8cb5_0 2.5 MB
94 | python-3.8.13 | hdfd78df_1 10.8 MB
95 | setuptools-66.0.0 | py38hecd8cb5_0 1.2 MB
96 | sqlite-3.41.2 | h6c40b1e_0 1.2 MB
97 | wheel-0.38.4 | py38hecd8cb5_0 65 KB
98 | xz-5.4.2 | h6c40b1e_0 372 KB
99 | ------------------------------------------------------------
100 | Total: 20.5 MB
101 |
102 | The following NEW packages will be INSTALLED:
103 |
104 | ca-certificates pkgs/main/osx-64::ca-certificates-2023.01.10-hecd8cb5_0
105 | libcxx pkgs/main/osx-64::libcxx-14.0.6-h9765a3e_0
106 | libffi pkgs/main/osx-64::libffi-3.3-hb1e8313_2
107 | ncurses pkgs/main/osx-64::ncurses-6.4-hcec6c5f_0
108 | openssl pkgs/main/osx-64::openssl-1.1.1t-hca72f7f_0
109 | pip pkgs/main/osx-64::pip-23.0.1-py38hecd8cb5_0
110 | python pkgs/main/osx-64::python-3.8.13-hdfd78df_1
111 | readline pkgs/main/osx-64::readline-8.2-hca72f7f_0
112 | setuptools pkgs/main/osx-64::setuptools-66.0.0-py38hecd8cb5_0
113 | sqlite pkgs/main/osx-64::sqlite-3.41.2-h6c40b1e_0
114 | tk pkgs/main/osx-64::tk-8.6.12-h5d9f67b_0
115 | wheel pkgs/main/osx-64::wheel-0.38.4-py38hecd8cb5_0
116 | xz pkgs/main/osx-64::xz-5.4.2-h6c40b1e_0
117 | zlib pkgs/main/osx-64::zlib-1.2.13-h4dc903c_0
118 |
119 |
120 | Proceed ([y]/n)? y
121 |
122 |
123 | Downloading and Extracting Packages
124 | sqlite-3.41.2 | 1.2 MB | ################################################################################################### | 100%
125 | wheel-0.38.4 | 65 KB | ################################################################################################### | 100%
126 | openssl-1.1.1t | 3.3 MB | ################################################################################################### | 100%
127 | python-3.8.13 | 10.8 MB | ################################################################################################### | 100%
128 | setuptools-66.0.0 | 1.2 MB | ################################################################################################### | 100%
129 | ncurses-6.4 | 1018 KB | ################################################################################################### | 100%
130 | xz-5.4.2 | 372 KB | ################################################################################################### | 100%
131 | ca-certificates-2023 | 121 KB | ################################################################################################### | 100%
132 | pip-23.0.1 | 2.5 MB | ################################################################################################### | 100%
133 | Preparing transaction: done
134 | Verifying transaction: done
135 | Executing transaction: done
136 | #
137 | # To activate this environment, use
138 | #
139 | # $ conda activate score
140 | #
141 | # To deactivate an active environment, use
142 | #
143 | # $ conda deactivate
144 | ```
145 |
146 | + 通过`pyenv`创建虚拟环境
147 |
148 | ```bash
149 | # 安装环境
150 | >> pyenv install -v 3.8.13
151 | # 启动环境
152 | >> pyenv local 3.8.13
153 | # 卸载环境
154 | >> pyenv uninstall 3.8.13
155 | ```
156 |
157 |
158 | ### 安装项目依赖
159 |
160 | ```bash
161 | >> pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
162 |
163 | Looking in indexes: http://mirrors.aliyun.com/pypi/simple/
164 | ......
165 | Installing collected packages: webencodings, six, pytz, colour, zipp, tomli, tinycss2, threadpoolctl, python-dateutil, pyparsing, pycparser, pluggy, pillow, packaging, numpy, kiwisolver, joblib, iniconfig, graphviz, fonttools, exceptiongroup, et-xmlfile, defusedxml, cycler, scipy, pytest, patsy, pandas, openpyxl, importlib-resources, cssselect2, contourpy, cffi, statsmodels, scikit-learn, matplotlib, cairocffi, dtreeviz, category-encoders, CairoSVG
166 | Successfully installed CairoSVG-2.7.0 cairocffi-1.5.1 category-encoders-2.6.0 cffi-1.15.1 colour-0.1.5 contourpy-1.0.7 cssselect2-0.7.0 cycler-0.11.0 defusedxml-0.7.1 dtreeviz-2.2.1 et-xmlfile-1.1.0 exceptiongroup-1.1.1 fonttools-4.39.4 graphviz-0.20.1 importlib-resources-5.12.0 iniconfig-2.0.0 joblib-1.2.0 kiwisolver-1.4.4 matplotlib-3.7.1 numpy-1.22.2 openpyxl-3.0.7 packaging-23.1 pandas-1.5.3 patsy-0.5.3 pillow-9.5.0 pluggy-1.0.0 pycparser-2.21 pyparsing-3.0.9 pytest-7.3.1 python-dateutil-2.8.2 pytz-2023.3 scikit-learn-1.2.2 scipy-1.10.1 six-1.11.0 statsmodels-0.14.0 threadpoolctl-3.1.0 tinycss2-1.2.1 tomli-2.0.1 webencodings-0.5.1 zipp-3.15.0
167 | ```
168 |
169 |
170 | ### `PDTR` 安装
171 |
172 | ```bash
173 | pip install pdtr
174 | ```
175 |
176 | ### 版本介绍
177 |
178 | + `0.1.0`
179 |
180 | 仅包含决策树策略挖掘相关工具
181 |
182 | + `0.1.1`
183 |
184 | 除版本 `0.1.0` 中的决策树挖掘相关工具以外,新增了基于 `toad` 和 `optbinning` 的单变量策略挖掘相关方法
185 |
186 | + `0.1.2`
187 |
188 | 在 `0.1.1` 的基础上增加了部分方法的文档注释
189 |
190 |
191 | ### 运行样例
192 |
193 | + 导入相关依赖
194 |
195 | ```python
196 | import os
197 | import numpy as np
198 | import pandas as pd
199 | from sklearn.model_selection import train_test_split
200 |
201 | try:
202 | from pdtr import ParseDecisionTreeRules
203 | except ModuleNotFoundError:
204 | import sys
205 |
206 | sys.path.append("../")
207 | from pdtr import ParseDecisionTreeRules
208 |
209 | np.random.seed(1)
210 | ```
211 |
212 | + 数据集加载
213 |
214 | ```python
215 | feature_map = {}
216 | n_samples = 10000
217 | ab = np.array(list('ABCDEFG'))
218 |
219 | data = pd.DataFrame({
220 | 'A': np.random.randint(10, size = n_samples),
221 | 'B': ab[np.random.choice(7, n_samples)],
222 | 'C': ab[np.random.choice(2, n_samples)],
223 | 'D': np.random.random(size = n_samples),
224 | 'target': np.random.randint(2, size = n_samples)
225 | })
226 | ```
227 |
228 | + 数据集拆分
229 |
230 | ```python
231 | train, test = train_test_split(data, test_size=0.3, shuffle=data["target"])
232 | ```
233 |
234 | + 决策树自动规则挖掘
235 |
236 | ```python
237 | pdtr_instance = ParseDecisionTreeRules(target="target", max_iter=8, output="model_report/决策树组合策略挖掘.xlsx")
238 | pdtr_instance.fit(train, lift=0., max_depth=2, max_samples=1., verbose=False, max_features="auto")
239 | ```
240 |
241 | + 规则验证
242 |
243 | ```python
244 | all_rules = pdtr_instance.insert_all_rules(test=test)
245 | ```
246 |
247 | + 导出策略挖掘报告
248 |
249 | ```python
250 | pdtr_instance.save()
251 | ```
252 |
253 | + 挖掘报告
254 |
255 | [`examples/决策树组合策略挖掘.xlsx`](https://github.com/itlubber/pdtr/blob/main/examples/model_report/%E5%86%B3%E7%AD%96%E6%A0%91%E7%BB%84%E5%90%88%E7%AD%96%E7%95%A5%E6%8C%96%E6%8E%98.xlsx)
256 |
257 |
258 | ## 参考
259 |
260 | > https://github.com/itlubber/LogisticRegressionPipeline
261 | >
262 | > https://github.com/itlubber/itlubber-excel-writer
263 |
--------------------------------------------------------------------------------
/examples/model_report/auto_mining_rules/combiner_rules_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/auto_mining_rules/combiner_rules_0.png
--------------------------------------------------------------------------------
/examples/model_report/auto_mining_rules/combiner_rules_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/auto_mining_rules/combiner_rules_1.png
--------------------------------------------------------------------------------
/examples/model_report/auto_mining_rules/combiner_rules_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/auto_mining_rules/combiner_rules_2.png
--------------------------------------------------------------------------------
/examples/model_report/bin_plots/bin_vars_A.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_A.png
--------------------------------------------------------------------------------
/examples/model_report/bin_plots/bin_vars_B.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_B.png
--------------------------------------------------------------------------------
/examples/model_report/bin_plots/bin_vars_C.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_C.png
--------------------------------------------------------------------------------
/examples/model_report/bin_plots/bin_vars_D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_D.png
--------------------------------------------------------------------------------
/examples/model_report/bin_plots/bin_vars_时间.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/bin_plots/bin_vars_时间.png
--------------------------------------------------------------------------------
/examples/model_report/决策树组合策略挖掘.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/examples/model_report/决策树组合策略挖掘.xlsx
--------------------------------------------------------------------------------
/pdtr/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2023/5/15 17:55
4 | @Author : itlubber
5 | @Site : itlubber.art
6 | """
7 |
8 | from .transforme import ParseDecisionTreeRules
9 |
10 |
11 | __version__ = "0.1.5"
12 | __all__ = ["ParseDecisionTreeRules"]
13 |
--------------------------------------------------------------------------------
/pdtr/matplot_chinese.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/pdtr/9d60f6cba7fc17473b1c26e199a24e2b5cff1d6b/pdtr/matplot_chinese.ttf
--------------------------------------------------------------------------------
/pdtr/transforme.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2023/5/15 17:55
4 | @Author : itlubber
5 | @Site : itlubber.art
6 | """
7 |
8 | import warnings
9 | import os
10 | import re
11 | import graphviz
12 | import dtreeviz
13 | import numpy as np
14 | import pandas as pd
15 | import matplotlib.pyplot as plt
16 | from matplotlib import font_manager
17 | from openpyxl.worksheet.worksheet import Worksheet
18 |
19 | import category_encoders as ce
20 | from optbinning import OptimalBinning
21 | from sklearn.tree import DecisionTreeClassifier
22 | from scorecardpipeline import ExcelWriter, Combiner, feature_bin_stats, bin_plot, dataframe2excel
23 |
24 |
25 | class ParseDecisionTreeRules:
26 |
27 | def __init__(self, target="target", labels=["positive", "negative"], feature_map={}, nan=-1., max_iter=128, writer=None, combiner=None, seed=None, theme_color="2639E9"):
28 | """决策树自动规则挖掘工具包
29 |
30 | :param target: 数据集中好坏样本标签列名称,默认 target
31 | :param labels: 好坏样本标签名称,传入一个长度为2的列表,第0个元素为好样本标签,第1个元素为坏样本标签,默认 ["positive", "negative"]
32 | :param feature_map: 变量名称及其含义,在后续输出报告和策略信息时增加可读性,默认 {}
33 | :param nan: 在决策树策略挖掘时,默认空值填充的值,默认 -1
34 | :param max_iter: 最多支持在数据集上训练多少颗树模型,每次生成一棵树后,会剔除特征重要性最高的特征后,再生成树,默认 128
35 | :param output: excel 挖掘报告保存的路径, 默认 model_report/决策树组合策略挖掘.xlsx
36 | :param writer: 在之前程序运行时生成的 ExcelWriter,可以支持传入一个已有的writer,后续所有内容将保存至该workbook中,默认 None
37 | :param combiner: 可以传入提前训练好的 combiner,支持 toad.transform.Combiner 和 笔者重写的 Combiner
38 | """
39 | self.seed = seed
40 | self.nan = nan
41 | self.target = target
42 | self.labels = labels
43 | self.theme_color = theme_color
44 | self.feature_map = feature_map
45 | self.decision_trees = []
46 | self.max_iter = max_iter
47 | if combiner:
48 | self.combiner = combiner
49 | else:
50 | self.combiner = Combiner()
51 | self.target_enc = None
52 | self.feature_names = None
53 | self.dt_rules = pd.DataFrame()
54 | self.end_row = 2
55 | self.start_col = 2
56 | self.describe_columns = ["组合策略", "命中数", "命中率", "好样本数", "好样本占比", "坏样本数", "坏样本占比", "坏率", "样本整体坏率", "LIFT值"]
57 |
58 | self.init_setting()
59 |
60 | if writer:
61 | self.writer = writer
62 | else:
63 | self.writer = ExcelWriter(theme_color=self.theme_color)
64 |
65 | @staticmethod
66 | def init_setting(font_path=None):
67 | if "seaborn-ticks" in plt.style.available:
68 | plt.style.use('seaborn-ticks')
69 | else:
70 | plt.style.use('seaborn-v0_8-ticks')
71 |
72 | font_path = font_path or os.path.join(os.path.dirname(os.path.abspath(__file__)), 'matplot_chinese.ttf')
73 | if not os.path.isfile(font_path):
74 | import wget
75 | font_path = wget.download("https://itlubber.art/upload/matplot_chinese.ttf", os.path.join(os.path.dirname(os.path.abspath(__file__)), 'matplot_chinese.ttf'))
76 |
77 | font_manager.fontManager.addfont(font_path)
78 | plt.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name()
79 | plt.rcParams['axes.unicode_minus'] = False
80 |
81 | def encode_cat_features(self, X, y):
82 | cat_features = list(set(X.select_dtypes(include=[object, pd.CategoricalDtype]).columns))
83 | cat_features_index = [i for i, f in enumerate(X.columns) if f in cat_features]
84 |
85 | if len(cat_features) > 0:
86 | if self.target_enc is None:
87 | self.target_enc = ce.TargetEncoder(cols=cat_features)
88 | self.target_enc.fit(X[cat_features], y)
89 | self.target_enc.target_mapping = {}
90 | X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target'))
91 | for col in cat_features:
92 | mapping = X_TE[[col, f"{col}_target"]].drop_duplicates()
93 | self.target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"]))
94 | else:
95 | X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target'))
96 |
97 | X_TE = X_TE.drop(columns=cat_features)
98 | return X_TE.rename(columns={f"{c}_target": c for c in cat_features})
99 | else:
100 | return X
101 |
102 | def get_dt_rules(self, tree, feature_names, total_bad_rate, total_count):
103 | tree_ = tree.tree_
104 | left = tree.tree_.children_left
105 | right = tree.tree_.children_right
106 | feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature]
107 | rules = dict()
108 |
109 | global res_df
110 | res_df = pd.DataFrame()
111 |
112 | def recurse(node, depth, parent): # 搜每个节点的规则
113 |
114 | if tree_.feature[node] != -2: # 非叶子节点,搜索每个节点的规则
115 | name = feature_name[node]
116 | thd = np.round(tree_.threshold[node], 3)
117 | s = "{} <= {} ".format(name, thd, node)
118 | # 左子
119 | if node == 0:
120 | rules[node] = s
121 | else:
122 | rules[node] = rules[parent] + ' & ' + s
123 | recurse(left[node], depth + 1, node)
124 | s = "{} > {}".format(name, thd)
125 | # 右子
126 | if node == 0:
127 | rules[node] = s
128 | else:
129 | rules[node] = rules[parent] + ' & ' + s
130 | recurse(right[node], depth + 1, node)
131 | else:
132 | df = pd.DataFrame()
133 | df['组合策略'] = rules[parent],
134 | df['好样本数'] = tree_.value[node][0][0].astype(int)
135 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate))
136 | df['坏样本数'] = tree_.value[node][0][1].astype(int)
137 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate)
138 | df['命中数'] = df['好样本数'] + df['坏样本数']
139 | df['命中率'] = df['命中数'] / total_count
140 | df['坏率'] = df['坏样本数'] / df['命中数']
141 | df['样本整体坏率'] = total_bad_rate
142 | df['LIFT值'] = df['坏率'] / df['样本整体坏率']
143 |
144 | global res_df
145 |
146 | res_df = pd.concat([res_df, df], axis=0)
147 |
148 | recurse(0, 1, 0)
149 |
150 | return res_df.sort_values("LIFT值", ascending=True)[self.describe_columns].reset_index(drop=True)
151 |
152 | def select_dt_rules(self, decision_tree, x, y, lift=0., max_samples=1., labels=["positive", "negative"], save=None, verbose=False, drop=False):
153 | rules = self.get_dt_rules(decision_tree, x.columns, sum(y) / len(y), len(y))
154 | total_rules = len(rules)
155 |
156 | try:
157 | viz_model = dtreeviz.model(decision_tree,
158 | X_train=x,
159 | y_train=y,
160 | feature_names=x.columns,
161 | target_name=self.target,
162 | class_names=labels,
163 | )
164 | except AttributeError:
165 | raise "请检查 dtreeviz 版本"
166 |
167 | rules = rules.query(f"LIFT值 >= {lift} & 命中率 <= {max_samples}").reset_index(drop=True)
168 |
169 | if len(rules) > 0:
170 | font_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'matplot_chinese.ttf')
171 | font_manager.fontManager.addfont(font_path)
172 | plt.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name()
173 | plt.rcParams['axes.unicode_minus'] = False
174 |
175 | decision_tree_viz = viz_model.view(
176 | scale=1.5,
177 | orientation='LR',
178 | colors={
179 | "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
180 | "arrow": "#2639E9",
181 | 'text_wedge': "#F76E6C",
182 | "pie": "#2639E9",
183 | "tile_alpha": 1,
184 | "legend_edge": "#FFFFFF",
185 | },
186 | ticks_fontsize=10,
187 | label_fontsize=10,
188 | fontname=plt.rcParams['font.family'],
189 | )
190 | if verbose:
191 | from IPython.core.display_functions import display
192 | if self.feature_map is not None and len(self.feature_map) > 0:
193 | display(rules.replace(self.feature_map, regex=True))
194 | else:
195 | display(rules)
196 | display(decision_tree_viz)
197 | if save:
198 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
199 | os.makedirs(os.path.dirname(save))
200 |
201 | try:
202 | decision_tree_viz.save("combine_rules_cache.svg")
203 | except graphviz.backend.execute.ExecutableNotFound:
204 | print("请确保您已安装 graphviz 程序并且正确配置了 PATH 路径。可参考: https://stackoverflow.com/questions/35064304/runtimeerror-make-sure-the-graphviz-executables-are-on-your-systems-path-aft")
205 |
206 | try:
207 | import cairosvg
208 | cairosvg.svg2png(url="combine_rules_cache.svg", write_to=save, dpi=240)
209 | except:
210 | from reportlab.graphics import renderPDF
211 | from svglib.svglib import svg2rlg
212 | drawing = svg2rlg("combine_rules_cache.svg")
213 | renderPDF.drawToFile(drawing, save, dpi=240, fmt="PNG")
214 |
215 | if os.path.isfile("combine_rules_cache.svg"):
216 | os.remove("combine_rules_cache.svg")
217 |
218 | if os.path.isfile("combine_rules_cache"):
219 | os.remove("combine_rules_cache")
220 |
221 | if drop:
222 | if len(rules) > 0:
223 | return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))], total_rules
224 | else:
225 | return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(min(decision_tree.feature_importances_))], total_rules
226 | else:
227 | return rules, total_rules
228 |
229 | def query_dt_rules(self, x, y, parsed_rules=None):
230 | total_count = len(y)
231 | total_bad_rate = y.sum() / len(y)
232 |
233 | rules = pd.DataFrame()
234 |
235 | if isinstance(parsed_rules, pd.DataFrame):
236 | parsed_rules = parsed_rules["组合策略"].unique()
237 |
238 | for rule in parsed_rules:
239 | select_index = x.query(rule).index
240 | if len(select_index) > 0:
241 | y_select = y[select_index]
242 | df = pd.Series()
243 | df['组合策略'] = rule
244 | df['好样本数'] = len(y_select) - y_select.sum()
245 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate))
246 | df['坏样本数'] = y_select.sum()
247 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate)
248 | df['命中数'] = df['好样本数'] + df['坏样本数']
249 | df['命中率'] = df['命中数'] / total_count
250 | df['坏率'] = df['坏样本数'] / df['命中数']
251 | df['样本整体坏率'] = total_bad_rate
252 | df['LIFT值'] = df['坏率'] / df['样本整体坏率']
253 | else:
254 | df = pd.Series({'组合策略': rule, '好样本数': 0, '好样本占比': 0., '坏样本数': 0, '坏样本占比': 0., '命中数': 0, '命中率': 0., '坏率': 0., '样本整体坏率': total_bad_rate, 'LIFT值': 0., })
255 |
256 | rules = pd.concat([rules, pd.DataFrame(df).T]).reset_index(drop=True)
257 |
258 | return rules[self.describe_columns]
259 |
260 | def insert_dt_rules(self, parsed_rules, end_row, start_col, save=None, sheet=None, figsize=(500, 350)):
261 | if isinstance(sheet, Worksheet):
262 | worksheet = sheet
263 | else:
264 | worksheet = self.writer.get_sheet_by_name(sheet or "决策树组合策略挖掘")
265 |
266 | end_row, end_col = dataframe2excel(parsed_rules, self.writer, sheet_name=worksheet, start_row=end_row + 1, start_col=start_col, percent_cols=['好样本占比', '坏样本占比', '命中率', '坏率', '样本整体坏率', 'LIFT值'], condition_cols=["坏率", "LIFT值"])
267 |
268 | if save is not None:
269 | end_row, end_col = self.writer.insert_pic2sheet(worksheet, save, (end_row + 1, start_col), figsize=figsize)
270 |
271 | return end_row, end_col
272 |
273 | def fit(self, x, y=None, max_depth=2, lift=0., max_samples=1., min_score=None, verbose=False, *args, **kwargs):
274 | """组合策略挖掘
275 |
276 | :param x: 包含标签的数据集
277 | :param max_depth: 决策树最大深度,即最多组合的特征个数,默认 2
278 | :param lift: 组合策略最小的lift值,默认 0.,即全部组合策略
279 | :param max_samples: 每条组合策略的最大样本占比,默认 1.0,即全部组合策略
280 | :param min_score: 决策树拟合时最小的auc,如果不满足则停止后续生成决策树
281 | :param verbose: 是否调试模式,仅在 jupyter 环境有效
282 | :param kwargs: DecisionTreeClassifier 参数
283 | """
284 | worksheet = self.writer.get_sheet_by_name("策略详情")
285 |
286 | y = x[self.target]
287 | X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y)
288 | X_TE = X_TE.fillna(self.nan)
289 |
290 | self.feature_names = list(X_TE.columns)
291 |
292 | for i in range(self.max_iter):
293 | decision_tree = DecisionTreeClassifier(max_depth=max_depth, *args, **kwargs)
294 | decision_tree = decision_tree.fit(X_TE, y)
295 |
296 | if (min_score is not None and decision_tree.score(X_TE, y) < min_score) or len(X_TE.columns) < max_depth:
297 | break
298 |
299 | try:
300 | parsed_rules, remove, total_rules = self.select_dt_rules(decision_tree, X_TE, y, lift=lift, max_samples=max_samples, labels=self.labels, verbose=verbose, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", drop=True)
301 |
302 | if len(parsed_rules) > 0:
303 | self.dt_rules = pd.concat([self.dt_rules, parsed_rules]).reset_index(drop=True)
304 |
305 | if self.writer is not None:
306 | if self.feature_map is not None and len(self.feature_map) > 0:
307 | parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True)
308 | self.end_row, _ = self.insert_dt_rules(parsed_rules, self.end_row, self.start_col, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", figsize=(500, 100 * total_rules), sheet=worksheet)
309 |
310 | X_TE = X_TE.drop(columns=remove)
311 | self.decision_trees.append(decision_tree)
312 | except:
313 | import traceback
314 | traceback.print_exc()
315 |
316 | if len(self.dt_rules) <= 0:
317 | print(f"未挖掘到有效策略, 可以考虑适当调整预设的筛选参数, 降低 lift / 提高 max_samples, 当前筛选标准为: 提取 lift >= {lift} 且 max_samples <= {max_samples} 的策略")
318 |
319 | return self
320 |
321 | def transform(self, x, y=None):
322 | y = x[self.target]
323 | X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y)
324 | X_TE = X_TE.fillna(self.nan)
325 | if self.dt_rules is not None and len(self.dt_rules) > 0:
326 | parsed_rules = self.query_dt_rules(X_TE, y, parsed_rules=self.dt_rules)
327 | if self.feature_map is not None and len(self.feature_map) > 0:
328 | parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True)
329 | return parsed_rules
330 | else:
331 | return pd.DataFrame(columns=self.describe_columns)
332 |
333 | def insert_all_rules(self, val=None, test=None, sheet="策略汇总"):
334 | """组合策略插入excel文档
335 |
336 | :param val: 验证数据集
337 | :param test: 测试数据集
338 |
339 | :return: 返回每个数据集组合策略命中情况
340 | """
341 | worksheet = self.writer.get_sheet_by_name(sheet or "决策树组合策略挖掘")
342 |
343 | if sheet:
344 | self.writer.workbook.move_sheet(sheet, -1)
345 |
346 | parsed_rules_train = self.dt_rules.copy()
347 | if self.feature_map is not None and len(self.feature_map) > 0:
348 | parsed_rules_train["组合策略"] = parsed_rules_train["组合策略"].replace(self.feature_map, regex=True)
349 | self.end_row, _ = self.writer.insert_value2sheet(worksheet, (2 if sheet else self.end_row + 2, self.start_col), value="训练集决策树组合策略", style="header_middle")
350 | self.end_row, _ = self.insert_dt_rules(parsed_rules_train, self.end_row, self.start_col, sheet=worksheet)
351 | outputs = (parsed_rules_train,)
352 |
353 | if len(parsed_rules_train) > 0:
354 | if val is not None:
355 | parsed_rules_val = self.transform(val)
356 | self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value="验证集决策树组合策略", style="header_middle")
357 | self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col, sheet=worksheet)
358 | outputs = outputs + (parsed_rules_val,)
359 |
360 | if test is not None:
361 | parsed_rules_test = self.transform(test)
362 | self.end_row, _ = self.writer.insert_value2sheet(worksheet, (self.end_row + 2, self.start_col), value="测试集决策树组合策略", style="header_middle")
363 | self.end_row, _ = self.insert_dt_rules(parsed_rules_test, self.end_row, self.start_col, sheet=worksheet)
364 | outputs = outputs + (parsed_rules_test,)
365 | else:
366 | if val is not None:
367 | outputs = outputs + (parsed_rules_train,)
368 |
369 | if test is not None:
370 | outputs = outputs + (parsed_rules_train,)
371 |
372 | return outputs
373 |
374 | def query_feature_rule(self, data, feature, desc="", plot=False, figsize=(10, 6), save=None, *args, **kwargs):
375 | """传入数据集和其中一个特征名称,输出简单策略挖掘统计信息
376 |
377 | :param data: 数据集
378 | :param feature: 特征名称
379 | :param desc: 特征中文含义或其他相关信息
380 | :param bin_plot: 是否可视化特征分箱图
381 | :param figsize: 图像的尺寸
382 | :param save: 图像保存的路径
383 |
384 | :return: pd.DataFrame, 特征分箱的统计信息
385 | """
386 | feature_table = feature_bin_stats(data, feature, desc=desc, *args, **kwargs)
387 |
388 | if plot:
389 | self.bin_plot(feature_table, desc=desc, figsize=figsize, save=save)
390 |
391 | return feature_table
392 |
393 | @staticmethod
394 | def bin_plot(*args, **kwargs):
395 | return bin_plot(*args, **kwargs)
396 |
397 | def save(self, output="model_report/决策树组合策略挖掘.xlsx"):
398 | self.writer.save(output)
399 |
400 |
401 | if __name__ == '__main__':
402 | import numpy as np
403 | import pandas as pd
404 | from sklearn.model_selection import train_test_split
405 |
406 | feature_map = {}
407 | n_samples = 10000
408 | ab = np.array(list('ABCDEFG'))
409 |
410 | data = pd.DataFrame({
411 | 'A': np.random.randint(10, size=n_samples),
412 | 'B': ab[np.random.choice(7, n_samples)],
413 | 'C': ab[np.random.choice(2, n_samples)],
414 | 'D': np.random.random(size=n_samples),
415 | 'target': np.random.randint(2, size=n_samples)
416 | })
417 |
418 | train, test = train_test_split(data, test_size=0.3, shuffle=data["target"])
419 |
420 | pdtr = ParseDecisionTreeRules(target="target", feature_map=feature_map, max_iter=8)
421 | pdtr.fit(train, lift=1., max_depth=2, max_samples=0.5, verbose=False, min_samples_split=8, min_samples_leaf=5, max_features="auto")
422 | pdtr.insert_all_rules(test=test)
423 | pdtr.save()
424 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wget
2 | scorecardpipeline
3 | category-encoders>=2.6.0
4 | statsmodels<0.14,>=0.13.2
5 | CairoSVG>=2.7.0
6 | graphviz>=0.20.1
7 | dtreeviz>=2.2.1
8 | reportlab
9 | svglib
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from setuptools import setup, find_packages, Extension
4 |
5 |
6 | NAME = 'pdtr'
7 |
8 |
9 | def get_version():
10 | with open(f"{NAME}/__init__.py", "r", encoding="utf8") as f:
11 | return re.search(r'__version__ = "(.*?)"', f.read()).group(1)
12 |
13 |
14 | def get_requirements(stage = None):
15 | file_name = 'requirements'
16 |
17 | if stage is not None:
18 | file_name = f"{file_name}-{stage}"
19 |
20 | requirements = []
21 | with open(f"{file_name}.txt", 'r') as f:
22 | for line in f:
23 | line = line.strip()
24 | if not line or line.startswith('-'):
25 | continue
26 |
27 | requirements.append(line)
28 |
29 | return requirements
30 |
31 |
32 | setup(
33 | name = NAME,
34 | version = get_version(),
35 | description = '自动决策树规则挖掘工具包',
36 | long_description = open('README.md', encoding = 'utf-8').read(),
37 | long_description_content_type = 'text/markdown',
38 | url = 'https://github.com/itlubber/pdtr',
39 | author = 'itlubber',
40 | author_email = 'itlubber@qq.com',
41 | packages = find_packages(),
42 | include_package_data = True,
43 | python_requires = '>=3.6',
44 | install_requires = get_requirements(),
45 | license = 'MIT',
46 | classifiers = [
47 | 'Operating System :: POSIX',
48 | 'Operating System :: Microsoft :: Windows',
49 | 'Operating System :: MacOS :: MacOS X',
50 | 'Programming Language :: Python :: 3.6',
51 | 'Programming Language :: Python :: 3.7',
52 | 'Programming Language :: Python :: 3.8',
53 | 'Programming Language :: Python :: 3.9',
54 | 'Programming Language :: Python :: 3.10',
55 | ],
56 | )
--------------------------------------------------------------------------------