├── utils ├── 报告输出模版.xlsx ├── matplot_chinese.ttf ├── __init__.py ├── perf_eva.py ├── excel_writer.py └── tools.py ├── requertments.txt ├── clear_cache.sh ├── LICENSE ├── .gitignore ├── README.md ├── tree_ming.py ├── rules_auto_mining.py ├── processing.py ├── main.py └── model.py /utils/报告输出模版.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/LogisticRegressionPipeline/HEAD/utils/报告输出模版.xlsx -------------------------------------------------------------------------------- /requertments.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy<1.20 3 | ortools>=9.4 4 | ropwr>=0.4.0 5 | scikit-learn>=1.0.2 6 | scipy>=1.6.0 7 | -------------------------------------------------------------------------------- /utils/matplot_chinese.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/LogisticRegressionPipeline/HEAD/utils/matplot_chinese.ttf -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2023/2/14 09:08 4 | @Author : itlubber 5 | @Site : itlubber.art 6 | """ 7 | -------------------------------------------------------------------------------- /clear_cache.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PYSTRING="$(find . | grep -E "(__pycache__|\.pyc|\.pyo$)")" 4 | IPYNBSTRING="$(find . | grep -E "(ipynb_checkpoints|\.ipynb$)")" 5 | 6 | # 删除 __pycache__ 缓存文件 7 | if [ -n "$PYSTRING" ]; then 8 | echo "删除以下缓存文件 :" 9 | echo "-----------------------------------------------------" 10 | echo "$PYSTRING" 11 | echo "-----------------------------------------------------" 12 | find . | grep -E "(__pycache__|\.pyc|\.pyo$)" | xargs rm -rf 13 | else 14 | echo "不存在 __pycache__ 缓存文件" 15 | fi 16 | 17 | # # 删除 ipynb_checkpoints 缓存文件 18 | # if [ -n "$IPYNBSTRING" ]; then 19 | # echo "删除以下缓存文件 :" 20 | # echo "-----------------------------------------------------" 21 | # echo "$IPYNBSTRING" 22 | # echo "-----------------------------------------------------" 23 | # find . | grep -E "(ipynb_checkpoints|\.ipynb$)" | xargs rm -rf 24 | # else 25 | # echo "不存在 ipynb_checkpoints 缓存文件" 26 | # fi -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 itlubber 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | *.ipynb 131 | *.zip 132 | .DS_store 133 | catboost_info/ 134 | test.py 135 | .idea 136 | .vscode -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 可用于 `超参数搜索` & `pipeline` 的逻辑回归 2 | 3 | ## 交流 4 | 5 |
| 8 | 微信: itlubber 9 | | 10 |11 | 微信公众号: itlubber_art 12 | | 13 |
16 |
17 | |
18 |
19 |
20 | |
21 |
32 |
33 |
34 |
35 | > 基于 `sklearn` 的 `ITLubberLogisticRegression`
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 | ## 使用方法
45 |
46 | ```python
47 | target = "creditability"
48 | data = sc.germancredit()
49 | data[target] = data[target].map({"good": 0, "bad": 1})
50 |
51 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])
52 | oot = data.copy()
53 | feature_pipeline = Pipeline([
54 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
55 | ("combiner", Combiner(target=target, min_samples=0.2)),
56 | ("transform", WOETransformer(target=target)),
57 | ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
58 | ("stepwise", StepwiseSelection(target=target)),
59 | ])
60 |
61 | feature_pipeline.fit(train)
62 |
63 | woe_train = feature_pipeline.transform(train)
64 | woe_test = feature_pipeline.transform(test)
65 | woe_oot = feature_pipeline.transform(oot)
66 |
67 | # logistic = StatsLogisticRegression(target=target)
68 | logistic = ITLubberLogisticRegression(target=target)
69 |
70 | logistic.fit(woe_train)
71 |
72 | y_pred_train = logistic.predict_proba(woe_train.drop(columns=target))[:, 1]
73 | y_pred_test = logistic.predict_proba(woe_test.drop(columns=target))[:, 1]
74 | y_pred_oot = logistic.predict_proba(woe_oot.drop(columns=target))[:, 1]
75 |
76 | # params_grid = {
77 | # # "logistic__C": [i / 1. for i in range(1, 10, 2)],
78 | # # "logistic__penalty": ["l2"],
79 | # # "logistic__class_weight": [None, "balanced"], # + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10)],
80 | # # "logistic__max_iter": [100],
81 | # # "logistic__solver": ["sag"] # ["liblinear", "sag", "lbfgs", "newton-cg"],
82 | # "logistic__intercept": [True, False],
83 | # }
84 |
85 | # clf = GridSearchCV(feature_pipeline, params_grid, cv=5, scoring='roc_auc', verbose=-1, n_jobs=2, return_train_score=True)
86 | # clf.fit(train, train[target])
87 |
88 | # y_pred_train = clf.best_estimator_.predict(train)
89 | # y_pred_test = clf.best_estimator_.predict(test)
90 |
91 | # print(clf.best_params_)
92 |
93 | # model summary
94 | # logistic.summary_save()
95 | # logistic.plot_weights(save="logistic_train.png")
96 | summary = logistic.summary().reset_index().rename(columns={"index": "Features"})
97 |
98 | train_report = logistic.report(woe_train)
99 | test_report = logistic.report(woe_test)
100 | oot_report = logistic.report(woe_oot)
101 |
102 | print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target]))
103 | print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target]))
104 | print("oot: ", toad.metrics.KS(y_pred_oot, oot[target]), toad.metrics.AUC(y_pred_oot, oot[target]))
105 |
106 | card = ScoreCard(target=target, pipeline=feature_pipeline, pretrain_lr=logistic)
107 | card.fit(woe_train)
108 |
109 | train["score"] = card.predict(train)
110 | test["score"] = card.predict(test)
111 | oot["score"] = card.predict(oot)
112 |
113 | # print(card.feature_bin_stats(train, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step"))
114 | # print(card.feature_bin_stats(train, "score", target=target, verbose=0, method="cart"))
115 |
116 | train_score_rank = card.feature_bin_stats(train, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
117 | test_score_rank = card.feature_bin_stats(test, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
118 | oot_score_rank = card.feature_bin_stats(oot, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
119 |
120 | writer = pd.ExcelWriter("评分卡结果验证表.xlsx", engine="openpyxl")
121 |
122 | summary.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=1, index=False)
123 | train_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + 5, index=False)
124 | test_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + 9, index=False)
125 | oot_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + len(test_report) + 13, index=False)
126 |
127 | worksheet = writer.sheets['逻辑回归拟合结果']
128 | worksheet.cell(row=1, column=1).value = "入模变量系数及相关统计指标"
129 | worksheet.cell(row=len(summary) + 5, column=1).value = "训练数据集模型预测报告"
130 | worksheet.cell(row=len(summary) + len(train_report) + 9, column=1).value = "测试数据集模型预测报告"
131 | worksheet.cell(row=len(summary) + len(train_report) + len(test_report) + 13, column=1).value = "跨时间验证集模型预测报告"
132 |
133 | train_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=1, index=False)
134 | test_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + 5, index=False)
135 | oot_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + len(test_score_rank) + 9, index=False)
136 |
137 | worksheet = writer.sheets['评分卡排序性']
138 |
139 | worksheet.cell(row=1, column=1).value = "训练数据集评分排序性"
140 | worksheet.cell(row=len(train_score_rank) + 5, column=1).value = "测试数据集评分排序性"
141 | worksheet.cell(row=len(train_score_rank) + len(test_score_rank) + 9, column=1).value = "跨时间验证集评分排序性"
142 |
143 | writer.close()
144 |
145 | from utils import render_excel
146 |
147 | render_excel("评分卡结果验证表.xlsx", border=False)
148 | ```
149 |
150 |
151 | ## 参考
152 |
153 | > https://github.com/ing-bank/skorecard/blob/main/skorecard/linear_model/linear_model.py
154 | >
155 | > https://github.com/itlubber/openpyxl-excel-style-template/blob/main/pipeline_model.py
156 | >
--------------------------------------------------------------------------------
/tree_ming.py:
--------------------------------------------------------------------------------
1 | import os
2 | import graphviz
3 | import warnings
4 | import numpy as np
5 | import pandas as pd
6 | import matplotlib.pyplot as plt
7 | from matplotlib import font_manager
8 | import dtreeviz
9 |
10 | import category_encoders as ce
11 | from sklearn.preprocessing import LabelEncoder
12 | from sklearn.tree import _tree, DecisionTreeClassifier, plot_tree, export_graphviz
13 |
14 |
15 | warnings.filterwarnings("ignore")
16 | pd.set_option('display.width', 5000)
17 | plt.style.use('seaborn-ticks')
18 | plt.rcParams["font.sans-serif"]=["SimHei"]
19 | plt.rcParams["axes.unicode_minus"]=False
20 |
21 |
22 | def get_dt_rules(tree, feature_names, total_bad_rate, total_count):
23 | tree_ = tree.tree_
24 | left = tree.tree_.children_left
25 | right = tree.tree_.children_right
26 | feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature]
27 | rules=dict()
28 |
29 | global res_df
30 | res_df = pd.DataFrame()
31 |
32 | def recurse(node, depth, parent): # 搜每个节点的规则
33 |
34 | if tree_.feature[node] != -2: # 非叶子节点,搜索每个节点的规则
35 | name = feature_name[node]
36 | thd = np.round(tree_.threshold[node],3)
37 | s= "{} <= {} ".format( name, thd, node )
38 | # 左子
39 | if node == 0:
40 | rules[node]=s
41 | else:
42 | rules[node]=rules[parent]+' & ' +s
43 | recurse(left[node], depth + 1, node)
44 | s="{} > {}".format(name, thd)
45 | # 右子
46 | if node == 0:
47 | rules[node]=s
48 | else:
49 | rules[node]=rules[parent]+' & ' +s
50 | recurse(right[node], depth + 1, node)
51 | else:
52 | df = pd.DataFrame()
53 | df['组合策略'] = rules[parent],
54 | df['好样本数'] = tree_.value[node][0][0].astype(int)
55 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate))
56 | df['坏样本数'] = tree_.value[node][0][1].astype(int)
57 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate)
58 | df['命中数'] = df['好样本数'] + df['坏样本数']
59 | df['命中率'] = df['命中数'] / total_count
60 | df['坏率'] = df['坏样本数'] / df['命中数']
61 | df['样本整体坏率'] = total_bad_rate
62 | df['LIFT值'] = df['坏率'] / df['样本整体坏率']
63 |
64 | global res_df
65 |
66 | res_df = pd.concat([res_df, df], 0)
67 |
68 | recurse(0, 1, 0)
69 |
70 | return res_df.sort_values("LIFT值", ascending=True).reset_index(drop=True)
71 |
72 |
73 | def dtreeviz_plot(tree, X_TE, y, target="target", save=None):
74 | viz_model = dtreeviz.model(tree,
75 | X_train=X_TE, y_train=y,
76 | feature_names=X_TE.columns,
77 | target_name=target, class_names=["GOOD", f"BAD"])
78 | viz = viz_model.view(
79 | scale=1.5,
80 | orientation='LR',
81 | colors={
82 | "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
83 | "arrow": "#2639E9",
84 | 'text_wedge': "#F76E6C",
85 | "pie": "#2639E9",
86 | "tile_alpha": 1,
87 | "legend_edge": "#FFFFFF",
88 | },
89 | ticks_fontsize=10,
90 | label_fontsize=10,
91 | )
92 |
93 | # viz = dtreeviz.model(
94 | # decision_tree,
95 | # X_TE,
96 | # y,
97 | # # title="DecisionTreeClassifier",
98 | # # title_fontsize=10,
99 | # ticks_fontsize=10,
100 | # label_fontsize=10,
101 | # target_name=target,
102 | # feature_names=X_TE.columns,
103 | # class_names=["good", "bad"],
104 | # orientation='LR',
105 | # scale=1.5,
106 | # colors={
107 | # "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
108 | # "arrow": "#2639E9",
109 | # 'text_wedge': "#F76E6C",
110 | # "pie": "#2639E9",
111 | # "tile_alpha": 1,
112 | # "legend_edge": "#FFFFFF",
113 | # },
114 | # )
115 |
116 | if save:
117 | viz.save(save)
118 |
119 | return viz
120 |
121 |
122 | if __name__ == '__main__':
123 | import scorecardpy as sc
124 |
125 | target = "creditability"
126 | data = sc.germancredit()
127 | data[target] = data[target].map({"good": 0, "bad": 1})
128 |
129 | cat_features = list(set(data.select_dtypes(include=[object, pd.CategoricalDtype]).columns) - set([target]))
130 | cat_features_index = [i for i, f in enumerate(data.columns) if f in cat_features]
131 |
132 | X = data.drop(columns=[target])
133 | y = data[target]
134 |
135 | target_enc = ce.TargetEncoder(cols=cat_features)
136 | target_enc.fit(X[cat_features], y)
137 |
138 | X_TE = X.join(target_enc.transform(X[cat_features]).add_suffix('_target'))
139 |
140 | target_enc.target_mapping = {}
141 | for col in cat_features:
142 | mapping = X_TE[[col, f"{col}_target"]].drop_duplicates()
143 | target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"]))
144 |
145 | X_TE = X_TE.drop(columns=cat_features)
146 | X_TE = X_TE.rename(columns={f"{c}_target": c for c in cat_features})
147 |
148 | removes = []
149 | dt_rules = pd.DataFrame()
150 |
151 | for i in range(128):
152 | decision_tree = DecisionTreeClassifier(max_depth=2, min_samples_split=8, min_samples_leaf=5, max_features="auto")
153 | decision_tree = decision_tree.fit(X_TE, y)
154 |
155 | if decision_tree.score(X_TE, y) < 0.8:
156 | break
157 |
158 | rules = get_dt_rules(decision_tree, X_TE.columns, sum(y) / len(y), len(y))
159 | viz_model = dtreeviz.model(decision_tree,
160 | X_train=X_TE, y_train=y,
161 | feature_names=X_TE.columns,
162 | target_name=target, class_names=["DPD 0", f"DPD {dpd}+"])
163 |
164 | rules = rules.query("LIFT值 > 4 & 命中率 < 0.1")
165 |
166 | if len(rules) > 0:
167 | print("/" * 150)
168 | rules["组合策略"] = rules["组合策略"].replace(feature_map, regex=True)
169 | display(rules)
170 | c = viz_model.view(
171 | scale=1.5,
172 | orientation='LR',
173 | colors={
174 | "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
175 | "arrow": "#2639E9",
176 | 'text_wedge': "#F76E6C",
177 | "pie": "#2639E9",
178 | "tile_alpha": 1,
179 | "legend_edge": "#FFFFFF",
180 | },
181 | ticks_fontsize=10,
182 | label_fontsize=10,
183 | )
184 | display(c)
185 |
186 | dt_rules = pd.concat([dt_rules, rules]).reset_index(drop=True)
187 | removes.append(decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))])
188 | X_TE = X_TE.drop(columns=removes[-1])
189 | print("-" * 150)
190 |
191 | pd.set_option('display.max_row', None)
192 | dt_rules.sort_values(["LIFT值", "命中率"], ascending=False)
193 |
194 | # decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
195 | # decision_tree = decision_tree.fit(X_TE, y)
196 |
197 | # rules = get_dt_rules(decision_tree, X_TE.columns, sum(y) / len(y), len(y))
198 |
199 | # dtreeviz_plot(decision_tree, X_TE, y, save="decision_tree.svg")
200 | # rules.to_excel("组合策略挖掘.xlsx")
201 |
202 | # dot_data = export_graphviz(decision_tree, feature_names=X_TE.columns, class_names=True, filled=True, rounded=False, out_file=None)
203 | # graph = graphviz.Source(dot_data)
204 |
205 | # graph.render("组合策略挖掘")
206 |
--------------------------------------------------------------------------------
/rules_auto_mining.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cairosvg
3 | import graphviz
4 | import dtreeviz
5 | import warnings
6 | import numpy as np
7 | import pandas as pd
8 |
9 | import category_encoders as ce
10 | from sklearn.preprocessing import LabelEncoder
11 | from sklearn.tree import _tree, DecisionTreeClassifier, plot_tree, export_graphviz
12 |
13 |
14 | warnings.filterwarnings("ignore")
15 |
16 |
17 | class ParseDecisionTreeRules:
18 |
19 | def __init__(self, target="target", labels=["positive", "negative"], feature_map={}, nan=-1., max_iter=128, output="model_report/auto_mining_rules/决策树组合策略挖掘.xlsx", writer=None):
20 | self.target = target
21 | self.labels = labels
22 | self.feature_map = feature_map
23 | self.nan = nan
24 | self.max_iter = max_iter
25 | self.output = output
26 | self.decision_trees = []
27 | self.target_enc = None
28 | self.feature_names = None
29 | self.dt_rules = pd.DataFrame()
30 | self.end_row = 2
31 | self.start_col = 2
32 | self.describe_columns = ["组合策略", "命中数", "命中率", "好样本数", "好样本占比", "坏样本数", "坏样本占比", "坏率", "样本整体坏率", "LIFT值"]
33 |
34 | if output:
35 | from utils.excel_writer import ExcelWriter
36 | from openpyxl.utils import get_column_letter, column_index_from_string
37 | init_setting()
38 | if writer:
39 | self.writer = writer
40 | else:
41 | self.writer = ExcelWriter(style_excel="./utils/报告输出模版.xlsx", theme_color="2639E9")
42 |
43 | self.worksheet = self.writer.get_sheet_by_name("决策树组合策略挖掘")
44 |
45 | def encode_cat_features(self, X, y):
46 | cat_features = list(set(X.select_dtypes(include=[object, pd.CategoricalDtype]).columns))
47 | cat_features_index = [i for i, f in enumerate(X.columns) if f in cat_features]
48 |
49 | if len(cat_features) > 0:
50 | if self.target_enc is None:
51 | self.target_enc = ce.TargetEncoder(cols=cat_features)
52 | self.target_enc.fit(X[cat_features], y)
53 | self.target_enc.target_mapping = {}
54 | X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target'))
55 | for col in cat_features:
56 | mapping = X_TE[[col, f"{col}_target"]].drop_duplicates()
57 | self.target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"]))
58 | else:
59 | X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target'))
60 |
61 | X_TE = X_TE.drop(columns=cat_features)
62 | return X_TE.rename(columns={f"{c}_target": c for c in cat_features})
63 | else:
64 | return X
65 |
66 | def get_dt_rules(self, tree, feature_names, total_bad_rate, total_count):
67 | tree_ = tree.tree_
68 | left = tree.tree_.children_left
69 | right = tree.tree_.children_right
70 | feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature]
71 | rules=dict()
72 |
73 | global res_df
74 | res_df = pd.DataFrame()
75 |
76 | def recurse(node, depth, parent): # 搜每个节点的规则
77 |
78 | if tree_.feature[node] != -2: # 非叶子节点,搜索每个节点的规则
79 | name = feature_name[node]
80 | thd = np.round(tree_.threshold[node],3)
81 | s= "{} <= {} ".format( name, thd, node )
82 | # 左子
83 | if node == 0:
84 | rules[node]=s
85 | else:
86 | rules[node]=rules[parent]+' & ' +s
87 | recurse(left[node], depth + 1, node)
88 | s="{} > {}".format(name, thd)
89 | # 右子
90 | if node == 0:
91 | rules[node]=s
92 | else:
93 | rules[node]=rules[parent]+' & ' +s
94 | recurse(right[node], depth + 1, node)
95 | else:
96 | df = pd.DataFrame()
97 | df['组合策略'] = rules[parent],
98 | df['好样本数'] = tree_.value[node][0][0].astype(int)
99 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate))
100 | df['坏样本数'] = tree_.value[node][0][1].astype(int)
101 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate)
102 | df['命中数'] = df['好样本数'] + df['坏样本数']
103 | df['命中率'] = df['命中数'] / total_count
104 | df['坏率'] = df['坏样本数'] / df['命中数']
105 | df['样本整体坏率'] = total_bad_rate
106 | df['LIFT值'] = df['坏率'] / df['样本整体坏率']
107 |
108 | global res_df
109 |
110 | res_df = pd.concat([res_df, df], 0)
111 |
112 | recurse(0, 1, 0)
113 |
114 | return res_df.sort_values("LIFT值", ascending=True)[self.describe_columns].reset_index(drop=True)
115 |
116 | def select_dt_rules(self, decision_tree, x, y, lift=3., max_samples=0.05, labels=["positive", "negative"], save=None, verbose=False, drop=False):
117 | rules = self.get_dt_rules(decision_tree, x.columns, sum(y) / len(y), len(y))
118 | viz_model = dtreeviz.model(decision_tree,
119 | X_train=x,
120 | y_train=y,
121 | feature_names=x.columns,
122 | target_name=target,
123 | class_names=labels,
124 | )
125 | rules = rules.query(f"LIFT值 >= {lift} & 命中率 <= {max_samples}").reset_index(drop=True)
126 |
127 | if len(rules) > 0:
128 | decision_tree_viz = viz_model.view(
129 | scale=1.5,
130 | orientation='LR',
131 | colors={
132 | "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
133 | "arrow": "#2639E9",
134 | 'text_wedge': "#F76E6C",
135 | "pie": "#2639E9",
136 | "tile_alpha": 1,
137 | "legend_edge": "#FFFFFF",
138 | },
139 | ticks_fontsize=10,
140 | label_fontsize=10,
141 | )
142 | if verbose:
143 | if self.feature_map is not None and len(self.feature_map) > 0:
144 | display(rules.replace(self.feature_map, regex=True))
145 | else:
146 | display(rules)
147 | display(decision_tree_viz)
148 | if save:
149 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
150 | os.makedirs(os.path.dirname(save))
151 |
152 | decision_tree_viz.save("combine_rules_cache.svg")
153 | cairosvg.svg2png(url="combine_rules_cache.svg", write_to=save, dpi=240)
154 |
155 | if drop:
156 | return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))]
157 | else:
158 | return rules
159 |
160 | def query_dt_rules(self, x, y, parsed_rules=None):
161 | total_count = len(y)
162 | total_bad_rate = y.sum() / len(y)
163 |
164 | rules = pd.DataFrame()
165 | for rule in parsed_rules["组合策略"].unique():
166 | select_index = x.query(rule).index
167 | if len(select_index) > 0:
168 | y_select = y[select_index]
169 | df = pd.Series()
170 | df['组合策略'] = rule
171 | df['好样本数'] = len(y_select) - y_select.sum()
172 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate))
173 | df['坏样本数'] = y_select.sum()
174 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate)
175 | df['命中数'] = df['好样本数'] + df['坏样本数']
176 | df['命中率'] = df['命中数'] / total_count
177 | df['坏率'] = df['坏样本数'] / df['命中数']
178 | df['样本整体坏率'] = total_bad_rate
179 | df['LIFT值'] = df['坏率'] / df['样本整体坏率']
180 | else:
181 | df = pd.Series({'组合策略': rule,'好样本数': 0,'好样本占比': 0.,'坏样本数': 0,'坏样本占比': 0.,'命中数': 0,'命中率': 0.,'坏率': 0.,'样本整体坏率': total_bad_rate,'LIFT值': 0.,})
182 |
183 | rules = pd.concat([rules, pd.DataFrame(df).T]).reset_index(drop=True)
184 |
185 | return rules[self.describe_columns]
186 |
187 | def insert_dt_rules(self, parsed_rules, end_row, start_col, save=None):
188 | end_row, end_col = self.writer.insert_df2sheet(self.worksheet, parsed_rules, (end_row + 2, start_col))
189 |
190 | for c in ['好样本占比', '坏样本占比', '命中率', '坏率', '样本整体坏率', 'LIFT值']:
191 | conditional_column = get_column_letter(start_col + parsed_rules.columns.get_loc(c))
192 | self.writer.set_number_format(self.worksheet, f"{conditional_column}{end_row - len(parsed_rules)}:{conditional_column}{end_row - 1}", "0.00%")
193 | for c in ["坏率", "LIFT值"]:
194 | conditional_column = get_column_letter(start_col + parsed_rules.columns.get_loc(c))
195 | self.writer.add_conditional_formatting(self.worksheet, f'{conditional_column}{end_row - len(parsed_rules)}', f'{conditional_column}{end_row - 1}')
196 |
197 | if save is not None:
198 | end_row, end_col = self.writer.insert_pic2sheet(self.worksheet, save, (end_row + 1, start_col), figsize=(400, 300))
199 |
200 | return end_row, end_col
201 |
202 | def fit(self, x, y=None, max_depth=2, lift=3, max_samples=0.2, min_score=None, verbose=False, **kwargs):
203 | y = x[self.target]
204 | X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y)
205 | X_TE = X_TE.fillna(self.nan)
206 |
207 | self.feature_names = list(X_TE.columns)
208 |
209 | for i in range(self.max_iter):
210 | decision_tree = DecisionTreeClassifier(max_depth=max_depth, **kwargs)
211 | decision_tree = decision_tree.fit(X_TE, y)
212 |
213 | if (min_score is not None and decision_tree.score(X_TE, y) < min_score) or len(X_TE.columns) < max_depth:
214 | break
215 |
216 | try:
217 | parsed_rules, remove = self.select_dt_rules(decision_tree, X_TE, y, lift=lift, max_samples=max_samples, labels=self.labels, verbose=verbose, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", drop=True)
218 |
219 | if len(parsed_rules) > 0:
220 | self.dt_rules = pd.concat([self.dt_rules, parsed_rules]).reset_index(drop=True)
221 |
222 | if self.writer is not None:
223 | if self.feature_map is not None and len(self.feature_map) > 0:
224 | parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True)
225 | self.end_row, _ = self.insert_dt_rules(parsed_rules, self.end_row, self.start_col, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png")
226 |
227 | X_TE = X_TE.drop(columns=remove)
228 | self.decision_trees.append(decision_tree)
229 | except:
230 | pass
231 |
232 | return self
233 |
234 | def transform(self, x, y=None):
235 | y = x[self.target]
236 | X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y)
237 | X_TE = X_TE.fillna(self.nan)
238 | parsed_rules = self.query_dt_rules(X_TE, y, parsed_rules=self.dt_rules)
239 | if self.feature_map is not None and len(self.feature_map) > 0:
240 | parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True)
241 | return parsed_rules
242 |
243 | def insert_all_rules(self, val=None, test=None):
244 | parsed_rules_train = self.dt_rules.copy()
245 | if self.feature_map is not None and len(self.feature_map) > 0:
246 | parsed_rules_train["组合策略"] = parsed_rules_train["组合策略"].replace(self.feature_map, regex=True)
247 | self.end_row, _ = self.writer.insert_value2sheet(self.worksheet, (self.end_row + 2, self.start_col), value="训练集决策树组合策略")
248 | self.end_row, _ = self.insert_dt_rules(parsed_rules_train, self.end_row, self.start_col)
249 |
250 | if val is not None:
251 | parsed_rules_val = self.transform(val)
252 | self.end_row, _ = self.writer.insert_value2sheet(self.worksheet, (self.end_row + 2, self.start_col), value="验证集决策树组合策略")
253 | self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col)
254 |
255 | if test is not None:
256 | parsed_rules_test = self.transform(test)
257 | self.end_row, _ = self.writer.insert_value2sheet(self.worksheet, (self.end_row + 2, self.start_col), value="测试集决策树组合策略")
258 | self.end_row, _ = self.insert_dt_rules(parsed_rules_test, self.end_row, self.start_col)
259 |
260 | def save(self):
261 | self.writer.save(self.output)
262 |
263 |
264 | if __name__ == '__main__':
265 | pdtr = ParseDecisionTreeRules(target=target, feature_map=feature_map, max_iter=8)
266 | pdtr.fit(train, lift=3., max_depth=2, max_samples=0.1, verbose=False, min_samples_split=8, min_samples_leaf=5, max_features="auto")
267 | pdtr.insert_all_rules(test=test)
268 | pdtr.save()
269 |
--------------------------------------------------------------------------------
/utils/perf_eva.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import re
3 | import numpy as np
4 | import pandas as pd
5 | import matplotlib.pyplot as plt
6 | import warnings
7 | from pandas.api.types import is_numeric_dtype
8 |
9 |
10 | def check_y(dat, y, positive):
11 | positive = str(positive)
12 | # ncol of dt
13 | if isinstance(dat, pd.DataFrame) & (dat.shape[1] <= 1):
14 | raise Exception("Incorrect inputs; dat should be a DataFrame with at least two columns.")
15 |
16 | # y ------
17 | if isinstance(y, str):
18 | y = [y]
19 | # length of y == 1
20 | if len(y) != 1:
21 | raise Exception("Incorrect inputs; the length of y should be one")
22 |
23 | y = y[0]
24 | # y not in dat.columns
25 | if y not in dat.columns:
26 | raise Exception("Incorrect inputs; there is no \'{}\' column in dat.".format(y))
27 |
28 | # remove na in y
29 | if pd.isna(dat[y]).any():
30 | warnings.warn("There are NaNs in \'{}\' column. The rows with NaN in \'{}\' were removed from dat.".format(y,y))
31 | dat = dat.dropna(subset=[y])
32 | # dat = dat[pd.notna(dat[y])]
33 |
34 |
35 | # numeric y to int
36 | if is_numeric_dtype(dat[y]):
37 | dat.loc[:,y] = dat[y].apply(lambda x: x if pd.isnull(x) else int(x)) #dat[y].astype(int)
38 | # length of unique values in y
39 | unique_y = np.unique(dat[y].values)
40 | if len(unique_y) == 2:
41 | # if [v not in [0,1] for v in unique_y] == [True, True]:
42 | if True in [bool(re.search(positive, str(v))) for v in unique_y]:
43 | y1 = dat[y]
44 | y2 = dat[y].apply(lambda x: 1 if str(x) in re.split('\|', positive) else 0)
45 | if (y1 != y2).any():
46 | dat.loc[:,y] = y2#dat[y] = y2
47 | warnings.warn("The positive value in \"{}\" was replaced by 1 and negative value by 0.".format(y))
48 | else:
49 | raise Exception("Incorrect inputs; the positive value in \"{}\" is not specified".format(y))
50 | else:
51 | raise Exception("Incorrect inputs; the length of unique values in y column \'{}\' != 2.".format(y))
52 |
53 | return dat
54 |
55 |
56 |
57 | def eva_dfkslift(df, groupnum=None):
58 | if groupnum is None: groupnum=len(df.index)
59 | # good bad func
60 | def n0(x): return sum(x==0)
61 | def n1(x): return sum(x==1)
62 | df_kslift = df.sort_values('pred', ascending=False).reset_index(drop=True)\
63 | .assign(group=lambda x: np.ceil((x.index+1)/(len(x.index)/groupnum)))\
64 | .groupby('group')['label'].agg([n0,n1])\
65 | .reset_index().rename(columns={'n0':'good','n1':'bad'})\
66 | .assign(
67 | group=lambda x: (x.index+1)/len(x.index),
68 | good_distri=lambda x: x.good/sum(x.good),
69 | bad_distri=lambda x: x.bad/sum(x.bad),
70 | badrate=lambda x: x.bad/(x.good+x.bad),
71 | cumbadrate=lambda x: np.cumsum(x.bad)/np.cumsum(x.good+x.bad),
72 | lift=lambda x: (np.cumsum(x.bad)/np.cumsum(x.good+x.bad))/(sum(x.bad)/sum(x.good+x.bad)),
73 | cumgood=lambda x: np.cumsum(x.good)/sum(x.good),
74 | cumbad=lambda x: np.cumsum(x.bad)/sum(x.bad)
75 | ).assign(ks=lambda x:abs(x.cumbad-x.cumgood))
76 | # bind 0
77 | df_kslift=pd.concat([
78 | pd.DataFrame({'group':0, 'good':0, 'bad':0, 'good_distri':0, 'bad_distri':0, 'badrate':0, 'cumbadrate':np.nan, 'cumgood':0, 'cumbad':0, 'ks':0, 'lift':np.nan}, index=np.arange(1)),
79 | df_kslift
80 | ], ignore_index=True)
81 | # return
82 | return df_kslift
83 | # plot ks
84 | def eva_pks(dfkslift, title):
85 | dfks = dfkslift.loc[lambda x: x.ks==max(x.ks)].sort_values('group').iloc[0]
86 | ###### plot ######
87 | # fig, ax = plt.subplots()
88 | # ks, cumbad, cumgood
89 | plt.plot(dfkslift.group, dfkslift.ks, 'b-',
90 | dfkslift.group, dfkslift.cumgood, 'k-',
91 | dfkslift.group, dfkslift.cumbad, 'k-')
92 | # ks vline
93 | plt.plot([dfks['group'], dfks['group']], [0, dfks['ks']], 'r--')
94 | # set xylabel
95 | plt.gca().set(title=title+'K-S',
96 | xlabel='% of population', ylabel='% of total Good/Bad',
97 | xlim=[0,1], ylim=[0,1], aspect='equal')
98 | # text
99 | # plt.text(0.5,0.96,'K-S', fontsize=15,horizontalalignment='center')
100 | plt.text(0.2,0.8,'Bad',horizontalalignment='center')
101 | plt.text(0.8,0.55,'Good',horizontalalignment='center')
102 | plt.text(dfks['group'], dfks['ks'], 'KS:'+ str(round(dfks['ks'],4)), horizontalalignment='center',color='b')
103 | # plt.grid()
104 | # plt.show()
105 | # return fig
106 | # plot lift
107 | def eva_plift(dfkslift, title):
108 | badrate_avg = sum(dfkslift.bad)/sum(dfkslift.good+dfkslift.bad)
109 | ###### plot ######
110 | # fig, ax = plt.subplots()
111 | # ks, cumbad, cumgood
112 | plt.plot(dfkslift.group, dfkslift.cumbadrate, 'k-')
113 | # ks vline
114 | plt.plot([0, 1], [badrate_avg, badrate_avg], 'r--')
115 | # set xylabel
116 | plt.gca().set(title=title+'Lift',
117 | xlabel='% of population', ylabel='% of Bad',
118 | xlim=[0,1], ylim=[0,1], aspect='equal')
119 | # text
120 | # plt.text(0.5,0.96,'Lift', fontsize=15,horizontalalignment='center')
121 | plt.text(0.7,np.mean(dfkslift.cumbadrate),'cumulate badrate',horizontalalignment='center')
122 | plt.text(0.7,badrate_avg,'average badrate',horizontalalignment='center')
123 | # plt.grid()
124 | # plt.show()
125 | # return fig
126 |
127 | def eva_dfrocpr(df):
128 | def n0(x): return sum(x==0)
129 | def n1(x): return sum(x==1)
130 | dfrocpr = df.sort_values('pred')\
131 | .groupby('pred')['label'].agg([n0,n1,len])\
132 | .reset_index().rename(columns={'n0':'countN','n1':'countP','len':'countpred'})\
133 | .assign(
134 | FN = lambda x: np.cumsum(x.countP),
135 | TN = lambda x: np.cumsum(x.countN)
136 | ).assign(
137 | TP = lambda x: sum(x.countP) - x.FN,
138 | FP = lambda x: sum(x.countN) - x.TN
139 | ).assign(
140 | TPR = lambda x: x.TP/(x.TP+x.FN),
141 | FPR = lambda x: x.FP/(x.TN+x.FP),
142 | precision = lambda x: x.TP/(x.TP+x.FP),
143 | recall = lambda x: x.TP/(x.TP+x.FN)
144 | ).assign(
145 | F1 = lambda x: 2*x.precision*x.recall/(x.precision+x.recall)
146 | )
147 | return dfrocpr
148 | # plot roc
149 | def eva_proc(dfrocpr, title):
150 | dfrocpr = pd.concat(
151 | [dfrocpr[['FPR','TPR']], pd.DataFrame({'FPR':[0,1], 'TPR':[0,1]})],
152 | ignore_index=True).sort_values(['FPR','TPR'])
153 | auc = dfrocpr.sort_values(['FPR','TPR'])\
154 | .assign(
155 | TPR_lag=lambda x: x['TPR'].shift(1), FPR_lag=lambda x: x['FPR'].shift(1)
156 | ).assign(
157 | auc=lambda x: (x.TPR+x.TPR_lag)*(x.FPR-x.FPR_lag)/2
158 | )['auc'].sum()
159 | ###### plot ######
160 | # fig, ax = plt.subplots()
161 | # ks, cumbad, cumgood
162 | plt.plot(dfrocpr.FPR, dfrocpr.TPR, 'k-')
163 | # ks vline
164 | x=np.array(np.arange(0,1.1,0.1))
165 | plt.plot(x, x, 'r--')
166 | # fill
167 | plt.fill_between(dfrocpr.FPR, 0, dfrocpr.TPR, color='blue', alpha=0.1)
168 | # set xylabel
169 | plt.gca().set(title=title+'ROC',
170 | xlabel='FPR', ylabel='TPR',
171 | xlim=[0,1], ylim=[0,1], aspect='equal')
172 | # text
173 | # plt.text(0.5,0.96, 'ROC', fontsize=15, horizontalalignment='center')
174 | plt.text(0.55,0.45, 'AUC:'+str(round(auc,4)), horizontalalignment='center', color='b')
175 | # plt.grid()
176 | # plt.show()
177 | # return fig
178 | # plot ppr
179 | def eva_ppr(dfrocpr, title):
180 | ###### plot ######
181 | # fig, ax = plt.subplots()
182 | # ks, cumbad, cumgood
183 | plt.plot(dfrocpr.recall, dfrocpr.precision, 'k-')
184 | # ks vline
185 | x=np.array(np.arange(0,1.1,0.1))
186 | plt.plot(x, x, 'r--')
187 | # set xylabel
188 | plt.gca().set(title=title+'P-R',
189 | xlabel='Recall', ylabel='Precision',
190 | xlim=[0,1], ylim=[0,1], aspect='equal')
191 | # text
192 | # plt.text(0.5,0.96, 'P-R', fontsize=15, horizontalalignment='center')
193 | # plt.grid()
194 | # plt.show()
195 | # return fig
196 | # plot f1
197 | def eva_pf1(dfrocpr, title):
198 | dfrocpr=dfrocpr.assign(pop=lambda x: np.cumsum(x.countpred)/sum(x.countpred))
199 | ###### plot ######
200 | # fig, ax = plt.subplots()
201 | # ks, cumbad, cumgood
202 | plt.plot(dfrocpr['pop'], dfrocpr['F1'], 'k-')
203 | # ks vline
204 | F1max_pop = dfrocpr.loc[dfrocpr['F1'].idxmax(),'pop']
205 | F1max_F1 = dfrocpr.loc[dfrocpr['F1'].idxmax(),'F1']
206 | plt.plot([F1max_pop,F1max_pop], [0,F1max_F1], 'r--')
207 | # set xylabel
208 | plt.gca().set(title=title+'F1',
209 | xlabel='% of population', ylabel='F1',
210 | xlim=[0,1], ylim=[0,1], aspect='equal')
211 | # pred text
212 | pred_0=dfrocpr.loc[dfrocpr['pred'].idxmin(),'pred']
213 | pred_F1max=dfrocpr.loc[dfrocpr['F1'].idxmax(),'pred']
214 | pred_1=dfrocpr.loc[dfrocpr['pred'].idxmax(),'pred']
215 | if np.mean(dfrocpr.pred) < 0 or np.mean(dfrocpr.pred) > 1:
216 | pred_0 = -pred_0
217 | pred_F1max = -pred_F1max
218 | pred_1 = -pred_1
219 | plt.text(0, 0, 'pred \n'+str(round(pred_0,4)), horizontalalignment='left',color='b')
220 | plt.text(F1max_pop, 0, 'pred \n'+str(round(pred_F1max,4)), horizontalalignment='center',color='b')
221 | plt.text(1, 0, 'pred \n'+str(round(pred_1,4)), horizontalalignment='right',color='b')
222 | # title F1
223 | plt.text(F1max_pop, F1max_F1, 'F1 max: \n'+ str(round(F1max_F1,4)), horizontalalignment='center',color='b')
224 | # plt.grid()
225 | # plt.show()
226 | # return fig
227 |
228 |
229 |
230 | def perf_eva(label, pred, title=None, groupnum=None, plot_type=["ks", "roc"], show_plot=True, positive="bad|1", seed=186):
231 |
232 | # inputs checking
233 | if len(label) != len(pred):
234 | warnings.warn('Incorrect inputs; label and pred should be list with the same length.')
235 | # if pred is score
236 | if np.mean(pred) < 0 or np.mean(pred) > 1:
237 | warnings.warn('Since the average of pred is not in [0,1], it is treated as predicted score but not probability.')
238 | pred = -pred
239 | # random sort datatable
240 | df = pd.DataFrame({'label':label, 'pred':pred}).sample(frac=1, random_state=seed)
241 | # remove NAs
242 | if any(np.unique(df.isna())):
243 | warnings.warn('The NANs in \'label\' or \'pred\' were removed.')
244 | df = df.dropna()
245 | # check label
246 | df = check_y(df, 'label', positive)
247 | # title
248 | title='' if title is None else str(title)+': '
249 |
250 | ### data ###
251 | # dfkslift ------
252 | if any([i in plot_type for i in ['ks', 'lift']]):
253 | dfkslift = eva_dfkslift(df, groupnum)
254 | if 'ks' in plot_type: df_ks = dfkslift
255 | if 'lift' in plot_type: df_lift = dfkslift
256 | # dfrocpr ------
257 | if any([i in plot_type for i in ["roc","pr",'f1']]):
258 | dfrocpr = eva_dfrocpr(df)
259 | if 'roc' in plot_type: df_roc = dfrocpr
260 | if 'pr' in plot_type: df_pr = dfrocpr
261 | if 'f1' in plot_type: df_f1 = dfrocpr
262 | ### return list ###
263 | rt = {}
264 | # plot, KS ------
265 | if 'ks' in plot_type:
266 | rt['KS'] = round(dfkslift.loc[lambda x: x.ks==max(x.ks),'ks'].iloc[0],4)
267 | # plot, ROC ------
268 | if 'roc' in plot_type:
269 | auc = pd.concat(
270 | [dfrocpr[['FPR','TPR']], pd.DataFrame({'FPR':[0,1], 'TPR':[0,1]})],
271 | ignore_index=True).sort_values(['FPR','TPR'])\
272 | .assign(
273 | TPR_lag=lambda x: x['TPR'].shift(1), FPR_lag=lambda x: x['FPR'].shift(1)
274 | ).assign(
275 | auc=lambda x: (x.TPR+x.TPR_lag)*(x.FPR-x.FPR_lag)/2
276 | )['auc'].sum()
277 | ###
278 | rt['AUC'] = round(auc, 4)
279 | rt['Gini'] = round(2*auc-1, 4)
280 |
281 | ### export plot ###
282 | if show_plot:
283 | plist = ["eva_p"+i+'(df_'+i+',title)' for i in plot_type]
284 | subplot_nrows = np.ceil(len(plist)/2)
285 | subplot_ncols = np.ceil(len(plist)/subplot_nrows)
286 |
287 | fig = plt.figure()
288 | for i in np.arange(len(plist)):
289 | plt.subplot(int(subplot_nrows),int(subplot_ncols),i+1)
290 | eval(plist[i])
291 |
292 | rt['pic'] = fig
293 |
294 | return rt
295 |
296 |
297 |
298 | def perf_psi(score, label=None, title=None, x_limits=None, x_tick_break=50, show_plot=True, seed=186, return_distr_dat=False):
299 |
300 | # inputs checking
301 | ## score
302 | if not isinstance(score, dict) and len(score) != 2:
303 | raise Exception("Incorrect inputs; score should be a dictionary with two elements.")
304 | else:
305 | if any([not isinstance(i, pd.DataFrame) for i in score.values()]):
306 | raise Exception("Incorrect inputs; score is a dictionary of two dataframes.")
307 | score_columns = [list(i.columns) for i in score.values()]
308 | if set(score_columns[0]) != set(score_columns[1]):
309 | raise Exception("Incorrect inputs; the column names of two dataframes in score should be the same.")
310 | ## label
311 | if label is not None:
312 | if not isinstance(label, dict) and len(label) != 2:
313 | raise Exception("Incorrect inputs; label should be a dictionary with two elements.")
314 | else:
315 | if set(score.keys()) != set(label.keys()):
316 | raise Exception("Incorrect inputs; the keys of score and label should be the same. ")
317 | for i in label.keys():
318 | if isinstance(label[i], pd.DataFrame):
319 | if len(label[i].columns) == 1:
320 | label[i] = label[i].iloc[:,0]
321 | else:
322 | raise Exception("Incorrect inputs; the number of columns in label should be 1.")
323 | # score dataframe column names
324 | score_names = score[list(score.keys())[0]].columns
325 | # merge label with score
326 | for i in score.keys():
327 | score[i] = score[i].copy(deep=True)
328 | if label is not None:
329 | score[i].loc[:,'y'] = label[i]
330 | else:
331 | score[i].copy(deep=True).loc[:,'y'] = np.nan
332 | # dateset of score and label
333 | dt_sl = pd.concat(score, names=['ae', 'rowid']).reset_index()\
334 | .sample(frac=1, random_state=seed)
335 | # ae refers to 'Actual & Expected'
336 |
337 | # PSI function
338 | def psi(dat):
339 | dt_bae = dat.groupby(['ae','bin']).size().reset_index(name='N')\
340 | .pivot_table(values='N', index='bin', columns='ae').fillna(0.9)\
341 | .agg(lambda x: x/sum(x))
342 | dt_bae.columns = ['A','E']
343 | psi_dt = dt_bae.assign(
344 | AE = lambda x: x.A-x.E,
345 | logAE = lambda x: np.log(x.A/x.E)
346 | ).assign(
347 | bin_PSI=lambda x: x.AE*x.logAE
348 | )['bin_PSI'].sum()
349 | return psi_dt
350 |
351 | # return psi and pic
352 | rt_psi = {}
353 | rt_pic = {}
354 | rt_dat = {}
355 | rt = {}
356 | for sn in score_names:
357 | # dataframe with columns of ae y sn
358 | dat = dt_sl[['ae', 'y', sn]]
359 | if len(dt_sl[sn].unique()) > 10:
360 | # breakpoints
361 | if x_limits is None:
362 | x_limits = dat[sn].quantile([0.02, 0.98])
363 | x_limits = round(x_limits/x_tick_break)*x_tick_break
364 | x_limits = list(x_limits)
365 |
366 | brkp = np.unique([np.floor(min(dt_sl[sn])/x_tick_break)*x_tick_break]+\
367 | list(np.arange(x_limits[0], x_limits[1], x_tick_break))+\
368 | [np.ceil(max(dt_sl[sn])/x_tick_break)*x_tick_break])
369 | # cut
370 | labels = ['[{},{})'.format(int(brkp[i]), int(brkp[i+1])) for i in range(len(brkp)-1)]
371 | dat.loc[:,'bin'] = pd.cut(dat[sn], brkp, right=False, labels=labels)
372 | else:
373 | dat.loc[:,'bin'] = dat[sn]
374 | # psi ------
375 | rt_psi[sn] = pd.DataFrame({'PSI':psi(dat)},index=np.arange(1))
376 |
377 | # distribution of scorecard probability
378 | def good(x): return sum(x==0)
379 | def bad(x): return sum(x==1)
380 | distr_prob = dat.groupby(['ae', 'bin'])\
381 | ['y'].agg([good, bad])\
382 | .assign(N=lambda x: x.good+x.bad,
383 | badprob=lambda x: x.bad/(x.good+x.bad)
384 | ).reset_index()
385 | distr_prob.loc[:,'distr'] = distr_prob.groupby('ae')['N'].transform(lambda x:x/sum(x))
386 | # pivot table
387 | distr_prob = distr_prob.pivot_table(values=['N','badprob', 'distr'], index='bin', columns='ae')
388 |
389 | # plot ------
390 | if show_plot:
391 | ###### param ######
392 | ind = np.arange(len(distr_prob.index)) # the x locations for the groups
393 | width = 0.35 # the width of the bars: can also be len(x) sequence
394 | ###### plot ######
395 | fig, ax1 = plt.subplots()
396 | ax2 = ax1.twinx()
397 | title_string = sn+'_PSI: '+str(round(psi(dat),4))
398 | title_string = title_string if title is None else str(title)+' '+title_string
399 | # ax1
400 | p1 = ax1.bar(ind, distr_prob.distr.iloc[:,0], width, color=(24/254, 192/254, 196/254), alpha=0.6)
401 | p2 = ax1.bar(ind+width, distr_prob.distr.iloc[:,1], width, color=(246/254, 115/254, 109/254), alpha=0.6)
402 | # ax2
403 | p3 = ax2.plot(ind+width/2, distr_prob.badprob.iloc[:,0], color=(24/254, 192/254, 196/254))
404 | ax2.scatter(ind+width/2, distr_prob.badprob.iloc[:,0], facecolors='w', edgecolors=(24/254, 192/254, 196/254))
405 | p4 = ax2.plot(ind+width/2, distr_prob.badprob.iloc[:,1], color=(246/254, 115/254, 109/254))
406 | ax2.scatter(ind+width/2, distr_prob.badprob.iloc[:,1], facecolors='w', edgecolors=(246/254, 115/254, 109/254))
407 | # settings
408 | ax1.set_ylabel('Score distribution')
409 | ax2.set_ylabel('Bad probability')#, color='blue')
410 | # ax2.tick_params(axis='y', colors='blue')
411 | # ax1.set_yticks(np.arange(0, np.nanmax(distr_prob['distr'].values), 0.2))
412 | # ax2.set_yticks(np.arange(0, 1+0.2, 0.2))
413 | ax1.set_ylim([0,np.ceil(np.nanmax(distr_prob['distr'].values)*10)/10])
414 | ax2.set_ylim([0,1])
415 | plt.xticks(ind+width/2, distr_prob.index)
416 | plt.title(title_string, loc='left')
417 | ax1.legend((p1[0], p2[0]), list(distr_prob.columns.levels[1]), loc='upper left')
418 | ax2.legend((p3[0], p4[0]), list(distr_prob.columns.levels[1]), loc='upper right')
419 | # show plot
420 | plt.show()
421 |
422 | # return of pic
423 | rt_pic[sn] = fig
424 |
425 | # return distr_dat ------
426 | if return_distr_dat:
427 | rt_dat[sn] = distr_prob[['N','badprob']].reset_index()
428 | # return rt
429 | rt['psi'] = pd.concat(rt_psi).reset_index().rename(columns={'level_0':'variable'})[['variable', 'PSI']]
430 | rt['pic'] = rt_pic
431 | if return_distr_dat: rt['dat'] = rt_dat
432 | return rt
433 |
--------------------------------------------------------------------------------
/utils/excel_writer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2023/2/14 16:23
4 | @Author : itlubber
5 | @Site : itlubber.art
6 | """
7 | import re
8 | import os
9 |
10 | import matplotlib.pyplot as plt
11 | import numpy as np
12 | import pandas as pd
13 |
14 | from openpyxl.cell.cell import Cell
15 | from openpyxl.drawing.image import Image
16 | from openpyxl import load_workbook, Workbook
17 | from openpyxl.formatting.rule import DataBarRule
18 | from openpyxl.utils.dataframe import dataframe_to_rows
19 | from openpyxl.utils import get_column_letter, column_index_from_string
20 | from openpyxl.styles import NamedStyle, Border, Side, Alignment, PatternFill, Font
21 |
22 |
23 | class ExcelWriter:
24 |
25 | def __init__(self, style_excel='报告输出模版.xlsx', style_sheet_name="初始化", fontsize=10, font='楷体', theme_color='8E8BFE'):
26 | """
27 | excel 文件内容写入公共方法
28 |
29 | :param style_excel: 样式模版文件,默认当前路径下的 报告输出模版.xlsx ,如果项目路径调整需要进行相应的调整
30 | :param style_sheet_name: 模版文件内初始样式sheet名称,默认即可
31 | :param fontsize: 插入excel文件中内容的字体大小,默认 10
32 | :param font: 插入excel文件中内容的字体,默认 楷体
33 | :param theme_color: 主题色,默认 8E8BFE,注意不包含 #
34 | """
35 | # english_width,chinese_width
36 | self.english_width = 0.12
37 | self.chinese_width = 0.21
38 | self.theme_color = theme_color
39 | self.fontsize = 10
40 | self.font = '楷体'
41 |
42 | self.workbook = load_workbook(style_excel)
43 | self.style_sheet = self.workbook[style_sheet_name]
44 |
45 | self.name_styles = []
46 | self.init_style(font, fontsize, theme_color)
47 | for style in self.name_styles:
48 | if style.name not in self.workbook.style_names:
49 | self.workbook.add_named_style(style)
50 |
51 | def add_conditional_formatting(self, worksheet, start_space, end_space):
52 | """
53 | 设置条件格式
54 |
55 | :param worksheet: 当前选择设置条件格式的sheet
56 | :param start_space: 开始单元格位置
57 | :param end_space: 结束单元格位置
58 | """
59 | worksheet.conditional_formatting.add(f'{start_space}:{end_space}', DataBarRule(start_type='min', end_type='max', color=self.theme_color))
60 |
61 | @staticmethod
62 | def set_column_width(worksheet, column, width):
63 | """
64 | 调整excel列宽
65 |
66 | :param worksheet: 当前选择调整列宽的sheet
67 | :param column: 列,可以直接输入 index 或者 字母
68 | :param width: 设置列的宽度
69 | """
70 | worksheet.column_dimensions[column if isinstance(column, str) else get_column_letter(column)] = width
71 |
72 | @staticmethod
73 | def set_number_format(worksheet, space, _format):
74 | """
75 | 设置数值显示格式
76 |
77 | :param worksheet: 当前选择调整数值显示格式的sheet
78 | :param space: 单元格范围
79 | :param _format: 显示格式,参考 openpyxl
80 | """
81 | cells = worksheet[space]
82 | if isinstance(cells, Cell):
83 | cells = [cells]
84 |
85 | for cell in cells:
86 | if isinstance(cell, tuple):
87 | for c in cell:
88 | c.number_format = _format
89 | else:
90 | cell.number_format = _format
91 |
92 | def get_sheet_by_name(self, name):
93 | """
94 | 获取sheet名称为name的工作簿,如果不存在,则从初始模版文件中拷贝一个名称为name的sheet
95 |
96 | :param name: 需要获取的工作簿名称
97 | """
98 | if name not in self.workbook.sheetnames:
99 | worksheet = self.workbook.copy_worksheet(self.style_sheet)
100 | worksheet.title = name
101 | else:
102 | worksheet = self.workbook[name]
103 |
104 | return worksheet
105 |
106 | def insert_value2sheet(self, worksheet, insert_space, value="", style="content", auto_width=False):
107 | """
108 | 向sheet中的某个单元格插入某种样式的内容
109 |
110 | :param worksheet: 需要插入内容的sheet
111 | :param insert_space: 内容插入的单元格位置,可以是 "B2" 或者 (2, 2) 任意一种形式
112 | :param value: 需要插入的内容
113 | :param style: 渲染的样式,参考 init_style 中初始设置的样式
114 | :param auto_width: 是否开启自动调整列宽
115 | :return 返回插入元素最后一列之后、最后一行之后的位置
116 | """
117 | if isinstance(insert_space, str):
118 | worksheet[insert_space] = value
119 | cell = worksheet[insert_space]
120 | start_col = re.findall('\D+', insert_space)[0]
121 | start_row = int(re.findall("\d+", insert_space)[0])
122 | else:
123 | cell = worksheet.cell(insert_space[0], insert_space[1], value)
124 | start_col = get_column_letter(insert_space[1])
125 | start_row = insert_space[0]
126 | cell.style = style
127 |
128 | if auto_width:
129 | curr_width = worksheet.column_dimensions[start_col].width
130 | auto_width = min(max([(self.check_contain_chinese(value)[1] * self.english_width + self.check_contain_chinese(value)[2] * self.chinese_width) * self.fontsize, 10, curr_width]), 50)
131 | worksheet.column_dimensions[start_col].width = auto_width
132 |
133 | return start_row + 1, column_index_from_string(start_col) + 1
134 |
135 | def insert_pic2sheet(self, worksheet, fig, insert_space, figsize=(600, 250)):
136 | """
137 | 向excel中插入图片内容
138 |
139 | :param worksheet: 需要插入内容的sheet
140 | :param fig: 需要插入的图片路径
141 | :param insert_space: 插入图片的起始单元格
142 | :param figsize: 图片大小设置
143 | :return 返回插入元素最后一列之后、最后一行之后的位置
144 | """
145 | if isinstance(insert_space, str):
146 | start_row = int(re.findall("\d+", insert_space)[0])
147 | start_col = re.findall('\D+', insert_space)[0]
148 | else:
149 | start_row, start_col = insert_space
150 | start_col = get_column_letter(start_col)
151 |
152 | image = Image(fig)
153 | image.width, image.height = figsize
154 | worksheet.add_image(image, f"{start_col}{start_row}")
155 |
156 | return start_row + int(figsize[1] / 17.5), column_index_from_string(start_col) + 8
157 |
158 | def insert_rows(self, worksheet, row, row_index, col_index, merge_rows=None, style="", auto_width=False):
159 | curr_col = column_index_from_string(col_index)
160 | for j, v in enumerate(row):
161 | if merge_rows is not None and row_index + 1 not in merge_rows:
162 | if j == 0:
163 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style="merge_left", auto_width=auto_width)
164 | elif j == len(row) - 1:
165 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style="merge_right", auto_width=auto_width)
166 | else:
167 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style="merge_middle", auto_width=auto_width)
168 | else:
169 | if j == 0:
170 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style=f"{style}_left" if style else "left", auto_width=auto_width)
171 | elif j == len(row) - 1:
172 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style=f"{style}_right" if style else "right", auto_width=auto_width)
173 | else:
174 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style=f"{style}_middle" if style else "middle", auto_width=auto_width)
175 |
176 | def insert_df2sheet(self, worksheet, data, insert_space, merge_column=None, header=True, index=False, auto_width=False):
177 | """
178 | 向excel文件中插入制定样式的dataframe数据
179 |
180 | :param worksheet: 需要插入内容的sheet
181 | :param data: 需要插入的dataframe
182 | :param insert_space: 插入内容的起始单元格位置
183 | :param merge_column: 需要分组显示的列,index或者列明
184 | :param header: 是否存储dataframe的header,暂不支持多级表头
185 | :param index: 是否存储dataframe的index
186 | :param auto_width: 是否自动调整列宽
187 | :return 返回插入元素最后一列之后、最后一行之后的位置
188 | """
189 | df = data.copy()
190 |
191 | if isinstance(insert_space, str):
192 | start_row = int(re.findall("\d+", insert_space)[0])
193 | start_col = re.findall('\D+', insert_space)[0]
194 | else:
195 | start_row, start_col = insert_space
196 | start_col = get_column_letter(start_col)
197 |
198 | if merge_column:
199 | if isinstance(merge_column, str):
200 | merge_column = [merge_column]
201 |
202 | if isinstance(merge_column[0], (int, float)):
203 | merge_cols = None
204 | merge_rows = merge_rows
205 | else:
206 | merge_cols = [get_column_letter(df.columns.get_loc(col) + column_index_from_string(start_col)) for col in merge_column]
207 | df = df.sort_values(merge_column)
208 | merge_rows = list(np.cumsum(df.groupby(merge_column)[merge_column].count().values[:, 0]) + start_row + 1)
209 |
210 | for i, row in enumerate(dataframe_to_rows(df, header=header, index=index)):
211 | if i == 0:
212 | if header:
213 | self.insert_rows(worksheet, row, start_row + i, start_col, style="header", auto_width=auto_width)
214 | else:
215 | self.insert_rows(worksheet, row, start_row + i, start_col, style="first", auto_width=auto_width)
216 | elif (header and i == len(df)) or (not header and i + 1 == len(df)):
217 | self.insert_rows(worksheet, row, start_row + i, start_col, style="last", auto_width=auto_width)
218 | else:
219 | self.insert_rows(worksheet, row, start_row + i, start_col, auto_width=auto_width, merge_rows=merge_rows if merge_column else None)
220 |
221 | # if merge_column and merge_cols is not None:
222 | # merge_rows = [start_row + 2] + merge_rows
223 | # for s, e in zip(merge_rows[:-1], merge_rows[1:]):
224 | # if e - s > 1:
225 | # for merge_col in merge_cols:
226 | # worksheet.merge_cells(f"{merge_col}{s-1}:{merge_col}{e-1}")
227 |
228 | end_row = start_row + len(data) + 1 if header else start_row + len(data)
229 |
230 | return (end_row, column_index_from_string(start_col) + len(data.columns))
231 |
232 | @staticmethod
233 | def check_contain_chinese(check_str):
234 | out = []
235 | for ch in str(check_str).encode('utf-8').decode('utf-8'):
236 | if u'\u4e00' <= ch <= u'\u9fff':
237 | out.append(True)
238 | else:
239 | out.append(False)
240 | return out, len(out) - sum(out), sum(out)
241 |
242 | @staticmethod
243 | def astype_insertvalue(value, decimal_point=4):
244 | if re.search('tuple|list|numpy.dtype|bool|str|numpy.ndarray|Interval|Categorical', str(type(value))):
245 | value = str(value)
246 | elif re.search('int', str(type(value))):
247 | value = value
248 | elif re.search('float', str(type(value))):
249 | value = round(float(value), decimal_point)
250 | else:
251 | value = 'nan'
252 |
253 | return value
254 |
255 | @staticmethod
256 | def calc_continuous_cnt(list_, index_=0):
257 | """
258 | Clac continuous_cnt
259 |
260 | Examples:s
261 | list_ = ['A','A','A','A','B','C','C','D','D','D']
262 | (1) calc_continuous_cnt(list_, 0) ===>('A', 0, 4)
263 | (2) calc_continuous_cnt(list_, 4) ===>('B', 4, 1)
264 | (3) calc_continuous_cnt(list_, 6) ===>('C', 6, 1)
265 | """
266 | if index_ >= len(list_):
267 | return None, None, None
268 |
269 | else:
270 | cnt, str_ = 0, list_[index_]
271 | for i in range(index_, len(list_), 1):
272 | if list_[i] == str_:
273 | cnt = cnt + 1
274 | else:
275 | break
276 | return str_, index_, cnt
277 |
278 | @staticmethod
279 | def itlubber_border(border, color):
280 | if len(border) == 3:
281 | return Border(left=Side(border_style=border[0], color=color[0]), right=Side(border_style=border[1], color=color[1]), bottom=Side(border_style=border[2], color=color[2]),)
282 | else:
283 | return Border(left=Side(border_style=border[0], color=color[0]), right=Side(border_style=border[1], color=color[1]), bottom=Side(border_style=border[2], color=color[2]), top=Side(border_style=border[3], color=color[3]),)
284 |
285 | @staticmethod
286 | def get_cell_space(space):
287 | if isinstance(space, str):
288 | start_row = int(re.findall("\d+", space)[0])
289 | start_col = re.findall('\D+', space)[0]
290 | return start_row, column_index_from_string(start_col)
291 | else:
292 | start_row = space[0]
293 | if isinstance(space[1], int):
294 | start_col = get_column_letter(space[1])
295 | else:
296 | start_col = space[1]
297 | return f"{start_row}{start_col}"
298 |
299 | def init_style(self, font, fontsize, theme_color):
300 | header_style, header_left_style, header_middle_style, header_right_style = NamedStyle(name="header"), NamedStyle(name="header_left"), NamedStyle(name="header_middle"), NamedStyle(name="header_right")
301 | last_style, last_left_style, last_middle_style, last_right_style = NamedStyle(name="last"), NamedStyle(name="last_left"), NamedStyle(name="last_middle"), NamedStyle(name="last_right")
302 | content_style, left_style, middle_style, right_style = NamedStyle(name="content"), NamedStyle(name="left"), NamedStyle(name="middle"), NamedStyle(name="right")
303 | merge_style, merge_left_style, merge_middle_style, merge_right_style = NamedStyle(name="merge"), NamedStyle(name="merge_left"), NamedStyle(name="merge_middle"), NamedStyle(name="merge_right")
304 | first_style, first_left_style, first_middle_style, first_right_style = NamedStyle(name="first"), NamedStyle(name="first_left"), NamedStyle(name="first_middle"), NamedStyle(name="first_right")
305 |
306 | header_font = Font(size=fontsize, name=font, color="FFFFFF", bold=True)
307 | header_fill = PatternFill(fill_type="solid", start_color=theme_color)
308 | alignment = Alignment(horizontal='center', vertical='center', wrap_text=False)
309 | content_fill = PatternFill(fill_type="solid", start_color="FFFFFF")
310 | content_font = Font(size=fontsize, name=font, color="000000")
311 |
312 | header_style.font, header_left_style.font, header_middle_style.font, header_right_style.font = header_font, header_font, header_font, header_font
313 | header_style.fill, header_left_style.fill, header_middle_style.fill, header_right_style.fill = header_fill, header_fill, header_fill, header_fill
314 | header_style.alignment, header_left_style.alignment, header_middle_style.alignment, header_right_style.alignment = Alignment(horizontal='left', vertical='center', wrap_text=True), alignment, alignment, alignment
315 |
316 | header_style.border = self.itlubber_border(["medium", "medium", "medium", "medium"], [theme_color, theme_color, theme_color, theme_color])
317 | header_left_style.border = self.itlubber_border(["medium", "thin", "medium", "medium"], [theme_color, "FFFFFF", theme_color, theme_color])
318 | header_middle_style.border = self.itlubber_border(["thin", "thin", "medium", "medium"], ["FFFFFF", "FFFFFF", theme_color, theme_color])
319 | header_right_style.border = self.itlubber_border(["thin", "medium", "medium", "medium"], ["FFFFFF", theme_color, theme_color, theme_color])
320 |
321 | last_style.font, last_left_style.font, last_middle_style.font, last_right_style.font = content_font, content_font, content_font, content_font
322 | last_style.fill, last_left_style.fill, last_middle_style.fill, last_right_style.fill = content_fill, content_fill, content_fill, content_fill
323 | last_style.alignment, last_left_style.alignment, last_middle_style.alignment, last_right_style.alignment = alignment, alignment, alignment, alignment
324 |
325 | last_style.border = self.itlubber_border(["medium", "medium", "medium"], [theme_color, theme_color, theme_color])
326 | last_left_style.border = self.itlubber_border(["medium", "thin", "medium"], [theme_color, "FFFFFF", theme_color])
327 | last_middle_style.border = self.itlubber_border(["thin", "thin", "medium"], ["FFFFFF", "FFFFFF", theme_color])
328 | last_right_style.border = self.itlubber_border(["thin", "medium", "medium"], ["FFFFFF", theme_color, theme_color])
329 |
330 | content_style.font, left_style.font, middle_style.font, right_style.font = content_font, content_font, content_font, content_font
331 | content_style.fill, left_style.fill, middle_style.fill, right_style.fill = content_fill, content_fill, content_fill, content_fill
332 | content_style.alignment, left_style.alignment, middle_style.alignment, right_style.alignment = alignment, alignment, alignment, alignment
333 |
334 | content_style.border = self.itlubber_border(["medium", "medium", "thin"], [theme_color, theme_color, theme_color])
335 | left_style.border = self.itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", theme_color])
336 | middle_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", "FFFFFF", theme_color])
337 | right_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, theme_color])
338 |
339 | merge_style.font, merge_left_style.font, merge_middle_style.font, merge_right_style.font = content_font, content_font, content_font, content_font
340 | merge_style.fill, merge_left_style.fill, merge_middle_style.fill, merge_right_style.fill = content_fill, content_fill, content_fill, content_fill
341 | merge_style.alignment, merge_left_style.alignment, merge_middle_style.alignment, merge_right_style.alignment = alignment, alignment, alignment, alignment
342 |
343 | merge_style.border = self.itlubber_border(["medium", "medium", "thin"], ["FFFFFF", "FFFFFF", "FFFFFF"])
344 | merge_left_style.border = self.itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", "FFFFFF"])
345 | merge_middle_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", "FFFFFF", "FFFFFF"])
346 | merge_right_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, "FFFFFF"])
347 |
348 | first_style.font, first_left_style.font, first_middle_style.font, first_right_style.font = content_font, content_font, content_font, content_font
349 | first_style.fill, first_left_style.fill, first_middle_style.fill, first_right_style.fill = content_fill, content_fill, content_fill, content_fill
350 | first_style.alignment, first_left_style.alignment, first_middle_style.alignment, first_right_style.alignment = alignment, alignment, alignment, alignment
351 |
352 | first_style.border = self.itlubber_border(["medium", "medium", "thin", "medium"], [theme_color, theme_color, theme_color, theme_color])
353 | first_left_style.border = self.itlubber_border(["medium", "thin", "thin", "medium"], [theme_color, "FFFFFF", theme_color, theme_color])
354 | first_middle_style.border = self.itlubber_border(["thin", "thin", "thin", "medium"], ["FFFFFF", "FFFFFF", theme_color, theme_color])
355 | first_right_style.border = self.itlubber_border(["thin", "medium", "thin", "medium"], ["FFFFFF", theme_color, theme_color, theme_color])
356 |
357 | self.name_styles.extend([
358 | header_style, header_left_style, header_middle_style, header_right_style,
359 | last_style, last_left_style, last_middle_style, last_right_style,
360 | content_style, left_style, middle_style, right_style,
361 | merge_style, merge_left_style, merge_middle_style, merge_right_style,
362 | first_style, first_left_style, first_middle_style, first_right_style
363 | ])
364 |
365 | def save(self, filename):
366 | """
367 | 保存excel文件
368 |
369 | :param filename: 需要保存 excel 文件的路径
370 | """
371 | self.workbook.remove(self.style_sheet)
372 | self.workbook.save(filename)
373 | self.workbook.close()
374 |
375 |
376 | if __name__ == '__main__':
377 | writer = ExcelWriter(style_excel="/Users/lubberit/Desktop/金融/兴业银行贷中行为评分/utils/报告输出模版.xlsx")
378 | worksheet = writer.get_sheet_by_name("模型报告")
379 | writer.insert_value2sheet(worksheet, "B2", value="模型报告", style="header")
380 | writer.insert_value2sheet(worksheet, "B3", value="当前模型主要为评分卡模型", style="content", auto_width=True)
381 | end_row = writer.insert_pic2sheet(worksheet, "/Users/lubberit/Desktop/金融/兴业银行贷中行为评分/tests/mypic.png", "B5")
382 | end_row = writer.insert_pic2sheet(worksheet, "/Users/lubberit/Desktop/金融/兴业银行贷中行为评分/tests/mypic.png", "H5")
383 | sample = pd.DataFrame(np.concatenate([np.random.random_sample((10, 10)) * 40, np.random.randint(0, 3, (10, 2))], axis=1), columns=[f"B{i}" for i in range(10)] + ["target", "type"])
384 | end_row, end_col = writer.insert_df2sheet(worksheet, sample, (end_row + 2, column_index_from_string("B")))
385 | end_row, end_col = writer.insert_df2sheet(worksheet, sample, (end_row + 2, column_index_from_string("B")), merge_column="target")
386 | end_row, end_col = writer.insert_df2sheet(worksheet, sample, (end_row + 2, column_index_from_string("B")), merge_column=["target", "type"])
387 | writer.save("test.xlsx")
388 |
--------------------------------------------------------------------------------
/utils/tools.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2022/8/23 13:12
4 | @Author : itlubber
5 | @Site : itlubber.art
6 | """
7 |
8 | import os
9 | import six
10 | import toad
11 | import joblib
12 | import warnings
13 | import numpy as np
14 | import pandas as pd
15 | from tqdm import tqdm
16 | import scorecardpy as sc
17 | from datetime import datetime
18 | import matplotlib.pyplot as plt
19 | from optbinning import OptimalBinning
20 | from sklearn.metrics import make_scorer
21 | from sklearn.model_selection import train_test_split
22 |
23 | from openpyxl import load_workbook, Workbook
24 | from openpyxl.formatting.rule import DataBarRule
25 | from openpyxl.styles import Border, Side, Alignment, PatternFill, Font
26 |
27 |
28 | def init_setting(font_path="./utils/matplot_chinese.ttf"):
29 | import warnings
30 | import matplotlib
31 | from matplotlib import font_manager
32 | warnings.filterwarnings("ignore")
33 | pd.options.display.float_format = '{:.4f}'.format
34 | pd.set_option('display.max_colwidth', 300)
35 | plt.style.use('seaborn-ticks')
36 | matplotlib.font_manager.fontManager.addfont(font_path)
37 | matplotlib.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name()
38 | matplotlib.rcParams['axes.unicode_minus'] = False
39 |
40 |
41 | # warnings.filterwarnings("ignore")
42 | # pd.set_option('display.width', 5000)
43 | # plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
44 | # plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
45 |
46 |
47 | try:
48 | feature_describe = pd.read_excel("变量字典及字段解释.xlsx", sheet_name="数据字段表", header=0, engine="openpyxl", usecols=[0, 1])
49 | feature_describe = feature_describe.drop_duplicates(subset=["变量名称"], keep="last")
50 | feature_dict = dict(zip(feature_describe["变量名称"], feature_describe["含义"]))
51 | except:
52 | feature_dict = {}
53 |
54 |
55 | def ks_score(y, y_pred):
56 | return toad.KS(y_pred[:, 1], y)
57 |
58 |
59 | ks_score = make_scorer(ks_score, needs_proba=True)
60 |
61 |
62 | def round_float(num):
63 | if ~pd.isnull(num) and isinstance(num, float):
64 | return float(str(num).split(".")[0] + "." + str(num).split(".")[1][:4])
65 | else:
66 | return num
67 |
68 |
69 | def feature_bins(bins):
70 | if isinstance(bins, list): bins = np.array(bins)
71 | EMPTYBINS = len(bins) if not isinstance(bins[0], (set, list, np.ndarray)) else -1
72 |
73 | l = []
74 | if np.issubdtype(bins.dtype, np.number):
75 | has_empty = len(bins) > 0 and np.isnan(bins[-1])
76 | if has_empty: bins = bins[:-1]
77 | sp_l = ["负无穷"] + [round_float(b) for b in bins.tolist()] + ["正无穷"]
78 | for i in range(len(sp_l) - 1): l.append('['+str(sp_l[i])+' , '+str(sp_l[i+1])+')')
79 | if has_empty: l.append('缺失值')
80 | else:
81 | for keys in bins:
82 | keys_update = set()
83 | for key in keys:
84 | if pd.isnull(key) or key == "nan":
85 | keys_update.add("缺失值")
86 | elif key.strip() == "":
87 | keys_update.add("空字符串")
88 | else:
89 | keys_update.add(key)
90 | label = ','.join(keys_update)
91 | l.append(label)
92 |
93 | return {i if b != "缺失值" else EMPTYBINS: b for i, b in enumerate(l)}
94 |
95 |
96 | def feature_bin_stats(data, feature, combiner=None, target="target", rules={}, empty_separate=True, method='cart', min_samples=0.15, max_n_bins=3, gamma=0.01, monotonic_trend="auto_asc_desc", feature_dict={}):
97 | # if combiner is None:
98 | # combiner = toad.transform.Combiner()
99 | # combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, min_samples=min_samples)
100 | if feature not in rules:
101 | if data[feature].nunique(dropna=True) < 3:
102 | splits = []
103 | for v in data[feature].unique():
104 | if not pd.isnull(v):
105 | splits.append(v)
106 |
107 | if str(data[feature].dtypes) in ["object", "string", "category"]:
108 | rule = {feature: [[s] for s in splits]}
109 | rule[feature].append([[np.nan]])
110 | else:
111 | rule = {feature: sorted(splits) + [np.nan]}
112 | else:
113 | try:
114 | y = data[target]
115 | if str(data[feature].dtypes) in ["object", "string", "category"]:
116 | dtype = "categorical"
117 | x = data[feature].astype("category").values
118 | else:
119 | dtype = "numerical"
120 | x = data[feature].values
121 | _combiner = OptimalBinning(feature, dtype=dtype, max_n_bins=max_n_bins, monotonic_trend=monotonic_trend, gamma=gamma).fit(x, y)
122 | if _combiner.status == "OPTIMAL":
123 | rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [[np.nan] if dtype == "categorical" else np.nan]}
124 | else:
125 | raise "OptimalBinning error"
126 | except Exception as e:
127 | if method not in ["dt", "chi", ]:
128 | method = "chi"
129 | _combiner = toad.transform.Combiner()
130 | _combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, min_samples=min_samples)
131 | rule = _combiner.export()
132 |
133 | if combiner is None:
134 | combiner = toad.transform.Combiner()
135 |
136 | combiner.update(rule)
137 |
138 | if rules and isinstance(rules, list): rules = {feature: rules}
139 | if rules and isinstance(rules, dict): combiner.update(rules)
140 |
141 | # feature_bin = combiner.export()[feature]
142 | # feature_bin_dict = format_bins(np.array(feature_bin))
143 |
144 | df_bin = combiner.transform(data[[feature, target]], labels=False)
145 |
146 | table = df_bin[[feature, target]].groupby([feature, target]).agg(len).unstack()
147 | table.columns.name = None
148 | table = table.rename(columns = {0 : '好样本数', 1 : '坏样本数'}).fillna(0)
149 | table["指标名称"] = feature
150 | table["指标含义"] = feature_dict.get(feature, "")
151 | table = table.reset_index().rename(columns={feature: "分箱"})
152 | # table["分箱"] = table["分箱"].map(feature_bin_dict)
153 |
154 | table['样本总数'] = table['好样本数'] + table['坏样本数']
155 | table['样本占比'] = table['样本总数'] / table['样本总数'].sum()
156 | table['好样本占比'] = table['好样本数'] / table['好样本数'].sum()
157 | table['坏样本占比'] = table['坏样本数'] / table['坏样本数'].sum()
158 | table['坏样本率'] = table['坏样本数'] / table['样本总数']
159 |
160 | table = table.fillna(0.)
161 |
162 | table['分档WOE值'] = table.apply(lambda x : np.log(x['坏样本占比'] / (x['好样本占比'] + 1e-6)),axis=1)
163 | table['分档IV值'] = table.apply(lambda x : (x['坏样本占比'] - x['好样本占比']) * np.log(x['坏样本占比'] / (x['好样本占比'] + 1e-6)), axis=1)
164 | table['指标IV值'] = table['分档IV值'].sum()
165 |
166 | table["LIFT值"] = table['坏样本率'] / (table["坏样本数"].sum() / table["样本总数"].sum())
167 | table["累积LIFT值"] = table["LIFT值"].cumsum()
168 |
169 | return table[['指标名称', "指标含义", '分箱', '样本总数', '样本占比', '好样本数', '好样本占比', '坏样本数', '坏样本占比', '坏样本率', '分档WOE值', '分档IV值', '指标IV值', 'LIFT值', '累积LIFT值']]
170 |
171 |
172 | def plot_bin(binx, title="", show_iv=True, show_na=True, colors=["#2639E9", "#a29bfe", "#ff7675"], figsize=(10, 8)):
173 | if not show_na:
174 | binx = binx[binx["分箱"] != "缺失值"].reset_index(drop=True)
175 | # y_right_max
176 | y_right_max = np.ceil(binx['坏样本率'].max()*10)
177 | if y_right_max % 2 == 1: y_right_max=y_right_max+1
178 | if y_right_max - binx['坏样本率'].max()*10 <= 0.3: y_right_max = y_right_max+2
179 | y_right_max = y_right_max/10
180 | if y_right_max>1 or y_right_max<=0 or y_right_max is np.nan or y_right_max is None: y_right_max=1
181 | ## y_left_max
182 | y_left_max = np.ceil(binx['样本占比'].max()*10)/10
183 | if y_left_max>1 or y_left_max<=0 or y_left_max is np.nan or y_left_max is None: y_left_max=1
184 | # title
185 | title_string = binx.loc[0,'指标名称']+" (iv:"+str(round(binx['分档IV值'].sum(),4))+")" if show_iv else binx.loc[0,'指标名称']
186 | title_string = title + '-' + title_string if title else title_string
187 | # param
188 | ind = np.arange(len(binx.index)) # the x locations for the groups
189 | width = 0.35 # the width of the bars: can also be len(x) sequence
190 | ###### plot ######
191 | fig, ax1 = plt.subplots(figsize=figsize)
192 | ax2 = ax1.twinx()
193 | # ax1
194 | p1 = ax1.bar(ind, binx['好样本占比'], width, color=colors[1])
195 | p2 = ax1.bar(ind, binx['坏样本占比'], width, bottom=binx['好样本占比'], color=colors[2])
196 | for i in ind:
197 | ax1.text(i, binx.loc[i,'样本占比']*1.02, str(round(binx.loc[i,'样本占比']*100,1))+'%, '+str(binx.loc[i,'样本总数']), ha='center')
198 | # ax2
199 | ax2.plot(ind, binx['坏样本率'], marker='o', color=colors[0])
200 | for i in ind:
201 | ax2.text(i, binx.loc[i,'坏样本率']*1.02, str(round(binx.loc[i,'坏样本率']*100,1))+'%', color=colors[0], ha='center')
202 | # settings
203 | ax1.set_ylabel('样本分布情况')
204 | ax2.set_ylabel('坏样本率', color=colors[0])
205 | ax1.set_yticks(np.arange(0, y_left_max+0.2, 0.2))
206 | ax2.set_yticks(np.arange(0, y_right_max+0.2, 0.2))
207 | ax2.tick_params(axis='y', colors=colors[0])
208 | plt.xticks(ind, binx['分箱'], fontsize=12)
209 | plt.title(title_string, loc='center')
210 | plt.legend((p2[0], p1[0]), ('好样本', '坏样本'), loc='upper right')
211 |
212 |
213 | # def bin_plot(feature_table, feature="", desc="", figsize=(8, 6), colors=['#8E8BFE', '#FEA3A2', '#9394E7'], max_len=35, save=None):
214 | # feature_table = feature_table.copy()
215 | #
216 | # feature_table["分箱"] = feature_table["分箱"].apply(lambda x: x if re.match("^\[.*\)$", x) else str(x)[:max_len] + "..")
217 | #
218 | # # 绘制好坏样本分布情况
219 | # fig, ax1 = plt.subplots(figsize=figsize)
220 | # ax1.barh(feature_table['分箱'], feature_table['好样本数'], color=colors[0], label='好样本')
221 | # ax1.barh(feature_table['分箱'], feature_table['坏样本数'], left=feature_table['好样本数'], color=colors[1], label='坏样本')
222 | # ax1.set_xlabel('样本数')
223 | #
224 | # # 绘制坏样本率的分布情况
225 | # ax2 = ax1.twiny()
226 | # ax2.plot(feature_table['坏样本率'], feature_table['分箱'], colors[2], label='坏样本率', linestyle='-.')
227 | # ax2.set_xlabel('坏样本率: 坏样本数 / 样本总数')
228 | #
229 | # for i, rate in enumerate(feature_table['坏样本率']):
230 | # ax2.scatter(rate, i, color=colors[2], s=3)
231 | #
232 | # # 在图像对应位置显示样本总数和坏样本率
233 | # for i, v in feature_table[['样本总数', '好样本数', '坏样本数', '坏样本率', '样本占比']].iterrows():
234 | # ax1.text(v['样本总数'] / 2, i + len(feature_table) / 60, f"{int(v['好样本数'])}:{int(v['坏样本数'])}:{v['样本占比']:.1%}:{v['坏样本率']:.1%}")
235 | #
236 | # # 逆转y轴顺序
237 | # ax1.invert_yaxis()
238 | #
239 | # desc = desc if desc else feature
240 | #
241 | # # 添加一个标题
242 | # fig.suptitle(f'变量 {desc} 分箱图\n\n')
243 | #
244 | # # 合并图例
245 | # handles1, labels1 = ax1.get_legend_handles_labels()
246 | # handles2, labels2 = ax2.get_legend_handles_labels()
247 | # fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.95), frameon=False)
248 | #
249 | # # 调整布局,使分箱信息能够完全显示
250 | # plt.tight_layout()
251 | #
252 | # if save:
253 | # if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
254 | # os.makedirs(os.path.dirname(save))
255 | #
256 | # fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
257 |
258 |
259 | def cal_psi(train, test, feature, combiner=None):
260 | # feature_bin = combiner.export()[feature]
261 | # feature_bin_dict = format_bins(np.array(feature_bin))
262 | try:
263 | A = (combiner.transform(train[[feature]]).value_counts() / len(train[[feature]])).reset_index().rename(columns={feature: "分箱", 0: "A"})
264 | E = (combiner.transform(test[[feature]]).value_counts() / len(test[[feature]])).reset_index().rename(columns={feature: "分箱", 0: "E"})
265 | except:
266 | A = (combiner.transform(train[[feature]])[feature].value_counts() / len(train)).reset_index().rename(columns={"index": "分箱", feature: "A"})
267 | E = (combiner.transform(test[[feature]])[feature].value_counts() / len(test)).reset_index().rename(columns={"index": "分箱", feature: "E"})
268 | df_psi = A.merge(E, on="分箱", how="outer").fillna(0.)
269 | # df_psi["分箱"] = df_psi["分箱"].map(feature_bin_dict)
270 | df_psi["分档PSI"] = (df_psi["A"] - df_psi["E"]) * np.log(df_psi["A"] / (df_psi["E"] + 1e-6))
271 | df_psi["指标PSI"] = df_psi["分档PSI"].replace(np.inf, 0).sum()
272 |
273 | return df_psi[["分箱", "分档PSI", "指标PSI"]]
274 |
275 |
276 | def itlubber_border(border, color):
277 | if len(border) == 3:
278 | return Border(
279 | left=Side(border_style=border[0], color=color[0]),
280 | right=Side(border_style=border[1], color=color[1]),
281 | bottom=Side(border_style=border[2], color=color[2]),
282 | )
283 | else:
284 | return Border(
285 | left=Side(border_style=border[0], color=color[0]),
286 | right=Side(border_style=border[1], color=color[1]),
287 | bottom=Side(border_style=border[2], color=color[2]),
288 | top=Side(border_style=border[3], color=color[3]),
289 | )
290 |
291 |
292 | def render_excel(excel_name, sheet_name=None, conditional_columns=[], freeze=None, merge_rows=[], percent_columns=[], theme_color="2639E9", conditional_color="9980FA", font="楷体", fontsize=10, max_column_width=50, header=True, start_row=0, n_jobs=4, bar=True, border=True):
293 | workbook = load_workbook(excel_name)
294 |
295 | if sheet_name and isinstance(sheet_name, str):
296 | sheet_names = [sheet_name]
297 | else:
298 | sheet_names = workbook.get_sheet_names()
299 |
300 | merge_rows = [i + start_row if header else i + start_row - 1 for i in merge_rows]
301 |
302 | for sheet_name in sheet_names:
303 | worksheet = workbook.get_sheet_by_name(sheet_name)
304 |
305 | def add_conditional_formatting(column, theme_color="FDA7DF"):
306 | worksheet.conditional_formatting.add(f'{column}2:{column}{worksheet.max_row}', DataBarRule(start_type='min', end_type='max', color=theme_color))
307 |
308 | for conditional_column in conditional_columns:
309 | add_conditional_formatting(f"{conditional_column}", theme_color=conditional_color)
310 |
311 | def render_cell(row_index, row):
312 | if row_index > start_row:
313 | if header and row_index == start_row + 1:
314 | for col_index, cell in enumerate(row, start=1):
315 | cell.font = Font(size=fontsize, name=font, color="FFFFFF", bold=True)
316 | cell.fill = PatternFill(fill_type="solid", start_color=theme_color)
317 | cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=False)
318 |
319 | if col_index == 1:
320 | cell.border = itlubber_border(["medium", "thin", "medium", "medium"], [theme_color, "FFFFFF", theme_color, theme_color])
321 | elif col_index == len(row):
322 | cell.border = itlubber_border(["thin", "medium", "medium", "medium"], ["FFFFFF", theme_color, theme_color, theme_color])
323 | else:
324 | cell.border = itlubber_border(["thin", "thin", "medium", "medium"], ["FFFFFF", "FFFFFF", theme_color, theme_color])
325 | else:
326 | for col_index, cell in enumerate(row, start=1):
327 | cell.font = Font(size=fontsize, name=font, color="000000")
328 | cell.fill = PatternFill(fill_type="solid", start_color="FFFFFF")
329 | cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=False)
330 |
331 | if col_index in percent_columns:
332 | # cell.alignment = Alignment(horizontal='right', vertical='center', wrap_text=False)
333 | cell.number_format = "0.00%"
334 | else:
335 | pass
336 | # cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=False)
337 |
338 | if row_index == worksheet.max_row:
339 | if col_index == 1:
340 | cell.border = itlubber_border(["medium", "thin", "medium"], [theme_color, "FFFFFF", theme_color])
341 | elif col_index == len(row):
342 | cell.border = itlubber_border(["thin", "medium", "medium"], ["FFFFFF", theme_color, theme_color])
343 | else:
344 | cell.border = itlubber_border(["thin", "thin", "medium"], ["FFFFFF", "FFFFFF", theme_color])
345 | else:
346 | if merge_rows in [[], None] or (row_index - 1 in merge_rows):
347 | if col_index == 1:
348 | cell.border = itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", theme_color])
349 | elif col_index == len(row):
350 | cell.border = itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, theme_color])
351 | else:
352 | cell.border = itlubber_border(["thin", "thin", "thin"], ["FFFFFF", "FFFFFF", theme_color])
353 | else:
354 | if col_index == 1:
355 | cell.border = itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", "FFFFFF"])
356 | elif col_index == len(row):
357 | cell.border = itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, "FFFFFF"])
358 | else:
359 | cell.border = itlubber_border(["thin", "thin", "thin"], ["FFFFFF", "FFFFFF", "FFFFFF"])
360 |
361 | if border:
362 | iterrows = tqdm(enumerate(worksheet.rows, start=1), total=worksheet.max_row - 1) if bar else enumerate(worksheet.rows, start=1)
363 | if n_jobs > 0:
364 | joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(render_cell)(row_index, row) for row_index, row in iterrows)
365 | else:
366 | for row_index, row in iterrows:
367 | render_cell(row_index, row)
368 |
369 | feature_table = pd.read_excel(
370 | excel_name, sheet_name=sheet_name, engine="openpyxl"
371 | )
372 | feature_table_len_max = feature_table.apply(lambda x: [(len(str(i).encode('utf-8')) - len(str(i))) / 2 + len(str(i)) for i in x]).max()
373 | for i in feature_table.columns:
374 | # 列的字母
375 | j = list(feature_table.columns)
376 | column_letter = [chr(j.index(i) + 65) if j.index(i) <= 25 else 'A' + chr(j.index(i) - 26 + 65) ][0]
377 | # 列的宽度
378 | columns_length = (len(str(i).encode('utf-8')) - len(str(i)))/2 + len(str(i))
379 | data_max_length = feature_table_len_max[i]
380 | column_width = [data_max_length if columns_length < data_max_length else columns_length][0]
381 | column_width = [column_width if column_width <= max_column_width else max_column_width][0] + 3
382 | # 更改列的宽度
383 | worksheet.column_dimensions['{}'.format(column_letter)].width = column_width
384 |
385 | if freeze:
386 | worksheet.freeze_panes = freeze
387 |
388 | workbook.save(excel_name)
389 | workbook.close()
390 |
391 |
392 | def run_feature_table(feature, train=None, feature_dict=None, rules={}, combiner=None, target="target", return_feature=False):
393 | table = feature_bin_stats(train, feature, feature_dict=feature_dict, rules=rules, combiner=combiner)
394 | df_psi = cal_psi(train[[feature, target]], test[[feature, target]], feature, combiner=combiner)
395 |
396 | table = table.merge(df_psi, on="分箱", how="left")
397 |
398 | feature_bin = combiner.export()[feature]
399 | feature_bin_dict = format_bins(np.array(feature_bin))
400 | table["分箱"] = table["分箱"].map(feature_bin_dict)
401 |
402 | if return_feature:
403 | return feature, table
404 | else:
405 | return table
406 |
407 |
408 | def render_dataframe(df, row_height=0.4, font_size=14,
409 | header_color='#2639E9', row_colors=['#dae3f3', 'w'], edge_color='w',
410 | bbox=[0, 0, 1, 1], header_columns=0,
411 | ax=None, save=None, **kwargs):
412 | data = df.copy()
413 | for col in data.select_dtypes('datetime'):
414 | data[col] = data[col].dt.strftime("%Y-%m-%d")
415 |
416 | for col in data.select_dtypes('float'):
417 | data[col] = data[col].apply(lambda x: np.nan if pd.isnull(x) else round(x, 4))
418 |
419 | cols_width = [max(data[col].apply(lambda x:len(str(x).encode())).max(), len(str(col).encode())) / 8. for col in data.columns]
420 |
421 | if ax is None:
422 | size = (sum(cols_width), (len(data) + 1) * row_height)
423 | fig, ax = plt.subplots(figsize=size)
424 | ax.axis('off')
425 |
426 | mpl_table = ax.table(cellText=data.values, colWidths=cols_width, bbox=bbox, colLabels=data.columns, **kwargs)
427 |
428 | mpl_table.auto_set_font_size(False)
429 | mpl_table.set_fontsize(font_size)
430 |
431 | for k, cell in six.iteritems(mpl_table._cells):
432 | cell.set_edgecolor(edge_color)
433 | if k[0] == 0 or k[1] < header_columns:
434 | cell.set_text_props(weight='bold', color='w')
435 | cell.set_facecolor(header_color)
436 | else:
437 | cell.set_facecolor(row_colors[k[0]%len(row_colors)])
438 |
439 | if save:
440 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
441 | os.makedirs(os.path.dirname(save))
442 |
443 | fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
444 |
445 | return fig
446 |
447 |
448 | if __name__ == '__main__':
449 | from functools import partial
450 | from multiprocessing import Pool
451 | data = sc.germancredit()
452 |
453 | # 测试数据
454 | data["target"] = data["creditability"].replace({'good':0,'bad':1})
455 | data["credit.amount"].loc[0] = np.nan
456 | data["status.of.existing.checking.account"].loc[0] = np.nan
457 | data["test_a"] = 0.
458 | data["test_a"].loc[0] = np.nan
459 | data["test_b"] = ""
460 | data["test_b"].loc[0] = np.nan
461 | data["test_c"] = np.nan
462 |
463 | # data = data.replace("", np.nan)
464 |
465 | train, test = train_test_split(data, test_size=0.3,)
466 |
467 | target = "target"
468 | cols = ["test_a", "test_b", "test_c", "status.of.existing.checking.account", "credit.amount"]
469 |
470 | combiner = toad.transform.Combiner()
471 | # combiner.fit(data[cols + [target]], target, empty_separate=True, method="chi", min_samples=0.2)
472 |
473 | # 保存结果至 EXCEL 文件
474 | output_excel_name = f"指标有效性验证-{datetime.now().strftime('%Y-%m-%d')}.xlsx"
475 | output_sheet_name = "指标有效性"
476 | tables = {}
477 | merge_row_number = []
478 |
479 | # _run_feature_table = partial(run_feature_table, train=train, feature_dict=feature_dict, rules={}, combiner=combiner, target=target, return_feature=True)
480 | # all_feature_tables = joblib.Parallel(n_jobs=4)(joblib.delayed(_run_feature_table)(feature) for feature in cols)
481 |
482 | # for feature, table in all_feature_tables:
483 | # merge_row_number.append(len(table))
484 | # tables[feature] = table
485 |
486 | for feature in cols:
487 | table = feature_bin_stats(train, feature, feature_dict=feature_dict, rules={}, combiner=combiner)
488 | print(train.shape)
489 | df_psi = cal_psi(train[[feature, target]], test[[feature, target]], feature, combiner=combiner)
490 |
491 | table = table.merge(df_psi, on="分箱", how="left")
492 |
493 | feature_bin = combiner.export()[feature]
494 | feature_bin_dict = format_bins(np.array(feature_bin))
495 | table["分箱"] = table["分箱"].map(feature_bin_dict)
496 |
497 | table = run_feature_table(feature)
498 | # plot_bin(table, show_na=True)
499 | merge_row_number.append(len(table))
500 | tables[feature] = table
501 |
502 | merge_row_number = np.cumsum(merge_row_number).tolist()
503 | feature_table = pd.concat(tables, ignore_index=True).round(6)
504 | feature_table["分档WOE值"] = feature_table["分档WOE值"].fillna(np.inf)
505 |
506 | workbook = load_workbook(output_excel_name) if os.path.exists(output_excel_name) else None
507 | writer = pd.ExcelWriter(output_excel_name, engine="openpyxl")
508 |
509 | if workbook:
510 | writer.book = workbook
511 | writer.sheets = {ws.title: ws for ws in workbook.worksheets}
512 | start_row = writer.book.get_sheet_by_name(output_sheet_name).max_row
513 | else:
514 | start_row = 0
515 |
516 | feature_table.to_excel(writer, sheet_name=output_sheet_name, index=False, header=True, startcol=0, startrow=start_row)
517 |
518 | writer.close()
519 |
520 | render_excel(output_excel_name, sheet_name=output_sheet_name, conditional_columns=["J", "N"], freeze="D2", merge_rows=merge_row_number, percent_columns=[5, 7, 9, 10], start_row=start_row, header=False if start_row > 0 else True)
521 | # render_excel("变量字典及字段解释.xlsx")
522 | combiner.export(to_json=f"rules_{datetime.now().strftime('%Y-%m-%d')}.json")
523 |
--------------------------------------------------------------------------------
/processing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2022/8/23 13:12
4 | @Author : itlubber
5 | @Site : itlubber.art
6 | """
7 |
8 | import re
9 | import os
10 | import toad
11 | import scipy
12 | import warnings
13 | import numpy as np
14 | import pandas as pd
15 | import scorecardpy as sc
16 | import statsmodels.api as sm
17 | from functools import partial
18 | import matplotlib.pyplot as plt
19 | import plotly.graph_objects as go
20 | from IPython.display import Image
21 | from openpyxl import load_workbook
22 | # from joblib import Parallel, delayed
23 | from concurrent.futures import ProcessPoolExecutor
24 | from openpyxl.styles import Alignment
25 | from optbinning import OptimalBinning
26 | from sklearn.decomposition import PCA
27 | from sklearn.pipeline import Pipeline
28 | from sklearn.linear_model import LogisticRegression
29 | from sklearn.utils.validation import check_is_fitted
30 | from sklearn.model_selection import train_test_split, GridSearchCV
31 | from sklearn.ensemble import GradientBoostingClassifier
32 | from toad.plot import bin_plot, proportion_plot, corr_plot, badrate_plot
33 | from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
34 | from statsmodels.stats.outliers_influence import variance_inflation_factor
35 |
36 |
37 | warnings.filterwarnings("ignore")
38 | pd.set_option('display.width', 5000)
39 | # plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
40 | # plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
41 |
42 |
43 | def drop_identical(frame, threshold = 0.95, return_drop = False, exclude = None, target = None):
44 | """drop columns by identical
45 | Args:
46 | frame (DataFrame): dataframe that will be used
47 | threshold (number): drop the features whose identical num is greater than threshold. if threshold is float, it will be use as percentage
48 | return_drop (bool): if need to return features' name who has been dropped
49 | exclude (array-like): list of feature names that will not be dropped
50 | target (str): target's name in dataframe
51 | Returns:
52 | DataFrame: selected dataframe
53 | array: list of feature names that has been dropped
54 | """
55 | cols = frame.columns.copy()
56 |
57 | if target is not None:
58 | cols.drop(target)
59 |
60 | if exclude is not None:
61 | cols = cols.drop(exclude)
62 |
63 | if threshold < 1:
64 | threshold = len(frame) * threshold
65 |
66 | drop_list = []
67 | for col in cols:
68 | n = frame[col].value_counts().max()
69 |
70 | if n > threshold:
71 | drop_list.append(col)
72 |
73 | r = frame.drop(columns = drop_list)
74 |
75 | res = (r,)
76 | if return_drop:
77 | res += (np.array(drop_list),)
78 |
79 | return toad.utils.unpack_tuple(res)
80 |
81 |
82 | def select(frame, target = 'target', empty = 0.9, iv = 0.02, corr = 0.7,
83 | identical=0.95, return_drop = False, exclude = None):
84 | """select features by rate of empty, iv and correlation
85 | Args:
86 | frame (DataFrame)
87 | target (str): target's name in dataframe
88 | empty (number): drop the features which empty num is greater than threshold. if threshold is less than `1`, it will be use as percentage
89 | identical (number): drop the features which identical num is greater than threshold. if threshold is less than `1`, it will be use as percentage
90 | iv (float): drop the features whose IV is less than threshold
91 | corr (float): drop features that has the smallest IV in each groups which correlation is greater than threshold
92 | return_drop (bool): if need to return features' name who has been dropped
93 | exclude (array-like): list of feature name that will not be dropped
94 | Returns:
95 | DataFrame: selected dataframe
96 | dict: list of dropped feature names in each step
97 | """
98 | empty_drop, iv_drop, corr_drop, identical_drop = None, None, None, None
99 |
100 | if empty is not False:
101 | frame, empty_drop = toad.selection.drop_empty(frame, threshold = empty, return_drop = True, exclude = exclude)
102 |
103 | if identical is not False:
104 | frame, identical_drop = drop_identical(frame, threshold = identical, return_drop = True, exclude = exclude, target = target)
105 |
106 | if iv is not False:
107 | frame, iv_drop, iv_list = toad.selection.drop_iv(frame, target = target, threshold = iv, return_drop = True, return_iv = True, exclude = exclude)
108 |
109 | if corr is not False:
110 | weights = 'IV'
111 |
112 | if iv is not False:
113 | weights = iv_list
114 |
115 | frame, corr_drop = toad.selection.drop_corr(frame, target = target, threshold = corr, by = weights, return_drop = True, exclude = exclude)
116 |
117 | res = (frame,)
118 | if return_drop:
119 | d = {
120 | 'empty': empty_drop,
121 | 'identical': identical_drop,
122 | 'iv': iv_drop,
123 | 'corr': corr_drop,
124 | }
125 | res += (d,)
126 |
127 | return toad.utils.unpack_tuple(res)
128 |
129 |
130 | class FeatureSelection(TransformerMixin, BaseEstimator):
131 |
132 | def __init__(self, target="target", empty=0.95, iv=0.02, corr=0.7, exclude=None, return_drop=True, identical=0.95, remove=None, engine="scorecardpy", target_rm=False):
133 | """
134 | ITLUBBER提供的特征筛选方法
135 |
136 | Args:
137 | target: 数据集中标签名称,默认 target
138 | empty: 空值率,默认 0.95, 即空值占比超过 95% 的特征会被剔除
139 | iv: IV值,默认 0.02,即iv值小于 0.02 时特征会被剔除
140 | corr: 相关性,默认 0.7,即特征之间相关性大于 0.7 时会剔除iv较小的特征
141 | identical: 唯一值占比,默认 0.95,即当特征的某个值占比超过 95% 时,特征会被剔除
142 | engine: 特征筛选使用的引擎,可选 "toad", "scorecardpy" 两种,默认 scorecardpy
143 | remove: 引擎使用 scorecardpy 时,可以传入需要强制删除的变量
144 | return_drop: 是否返回删除信息,默认 True,即默认返回删除特征信息
145 | target_rm: 是否剔除标签,默认 False,即不剔除
146 | exclude: 是否需要强制保留某些特征
147 | """
148 | self.engine = engine
149 | self.target = target
150 | self.empty = empty
151 | self.identical = identical
152 | self.iv = iv
153 | self.corr = corr
154 | self.exclude = exclude
155 | self.remove = remove
156 | self.return_drop = return_drop
157 | self.target_rm = target_rm
158 | self.select_columns = None
159 | self.dropped = None
160 |
161 | def fit(self, x, y=None):
162 | if self.engine == "toad":
163 | selected = select(x, target=self.target, empty=self.empty, identical=self.identical, iv=self.iv, corr=self.corr, exclude=self.exclude, return_drop=self.return_drop)
164 | else:
165 | selected = sc.var_filter(x, y=self.target, iv_limit=self.iv, missing_limit=self.empty, identical_limit=self.identical, var_rm=self.remove, var_kp=self.exclude, return_rm_reason=self.return_drop)
166 |
167 | if self.return_drop and isinstance(selected, dict):
168 | self.dropped = selected["rm"]
169 | self.select_columns = list(selected["dt"].columns)
170 | elif self.return_drop and isinstance(selected, (tuple, list)):
171 | self.dropped = pd.DataFrame([(feature, reason) for reason, features in selected[1].items() for feature in features], columns=["variable", "rm_reason"])
172 | self.select_columns = list(selected[0].columns)
173 | else:
174 | self.select_columns = list(selected.columns)
175 |
176 | if self.target_rm and self.target in self.select_columns:
177 | self.select_columns.remove(self.target)
178 |
179 | return self
180 |
181 | def transform(self, x, y=None):
182 | # if self.engine == "toad":
183 | # selected = toad.selection.select(x, target=self.target, empty=self.empty, iv=self.iv, corr=self.corr, exclude=self.exclude, return_drop=self.return_drop)
184 | # else:
185 | # selected = sc.var_filter(x, y=self.target, iv_limit=self.iv, missing_limit=self.empty, identical_limit=self.identical, var_rm=self.remove, var_kp=self.exclude, return_rm_reason=self.return_drop)
186 |
187 | # if self.return_drop and isinstance(selected, dict):
188 | # self.dropped = selected["rm"]
189 | # return selected["dt"]
190 | # elif self.return_drop and isinstance(selected, (tuple, list)):
191 | # self.dropped = pd.DataFrame([(feature, reason) for reason, features in selected[1].items() for feature in features], columns=["variable", "rm_reason"])
192 | # return selected[0]
193 | # else:
194 | # return selected
195 | return x[[col for col in self.select_columns if col in x.columns]]
196 |
197 |
198 | class FeatureImportanceSelector(BaseEstimator, TransformerMixin):
199 |
200 | def __init__(self, top_k=126, target="target", selector="catboost", params=None, max_iv=None):
201 | """
202 | 基于特征重要性的特征筛选方法
203 |
204 | Args:
205 | target: 数据集中标签名称,默认 target
206 | top_k: 依据特征重要性进行排序,筛选最重要的 top_k 个特征
207 | max_iv: 是否需要删除 IV 过高的特征,建议设置为 1.0
208 | selector: 特征选择器,目前只支持 catboost ,可以支持数据集中包含字符串的数据
209 | params: selector 的参数,不传使用默认参数
210 | """
211 | self.target = target
212 | self.top_k = top_k
213 | self.max_iv = max_iv
214 | self.selector = selector
215 | self.params = params
216 | self.feature_names_ = None
217 | self.high_iv_feature_names_ = None
218 | self.low_importance_feature_names_ = None
219 | self.select_columns = None
220 | self.dropped = None
221 |
222 | def fit(self, x, y=None):
223 | x = x.copy()
224 |
225 | if self.max_iv is not None:
226 | self.high_iv_feature_names_ = list(toad.quality(train, target=target, cpu_cores=-1, iv_only=True).query("iv > 1.0").index)
227 | x = x[[c for c in x.columns if c not in self.high_iv_feature_names_]]
228 |
229 | X = x.drop(columns=self.target)
230 | Y = x[self.target]
231 |
232 | self.feature_names_ = list(X.columns)
233 | cat_features_index = [i for i in range(len(self.feature_names_)) if self.feature_names_[i] not in X.select_dtypes("number").columns]
234 |
235 | if self.selector == "catboost":
236 | self.catboost_selector(x=X, y=Y, cat_features=cat_features_index)
237 | else:
238 | pass
239 |
240 | return self
241 |
242 | def transform(self, x, y=None):
243 | return x[self.select_columns + [self.target]]
244 |
245 |
246 | def catboost_selector(self, x, y, cat_features=None):
247 | from catboost import Pool, cv, metrics, CatBoostClassifier
248 |
249 | cat_data = Pool(data=x, label=y, cat_features=cat_features)
250 |
251 | if self.params is None:
252 | self.params = {
253 | "iterations": 256,
254 | "objective": "CrossEntropy",
255 | "eval_metric": "AUC",
256 | "learning_rate": 1e-2,
257 | "colsample_bylevel": 0.1,
258 | "depth": 4,
259 | "boosting_type": "Ordered",
260 | "bootstrap_type": "Bernoulli",
261 | "subsample": 0.8,
262 | "random_seed": 1024,
263 | "early_stopping_rounds": 10,
264 | "verbose": 0,
265 | }
266 |
267 | cat_model = CatBoostClassifier(**self.params)
268 | cat_model.fit(cat_data, eval_set=[cat_data])
269 |
270 | self.select_columns = [name for score, name in sorted(zip(cat_model.feature_importances_, cat_model.feature_names_), reverse=True)][:self.top_k]
271 | self.low_importance_feature_names_ = [c for c in x.columns if c not in self.select_columns]
272 |
273 |
274 | class FeatureDecomposition(BaseEstimator, TransformerMixin):
275 |
276 | def __init__(self, freq, app, key_words=None, combin_features=None, combiner=PCA, n_components=1):
277 | """
278 | 同一类型 + 同一周期 + 新增数/安装数/活跃天数/卸载数 的特征通过降维方法转换为 n_components 个特征
279 |
280 | freq: 周期,例如 90天
281 | app: 类型,例如 银行类
282 | key_words: 不同类型的指标,例如 ["活跃款数", "新增款数", "活跃频次", "活跃天数"]
283 | combin_features: 手工制定需要进行降维的特征,传入app、freq、freq时不需要传入
284 | combiner: 降维的方法,默认 PCA,参考 sklearn.decomposition 中相关方法的使用
285 | n_components: 降维后的特征数量,默认 1
286 | """
287 | self.freq = freq
288 | self.app = app
289 | self.key_words = key_words
290 | self.combin_features = combin_features
291 | self.n_components = n_components
292 | self.combiner = combiner(n_components=self.n_components)
293 |
294 | def fit(self, x, y=None):
295 | x = x.copy()
296 |
297 | if self.combin_features:
298 | self.combin_features = [c for c in self.combin_features if c in x.columns]
299 | else:
300 | if self.key_words:
301 | if isinstance(self.key_words, str):
302 | self.key_words = [self.key_words]
303 | pattern = re.compile(f"(?=.*{self.freq})(?=.*{self.app})(?=.*(?:{'|'.join(self.key_words)})).+")
304 | else:
305 | pattern = re.compile(f"{self.app}")
306 |
307 | self.combin_features = [c for c in x.columns if pattern.match(c)]
308 |
309 | if len(self.combin_features) > 0 and len(self.combin_features) > self.n_components:
310 | x = x[self.combin_features]
311 | self.combiner.fit(x, y=y)
312 |
313 | else:
314 | raise Exception("组合特征不在数据中。")
315 |
316 | return self
317 |
318 | def transform(self, x, y=None):
319 | x = x[self.combin_features].copy()
320 | return self.combiner.transform(x)
321 |
322 | def inverse_transform(self, x, y=None):
323 | return self.combiner.inverse_transform(x)
324 |
325 |
326 | class Combiner(TransformerMixin, BaseEstimator):
327 |
328 | def __init__(self, target="target", method='chi', engine="toad", empty_separate=False, min_samples=0.05, min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", rules={}, n_jobs=1):
329 | """
330 | 特征分箱封装方法
331 |
332 | Args:
333 | target: 数据集中标签名称,默认 target
334 | method: 特征分箱方法,可选 "chi", "dt", "quantile", "step", "kmeans", "cart", "mdlp", "uniform", 参考 toad.Combiner & optbinning.OptimalBinning
335 | engine: 分箱引擎,可选 "optbinning", "toad"
336 | empty_separate: 是否空值单独一箱, 默认 False,推荐设置为 True
337 | min_samples: 最小叶子结点样本占比,参考对应文档进行设置,默认 5%
338 | min_n_bins: 最小分箱数,默认 2,即最小拆分2箱
339 | max_n_bins: 最大分像素,默认 3,即最大拆分3箱,推荐设置 3 ~ 5,不宜过多,偶尔使用 optbinning 时不起效
340 | max_n_prebins: 使用 optbinning 时预分箱数量
341 | min_prebin_size: 使用 optbinning 时预分箱叶子结点(或者每箱)样本占比,默认 2%
342 | min_bin_size: 使用 optbinning 正式分箱叶子结点(或者每箱)最小样本占比,默认 5%
343 | max_bin_size: 使用 optbinning 正式分箱叶子结点(或者每箱)最大样本占比,默认 None
344 | gamma: 使用 optbinning 分箱时限制过拟合的正则化参数,值越大惩罚越多,默认 0。01
345 | monotonic_trend: 使用 optbinning 正式分箱时的坏率策略,默认 auto,可选 "auto", "auto_heuristic", "auto_asc_desc", "ascending", "descending", "convex", "concave", "peak", "valley", "peak_heuristic", "valley_heuristic"
346 | rules: 自定义分箱规则,toad.Combiner 能够接收的形式
347 | n_jobs: 使用多进程加速的worker数量,默认单进程
348 | """
349 | self.combiner = toad.transform.Combiner()
350 | self.method = method
351 | self.empty_separate = empty_separate
352 | self.target = target
353 | self.min_samples = min_samples
354 | self.max_n_bins = max_n_bins
355 | self.min_n_bins = min_n_bins
356 | self.min_bin_size = min_bin_size
357 | self.max_bin_size = max_bin_size
358 | self.max_n_prebins = max_n_prebins
359 | self.min_prebin_size = min_prebin_size
360 | self.gamma = gamma
361 | self.monotonic_trend = monotonic_trend
362 | self.rules = rules
363 | self.engine = engine
364 | self.n_jobs = n_jobs
365 |
366 | def optbinning_bins(self, feature, data=None, target="target", min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc"):
367 | if data[feature].dropna().nunique() <= min_n_bins:
368 | splits = []
369 | for v in data[feature].dropna().unique():
370 | splits.append(v)
371 |
372 | if str(data[feature].dtypes) in ["object", "string", "category"]:
373 | rule = {feature: [[s] for s in splits]}
374 | rule[feature].append([[np.nan]])
375 | else:
376 | rule = {feature: sorted(splits) + [np.nan]}
377 | else:
378 | try:
379 | y = data[target]
380 | if str(data[feature].dtypes) in ["object", "string", "category"]:
381 | dtype = "categorical"
382 | x = data[feature].astype("category").values
383 | else:
384 | dtype = "numerical"
385 | x = data[feature].values
386 |
387 | _combiner = OptimalBinning(feature, dtype=dtype, min_n_bins=min_n_bins, max_n_bins=max_n_bins, max_n_prebins=max_n_prebins, min_prebin_size=min_prebin_size, min_bin_size=min_bin_size, max_bin_size=max_bin_size, monotonic_trend=monotonic_trend, gamma=gamma).fit(x, y)
388 | if _combiner.status == "OPTIMAL":
389 | rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [[np.nan] if dtype == "categorical" else np.nan]}
390 | else:
391 | raise Exception("optimalBinning error")
392 |
393 | except Exception as e:
394 | _combiner = toad.transform.Combiner()
395 | _combiner.fit(data[[feature, target]].dropna(), target, method="chi", min_samples=self.min_samples, n_bins=self.max_n_bins)
396 | rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.export()[feature]] + [[np.nan] if dtype == "categorical" else np.nan]}
397 |
398 | self.combiner.update(rule)
399 |
400 | def fit(self, x, y=None):
401 | if self.engine == "optbinning":
402 | feature_optbinning_bins = partial(self.optbinning_bins, data=x, target=self.target, min_n_bins=self.min_n_bins, max_n_bins=self.max_n_bins, max_n_prebins=self.max_n_prebins, min_prebin_size=self.min_prebin_size, min_bin_size=self.min_bin_size, max_bin_size=self.max_bin_size, gamma=self.gamma, monotonic_trend=self.monotonic_trend)
403 | if self.n_jobs > 1:
404 | with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
405 | [executor.submit(feature_optbinning_bins(feature)) for feature in x.columns.drop(self.target)]
406 | else:
407 | for feature in x.drop(columns=[self.target]):
408 | self.optbinning_bins(feature, data=x, target=self.target, min_n_bins=self.min_n_bins, max_n_bins=self.max_n_bins, max_n_prebins=self.max_n_prebins, min_prebin_size=self.min_prebin_size, min_bin_size=self.min_bin_size, max_bin_size=self.max_bin_size, gamma=self.gamma, monotonic_trend=self.monotonic_trend)
409 | # feature_optbinning_bins(feature)
410 | else:
411 | self.combiner.fit(x, y=self.target, method=self.method, min_samples=self.min_samples, n_bins=self.max_n_bins)
412 |
413 | self.update(self.rules)
414 |
415 | return self
416 |
417 | def transform(self, x, y=None, labels=False):
418 | return self.combiner.transform(x, labels=labels)
419 |
420 | def update(self, rules):
421 | if isinstance(rules, dict):
422 | self.combiner.update(rules)
423 |
424 | def export(self, to_json=None):
425 | return self.combiner.export(to_json=to_json)
426 |
427 | def load(self, from_json=None):
428 | self.combiner.load(from_json=from_json)
429 | return self
430 |
431 | def bin_plot(self, data, x, rule=None, labels=True, result=False, save=None):
432 | if rule:
433 | if isinstance(rule, list):
434 | rule = {x: rule}
435 | self.combiner.update(rule)
436 |
437 | bin_plot(self.combiner.transform(data, labels=labels), x=x, target=self.target)
438 |
439 | if save:
440 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
441 | os.makedirs(os.path.dirname(save))
442 |
443 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
444 |
445 | if result:
446 | return self.combiner.export()[x]
447 |
448 | def proportion_plot(self, x, transform=False, labels=False):
449 | if transform:
450 | x = self.combiner.transform(x, labels=labels)
451 | proportion_plot(x)
452 |
453 | def corr_plot(self, data, transform=False, figure_size=(20, 15)):
454 | if transform:
455 | data = self.combiner.transform(data, labels=False)
456 |
457 | corr_plot(data, figure_size=figure_size)
458 |
459 | def badrate_plot(self, data, date_column, feature, labels=True):
460 | badrate_plot(self.combiner.transform(data[[date_column, feature, self.target]], labels=labels), target=self.target, x=date_column, by=feature)
461 |
462 | @property
463 | def rules(self):
464 | return self.combiner._rules
465 |
466 | @rules.setter
467 | def rules(self, value):
468 | self.combiner._rules = value
469 |
470 | def __len__(self):
471 | return len(self.combiner._rules.keys())
472 |
473 | def __contains__(self, key):
474 | return key in self.combiner._rules
475 |
476 | def __getitem__(self, key):
477 | return self.combiner._rules[key]
478 |
479 | def __setitem__(self, key, value):
480 | self.combiner._rules[key] = value
481 |
482 | def __iter__(self):
483 | return iter(self.combiner._rules)
484 |
485 |
486 | class WOETransformer(TransformerMixin, BaseEstimator):
487 |
488 | def __init__(self, target="target", exclude=None):
489 | """
490 | WOE转换器
491 |
492 | Args:
493 | target: 数据集中标签名称,默认 target
494 | exclude: 不需要转换 woe 的列
495 | """
496 | self.target = target
497 | self.exclude = exclude if isinstance(exclude, list) else [exclude] if exclude else []
498 | self.transformer = toad.transform.WOETransformer()
499 |
500 | def fit(self, x, y=None):
501 | self.transformer.fit(x.drop(columns=self.exclude + [self.target]), x[self.target])
502 | return self
503 |
504 | def transform(self, x, y=None):
505 | return self.transformer.transform(x)
506 |
507 | @property
508 | def rules(self):
509 | return self.transformer._rules
510 |
511 | @rules.setter
512 | def rules(self, value):
513 | self.transformer._rules = value
514 |
515 | def __len__(self):
516 | return len(self.transformer._rules.keys())
517 |
518 | def __contains__(self, key):
519 | return key in self.transformer._rules
520 |
521 | def __getitem__(self, key):
522 | return self.transformer._rules[key]
523 |
524 | def __setitem__(self, key, value):
525 | self.transformer._rules[key] = value
526 |
527 | def __iter__(self):
528 | return iter(self.transformer._rules)
529 |
530 |
531 | class StepwiseSelection(TransformerMixin, BaseEstimator):
532 |
533 | def __init__(self, target="target", estimator="ols", direction="both", criterion="aic", max_iter=None, return_drop=True, exclude=None, intercept=True, p_value_enter=0.2, p_remove=0.01, p_enter=0.01, target_rm=False):
534 | """
535 | 逐步回归筛选方法
536 |
537 | Args:
538 | target: 数据集中标签名称,默认 target
539 | estimator: 预估器,默认 ols,可选 "ols", "lr", "lasso", "ridge",通常默认即可
540 | direction: 逐步回归方向,默认both,可选 "forward", "backward", "both",通常默认即可
541 | criterion: 评价指标,默认 aic,可选 "aic", "bic",通常默认即可
542 | max_iter: 最大迭代次数,sklearn中使用的参数,默认为 None
543 | return_drop: 是否返回特征剔除信息,默认 True
544 | exclude: 强制保留的某些特征
545 | intercept: 是否包含截距,默认为 True
546 | p_value_enter: 特征进入的 p 值,用于前向筛选时决定特征是否进入模型
547 | p_remove: 特征剔除的 p 值,用于后向剔除时决定特征是否要剔除
548 | p_enter: 特征 p 值,用于判断双向逐步回归是否剔除或者准入特征
549 | target_rm: 是否剔除数据集中的标签,默认为 False,即剔除数据集中的标签
550 | """
551 | self.target = target
552 | self.intercept = intercept
553 | self.p_value_enter = p_value_enter
554 | self.p_remove = p_remove
555 | self.p_enter = p_enter
556 | self.estimator = estimator
557 | self.direction = direction
558 | self.criterion = criterion
559 | self.max_iter = max_iter
560 | self.return_drop = return_drop
561 | self.target_rm = target_rm
562 | self.exclude = exclude
563 | self.select_columns = None
564 | self.dropped = None
565 |
566 | def fit(self, x, y=None):
567 | selected = toad.selection.stepwise(x, target=self.target, estimator=self.estimator, direction=self.direction, criterion=self.criterion, exclude=self.exclude, intercept=self.intercept, p_value_enter=self.p_value_enter,
568 | p_remove=self.p_remove, p_enter=self.p_enter, return_drop=self.return_drop)
569 | if self.return_drop:
570 | self.dropped = pd.DataFrame([(col, "stepwise") for col in selected[1]], columns=["variable", "rm_reason"])
571 | selected = selected[0]
572 |
573 | self.select_columns = list(selected.columns)
574 |
575 | if self.target_rm and self.target in self.select_columns:
576 | self.select_columns.remove(self.target)
577 |
578 | return self
579 |
580 | def transform(self, x, y=None):
581 | return x[[col for col in self.select_columns if col in x.columns]]
582 |
583 |
584 | if __name__ == "__main__":
585 | from model import ITLubberLogisticRegression, StatsLogisticRegression, ScoreCard
586 |
587 | target = "creditability"
588 | data = sc.germancredit()
589 | data[target] = data[target].map({"good": 0, "bad": 1})
590 |
591 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])
592 |
593 | # selection = FeatureSelection(target=target, engine="toad", return_drop=True, corr=0.9, iv=0.01)
594 | # train = selection.fit_transform(train)
595 |
596 | # combiner = Combiner(min_samples=0.2, empty_separate=True, target=target)
597 | # combiner.fit(train)
598 | # train = combiner.transform(train)
599 |
600 | # transformer = WOETransformer(target=target)
601 | # train = transformer.fit_transform(train)
602 |
603 | # stepwise = StepwiseSelection(target=target)
604 | # train = stepwise.fit_transform(train)
605 |
606 | feature_pipeline = Pipeline([
607 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
608 | ("combiner", Combiner(target=target, min_samples=0.2)),
609 | ("transformer", WOETransformer(target=target)),
610 | ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
611 | ("stepwise", StepwiseSelection(target=target, target_rm=False)),
612 | # ("logistic", StatsLogisticRegression(target=target)),
613 | ("logistic", ITLubberLogisticRegression(target=target)),
614 | ])
615 |
616 | # feature_pipeline.fit(train)
617 | # y_pred_train = feature_pipeline.predict(train.drop(columns=target))
618 | # y_pred_test = feature_pipeline.predict(test.drop(columns=target))
619 |
620 | params_grid = {
621 | "logistic__C": [i / 1. for i in range(1, 10, 2)],
622 | "logistic__penalty": ["l2"],
623 | "logistic__class_weight": [None, "balanced"], # + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10)],
624 | "logistic__max_iter": [100],
625 | "logistic__solver": ["sag"] # ["liblinear", "sag", "lbfgs", "newton-cg"],
626 | "logistic__intercept": [True, False],
627 | }
628 |
629 | clf = GridSearchCV(feature_pipeline, params_grid, cv=5, scoring='roc_auc', verbose=-1, n_jobs=2, return_train_score=True)
630 | clf.fit(train, train[target])
631 |
632 | y_pred_train = clf.best_estimator_.predict(train)
633 | y_pred_test = clf.best_estimator_.predict(test)
634 |
635 | print(clf.best_params_)
636 |
637 | # statmodels methods
638 | # feature_pipeline.named_steps['logistic'].summary_save()
639 |
640 | # print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target]))
641 | # print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target]))
642 |
643 | woe_train = feature_pipeline.fit_transform(train)
644 | woe_test = feature_pipeline.transform(test)
645 |
646 | # lr = StatsLogisticRegression(target=target)
647 | # lr.fit(woe_train)
648 | # lr.summary_save()
649 |
650 | # cols = list(filter(lambda x: x != target, feature_pipeline.named_steps['preprocessing_select'].select_columns))
651 |
652 | combiner = feature_pipeline.named_steps['combiner'].combiner
653 | transformer = feature_pipeline.named_steps['transformer'].transformer
654 |
655 | score_card = ScoreCard(target=target, combiner=combiner, transer=transformer, )
656 | score_card.fit(woe_train)
657 |
658 |
659 | data["score"] = score_card.transform(data)
660 |
661 | print(score_card.KS_bucket(data["score"], data[target]))
662 | pt = score_card.perf_eva(data["score"], data[target], title="train")
663 |
664 | sc = score_card.score_hist(data["score"], data[target])
665 |
666 | print(score_card.KS(data["score"], data[target]), score_card.AUC(data["score"], data[target]))
667 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2023/2/15 17:55
4 | @Author : itlubber
5 | @Site : itlubber.art
6 | """
7 | import math
8 | import sys
9 | import re
10 | import matplotlib
11 | import matplotlib.font_manager as font_manager
12 | import matplotlib.pyplot as plt
13 | import pandas as pd
14 | from openpyxl.formatting.rule import Rule
15 | from openpyxl.formatting.rule import ColorScaleRule
16 | from openpyxl.utils.dataframe import dataframe_to_rows
17 | from openpyxl.utils import get_column_letter, column_index_from_string
18 |
19 |
20 | from model import *
21 | from utils.excel_writer import ExcelWriter
22 |
23 |
24 | plt.style.use('seaborn-ticks')
25 | # plt.style.use('seaborn-white')
26 | # plt.rcParams.update({'font.size': 14})
27 |
28 |
29 | def pyplot_chinese(font_path='utils/matplot_chinese.ttf'):
30 | # matplotlib.rcParams['font.size'] = 20
31 | matplotlib.font_manager.fontManager.addfont(font_path)
32 | matplotlib.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name()
33 | matplotlib.rcParams['axes.unicode_minus']=False
34 |
35 |
36 | pyplot_chinese()
37 |
38 |
39 | target = "creditability"
40 | data = sc.germancredit()
41 | data[target] = data[target].map({"good": 0, "bad": 1})
42 |
43 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])
44 | oot = data.copy()
45 |
46 | feature_pipeline = Pipeline([
47 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
48 | ("combiner", Combiner(target=target, min_samples=0.2)),
49 | ("transform", WOETransformer(target=target)),
50 | # ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
51 | ("stepwise", StepwiseSelection(target=target)),
52 | ])
53 |
54 | feature_pipeline.fit(train)
55 |
56 | woe_train = feature_pipeline.transform(train)
57 | woe_test = feature_pipeline.transform(test)
58 | woe_oot = feature_pipeline.transform(oot)
59 |
60 | # # save all bin_plot
61 | # _combiner = feature_pipeline.named_steps["combiner"]
62 | # for col in woe_train.columns:
63 | # if col != target:
64 | # _combiner.bin_plot(train, col, labels=True, save=f"model_report/bin_plots/train_{col}.png")
65 | # _combiner.bin_plot(test, col, labels=True, save=f"model_report/bin_plots/test_{col}.png")
66 | # _combiner.bin_plot(oot, col, labels=True, save=f"model_report/bin_plots/oot_{col}.png")
67 |
68 | # logistic = StatsLogisticRegression(target=target)
69 | logistic = ITLubberLogisticRegression(target=target)
70 |
71 | logistic.fit(woe_train)
72 |
73 | y_pred_train = logistic.predict_proba(woe_train.drop(columns=target))[:, 1]
74 | y_pred_test = logistic.predict_proba(woe_test.drop(columns=target))[:, 1]
75 | y_pred_oot = logistic.predict_proba(woe_oot.drop(columns=target))[:, 1]
76 |
77 | ScoreCard.ks_plot(y_pred_train, train[target], save="model_report/lr_ksplot_train.png", figsize=(10, 5))
78 | ScoreCard.ks_plot(y_pred_test, test[target], save="model_report/lr_ksplot_test.png", figsize=(10, 5))
79 | ScoreCard.ks_plot(y_pred_oot, oot[target], save="model_report/lr_ksplot_oot.png", figsize=(10, 5))
80 |
81 | summary = logistic.summary().reset_index().rename(columns={"index": "Features"})
82 |
83 | train_corr = logistic.corr(woe_train, save="model_report/train_corr.png")
84 | test_corr = logistic.corr(woe_test, save="model_report/test_corr.png")
85 | oot_corr = logistic.corr(woe_oot, save="model_report/oot_corr.png")
86 |
87 | train_report = logistic.report(woe_train)
88 | test_report = logistic.report(woe_test)
89 | oot_report = logistic.report(woe_oot)
90 |
91 | print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target]))
92 | print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target]))
93 | print("oot: ", toad.metrics.KS(y_pred_oot, oot[target]), toad.metrics.AUC(y_pred_oot, oot[target]))
94 |
95 |
96 | card = ScoreCard(target=target, pipeline=feature_pipeline, pretrain_lr=logistic)
97 | card.fit(woe_train)
98 |
99 | train["score"] = card.predict(train)
100 | test["score"] = card.predict(test)
101 | oot["score"] = card.predict(oot)
102 |
103 |
104 | def sample_distribution(df, date="date", target="target", user_count="count", save="model_report/sample_time_count.png", figsize=(10, 6), colors=["#2639E9", "#F76E6C", "#FE7715"]):
105 | temp = df.set_index(date).assign(
106 | 好样本=lambda x: (x[target] == 0).astype(int),
107 | 坏样本=lambda x: (x[target] == 1).astype(int),
108 | ).resample("W").agg({"好样本": sum, "坏样本": sum})
109 | temp.index = [i.strftime("%Y-%m-%d") for i in temp.index]
110 |
111 | fig, ax1 = plt.subplots(1, 1, figsize=figsize)
112 | temp.plot(kind='bar', stacked=True, ax=ax1, color=colors[:2], hatch="/", legend=False)
113 | ax1.tick_params(axis='x', labelrotation=-90)
114 | ax1.set(xlabel=None)
115 | ax1.set_ylabel('样本数')
116 | ax1.set_title('不同时点数据集样本分布情况\n\n')
117 |
118 | ax2 = plt.twinx()
119 | (temp["坏样本"] / temp.sum(axis=1)).plot(ax=ax2, color=colors[-1], marker=".", linewidth=2, label="坏样本率")
120 | # sns.despine()
121 |
122 | # 合并图例
123 | handles1, labels1 = ax1.get_legend_handles_labels()
124 | handles2, labels2 = ax2.get_legend_handles_labels()
125 | fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.94), frameon=False)
126 | # ax1.legend(frameon=False, labels=["good", "bad"], loc='upper right')
127 | # ax2.legend(loc='upper left', frameon=False, labels=["bad rate"])
128 |
129 | plt.tight_layout()
130 |
131 | if save:
132 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
133 | os.makedirs(os.path.dirname(save))
134 |
135 | fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
136 |
137 | temp = temp.reset_index().rename(columns={date: "日期", "index": "日期", 0: "好样本", 1: "坏样本"})
138 | temp["样本总数"] = temp["坏样本"] + temp["好样本"]
139 | temp["样本占比"] = temp["样本总数"] / temp["样本总数"].sum()
140 | temp["好样本占比"] = temp["好样本"] / temp["好样本"].sum()
141 | temp["坏样本占比"] = temp["坏样本"] / temp["坏样本"].sum()
142 | temp["坏样本率"] = temp["坏样本"] / temp["样本总数"]
143 |
144 | return temp[["日期", "样本总数", "样本占比", "好样本", "好样本占比", "坏样本", "坏样本占比", "坏样本率"]]
145 |
146 |
147 | def bin_plot(feature_table, feature="", figsize=(15, 8), colors=['#8E8BFE', '#FEA3A2', '#9394E7'], max_len=35, save=None):
148 | feature_table = feature_table.copy()
149 |
150 | feature_table["分箱"] = feature_table["分箱"].apply(lambda x: x if re.match("^\[.*\)$", x) else str(x)[:max_len] + "..")
151 |
152 | # 绘制好坏样本分布情况
153 | fig, ax1 = plt.subplots(figsize=figsize)
154 | ax1.barh(feature_table['分箱'], feature_table['好样本数'], color=colors[0], label='好样本', hatch="/")
155 | ax1.barh(feature_table['分箱'], feature_table['坏样本数'], left=feature_table['好样本数'], color=colors[1], label='坏样本', hatch="\\")
156 | ax1.set_xlabel('样本数')
157 |
158 | # 绘制坏样本率的分布情况
159 | ax2 = ax1.twiny()
160 | ax2.plot(feature_table['坏样本率'], feature_table['分箱'], colors[2], label='坏样本率', linestyle='-.')
161 | ax2.set_xlabel('坏样本率: 坏样本数 / 样本总数')
162 |
163 | for i, rate in enumerate(feature_table['坏样本率']):
164 | ax2.scatter(rate, i, color=colors[2])
165 |
166 | # 在图像对应位置显示样本总数和坏样本率
167 | for i, v in feature_table[['样本总数', '好样本数', '坏样本数', '坏样本率']].iterrows():
168 | ax1.text(v['样本总数'] / 2, i + len(feature_table) / 60, f"{int(v['好样本数'])}:{int(v['坏样本数'])}:{v['坏样本率']:.2%}")
169 |
170 | # 逆转y轴顺序
171 | ax1.invert_yaxis()
172 |
173 | # 添加一个标题
174 | fig.suptitle(f'变量 {feature} 分箱图\n\n')
175 |
176 | # 合并图例
177 | handles1, labels1 = ax1.get_legend_handles_labels()
178 | handles2, labels2 = ax2.get_legend_handles_labels()
179 | fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.925), frameon=False)
180 |
181 | # 调整布局,使分箱信息能够完全显示
182 | plt.tight_layout()
183 |
184 | if save:
185 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
186 | os.makedirs(os.path.dirname(save))
187 |
188 | fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
189 |
190 |
191 | writer = ExcelWriter(style_excel="./utils/报告输出模版.xlsx", theme_color="8E8BFE")
192 |
193 |
194 | # ////////////////////////////////////// 样本说明 ///////////////////////////////////// #
195 | df = pd.DataFrame({
196 | "date": pd.date_range(start="2021-01-01", end="2022-06-30"),
197 | "target": np.random.randint(0, 2, 546),
198 | "count": np.random.randint(0, 100, 546),
199 | })
200 |
201 | total_count = len(data)
202 | dataset_summary = pd.DataFrame(
203 | [
204 | ["建模样本", "2022-01-01", "2023-01-31", len(data), len(data) / total_count, data[target].sum(), data[target].sum() / len(data), ""],
205 | ["训练集", "2022-01-01", "2023-12-31", len(train), len(train) / total_count, train[target].sum(), train[target].sum() / len(train), ""],
206 | ["测试集", "2022-01-01", "2023-12-31", len(test), len(test) / total_count, test[target].sum(), test[target].sum() / len(test), ""],
207 | ["跨时间验证集", "2023-01-01", "2023-01-31", len(oot), len(oot) / total_count, oot[target].sum(), oot[target].sum() / len(oot), ""],
208 | ],
209 | columns=["数据集", "开始时间", "结束时间", "样本总数", "样本占比", "坏客户数", "坏客户占比", "备注"],
210 | )
211 |
212 | worksheet = writer.get_sheet_by_name("汇总信息")
213 |
214 | # 样本总体分布情况
215 | start_row, start_col = 2, 2
216 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="样本总体分布情况", style="header")
217 | end_row, end_col = writer.insert_df2sheet(worksheet, dataset_summary, (end_row + 1, start_col), header=True)
218 |
219 | writer.set_number_format(worksheet, f"{get_column_letter(end_col - 2)}{end_row - len(dataset_summary)}:{get_column_letter(end_col - 2)}{end_row}", "0.00%")
220 | writer.set_number_format(worksheet, f"{get_column_letter(end_col - 4)}{end_row - len(dataset_summary)}:{get_column_letter(end_col - 4)}{end_row}", "0.00%")
221 |
222 | # 建模样本时间分布情况
223 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/all_sample_time_count.png")
224 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="建模样本时间分布情况", style="header")
225 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/all_sample_time_count.png", (end_row, start_col), figsize=(720, 370))
226 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False)
227 |
228 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%")
229 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%")
230 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%")
231 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%")
232 |
233 | # 训练集样本时间分布情况
234 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/train_sample_time_count.png")
235 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练集样本时间分布情况", style="header")
236 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_sample_time_count.png", (end_row, start_col), figsize=(720, 370))
237 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False)
238 |
239 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%")
240 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%")
241 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%")
242 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%")
243 |
244 | # 测试集样本时间分布情况
245 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/test_sample_time_count.png")
246 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试集样本时间分布情况", style="header")
247 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_sample_time_count.png", (end_row, start_col), figsize=(720, 370))
248 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False)
249 |
250 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%")
251 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%")
252 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%")
253 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%")
254 |
255 | # 跨时间验证集样本时间分布情况
256 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/oot_sample_time_count.png")
257 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="跨时间验证集样本时间分布情况", style="header")
258 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/oot_sample_time_count.png", (end_row, start_col), figsize=(720, 370))
259 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False)
260 |
261 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%")
262 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%")
263 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%")
264 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%")
265 |
266 |
267 | # ////////////////////////////////////// 模型报告 ///////////////////////////////////// #
268 |
269 | # 逻辑回归拟合情况
270 | worksheet = writer.get_sheet_by_name("逻辑回归拟合结果")
271 | start_row, start_col = 2, 2
272 |
273 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="逻辑回归拟合效果", style="header")
274 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{start_row}:{get_column_letter(start_col + len(summary.columns) - 1)}{start_row}")
275 | # worksheet[f"{get_column_letter(start_col)}{start_row}:{get_column_letter(start_col + len(summary.columns) - 1)}{start_row}"].style = "header"
276 | logistic.plot_weights(save="model_report/logistic_train.png")
277 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/logistic_train.png", (end_row + 2, start_col))
278 | end_row, end_col = writer.insert_df2sheet(worksheet, summary, (end_row + 1, start_col))
279 |
280 | conditional_column = get_column_letter(start_col + summary.columns.get_loc("Coef."))
281 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(summary)}', f'{conditional_column}{end_row}')
282 |
283 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{end_row + 2}:{get_column_letter(start_col + len(train_report.columns) - 1)}{end_row + 2}")
284 | # worksheet[f"{get_column_letter(start_col)}{end_row + 2}"].style = "header"
285 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集拟合报告", style="header")
286 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/lr_ksplot_train.png", (end_row, start_col), figsize=(480, 270))
287 | end_row, end_col = writer.insert_df2sheet(worksheet, train_report, (end_row + 1, start_col))
288 |
289 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{end_row + 2}:{get_column_letter(start_col + len(test_report.columns) - 1)}{end_row + 2}")
290 | # worksheet[f"{get_column_letter(start_col)}{end_row + 2}"].style = "header"
291 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集拟合报告", style="header")
292 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/lr_ksplot_test.png", (end_row, start_col), figsize=(480, 270))
293 | end_row, end_col = writer.insert_df2sheet(worksheet, test_report, (end_row + 1, start_col))
294 |
295 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{end_row + 2}:{get_column_letter(start_col + len(oot_report.columns) - 1)}{end_row + 2}")
296 | # worksheet[f"{get_column_letter(start_col)}{end_row + 2}"].style = "header"
297 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="跨时间验证集拟合报告", style="header")
298 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/lr_ksplot_oot.png", (end_row, start_col), figsize=(480, 270))
299 | end_row, end_col = writer.insert_df2sheet(worksheet, oot_report, (end_row + 1, start_col))
300 |
301 |
302 | # ////////////////////////////////////// 特征概述 ///////////////////////////////////// #
303 |
304 | # 模型变量概览
305 | feature_describe = pd.DataFrame([
306 | ["status_account", "支票账户状态"], ["duration", "借款周期"], ["credit_histor", "历史信用"], ["purpose", "借款目的"], ["amount", "信用额度"], ["svaing_account", "储蓄账户状态"], ["present_emp", "当前就业状态"], ["income_rate", "分期付款占可支配收入百分比"], ["personal_status", "性别与婚姻状态"], ["other_debtors", "他人担保信息"], ["residence_info", "现居住地"], ["property", "财产状态"], ["age", "年龄"], ["inst_plans", "其他分期情况"], ["housing", "房产状态"], ["num_credits", "信用卡数量"], ["job", "工作状态"], ["dependents", "赡养人数"], ["telephone", "电话号码注册情况"], ["foreign_worke", "是否有海外工作经历"],
307 | ], columns=["变量名称", "变量含义"])
308 |
309 | worksheet = writer.get_sheet_by_name("模型变量信息")
310 | start_row, start_col = 2, 2
311 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="入模变量信息", style="header")
312 | end_row, end_col = writer.insert_df2sheet(worksheet, feature_describe.reset_index().rename(columns={"index": "序号"}), (end_row + 1, start_col))
313 |
314 | # 变量分布情况
315 | data_info = toad.detect(data[card.rules.keys()]).reset_index().rename(columns={"index": "变量名称", "type": "变量类型", "size": "样本个数", "missing": "缺失值", "unique": "唯一值个数"})
316 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分布情况", style="header")
317 | end_row, end_col = writer.insert_df2sheet(worksheet, data_info, (end_row + 1, start_col))
318 |
319 | # 变量相关性
320 | data_corr = logistic.corr(feature_pipeline.transform(train), save="model_report/data_corr.png", annot=False)
321 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量相关性", style="header")
322 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/data_corr.png", (end_row + 1, start_col), figsize=(700, 500))
323 | end_row, end_col = writer.insert_df2sheet(worksheet, data_corr.reset_index().rename(columns={"index": ""}), (end_row + 1, start_col))
324 |
325 | conditional_column = f"{get_column_letter(start_col + 1)}{end_row - len(data_corr)}:{get_column_letter(end_col - 1)}{end_row - 1}"
326 | worksheet.conditional_formatting.add(conditional_column, ColorScaleRule(start_type='num', start_value=-1.0, start_color='8E8BFE', mid_type='num', mid_value=0., mid_color='FFFFFF', end_type='num', end_value=1.0, end_color='8E8BFE'))
327 |
328 |
329 | # 变量分箱信息
330 | _combiner = feature_pipeline.named_steps["combiner"]
331 |
332 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分箱信息", style="header")
333 | for col in card.rules.keys():
334 | feature_table = card.feature_bin_stats(data, col, target=target, desc="逻辑回归入模变量", combiner=card.combiner)
335 | # _combiner.bin_plot(data, col, labels=True, save=f"model_report/bin_plots/data_{col}.png")
336 | bin_plot(feature_table, feature=col, save=f"model_report/bin_plots/data_{col}.png")
337 | end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/bin_plots/data_{col}.png", (end_row + 1, start_col), figsize=(700, 400))
338 | end_row, end_col = writer.insert_df2sheet(worksheet, feature_table, (end_row, start_col))
339 |
340 | for c in ["坏样本率", "LIFT值"]:
341 | conditional_column = get_column_letter(start_col + feature_table.columns.get_loc(c))
342 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(feature_table)}', f'{conditional_column}{end_row}')
343 | # conditional_column = get_column_letter(start_col + feature_table.columns.get_loc("LIFT值"))
344 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(feature_table)}', f'{conditional_column}{end_row}')
345 |
346 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值"]:
347 | conditional_column = get_column_letter(start_col + feature_table.columns.get_loc(c))
348 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(feature_table)}:{conditional_column}{end_row}", "0.00%")
349 |
350 |
351 | # ////////////////////////////////////// 评分卡说明 ///////////////////////////////////// #
352 |
353 | # 评分卡刻度
354 | scorecard_kedu = pd.DataFrame(
355 | [
356 | ["base_odds", card.base_odds, "根据业务经验设置的基础比率(违约概率/正常概率),估算方法:(1-样本坏客户占比)/坏客户占比"],
357 | ["base_score", card.base_score, "基础ODDS对应的分数"],
358 | ["rate", card.rate, "设置分数的倍率"],
359 | ["pdo", card.pdo, "表示分数增长PDO时,ODDS值增长到RATE倍"],
360 | ["B", card.offset, "补偿值,计算方式:pdo / ln(rate)"],
361 | ["A", card.factor, "刻度,计算方式:base_score - B * ln(base_odds)"],
362 | ],
363 | columns=["刻度项", "刻度值", "备注"],
364 | )
365 |
366 | worksheet = writer.get_sheet_by_name("评分卡结果")
367 | start_row, start_col = 2, 2
368 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="评分卡刻度", style="header")
369 | end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_kedu, (end_row + 1, start_col))
370 |
371 | # 评分卡对应分数
372 | card_points = card.export(to_frame=True).rename(columns={"name": "变量名称", "value": "变量分箱", "score": "对应分数"})
373 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡分数", style="header")
374 | end_row, end_col = writer.insert_df2sheet(worksheet, card_points, (end_row + 1, start_col), merge_column="变量名称")
375 |
376 | # 评分效果
377 | clip = 50
378 | clip_start = max(math.ceil(train["score"].min() / clip) * clip, math.ceil(train["score"].quantile(0.01) / clip) * clip)
379 | clip_end = min(math.ceil(train["score"].max() / clip) * clip, math.ceil(train["score"].quantile(0.99) / clip) * clip)
380 | score_clip = [i for i in range(clip_start, clip_end, clip)]
381 |
382 | train_score_rank = card.feature_bin_stats(train, "score", target=target, rules=score_clip, verbose=0, method="step", ks=True)
383 | test_score_rank = card.feature_bin_stats(test, "score", target=target, rules=score_clip, verbose=0, method="step", ks=True)
384 | oot_score_rank = card.feature_bin_stats(oot, "score", target=target, rules=score_clip, verbose=0, method="step", ks=True)
385 |
386 | card.ks_plot(train["score"], train[target], title="Train Dataset", save="model_report/train_ksplot.png")
387 | card.ks_plot(test["score"], test[target], title="Test Dataset", save="model_report/test_ksplot.png")
388 | card.ks_plot(oot["score"], oot[target], title="OOT Dataset", save="model_report/oot_ksplot.png")
389 |
390 | card.score_hist(train["score"], train[target], save="model_report/train_scorehist.png", bins=30, figsize=(13, 10))
391 | card.score_hist(test["score"], test[target], save="model_report/test_scorehist.png", bins=30, figsize=(13, 10))
392 | card.score_hist(oot["score"], oot[target], save="model_report/oot_scorehist.png", bins=30, figsize=(13, 10))
393 |
394 |
395 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集评分模型效果", style="header")
396 | ks_row = end_row
397 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_ksplot.png", (ks_row, start_col))
398 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_scorehist.png", (ks_row, end_col))
399 | end_row, end_col = writer.insert_df2sheet(worksheet, train_score_rank, (end_row + 1, start_col))
400 |
401 | for c in ["坏样本率", "LIFT值", "分档KS值"]:
402 | conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc(c))
403 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(train_score_rank)}', f'{conditional_column}{end_row}')
404 |
405 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"]:
406 | conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc(c))
407 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(train_score_rank)}:{conditional_column}{end_row}", "0.00%")
408 |
409 | # conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc("坏样本率"))
410 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_score_rank)}', f'{conditional_column}{end_row}')
411 | # conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc("LIFT值"))
412 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_score_rank)}', f'{conditional_column}{end_row}')
413 | # conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc("分档KS值"))
414 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_score_rank)}', f'{conditional_column}{end_row}')
415 |
416 |
417 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集评分模型效果", style="header")
418 | ks_row = end_row
419 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_ksplot.png", (ks_row, start_col))
420 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_scorehist.png", (ks_row, end_col))
421 | end_row, end_col = writer.insert_df2sheet(worksheet, test_score_rank, (end_row + 1, start_col))
422 |
423 | for c in ["坏样本率", "LIFT值", "分档KS值"]:
424 | conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc(c))
425 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(test_score_rank)}', f'{conditional_column}{end_row}')
426 |
427 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"]:
428 | conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc(c))
429 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(test_score_rank)}:{conditional_column}{end_row}", "0.00%")
430 |
431 | # conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc("坏样本率"))
432 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_score_rank)}', f'{conditional_column}{end_row}')
433 | # conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc("LIFT值"))
434 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_score_rank)}', f'{conditional_column}{end_row}')
435 | # conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc("分档KS值"))
436 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_score_rank)}', f'{conditional_column}{end_row}')
437 |
438 |
439 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="跨时间验证集评分模型效果", style="header")
440 | ks_row = end_row
441 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/oot_ksplot.png", (ks_row, start_col))
442 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/oot_scorehist.png", (ks_row, end_col))
443 | end_row, end_col = writer.insert_df2sheet(worksheet, oot_score_rank, (end_row + 1, start_col))
444 |
445 | for c in ["坏样本率", "LIFT值", "分档KS值"]:
446 | conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc(c))
447 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(oot_score_rank)}', f'{conditional_column}{end_row}')
448 |
449 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"]:
450 | conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc(c))
451 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(oot_score_rank)}:{conditional_column}{end_row}", "0.00%")
452 |
453 | # conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc("坏样本率"))
454 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(oot_score_rank)}', f'{conditional_column}{end_row}')
455 | # conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc("LIFT值"))
456 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(oot_score_rank)}', f'{conditional_column}{end_row}')
457 | # conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc("分档KS值"))
458 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(oot_score_rank)}', f'{conditional_column}{end_row}')
459 |
460 |
461 | def score_psi(expected, actual, labels=["预期", "实际"], save=None, colors=['#8E8BFE', '#FEA3A2', '#9394E7'], figsize=(15, 8)):
462 | expected = expected.rename(columns={"分箱": "评分区间", "样本总数": f"{labels[0]}样本数", "样本占比": f"{labels[0]}样本占比", "坏样本率": f"{labels[0]}坏样本率"})
463 | actual = actual.rename(columns={"分箱": "评分区间", "样本总数": f"{labels[1]}样本数", "样本占比": f"{labels[1]}样本占比", "坏样本率": f"{labels[1]}坏样本率"})
464 | df_psi = expected.merge(actual, on="评分区间", how="outer").replace(np.nan, 0)
465 | df_psi[f"{labels[1]}% - {labels[0]}%"] = df_psi[f"{labels[1]}样本占比"] - df_psi[f"{labels[0]}样本占比"]
466 | df_psi[f"ln({labels[1]}% / {labels[0]}%)"] = np.log(df_psi[f"{labels[1]}样本占比"] / df_psi[f"{labels[0]}样本占比"])
467 | df_psi["分档PSI值"] = (df_psi[f"{labels[1]}% - {labels[0]}%"] * df_psi[f"ln({labels[1]}% / {labels[0]}%)"])
468 | df_psi = df_psi.fillna(0).replace(np.inf, 0).replace(-np.inf, 0)
469 | df_psi["总体PSI值"] = df_psi["分档PSI值"].sum()
470 |
471 | if save:
472 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
473 | os.makedirs(os.path.dirname(save))
474 |
475 | x = df_psi['评分区间']
476 | width = 0.35
477 | x_indexes = np.arange(len(x))
478 | fig, ax1 = plt.subplots(figsize=figsize)
479 |
480 | ax1.bar(x_indexes - width / 2, df_psi[f'{labels[0]}样本占比'], width, label=f'{labels[0]}样本占比', color=colors[0], hatch="/")
481 | ax1.bar(x_indexes + width / 2, df_psi[f'{labels[1]}样本占比'], width, label=f'{labels[1]}样本占比', color=colors[1], hatch="\\")
482 |
483 | ax1.set_ylabel('样本占比: 评分区间内样本数 / 样本总数')
484 | ax1.set_xticks(x_indexes)
485 | ax1.set_xticklabels(x)
486 | ax1.tick_params(axis='x', labelrotation=90)
487 |
488 | ax2 = ax1.twinx()
489 | ax2.plot(df_psi["评分区间"], df_psi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3)))
490 | ax2.plot(df_psi["评分区间"], df_psi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3)))
491 |
492 | ax2.scatter(df_psi["评分区间"], df_psi[f"{labels[0]}坏样本率"], marker=".")
493 | ax2.scatter(df_psi["评分区间"], df_psi[f"{labels[1]}坏样本率"], marker=".")
494 |
495 | ax2.set_ylabel('坏样本率: 坏样本数 / 样本总数')
496 |
497 | handles1, labels1 = ax1.get_legend_handles_labels()
498 | handles2, labels2 = ax2.get_legend_handles_labels()
499 | fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.94), frameon=False)
500 |
501 | fig.suptitle(f"{labels[0]} vs {labels[1]} 群体稳定性指数(PSI): {df_psi['分档PSI值'].sum():.4f}\n\n")
502 |
503 | fig.tight_layout()
504 |
505 | fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
506 |
507 | return df_psi[["评分区间", f"{labels[0]}样本数", f"{labels[0]}样本占比", f"{labels[0]}坏样本率", f"{labels[1]}样本数", f"{labels[1]}样本占比", f"{labels[1]}坏样本率", f"{labels[1]}% - {labels[0]}%", f"ln({labels[1]}% / {labels[0]}%)", "分档PSI值", "总体PSI值"]]
508 |
509 |
510 | train_test_score_psi = score_psi(train_score_rank, test_score_rank, labels=["训练数据集", "测试数据集"], save="model_report/train_test_psiplot.png")
511 | train_oot_score_psi = score_psi(train_score_rank, oot_score_rank, labels=["训练数据集", "跨时间验证集"], save="model_report/train_oot_psiplot.png")
512 | test_oot_score_psi = score_psi(test_score_rank, oot_score_rank, labels=["测试数据集", "跨时间验证集"], save="model_report/test_oot_psiplot.png")
513 |
514 |
515 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡模型稳定性评估: 训练数据集 vs 测试数据集", style="header")
516 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_test_psiplot.png", (end_row, start_col), figsize=(1000, 400))
517 | end_row, end_col = writer.insert_df2sheet(worksheet, train_test_score_psi, (end_row + 1, start_col))
518 |
519 | conditional_column = get_column_letter(start_col + train_test_score_psi.columns.get_loc("分档PSI值"))
520 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_test_score_psi)}', f'{conditional_column}{end_row}')
521 |
522 | for c in ["训练数据集样本占比", "训练数据集坏样本率", "测试数据集样本占比", "测试数据集坏样本率"]:
523 | conditional_column = get_column_letter(start_col + train_test_score_psi.columns.get_loc(c))
524 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(train_test_score_psi)}:{conditional_column}{end_row}", "0.00%")
525 |
526 |
527 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡模型稳定性评估: 训练数据集 vs 跨时间验证集", style="header")
528 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_oot_psiplot.png", (end_row, start_col), figsize=(1000, 400))
529 | end_row, end_col = writer.insert_df2sheet(worksheet, train_oot_score_psi, (end_row + 1, start_col))
530 |
531 | conditional_column = get_column_letter(start_col + train_oot_score_psi.columns.get_loc("分档PSI值"))
532 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_oot_score_psi)}', f'{conditional_column}{end_row}')
533 |
534 | for c in ["训练数据集样本占比", "训练数据集坏样本率", "跨时间验证集样本占比", "跨时间验证集坏样本率"]:
535 | conditional_column = get_column_letter(start_col + train_oot_score_psi.columns.get_loc(c))
536 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(train_oot_score_psi)}:{conditional_column}{end_row}", "0.00%")
537 |
538 |
539 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡模型稳定性评估: 测试数据集 vs 跨时间验证集", style="header")
540 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_oot_psiplot.png", (end_row, start_col), figsize=(1000, 400))
541 | end_row, end_col = writer.insert_df2sheet(worksheet, test_oot_score_psi, (end_row + 1, start_col))
542 |
543 | conditional_column = get_column_letter(start_col + test_oot_score_psi.columns.get_loc("分档PSI值"))
544 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_oot_score_psi)}', f'{conditional_column}{end_row}')
545 |
546 | for c in ["跨时间验证集样本占比", "跨时间验证集坏样本率", "测试数据集样本占比", "测试数据集坏样本率"]:
547 | conditional_column = get_column_letter(start_col + test_oot_score_psi.columns.get_loc(c))
548 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(test_oot_score_psi)}:{conditional_column}{end_row}", "0.00%")
549 |
550 |
551 | # ////////////////////////////////////// 模型稳定性 ///////////////////////////////////// #
552 | #
553 | # worksheet = writer.get_sheet_by_name("模型稳定性")
554 | # start_row, start_col = 2, 2
555 | #
556 | # # 变量 CSI 表
557 | # end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="入模变量稳定性指标 (Characteristic Stability Index, CSI)", style="header")
558 | #
559 | # # train vs test
560 | #
561 | # # 评分分布稳定性
562 | # end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="模型评分稳定性指标 (Population Stability Index, PSI)", style="header")
563 |
564 |
565 | writer.save("model_report/评分卡模型报告.xlsx")
566 |
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time : 2022/8/23 13:12
4 | @Author : itlubber
5 | @Site : itlubber.art
6 | """
7 |
8 | import os
9 | import toad
10 | import warnings
11 | import numpy as np
12 | import pandas as pd
13 | import scorecardpy as sc
14 | from scorecardpy.perf import eva_pks, eva_proc
15 | from optbinning import OptimalBinning
16 | import matplotlib.pyplot as plt
17 | from matplotlib import font_manager
18 | import seaborn as sns
19 | # import plotly.graph_objects as go
20 | # from plotly.io import write_image
21 | from openpyxl import load_workbook
22 | from openpyxl.styles import Alignment, PatternFill
23 |
24 | import scipy
25 | import statsmodels.api as sm
26 | from statsmodels.stats.outliers_influence import variance_inflation_factor
27 |
28 | from sklearn.pipeline import Pipeline
29 | from sklearn.metrics import roc_curve, auc
30 | from sklearn.metrics import classification_report
31 | from sklearn.linear_model import LogisticRegression
32 | from sklearn.model_selection import train_test_split
33 | from sklearn.utils.validation import check_is_fitted
34 | from sklearn.ensemble import GradientBoostingClassifier
35 | from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
36 |
37 | from processing import FeatureSelection, Combiner, WOETransformer, StepwiseSelection
38 |
39 |
40 | warnings.filterwarnings("ignore")
41 | pd.set_option('display.width', 5000)
42 | # plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
43 | # plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
44 |
45 |
46 | def pyplot_chinese(font_path='utils/matplot_chinese.ttf'):
47 | font_manager.fontManager.addfont(font_path)
48 | plt.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name()
49 | plt.rcParams['axes.unicode_minus']=False
50 |
51 |
52 | class StatsLogisticRegression(TransformerMixin, BaseEstimator):
53 |
54 | def __init__(self, target="target", intercept=True):
55 | """
56 | 基于statsmodels的逻辑回归方法
57 |
58 | Args:
59 | target: 数据集中标签名称,默认 target
60 | intercept: 是否包含截距,默认 True,即包含截距
61 | """
62 | self.intercept = intercept
63 | self.target = target
64 | self.classifier = None
65 | self.corr = None
66 | self.vif = None
67 | self.coef_normalization = None
68 | self.feature_names_ = None
69 | self.feature_importances_ = None
70 |
71 | def fit(self, x, y=None, vif=True, corr=True, normalization=True):
72 | self.feature_names_ = list(x.drop(columns=[self.target]).columns)
73 | self.feature_importances_ = self.feature_importances(x)
74 |
75 | if vif:
76 | self.vif = self.VIF(x)
77 |
78 | if normalization:
79 | _x = x.drop(columns=[self.target]).apply(lambda x: (x - np.mean(x)) / np.std(x))
80 | _y = x[self.target]
81 | lr_normalization = sm.Logit(_y, sm.add_constant(_x) if self.intercept else _x).fit()
82 | self.coef_normalization = pd.DataFrame(lr_normalization.params, columns=["coef_normalization"])
83 |
84 | if corr:
85 | self.corr = x.drop(columns=[self.target]).corr()
86 |
87 | if self.intercept:
88 | x = sm.add_constant(x)
89 |
90 | self.classes_ = x[self.target].unique()
91 | self.classifier = sm.Logit(x[self.target], x.drop(columns=[self.target])).fit()
92 |
93 | return self
94 |
95 | def transform(self, x):
96 | if self.intercept:
97 | x = sm.add_constant(x)
98 |
99 | return self.classifier.predict(x)
100 |
101 | def predict(self, x):
102 | return self.transform(x)
103 |
104 | def summary(self):
105 | describe = self.classifier.summary2()
106 | return describe
107 |
108 | def feature_importances(self, x):
109 | params = {
110 | "n_estimators": 256,
111 | "max_depth": 4,
112 | "min_samples_split": 5,
113 | "learning_rate": 1e-3,
114 | "loss": "deviance",
115 | "subsample": 0.9,
116 | }
117 | feature_importances_ = GradientBoostingClassifier(**params).fit(x.drop(columns=[self.target]), x[self.target]).feature_importances_
118 | return pd.DataFrame(feature_importances_, index=self.feature_names_, columns=["feature_importances"])
119 |
120 | def VIF(self, x):
121 | if self.intercept:
122 | x = sm.add_constant(x)
123 |
124 | x = x.drop(columns=[self.target])
125 | columns = x.columns
126 | vif = pd.DataFrame({"VIF": [variance_inflation_factor(np.matrix(x), i) for i in range(len(columns))]}, index=columns)
127 |
128 | return vif
129 |
130 | def WALD(self):
131 | return self.classifier.wald_test_terms().table[["statistic", "pvalue"]].rename(columns={"pvalue": "wald_test_pvalue", "statistic": "wald_test_statistic"})
132 |
133 | def report(self):
134 | return self.classifier.summary2().tables[1].join([self.coef_normalization, self.WALD(), self.vif, self.feature_importances_]), self.classifier.summary2().tables[0], self.corr
135 |
136 | def summary_save(self, excel_name="逻辑回归模型拟合效果.xlsx", sheet_name="逻辑回归拟合效果"):
137 | writer = pd.ExcelWriter(excel_name, engine='openpyxl')
138 |
139 | coef_report, summary_report, corr_report = self.report()
140 | summary_report.columns = ["逻辑回归模型拟合效果"] * summary_report.shape[1]
141 | summary_report.to_excel(writer, sheet_name=sheet_name, index=False, header=False, startcol=0, startrow=2)
142 | coef_report.reset_index().rename(columns={"index": "variable"}).to_excel(writer, sheet_name=sheet_name, index=False, header=True, startcol=0, startrow=summary_report.shape[0] + 4)
143 | corr_report.to_excel(writer, sheet_name=sheet_name, index=True, header=True, startcol=0, startrow=summary_report.shape[0] + coef_report.shape[0] + 7)
144 |
145 | writer.save()
146 | writer.close()
147 |
148 | if os.path.exists(excel_name):
149 | workbook = load_workbook(excel_name)
150 | worksheet = workbook.get_sheet_by_name(sheet_name)
151 | worksheet["A1"].value = "逻辑回归模型报告"
152 | worksheet["A1"].alignment = Alignment(horizontal='center', vertical='center')
153 | worksheet.merge_cells(f"A1:L1")
154 |
155 | workbook.save(excel_name)
156 | workbook.close()
157 |
158 | try:
159 | from processing import render_excel # From: https://github.com/itlubber/openpyxl-excel-style-template/blob/main/feature_bins.py
160 | render_excel(excel_name, sheet_name=sheet_name, max_column_width=25, merge_rows=np.cumsum([1, len(summary_report), 2, len(coef_report) + 1, 2, len(corr_report) + 1]).tolist())
161 | except:
162 | pass
163 |
164 |
165 | class ITLubberLogisticRegression(LogisticRegression):
166 | """
167 | Extended Logistic Regression.
168 | Extends [sklearn.linear_model.LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).
169 | This class provides the following extra statistics, calculated on `.fit()` and accessible via `.summary()`:
170 | - `cov_matrix_`: covariance matrix for the estimated parameters.
171 | - `std_err_intercept_`: estimated uncertainty for the intercept
172 | - `std_err_coef_`: estimated uncertainty for the coefficients
173 | - `z_intercept_`: estimated z-statistic for the intercept
174 | - `z_coef_`: estimated z-statistic for the coefficients
175 | - `p_value_intercept_`: estimated p-value for the intercept
176 | - `p_value_coef_`: estimated p-value for the coefficients
177 |
178 | Example:
179 | ```python
180 | feature_pipeline = Pipeline([
181 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
182 | ("combiner", Combiner(target=target, min_samples=0.2)),
183 | ("transform", WOETransformer(target=target)),
184 | ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
185 | ("stepwise", StepwiseSelection(target=target)),
186 | # ("logistic", LogisticClassifier(target=target)),
187 | ("logistic", ITLubberLogisticRegression(target=target)),
188 | ])
189 |
190 | feature_pipeline.fit(train)
191 | summary = feature_pipeline.named_steps['logistic'].summary()
192 | ```
193 |
194 | An example output of `.summary()`:
195 |
196 | | | Coef. | Std.Err | z | P>|z| | [ 0.025 | 0.975 ] | VIF |
197 | |:------------------|----------:|----------:|---------:|------------:|-----------:|----------:|--------:|
198 | | const | -0.844037 | 0.0965117 | -8.74544 | 2.22148e-18 | -1.0332 | -0.654874 | 1.05318 |
199 | | duration.in.month | 0.847445 | 0.248873 | 3.40513 | 0.000661323 | 0.359654 | 1.33524 | 1.14522 |
200 | """
201 |
202 | def __init__(self, target="target", penalty="l2", calculate_stats=True, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver="lbfgs", max_iter=100, multi_class="auto", verbose=0, warm_start=False, n_jobs=None, l1_ratio=None,):
203 | """
204 | Extends [sklearn.linear_model.LogisticRegression.fit()](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).
205 |
206 | Args:
207 | target (str): your dataset's target name
208 | calculate_stats (bool): If true, calculate statistics like standard error during fit, accessible with .summary()
209 | """
210 | super().__init__(penalty=penalty, dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs, l1_ratio=l1_ratio,)
211 | self.target = target
212 | self.calculate_stats = calculate_stats
213 |
214 | def fit(self, x, sample_weight=None, **kwargs):
215 | y = x[self.target]
216 | x = x.drop(columns=[self.target])
217 |
218 | if not self.calculate_stats:
219 | return super().fit(x, y, sample_weight=sample_weight, **kwargs)
220 |
221 | x = self.convert_sparse_matrix(x)
222 |
223 | if isinstance(x, pd.DataFrame):
224 | self.names_ = ["const"] + [f for f in x.columns]
225 | else:
226 | self.names_ = ["const"] + [f"x{i}" for i in range(x.shape[1])]
227 |
228 | lr = super().fit(x, y, sample_weight=sample_weight, **kwargs)
229 |
230 | predProbs = self.predict_proba(x)
231 |
232 | # Design matrix -- add column of 1's at the beginning of your x matrix
233 | if lr.fit_intercept:
234 | x_design = np.hstack([np.ones((x.shape[0], 1)), x])
235 | else:
236 | x_design = x
237 |
238 | self.vif = [variance_inflation_factor(np.matrix(x_design), i) for i in range(x_design.shape[-1])]
239 | p = np.product(predProbs, axis=1)
240 | self.cov_matrix_ = np.linalg.inv((x_design * p[..., np.newaxis]).T @ x_design)
241 | std_err = np.sqrt(np.diag(self.cov_matrix_)).reshape(1, -1)
242 |
243 | # In case fit_intercept is set to True, then in the std_error array
244 | # Index 0 corresponds to the intercept, from index 1 onwards it relates to the coefficients
245 | # If fit intercept is False, then all the values are related to the coefficients
246 | if lr.fit_intercept:
247 |
248 | self.std_err_intercept_ = std_err[:, 0]
249 | self.std_err_coef_ = std_err[:, 1:][0]
250 |
251 | self.z_intercept_ = self.intercept_ / self.std_err_intercept_
252 |
253 | # Get p-values under the gaussian assumption
254 | self.p_val_intercept_ = scipy.stats.norm.sf(abs(self.z_intercept_)) * 2
255 |
256 | else:
257 | self.std_err_intercept_ = np.array([np.nan])
258 | self.std_err_coef_ = std_err[0]
259 |
260 | self.z_intercept_ = np.array([np.nan])
261 |
262 | # Get p-values under the gaussian assumption
263 | self.p_val_intercept_ = np.array([np.nan])
264 |
265 | self.z_coef_ = self.coef_ / self.std_err_coef_
266 | self.p_val_coef_ = scipy.stats.norm.sf(abs(self.z_coef_)) * 2
267 |
268 | return self
269 |
270 | def corr(self, data, save=None, annot=True):
271 | corr = data.drop(columns=[self.target]).corr()
272 |
273 | if save:
274 | self.corr_plot(data.drop(columns=[self.target]), save=save, annot=annot)
275 |
276 | return corr
277 |
278 | @staticmethod
279 | def corr_plot(data, figure_size=(16, 8), fontsize=14, color=["#2639E9", "#F76E6C", "#FE7715"], mask=False, save=None, annot=True):
280 | corr = data.corr()
281 | corr_mask = np.zeros_like(corr, dtype = np.bool)
282 | corr_mask[np.triu_indices_from(corr_mask)] = True
283 |
284 | map_plot = toad.tadpole.tadpole.heatmap(
285 | corr,
286 | mask = corr_mask if mask else None,
287 | cmap = sns.diverging_palette(267, 267, n=10, s=100, l=40),
288 | vmax = 1,
289 | vmin = -1,
290 | center = 0,
291 | square = True,
292 | linewidths = .1,
293 | annot = annot,
294 | fmt = '.2f',
295 | figure_size = figure_size,
296 | )
297 |
298 | map_plot.tick_params(axis='x', labelrotation=270, labelsize=fontsize)
299 | map_plot.tick_params(axis='y', labelrotation=0, labelsize=fontsize)
300 |
301 | if save:
302 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
303 | os.makedirs(os.path.dirname(save))
304 |
305 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
306 |
307 | return map_plot
308 |
309 | def report(self, data):
310 | report_dict = classification_report(data[self.target], self.predict(data.drop(columns=self.target)), output_dict=True, target_names=["好客户", "坏客户"])
311 | accuracy = report_dict.pop("accuracy")
312 | _report = pd.DataFrame(report_dict).T.reset_index().rename(columns={"index": "desc"})
313 | _report.loc[len(_report)] = ['accuracy', '', '', accuracy, len(data)]
314 | return _report
315 |
316 | def summary(self):
317 | """
318 | Puts the summary statistics of the fit() function into a pandas DataFrame.
319 | Returns:
320 | data (pandas DataFrame): The statistics dataframe, indexed by the column name
321 | """
322 | check_is_fitted(self)
323 |
324 | if not hasattr(self, "std_err_coef_"):
325 | msg = "Summary statistics were not calculated on .fit(). Options to fix:\n"
326 | msg += "\t- Re-fit using .fit(X, y, calculate_stats=True)\n"
327 | msg += "\t- Re-inititialize using LogisticRegression(calculate_stats=True)"
328 | raise AssertionError(msg)
329 |
330 | data = {
331 | "Coef.": (self.intercept_.tolist() + self.coef_.tolist()[0]),
332 | "Std.Err": (self.std_err_intercept_.tolist() + self.std_err_coef_.tolist()),
333 | "z": (self.z_intercept_.tolist() + self.z_coef_.tolist()[0]),
334 | "P>|z|": (self.p_val_intercept_.tolist() + self.p_val_coef_.tolist()[0]),
335 | }
336 |
337 | stats = pd.DataFrame(data, index=self.names_)
338 | stats["[ 0.025"] = stats["Coef."] - 1.96 * stats["Std.Err"]
339 | stats["0.975 ]"] = stats["Coef."] + 1.96 * stats["Std.Err"]
340 |
341 | stats["VIF"] = self.vif
342 |
343 | return stats
344 |
345 | @staticmethod
346 | def convert_sparse_matrix(x):
347 | """
348 | Converts a sparse matrix to a numpy array.
349 | This can prevent problems arising from, e.g. OneHotEncoder.
350 | Args:
351 | x: numpy array, sparse matrix
352 | Returns:
353 | numpy array of x
354 | """
355 | if scipy.sparse.issparse(x):
356 | return x.toarray()
357 | else:
358 | return x
359 |
360 | def plot_weights(self, save=None, figsize=(15, 8), fontsize=14, color=["#2639E9", "#F76E6C", "#FE7715"]):
361 | summary = self.summary()
362 |
363 | x = summary["Coef."]
364 | y = summary.index
365 | lower_error = summary["Coef."] - summary["[ 0.025"]
366 | upper_error = summary["0.975 ]"] - summary["Coef."]
367 |
368 | fig, ax = plt.subplots(1, 1, figsize=figsize)
369 | ax.errorbar(x, y, xerr=[lower_error, upper_error], fmt="o", ecolor=color[0], elinewidth=2, capthick=2, capsize=4, ms=6, mfc=color[0], mec=color[0])
370 | # ax.tick_params(axis='x', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize)
371 | # ax.tick_params(axis='y', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize)
372 | ax.axvline(0, color=color[0], linestyle='--', ymax=len(y), alpha=0.5)
373 | ax.spines['top'].set_color(color[0])
374 | ax.spines['bottom'].set_color(color[0])
375 | ax.spines['right'].set_color(color[0])
376 | ax.spines['left'].set_color(color[0])
377 | ax.spines['top'].set_visible(False)
378 | ax.spines['right'].set_visible(False)
379 |
380 | ax.set_title("Regression Meta Analysis - Weight Plot", fontsize=fontsize, fontweight="bold")
381 | ax.set_xlabel("Weight Estimates", fontsize=fontsize, weight="bold")
382 | ax.set_ylabel("Variable", fontsize=fontsize, weight="bold")
383 |
384 | if save:
385 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
386 | os.makedirs(os.path.dirname(save))
387 |
388 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
389 |
390 | return fig
391 |
392 | # def plot_weights(self, save=None):
393 | # """
394 | # Generates a weight plot(plotly chart) from `stats`
395 | # Example:
396 | # ```
397 | # pipeline = Pipeline([
398 | # ('clf', LogisticRegression(calculate_stats=True))
399 | # ])
400 | # pipeline.fit(X, y)
401 | # stats = pipeline.named_steps['clf'].plot_weights()
402 | # ```
403 | # Args:
404 | # stats: The statistics to display
405 | # format: The format of the image, such as 'png'. The default None returns a plotly image.
406 | # scale: If format is specified, the scale of the image
407 | # width: If format is specified, the width of the image
408 | # height: If format is specified, the image of the image
409 | # """
410 | # stats = self.summary()
411 |
412 | # fig = go.Figure()
413 |
414 | # fig.add_trace(
415 | # go.Scatter(
416 | # x=stats['Coef.'],
417 | # y=stats['Coef.'].index,
418 | # line=dict(color='#2639E9', width=2),
419 | # mode='markers',
420 |
421 | # error_x=dict(
422 | # type='data',
423 | # symmetric=False,
424 | # array=stats['0.975 ]'] - stats['Coef.'],
425 | # arrayminus=stats['Coef.'] - stats['[ 0.025'],
426 | # color='#2639E9')
427 | # )
428 | # )
429 |
430 | # fig.add_shape(type="line",
431 | # x0=0, y0=0, x1=0, y1=len(stats),
432 | # line=dict(color="#a29bfe", width=3, dash='dash')
433 | # )
434 |
435 | # fig.update_layout(
436 | # title='Regression Meta Analysis - Weight Plot',
437 | # xaxis_title='Weight Estimates',
438 | # yaxis_title='Variable',
439 | # xaxis_showgrid=False,
440 | # yaxis_showgrid=False
441 | # )
442 |
443 | # fig.update_layout(template="simple_white")
444 |
445 | # if save:
446 | # write_image(fig, save)
447 |
448 | # return fig
449 |
450 |
451 | class ScoreCard(toad.ScoreCard, TransformerMixin):
452 |
453 | def __init__(self, target="target", pdo=60, rate=2, base_odds=35, base_score=750, combiner={}, transer=None, pretrain_lr=None, pipeline=None, **kwargs):
454 | """
455 | 评分卡模型转换
456 |
457 | Args:
458 | target: 数据集中标签名称,默认 target
459 | pdo: odds 每增加 rate 倍时减少 pdo 分,默认 60
460 | rate: 倍率
461 | base_odds: 基础 odds,通常根据业务经验设置的基础比率(违约概率/正常概率),估算方法:(1-样本坏客户占比)/坏客户占比,默认 35,即 35:1 => 0.972 => 坏样本率 2.8%
462 | base_score: 基础 odds 对应的分数,默认 750
463 | combiner: 分箱转换器,传入 pipeline 时可以为None
464 | transer: woe转换器,传入 pipeline 时可以为None
465 | pretrain_lr: 预训练好的逻辑回归模型,可以不传
466 | pipeline: 训练好的 pipeline,必须包含 Combiner 和 WOETransformer
467 | **kwargs: 其他相关参数,具体参考 toad.ScoreCard
468 | """
469 | if pipeline:
470 | combiner = self.class_steps(pipeline, Combiner)[0]
471 | transer = self.class_steps(pipeline, WOETransformer)[0]
472 |
473 | if self.class_steps(pipeline, (ITLubberLogisticRegression, LogisticRegression)):
474 | pretrain_lr = self.class_steps(pipeline, (ITLubberLogisticRegression, LogisticRegression))[0]
475 |
476 | super().__init__(
477 | combiner=combiner.combiner if isinstance(combiner, Combiner) else combiner, transer=transer.transformer if isinstance(transer, WOETransformer) else transer,
478 | pdo=pdo, rate=rate, base_odds=base_odds, base_score=base_score, **kwargs
479 | )
480 |
481 | self.target = target
482 | self.pipeline = pipeline
483 | self.pretrain_lr = pretrain_lr
484 |
485 | def fit(self, x):
486 | y = x[self.target]
487 | x = x.drop(columns=[self.target])
488 |
489 | self._feature_names = x.columns.tolist()
490 |
491 | for f in self.features_:
492 | if f not in self.transer:
493 | raise Exception('column \'{f}\' is not in transer'.format(f = f))
494 |
495 | if self.pretrain_lr:
496 | self.model = self.pretrain_lr
497 | else:
498 | self.model.fit(x, y)
499 |
500 | self.rules = self._generate_rules()
501 |
502 | sub_score = self.woe_to_score(x)
503 | self.base_effect = pd.Series(np.median(sub_score, axis=0), index = self.features_)
504 |
505 | return self
506 |
507 | def transform(self, x):
508 | return self.predict(x)
509 |
510 | def scorecard_scale(self):
511 | scorecard_kedu = pd.DataFrame(
512 | [
513 | ["base_odds", self.base_odds, "根据业务经验设置的基础比率(违约概率/正常概率),估算方法:(1-样本坏客户占比)/坏客户占比"],
514 | ["base_score", self.base_score, "基础ODDS对应的分数"],
515 | ["rate", self.rate, "设置分数的倍率"],
516 | ["pdo", self.pdo, "表示分数增长PDO时,ODDS值增长到RATE倍"],
517 | ["B", self.offset, "补偿值,计算方式:pdo / ln(rate)"],
518 | ["A", self.factor, "刻度,计算方式:base_score - B * ln(base_odds)"],
519 | ],
520 | columns=["刻度项", "刻度值", "备注"],
521 | )
522 | return scorecard_kedu
523 |
524 | @staticmethod
525 | def KS_bucket(y_pred, y_true, bucket=10, method="quantile"):
526 | return toad.metrics.KS_bucket(y_pred, y_true, bucket=bucket, method=method)
527 |
528 | @staticmethod
529 | def KS(y_pred, y_true):
530 | return toad.metrics.KS(y_pred, y_true)
531 |
532 | @staticmethod
533 | def AUC(y_pred, y_true):
534 | return toad.metrics.AUC(y_pred, y_true)
535 |
536 | @staticmethod
537 | def perf_eva(y_pred, y_true, title="", plot_type=["ks", "roc"], save=None, figsize=(14, 6)):
538 | # plt.figure(figsize=figsize)
539 | rt = sc.perf_eva(y_true, y_pred, title=title, plot_type=plot_type, show_plot=True)
540 |
541 | if save:
542 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
543 | os.makedirs(os.path.dirname(save))
544 |
545 | rt["pic"].savefig(save, dpi=240, format="png", bbox_inches="tight")
546 |
547 | return rt
548 |
549 | @staticmethod
550 | def ks_plot(score, target, title="", fontsize=14, figsize=(16, 8), save=None, colors=["#2639E9", "#F76E6C", "#FE7715"]):
551 | if np.mean(score) < 0 or np.mean(score) > 1:
552 | warnings.warn('Since the average of pred is not in [0,1], it is treated as predicted score but not probability.')
553 | score = -score
554 |
555 | df = pd.DataFrame({'label': target, 'pred': score})
556 | def n0(x): return sum(x==0)
557 | def n1(x): return sum(x==1)
558 | df_ks = df.sort_values('pred', ascending=False).reset_index(drop=True) \
559 | .assign(group=lambda x: np.ceil((x.index+1)/(len(x.index)/len(df.index)))) \
560 | .groupby('group')['label'].agg([n0, n1]) \
561 | .reset_index().rename(columns={'n0':'good','n1':'bad'}) \
562 | .assign(
563 | group=lambda x: (x.index+1)/len(x.index),
564 | cumgood=lambda x: np.cumsum(x.good)/sum(x.good),
565 | cumbad=lambda x: np.cumsum(x.bad)/sum(x.bad)
566 | ).assign(ks=lambda x:abs(x.cumbad-x.cumgood))
567 |
568 | fig, ax = plt.subplots(1, 2, figsize = figsize)
569 |
570 | # KS曲线
571 | dfks = df_ks.loc[lambda x: x.ks==max(x.ks)].sort_values('group').iloc[0]
572 |
573 | ax[0].plot(df_ks.group, df_ks.ks, color=colors[0], label="KS曲线")
574 | ax[0].plot(df_ks.group, df_ks.cumgood, color=colors[1], label="累积好客户占比")
575 | ax[0].plot(df_ks.group, df_ks.cumbad, color=colors[2], label="累积坏客户占比")
576 | ax[0].fill_between(df_ks.group, df_ks.cumbad, df_ks.cumgood, color=colors[0], alpha=0.25)
577 |
578 | ax[0].plot([dfks['group'], dfks['group']], [0, dfks['ks']], 'r--')
579 | ax[0].text(dfks['group'], dfks['ks'], f"KS: {round(dfks['ks'],4)} at: {dfks.group:.2%}", horizontalalignment='center', fontsize=fontsize)
580 |
581 | ax[0].spines['top'].set_color(colors[0])
582 | ax[0].spines['bottom'].set_color(colors[0])
583 | ax[0].spines['right'].set_color(colors[0])
584 | ax[0].spines['left'].set_color(colors[0])
585 | ax[0].set_xlabel('% of Population', fontsize=fontsize)
586 | ax[0].set_ylabel('% of Total Bad / Good', fontsize=fontsize)
587 |
588 | ax[0].set_xlim((0, 1))
589 | ax[0].set_ylim((0, 1))
590 |
591 | handles1, labels1 = ax[0].get_legend_handles_labels()
592 |
593 | ax[0].legend(loc='upper center', ncol=len(labels1), bbox_to_anchor=(0.5, 1.1), frameon=False)
594 |
595 | # ROC 曲线
596 | fpr, tpr, thresholds = roc_curve(target, score)
597 | auc_value = toad.metrics.AUC(score, target)
598 |
599 | ax[1].plot(fpr, tpr, color=colors[0], label="ROC Curve")
600 | ax[1].stackplot(fpr, tpr, color=colors[0], alpha=0.25)
601 | ax[1].plot([0, 1], [0, 1], color=colors[1], lw=2, linestyle=':')
602 | # ax[1].tick_params(axis='x', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize)
603 | # ax[1].tick_params(axis='y', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize)
604 | ax[1].text(0.5, 0.5, f"AUC: {auc_value:.4f}", fontsize=fontsize, horizontalalignment="center", transform=ax[1].transAxes)
605 |
606 | ax[1].spines['top'].set_color(colors[0])
607 | ax[1].spines['bottom'].set_color(colors[0])
608 | ax[1].spines['right'].set_color(colors[0])
609 | ax[1].spines['left'].set_color(colors[0])
610 | ax[1].set_xlabel("False Positive Rate", fontsize=fontsize)
611 | ax[1].set_ylabel('True Positive Rate', fontsize=fontsize)
612 |
613 | ax[1].set_xlim((0, 1))
614 | ax[1].set_ylim((0, 1))
615 |
616 | ax[1].yaxis.tick_right()
617 | ax[1].yaxis.set_label_position("right")
618 |
619 | handles2, labels2 = ax[1].get_legend_handles_labels()
620 |
621 | ax[1].legend(loc='upper center', ncol=len(labels2), bbox_to_anchor=(0.5, 1.1), frameon=False)
622 |
623 | if title: title += " "
624 | fig.suptitle(f"{title}K-S & ROC CURVE\n", fontsize=fontsize, fontweight="bold")
625 |
626 | plt.tight_layout()
627 |
628 | if save:
629 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
630 | os.makedirs(os.path.dirname(save))
631 |
632 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
633 |
634 | return fig
635 |
636 | @staticmethod
637 | def PSI(y_pred_train, y_pred_oot):
638 | return toad.metrics.PSI(y_pred_train, y_pred_oot)
639 |
640 | @staticmethod
641 | def perf_psi(y_pred_train, y_pred_oot, y_true_train, y_true_oot, keys=["train", "test"], x_limits=None, x_tick_break=50, show_plot=True, return_distr_dat=False):
642 | return sc.perf_psi(
643 | score = {keys[0]: y_pred_train, keys[1]: y_pred_oot},
644 | label = {keys[0]: y_true_train, keys[1]: y_true_oot},
645 | x_limits = x_limits,
646 | x_tick_break = x_tick_break,
647 | show_plot = show_plot,
648 | return_distr_dat = return_distr_dat,
649 | )
650 |
651 | @staticmethod
652 | def score_hist(score, y_true, figsize=(15, 10), bins=20, alpha=1, save=None):
653 | fig, ax = plt.subplots(1, 1, figsize = figsize)
654 | palette = sns.diverging_palette(340, 267, n=2, s=100, l=40)
655 |
656 | sns.histplot(
657 | x=score, hue=y_true.replace({0: "good", 1: "bad"}), element="step", stat="density", bins=bins, common_bins=True, common_norm=True, palette=palette, ax=ax
658 | )
659 |
660 | sns.despine()
661 |
662 | ax.spines['top'].set_color("#2639E9")
663 | ax.spines['bottom'].set_color("#2639E9")
664 | ax.spines['right'].set_color("#2639E9")
665 | ax.spines['left'].set_color("#2639E9")
666 |
667 | ax.set_xlabel("score")
668 | ax.set_ylabel("density")
669 |
670 | ax.legend(["坏样本", "好样本"], loc='upper center', ncol=len(y_true.unique()), bbox_to_anchor=(0.5, 1.05), frameon=False, fontsize=14)
671 |
672 | fig.tight_layout()
673 |
674 | if save:
675 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
676 | os.makedirs(os.path.dirname(save))
677 |
678 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
679 |
680 | return fig
681 |
682 | def _format_rule(self, rule, decimal = 2, **kwargs):
683 | bins = self.format_bins(rule['bins'])
684 | scores = np.around(rule['scores'], decimals = decimal).tolist()
685 |
686 | return dict(zip(bins, scores))
687 |
688 | @staticmethod
689 | def class_steps(pipeline, query):
690 | return [v for k, v in pipeline.named_steps.items() if isinstance(v, query)]
691 |
692 | @staticmethod
693 | def round_float(num, decimal = 4):
694 | if ~pd.isnull(num) and isinstance(num, float):
695 | return float(str(num).split(".")[0] + "." + str(num).split(".")[1][:decimal])
696 | else:
697 | return num
698 |
699 | def feature_bins(self, bins, decimal = 4):
700 | if isinstance(bins, list): bins = np.array(bins)
701 | EMPTYBINS = len(bins) if not isinstance(bins[0], (set, list, np.ndarray)) else -1
702 |
703 | l = []
704 | if np.issubdtype(bins.dtype, np.number):
705 | has_empty = len(bins) > 0 and np.isnan(bins[-1])
706 | if has_empty: bins = bins[:-1]
707 | sp_l = ["负无穷"] + [self.round_float(b, decimal=decimal) for b in bins.tolist()] + ["正无穷"]
708 | for i in range(len(sp_l) - 1): l.append('['+str(sp_l[i])+' , '+str(sp_l[i+1])+')')
709 | if has_empty: l.append('缺失值')
710 | else:
711 | for keys in bins:
712 | keys_update = set()
713 | for key in keys:
714 | if pd.isnull(key) or key == "nan":
715 | keys_update.add("缺失值")
716 | elif key.strip() == "":
717 | keys_update.add("空字符串")
718 | else:
719 | keys_update.add(key)
720 | label = ','.join(keys_update)
721 | l.append(label)
722 |
723 | return {i if b != "缺失值" else EMPTYBINS: b for i, b in enumerate(l)}
724 |
725 | def feature_bin_stats(self, data, feature, target="target", rules={}, empty_separate=True, method='step', max_n_bins=10, clip_v=None, desc="评分卡分数", verbose=0, combiner=None, ks=False):
726 | if method not in ['dt', 'chi', 'quantile', 'step', 'kmeans', 'cart']:
727 | raise "method is the one of ['dt', 'chi', 'quantile', 'step', 'kmeans', 'cart']"
728 |
729 | if combiner is None:
730 | combiner = toad.transform.Combiner()
731 |
732 | if method == "cart":
733 | x = data[feature].values
734 | y = data[target]
735 | _combiner = OptimalBinning(feature, dtype="numerical", max_n_bins=max_n_bins, monotonic_trend="auto_asc_desc", gamma=0.01).fit(x, y)
736 | if _combiner.status == "OPTIMAL":
737 | rules.update({feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [np.nan]})
738 | else:
739 | if method == "step":
740 | combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, n_bins=max_n_bins, clip_v=clip_v)
741 | else:
742 | combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, n_bins=max_n_bins)
743 |
744 | if verbose > 0:
745 | print(data[feature].describe())
746 |
747 | if rules and isinstance(rules, list): rules = {feature: rules}
748 | if rules and isinstance(rules, dict): combiner.update(rules)
749 |
750 | feature_bin = combiner.export()[feature]
751 | feature_bin_dict = self.feature_bins(np.array(feature_bin))
752 |
753 | df_bin = combiner.transform(data[[feature, target]], labels=False)
754 |
755 | table = df_bin[[feature, target]].groupby([feature, target]).agg(len).unstack()
756 | table.columns.name = None
757 | table = table.rename(columns = {0 : '好样本数', 1 : '坏样本数'}).fillna(0)
758 | if "好样本数" not in table.columns:
759 | table["好样本数"] = 0
760 | if "坏样本数" not in table.columns:
761 | table["坏样本数"] = 0
762 |
763 | table["指标名称"] = feature
764 | table["指标含义"] = desc
765 | table = table.reset_index().rename(columns={feature: "分箱"})
766 |
767 | table['样本总数'] = table['好样本数'] + table['坏样本数']
768 | table['样本占比'] = table['样本总数'] / table['样本总数'].sum()
769 | table['好样本占比'] = table['好样本数'] / table['好样本数'].sum()
770 | table['坏样本占比'] = table['坏样本数'] / table['坏样本数'].sum()
771 | table['坏样本率'] = table['坏样本数'] / table['样本总数']
772 |
773 | table = table.fillna(0.)
774 |
775 | table['分档WOE值'] = table.apply(lambda x : np.log(x['好样本占比'] / (x['坏样本占比'] + 1e-6)),axis=1)
776 | table['分档IV值'] = table.apply(lambda x : (x['好样本占比'] - x['坏样本占比']) * np.log(x['好样本占比'] / (x['坏样本占比'] + 1e-6)), axis=1)
777 |
778 | table = table.replace(np.inf, 0).replace(-np.inf, 0)
779 |
780 | table['指标IV值'] = table['分档IV值'].sum()
781 |
782 | table["LIFT值"] = table['坏样本率'] / (table["坏样本数"].sum() / table["样本总数"].sum())
783 | table["累积LIFT值"] = (table['坏样本数'].cumsum() / table['样本总数'].cumsum()) / (table["坏样本数"].sum() / table["样本总数"].sum())
784 | # table["累积LIFT值"] = table["LIFT值"].cumsum()
785 |
786 | if ks:
787 | table = table.sort_values("分箱")
788 | table["累积好样本数"] = table["好样本数"].cumsum()
789 | table["累积坏样本数"] = table["坏样本数"].cumsum()
790 | table["分档KS值"] = table["累积坏样本数"] / table['坏样本数'].sum() - table["累积好样本数"] / table['好样本数'].sum()
791 |
792 | table["分箱"] = table["分箱"].map(feature_bin_dict)
793 | table = table.set_index(['指标名称', '指标含义', '分箱']).reindex([(feature, desc, b) for b in feature_bin_dict.values()]).fillna(0).reset_index()
794 |
795 | if ks:
796 | return table[['指标名称', "指标含义", '分箱', '样本总数', '样本占比', '好样本数', '好样本占比', '坏样本数', '坏样本占比', '坏样本率', '分档WOE值', '分档IV值', '指标IV值', 'LIFT值', '累积LIFT值', '累积好样本数', '累积坏样本数', '分档KS值']]
797 | else:
798 | return table[['指标名称', "指标含义", '分箱', '样本总数', '样本占比', '好样本数', '好样本占比', '坏样本数', '坏样本占比', '坏样本率', '分档WOE值', '分档IV值', '指标IV值', 'LIFT值', '累积LIFT值']]
799 |
800 |
801 | if __name__ == '__main__':
802 | # https://github.com/itlubber/openpyxl-excel-style-template/blob/main/pipeline_model.py
803 | plt.ion()
804 |
805 | target = "creditability"
806 | data = sc.germancredit()
807 | data[target] = data[target].map({"good": 0, "bad": 1})
808 |
809 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])
810 | oot = data.copy()
811 | feature_pipeline = Pipeline([
812 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
813 | ("combiner", Combiner(target=target, min_samples=0.2)),
814 | ("transform", WOETransformer(target=target)),
815 | ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
816 | ("stepwise", StepwiseSelection(target=target)),
817 | ])
818 |
819 | feature_pipeline.fit(train)
820 |
821 | woe_train = feature_pipeline.transform(train)
822 | woe_test = feature_pipeline.transform(test)
823 | woe_oot = feature_pipeline.transform(oot)
824 |
825 | # save all bin_plot
826 | _combiner = feature_pipeline.named_steps["combiner"]
827 | for col in woe_train.columns:
828 | if col != target:
829 | _combiner.bin_plot(train, col, labels=True, save=f"outputs/bin_plots/train_{col}.png")
830 | _combiner.bin_plot(test, col, labels=True, save=f"outputs/bin_plots/test_{col}.png")
831 | _combiner.bin_plot(oot, col, labels=True, save=f"outputs/bin_plots/oot_{col}.png")
832 |
833 | # logistic = StatsLogisticRegression(target=target)
834 | logistic = ITLubberLogisticRegression(target=target)
835 |
836 | logistic.fit(woe_train)
837 |
838 | y_pred_train = logistic.predict_proba(woe_train.drop(columns=target))[:, 1]
839 | y_pred_test = logistic.predict_proba(woe_test.drop(columns=target))[:, 1]
840 | y_pred_oot = logistic.predict_proba(woe_oot.drop(columns=target))[:, 1]
841 |
842 | # params_grid = {
843 | # # "logistic__C": [i / 1. for i in range(1, 10, 2)],
844 | # # "logistic__penalty": ["l2"],
845 | # # "logistic__class_weight": [None, "balanced"], # + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10)],
846 | # # "logistic__max_iter": [100],
847 | # # "logistic__solver": ["sag"] # ["liblinear", "sag", "lbfgs", "newton-cg"],
848 | # "logistic__intercept": [True, False],
849 | # }
850 |
851 | # clf = GridSearchCV(feature_pipeline, params_grid, cv=5, scoring='roc_auc', verbose=-1, n_jobs=2, return_train_score=True)
852 | # clf.fit(train, train[target])
853 |
854 | # y_pred_train = clf.best_estimator_.predict(train)
855 | # y_pred_test = clf.best_estimator_.predict(test)
856 |
857 | # print(clf.best_params_)
858 |
859 | # model summary
860 | # logistic.summary_save()
861 |
862 | logistic.plot_weights(save="outputs/logistic_train.png")
863 |
864 | summary = logistic.summary().reset_index().rename(columns={"index": "Features"})
865 |
866 | train_corr = logistic.corr(woe_train, save="outputs/train_corr.png")
867 | test_corr = logistic.corr(woe_test, save="outputs/test_corr.png")
868 | oot_corr = logistic.corr(woe_oot, save="outputs/oot_corr.png")
869 |
870 | train_report = logistic.report(woe_train)
871 | test_report = logistic.report(woe_test)
872 | oot_report = logistic.report(woe_oot)
873 |
874 | print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target]))
875 | print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target]))
876 | print("oot: ", toad.metrics.KS(y_pred_oot, oot[target]), toad.metrics.AUC(y_pred_oot, oot[target]))
877 |
878 | card = ScoreCard(target=target, pipeline=feature_pipeline, pretrain_lr=logistic)
879 | card.fit(woe_train)
880 |
881 | train["score"] = card.predict(train)
882 | test["score"] = card.predict(test)
883 | oot["score"] = card.predict(oot)
884 |
885 | card.perf_eva(train["score"], train[target], title="Train Dataset", save="outputs/train_ksplot.png")
886 | card.perf_eva(test["score"], test[target], title="Test Dataset", save="outputs/test_ksplot.png")
887 | card.perf_eva(oot["score"], oot[target], title="OOT Dataset", save="outputs/oot_ksplot.png")
888 |
889 | card.score_hist(train["score"], train[target], save="outputs/train_scorehist.png")
890 | card.score_hist(test["score"], test[target], save="outputs/test_scorehist.png")
891 | card.score_hist(oot["score"], oot[target], save="outputs/oot_scorehist.png")
892 |
893 | train_score_rank = card.feature_bin_stats(train, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
894 | test_score_rank = card.feature_bin_stats(test, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
895 | oot_score_rank = card.feature_bin_stats(oot, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
896 |
897 | card_points = card.export(to_frame=True)
898 |
899 | writer = pd.ExcelWriter("outputs/评分卡结果验证表.xlsx", engine="openpyxl")
900 |
901 | summary.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=1, index=False)
902 | train_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + 5, index=False)
903 | test_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + 9, index=False)
904 | oot_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + len(test_report) + 13, index=False)
905 |
906 | worksheet = writer.sheets['逻辑回归拟合结果']
907 | worksheet.cell(row=1, column=1).value = "入模变量系数及相关统计指标"
908 | worksheet.cell(row=len(summary) + 5, column=1).value = "训练数据集模型预测报告"
909 | worksheet.cell(row=len(summary) + len(train_report) + 9, column=1).value = "测试数据集模型预测报告"
910 | worksheet.cell(row=len(summary) + len(train_report) + len(test_report) + 13, column=1).value = "跨时间验证集模型预测报告"
911 |
912 | train_corr.to_excel(writer, sheet_name="入模变量相关性", startrow=1, index=True)
913 | test_corr.to_excel(writer, sheet_name="入模变量相关性", startrow=len(train_corr) + 5, index=True)
914 | oot_corr.to_excel(writer, sheet_name="入模变量相关性", startrow=len(train_corr) + len(test_corr) + 9, index=True)
915 |
916 | worksheet = writer.sheets['入模变量相关性']
917 | worksheet.cell(row=2, column=1).value = "训练数据集入模变量相关性"
918 | worksheet.cell(row=len(train_corr) + 6, column=1).value = "测试数据集入模变量相关性"
919 | worksheet.cell(row=len(train_corr) + len(test_corr) + 10, column=1).value = "跨时间验证集入模变量相关性"
920 |
921 | card_points.to_excel(writer, sheet_name="评分卡", index=False)
922 |
923 | train_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=1, index=False)
924 | test_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + 5, index=False)
925 | oot_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + len(test_score_rank) + 9, index=False)
926 |
927 | worksheet = writer.sheets['评分卡排序性']
928 |
929 | worksheet.cell(row=1, column=1).value = "训练数据集评分排序性"
930 | worksheet.cell(row=len(train_score_rank) + 5, column=1).value = "测试数据集评分排序性"
931 | worksheet.cell(row=len(train_score_rank) + len(test_score_rank) + 9, column=1).value = "跨时间验证集评分排序性"
932 |
933 | writer.close()
934 |
935 | from utils.tools import render_excel
936 |
937 | render_excel("outputs/评分卡结果验证表.xlsx", border=False)
938 |
939 |
--------------------------------------------------------------------------------