├── utils
    ├── 报告输出模版.xlsx
    ├── matplot_chinese.ttf
    ├── __init__.py
    ├── perf_eva.py
    ├── excel_writer.py
    └── tools.py
├── requertments.txt
├── clear_cache.sh
├── LICENSE
├── .gitignore
├── README.md
├── tree_ming.py
├── rules_auto_mining.py
├── processing.py
├── main.py
└── model.py


/utils/报告输出模版.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/LogisticRegressionPipeline/HEAD/utils/报告输出模版.xlsx


--------------------------------------------------------------------------------
/requertments.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | numpy<1.20
3 | ortools>=9.4
4 | ropwr>=0.4.0
5 | scikit-learn>=1.0.2
6 | scipy>=1.6.0
7 | 


--------------------------------------------------------------------------------
/utils/matplot_chinese.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/itlubber/LogisticRegressionPipeline/HEAD/utils/matplot_chinese.ttf


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @Time    : 2023/2/14 09:08
4 | @Author  : itlubber
5 | @Site    : itlubber.art
6 | """
7 | 


--------------------------------------------------------------------------------
/clear_cache.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PYSTRING="$(find . | grep -E "(__pycache__|\.pyc|\.pyo$)")"
 4 | IPYNBSTRING="$(find . | grep -E "(ipynb_checkpoints|\.ipynb$)")"
 5 | 
 6 | # 删除 __pycache__ 缓存文件
 7 | if [ -n "$PYSTRING" ]; then
 8 |   echo "删除以下缓存文件 :"
 9 |   echo "-----------------------------------------------------"
10 |   echo "$PYSTRING"
11 |   echo "-----------------------------------------------------"
12 |   find . | grep -E "(__pycache__|\.pyc|\.pyo$)" | xargs rm -rf
13 | else
14 |   echo "不存在 __pycache__ 缓存文件"
15 | fi
16 | 
17 | # # 删除 ipynb_checkpoints 缓存文件
18 | # if [ -n "$IPYNBSTRING" ]; then
19 | #   echo "删除以下缓存文件 :"
20 | #   echo "-----------------------------------------------------"
21 | #   echo "$IPYNBSTRING"
22 | #   echo "-----------------------------------------------------"
23 | #   find . | grep -E "(ipynb_checkpoints|\.ipynb$)" | xargs rm -rf
24 | # else
25 | #   echo "不存在 ipynb_checkpoints 缓存文件"
26 | # fi


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 itlubber
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | *.ipynb
131 | *.zip
132 | .DS_store
133 | catboost_info/
134 | test.py
135 | .idea
136 | .vscode


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 可用于 `超参数搜索` & `pipeline` 的逻辑回归
  2 | 
  3 | ## 交流
  4 | 
  5 | <table style="text-align:center !important;border=0;">
  6 |     <tr>
  7 |         <td>
  8 |             <span>微信: itlubber</span>
  9 |         </td>
 10 |         <td>
 11 |             <span>微信公众号: itlubber_art</span>
 12 |         </td>
 13 |     </tr>
 14 |     <tr>
 15 |         <td>
 16 |             <img src="https://itlubber.art//upload/itlubber.png" alt="itlubber.png" width="50%" border=0/>
 17 |         </td>
 18 |         <td>
 19 |             <img src="https://itlubber.art//upload/itlubber_art.png" alt="itlubber_art.png" width="50%" border=0/>
 20 |         </td>
 21 |     </tr>
 22 | </table>
 23 | 
 24 | ## 概述
 25 | 
 26 | 分别基于 `statsmodels` 和 `scikit-learn` 实现两种可用于 `sklearn pipeline` 的 `LogisticRegression`，并输出相应的报告，效果如下：
 27 | 
 28 | > 基于 `statsmodels` 的 `StatsLogisticRegression`
 29 | 
 30 | 
 31 | <img src="https://itlubber.art/upload/2022/10/iShot_2022-10-28_13.21.00.png"></img>
 32 | <img src="https://itlubber.art/upload/2022/10/iShot_2022-10-28_13.14.39.png"></img>
 33 | 
 34 | 
 35 | > 基于 `sklearn` 的 `ITLubberLogisticRegression`
 36 | 
 37 | <img src="https://itlubber.art/upload/2022/11/image-1669653191871.png"></img>
 38 | 
 39 | <img src="outputs/logistic_train.png"></img>
 40 | 
 41 | <img src="outputs/train_scorehist.png"></img>
 42 | 
 43 | 
 44 | ## 使用方法
 45 | 
 46 | ```python
 47 | target = "creditability"
 48 | data = sc.germancredit()
 49 | data[target] = data[target].map({"good": 0, "bad": 1})
 50 | 
 51 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])
 52 | oot = data.copy()
 53 | feature_pipeline = Pipeline([
 54 |     ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
 55 |     ("combiner", Combiner(target=target, min_samples=0.2)),
 56 |     ("transform", WOETransformer(target=target)),
 57 |     ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
 58 |     ("stepwise", StepwiseSelection(target=target)),
 59 | ])
 60 | 
 61 | feature_pipeline.fit(train)
 62 | 
 63 | woe_train = feature_pipeline.transform(train)
 64 | woe_test = feature_pipeline.transform(test)
 65 | woe_oot = feature_pipeline.transform(oot)
 66 | 
 67 | # logistic = StatsLogisticRegression(target=target)
 68 | logistic = ITLubberLogisticRegression(target=target)
 69 | 
 70 | logistic.fit(woe_train)
 71 | 
 72 | y_pred_train = logistic.predict_proba(woe_train.drop(columns=target))[:, 1]
 73 | y_pred_test = logistic.predict_proba(woe_test.drop(columns=target))[:, 1]
 74 | y_pred_oot = logistic.predict_proba(woe_oot.drop(columns=target))[:, 1]
 75 | 
 76 | # params_grid = {
 77 | #     # "logistic__C": [i / 1. for i in range(1, 10, 2)],
 78 | #     # "logistic__penalty": ["l2"],
 79 | #     # "logistic__class_weight": [None, "balanced"], # + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10)],
 80 | #     # "logistic__max_iter": [100],
 81 | #     # "logistic__solver": ["sag"] # ["liblinear", "sag", "lbfgs", "newton-cg"],
 82 | #     "logistic__intercept": [True, False],
 83 | # }
 84 | 
 85 | # clf = GridSearchCV(feature_pipeline, params_grid, cv=5, scoring='roc_auc', verbose=-1, n_jobs=2, return_train_score=True)
 86 | # clf.fit(train, train[target])
 87 | 
 88 | # y_pred_train = clf.best_estimator_.predict(train)
 89 | # y_pred_test = clf.best_estimator_.predict(test)
 90 | 
 91 | # print(clf.best_params_)
 92 | 
 93 | # model summary
 94 | # logistic.summary_save()
 95 | # logistic.plot_weights(save="logistic_train.png")
 96 | summary = logistic.summary().reset_index().rename(columns={"index": "Features"})
 97 | 
 98 | train_report = logistic.report(woe_train)
 99 | test_report = logistic.report(woe_test)
100 | oot_report = logistic.report(woe_oot)
101 | 
102 | print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target]))
103 | print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target]))
104 | print("oot: ", toad.metrics.KS(y_pred_oot, oot[target]), toad.metrics.AUC(y_pred_oot, oot[target]))
105 | 
106 | card = ScoreCard(target=target, pipeline=feature_pipeline, pretrain_lr=logistic)
107 | card.fit(woe_train)
108 | 
109 | train["score"] = card.predict(train)
110 | test["score"] = card.predict(test)
111 | oot["score"] = card.predict(oot)
112 | 
113 | # print(card.feature_bin_stats(train, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step"))
114 | # print(card.feature_bin_stats(train, "score", target=target, verbose=0, method="cart"))
115 | 
116 | train_score_rank = card.feature_bin_stats(train, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
117 | test_score_rank = card.feature_bin_stats(test, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
118 | oot_score_rank = card.feature_bin_stats(oot, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
119 | 
120 | writer = pd.ExcelWriter("评分卡结果验证表.xlsx", engine="openpyxl")
121 | 
122 | summary.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=1, index=False)
123 | train_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + 5, index=False)
124 | test_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + 9, index=False)
125 | oot_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + len(test_report) + 13, index=False)
126 | 
127 | worksheet = writer.sheets['逻辑回归拟合结果']
128 | worksheet.cell(row=1, column=1).value = "入模变量系数及相关统计指标"
129 | worksheet.cell(row=len(summary) + 5, column=1).value = "训练数据集模型预测报告"
130 | worksheet.cell(row=len(summary) + len(train_report) + 9, column=1).value = "测试数据集模型预测报告"
131 | worksheet.cell(row=len(summary) + len(train_report) + len(test_report) + 13, column=1).value = "跨时间验证集模型预测报告"
132 | 
133 | train_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=1, index=False)
134 | test_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + 5, index=False)
135 | oot_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + len(test_score_rank) + 9, index=False)
136 | 
137 | worksheet = writer.sheets['评分卡排序性']
138 | 
139 | worksheet.cell(row=1, column=1).value = "训练数据集评分排序性"
140 | worksheet.cell(row=len(train_score_rank) + 5, column=1).value = "测试数据集评分排序性"
141 | worksheet.cell(row=len(train_score_rank) + len(test_score_rank) + 9, column=1).value = "跨时间验证集评分排序性"
142 | 
143 | writer.close()
144 | 
145 | from utils import render_excel
146 | 
147 | render_excel("评分卡结果验证表.xlsx", border=False)
148 | ```
149 | 
150 | 
151 | ## 参考
152 | 
153 | > https://github.com/ing-bank/skorecard/blob/main/skorecard/linear_model/linear_model.py
154 | > 
155 | > https://github.com/itlubber/openpyxl-excel-style-template/blob/main/pipeline_model.py
156 | > 


--------------------------------------------------------------------------------
/tree_ming.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import graphviz
  3 | import warnings
  4 | import numpy as np
  5 | import pandas as pd
  6 | import matplotlib.pyplot as plt
  7 | from matplotlib import font_manager
  8 | import dtreeviz
  9 | 
 10 | import category_encoders as ce
 11 | from sklearn.preprocessing import LabelEncoder
 12 | from sklearn.tree import _tree, DecisionTreeClassifier, plot_tree, export_graphviz
 13 | 
 14 | 
 15 | warnings.filterwarnings("ignore")
 16 | pd.set_option('display.width', 5000)
 17 | plt.style.use('seaborn-ticks')
 18 | plt.rcParams["font.sans-serif"]=["SimHei"]
 19 | plt.rcParams["axes.unicode_minus"]=False
 20 | 
 21 | 
 22 | def get_dt_rules(tree, feature_names, total_bad_rate, total_count):
 23 |     tree_ = tree.tree_
 24 |     left = tree.tree_.children_left
 25 |     right = tree.tree_.children_right
 26 |     feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature]
 27 |     rules=dict()
 28 | 
 29 |     global res_df
 30 |     res_df = pd.DataFrame()
 31 |     
 32 |     def recurse(node, depth, parent): # 搜每个节点的规则
 33 | 
 34 |         if tree_.feature[node] != -2:  # 非叶子节点,搜索每个节点的规则
 35 |             name = feature_name[node]
 36 |             thd = np.round(tree_.threshold[node],3)
 37 |             s= "{} <= {} ".format( name, thd, node )
 38 |             # 左子
 39 |             if node == 0:
 40 |                 rules[node]=s
 41 |             else:
 42 |                 rules[node]=rules[parent]+' & ' +s
 43 |             recurse(left[node], depth + 1, node)
 44 |             s="{} > {}".format(name, thd)
 45 |             # 右子 
 46 |             if node == 0:
 47 |                 rules[node]=s
 48 |             else:
 49 |                 rules[node]=rules[parent]+' & ' +s
 50 |             recurse(right[node], depth + 1, node)
 51 |         else:
 52 |             df = pd.DataFrame()
 53 |             df['组合策略'] = rules[parent],
 54 |             df['好样本数'] = tree_.value[node][0][0].astype(int)
 55 |             df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate))
 56 |             df['坏样本数'] = tree_.value[node][0][1].astype(int)
 57 |             df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate)
 58 |             df['命中数'] = df['好样本数'] + df['坏样本数']
 59 |             df['命中率'] = df['命中数'] / total_count
 60 |             df['坏率'] = df['坏样本数'] / df['命中数']
 61 |             df['样本整体坏率'] = total_bad_rate
 62 |             df['LIFT值'] = df['坏率'] / df['样本整体坏率']
 63 |             
 64 |             global res_df
 65 |             
 66 |             res_df = pd.concat([res_df, df], 0)
 67 |             
 68 |     recurse(0, 1, 0)
 69 |     
 70 |     return res_df.sort_values("LIFT值", ascending=True).reset_index(drop=True)
 71 | 
 72 | 
 73 | def dtreeviz_plot(tree, X_TE, y, target="target", save=None):
 74 |     viz_model = dtreeviz.model(tree,
 75 |                                X_train=X_TE, y_train=y,
 76 |                                feature_names=X_TE.columns,
 77 |                                target_name=target, class_names=["GOOD", f"BAD"])
 78 |     viz = viz_model.view(
 79 |                 scale=1.5, 
 80 |                 orientation='LR', 
 81 |                 colors={
 82 |                         "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
 83 |                         "arrow": "#2639E9",
 84 |                         'text_wedge': "#F76E6C",
 85 |                         "pie": "#2639E9",
 86 |                         "tile_alpha": 1,
 87 |                         "legend_edge": "#FFFFFF",
 88 |                     },
 89 |                 ticks_fontsize=10,
 90 |                 label_fontsize=10,
 91 |             )
 92 |     
 93 | #     viz = dtreeviz.model(
 94 | #         decision_tree,
 95 | #         X_TE,
 96 | #         y,
 97 | #         # title="DecisionTreeClassifier",
 98 | #         # title_fontsize=10,
 99 | #         ticks_fontsize=10,
100 | #         label_fontsize=10,
101 | #         target_name=target,
102 | #         feature_names=X_TE.columns,
103 | #         class_names=["good", "bad"],
104 | #         orientation='LR',
105 | #         scale=1.5,
106 | #         colors={
107 | #             "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
108 | #             "arrow": "#2639E9",
109 | #             'text_wedge': "#F76E6C",
110 | #             "pie": "#2639E9",
111 | #             "tile_alpha": 1,
112 | #             "legend_edge": "#FFFFFF",
113 | #         },
114 | #     )
115 |     
116 |     if save:
117 |         viz.save(save)
118 |     
119 |     return viz
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     import scorecardpy as sc
124 |     
125 |     target = "creditability"
126 |     data = sc.germancredit()
127 |     data[target] = data[target].map({"good": 0, "bad": 1})
128 |     
129 |     cat_features = list(set(data.select_dtypes(include=[object, pd.CategoricalDtype]).columns) - set([target]))
130 |     cat_features_index = [i for i, f in enumerate(data.columns) if f in cat_features]
131 | 
132 |     X = data.drop(columns=[target])
133 |     y = data[target]
134 |     
135 |     target_enc = ce.TargetEncoder(cols=cat_features)
136 |     target_enc.fit(X[cat_features], y)
137 | 
138 |     X_TE = X.join(target_enc.transform(X[cat_features]).add_suffix('_target'))
139 | 
140 |     target_enc.target_mapping = {}
141 |     for col in cat_features:
142 |         mapping = X_TE[[col, f"{col}_target"]].drop_duplicates()
143 |         target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"]))
144 | 
145 |     X_TE = X_TE.drop(columns=cat_features)
146 |     X_TE = X_TE.rename(columns={f"{c}_target": c for c in cat_features})
147 |     
148 |     removes = []
149 |     dt_rules = pd.DataFrame()
150 |     
151 |     for i in range(128):
152 |         decision_tree = DecisionTreeClassifier(max_depth=2, min_samples_split=8, min_samples_leaf=5, max_features="auto")
153 |         decision_tree = decision_tree.fit(X_TE, y)
154 | 
155 |         if decision_tree.score(X_TE, y) < 0.8:
156 |             break
157 | 
158 |         rules = get_dt_rules(decision_tree, X_TE.columns, sum(y) / len(y), len(y))
159 |         viz_model = dtreeviz.model(decision_tree,
160 |                                    X_train=X_TE, y_train=y,
161 |                                    feature_names=X_TE.columns,
162 |                                    target_name=target, class_names=["DPD 0", f"DPD {dpd}+"])
163 | 
164 |         rules = rules.query("LIFT值 > 4 & 命中率 < 0.1")
165 | 
166 |         if len(rules) > 0:
167 |             print("/" * 150)
168 |             rules["组合策略"] = rules["组合策略"].replace(feature_map, regex=True)
169 |             display(rules)
170 |             c = viz_model.view(
171 |                 scale=1.5, 
172 |                 orientation='LR', 
173 |                 colors={
174 |                         "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
175 |                         "arrow": "#2639E9",
176 |                         'text_wedge': "#F76E6C",
177 |                         "pie": "#2639E9",
178 |                         "tile_alpha": 1,
179 |                         "legend_edge": "#FFFFFF",
180 |                     },
181 |                 ticks_fontsize=10,
182 |                 label_fontsize=10,
183 |             )
184 |             display(c)
185 | 
186 |             dt_rules = pd.concat([dt_rules, rules]).reset_index(drop=True)
187 |             removes.append(decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))])
188 |             X_TE = X_TE.drop(columns=removes[-1])
189 |             print("-" * 150)
190 | 
191 |     pd.set_option('display.max_row', None)
192 |     dt_rules.sort_values(["LIFT值", "命中率"], ascending=False)
193 |     
194 | #     decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2)
195 | #     decision_tree = decision_tree.fit(X_TE, y)
196 |     
197 | #     rules = get_dt_rules(decision_tree, X_TE.columns, sum(y) / len(y), len(y))
198 |     
199 | #     dtreeviz_plot(decision_tree, X_TE, y, save="decision_tree.svg")
200 | #     rules.to_excel("组合策略挖掘.xlsx")
201 |     
202 | #     dot_data = export_graphviz(decision_tree, feature_names=X_TE.columns, class_names=True, filled=True, rounded=False, out_file=None)
203 | #     graph = graphviz.Source(dot_data)
204 |     
205 | #     graph.render("组合策略挖掘")
206 | 


--------------------------------------------------------------------------------
/rules_auto_mining.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cairosvg
  3 | import graphviz
  4 | import dtreeviz
  5 | import warnings
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | import category_encoders as ce
 10 | from sklearn.preprocessing import LabelEncoder
 11 | from sklearn.tree import _tree, DecisionTreeClassifier, plot_tree, export_graphviz
 12 | 
 13 | 
 14 | warnings.filterwarnings("ignore")
 15 | 
 16 | 
 17 | class ParseDecisionTreeRules:
 18 |     
 19 |     def __init__(self, target="target", labels=["positive", "negative"], feature_map={}, nan=-1., max_iter=128, output="model_report/auto_mining_rules/决策树组合策略挖掘.xlsx", writer=None):
 20 |         self.target = target
 21 |         self.labels = labels
 22 |         self.feature_map = feature_map
 23 |         self.nan = nan
 24 |         self.max_iter = max_iter
 25 |         self.output = output
 26 |         self.decision_trees = []
 27 |         self.target_enc = None
 28 |         self.feature_names = None
 29 |         self.dt_rules = pd.DataFrame()
 30 |         self.end_row = 2
 31 |         self.start_col = 2
 32 |         self.describe_columns = ["组合策略", "命中数", "命中率", "好样本数", "好样本占比", "坏样本数", "坏样本占比", "坏率", "样本整体坏率", "LIFT值"]
 33 |         
 34 |         if output:
 35 |             from utils.excel_writer import ExcelWriter
 36 |             from openpyxl.utils import get_column_letter, column_index_from_string
 37 |             init_setting()
 38 |             if writer:
 39 |                 self.writer = writer
 40 |             else:
 41 |                 self.writer = ExcelWriter(style_excel="./utils/报告输出模版.xlsx", theme_color="2639E9")
 42 |             
 43 |             self.worksheet = self.writer.get_sheet_by_name("决策树组合策略挖掘")
 44 |     
 45 |     def encode_cat_features(self, X, y):
 46 |         cat_features = list(set(X.select_dtypes(include=[object, pd.CategoricalDtype]).columns))
 47 |         cat_features_index = [i for i, f in enumerate(X.columns) if f in cat_features]
 48 |         
 49 |         if len(cat_features) > 0:
 50 |             if self.target_enc is None:
 51 |                 self.target_enc = ce.TargetEncoder(cols=cat_features)
 52 |                 self.target_enc.fit(X[cat_features], y)
 53 |                 self.target_enc.target_mapping = {}
 54 |                 X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target'))
 55 |                 for col in cat_features:
 56 |                     mapping = X_TE[[col, f"{col}_target"]].drop_duplicates()
 57 |                     self.target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"]))
 58 |             else:
 59 |                 X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target'))
 60 |             
 61 |             X_TE = X_TE.drop(columns=cat_features)
 62 |             return X_TE.rename(columns={f"{c}_target": c for c in cat_features})
 63 |         else:
 64 |             return X
 65 |     
 66 |     def get_dt_rules(self, tree, feature_names, total_bad_rate, total_count):
 67 |         tree_ = tree.tree_
 68 |         left = tree.tree_.children_left
 69 |         right = tree.tree_.children_right
 70 |         feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature]
 71 |         rules=dict()
 72 | 
 73 |         global res_df
 74 |         res_df = pd.DataFrame()
 75 | 
 76 |         def recurse(node, depth, parent): # 搜每个节点的规则
 77 | 
 78 |             if tree_.feature[node] != -2:  # 非叶子节点,搜索每个节点的规则
 79 |                 name = feature_name[node]
 80 |                 thd = np.round(tree_.threshold[node],3)
 81 |                 s= "{} <= {} ".format( name, thd, node )
 82 |                 # 左子
 83 |                 if node == 0:
 84 |                     rules[node]=s
 85 |                 else:
 86 |                     rules[node]=rules[parent]+' & ' +s
 87 |                 recurse(left[node], depth + 1, node)
 88 |                 s="{} > {}".format(name, thd)
 89 |                 # 右子 
 90 |                 if node == 0:
 91 |                     rules[node]=s
 92 |                 else:
 93 |                     rules[node]=rules[parent]+' & ' +s
 94 |                 recurse(right[node], depth + 1, node)
 95 |             else:
 96 |                 df = pd.DataFrame()
 97 |                 df['组合策略'] = rules[parent],
 98 |                 df['好样本数'] = tree_.value[node][0][0].astype(int)
 99 |                 df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate))
100 |                 df['坏样本数'] = tree_.value[node][0][1].astype(int)
101 |                 df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate)
102 |                 df['命中数'] = df['好样本数'] + df['坏样本数']
103 |                 df['命中率'] = df['命中数'] / total_count
104 |                 df['坏率'] = df['坏样本数'] / df['命中数']
105 |                 df['样本整体坏率'] = total_bad_rate
106 |                 df['LIFT值'] = df['坏率'] / df['样本整体坏率']
107 | 
108 |                 global res_df
109 | 
110 |                 res_df = pd.concat([res_df, df], 0)
111 | 
112 |         recurse(0, 1, 0)
113 | 
114 |         return res_df.sort_values("LIFT值", ascending=True)[self.describe_columns].reset_index(drop=True)
115 |     
116 |     def select_dt_rules(self, decision_tree, x, y, lift=3., max_samples=0.05, labels=["positive", "negative"], save=None, verbose=False, drop=False):
117 |         rules = self.get_dt_rules(decision_tree, x.columns, sum(y) / len(y), len(y))
118 |         viz_model = dtreeviz.model(decision_tree,
119 |                                    X_train=x, 
120 |                                    y_train=y,
121 |                                    feature_names=x.columns,
122 |                                    target_name=target, 
123 |                                    class_names=labels,
124 |                                   )
125 |         rules = rules.query(f"LIFT值 >= {lift} & 命中率 <= {max_samples}").reset_index(drop=True)
126 | 
127 |         if len(rules) > 0:
128 |             decision_tree_viz = viz_model.view(
129 |                                                 scale=1.5, 
130 |                                                 orientation='LR', 
131 |                                                 colors={
132 |                                                         "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]],
133 |                                                         "arrow": "#2639E9",
134 |                                                         'text_wedge': "#F76E6C",
135 |                                                         "pie": "#2639E9",
136 |                                                         "tile_alpha": 1,
137 |                                                         "legend_edge": "#FFFFFF",
138 |                                                     },
139 |                                                 ticks_fontsize=10,
140 |                                                 label_fontsize=10,
141 |                                             )
142 |             if verbose:
143 |                 if self.feature_map is not None and len(self.feature_map) > 0:
144 |                     display(rules.replace(self.feature_map, regex=True))
145 |                 else:
146 |                     display(rules)
147 |                 display(decision_tree_viz)
148 |             if save:
149 |                 if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
150 |                     os.makedirs(os.path.dirname(save))
151 | 
152 |                 decision_tree_viz.save("combine_rules_cache.svg")
153 |                 cairosvg.svg2png(url="combine_rules_cache.svg", write_to=save, dpi=240)
154 | 
155 |         if drop:
156 |             return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))]
157 |         else:
158 |             return rules
159 |     
160 |     def query_dt_rules(self, x, y, parsed_rules=None):
161 |         total_count = len(y)
162 |         total_bad_rate = y.sum() / len(y)
163 | 
164 |         rules = pd.DataFrame()
165 |         for rule in parsed_rules["组合策略"].unique():
166 |             select_index = x.query(rule).index
167 |             if len(select_index) > 0:
168 |                 y_select = y[select_index]
169 |                 df = pd.Series()
170 |                 df['组合策略'] = rule
171 |                 df['好样本数'] = len(y_select) - y_select.sum()
172 |                 df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate))
173 |                 df['坏样本数'] = y_select.sum()
174 |                 df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate)
175 |                 df['命中数'] = df['好样本数'] + df['坏样本数']
176 |                 df['命中率'] = df['命中数'] / total_count
177 |                 df['坏率'] = df['坏样本数'] / df['命中数']
178 |                 df['样本整体坏率'] = total_bad_rate
179 |                 df['LIFT值'] = df['坏率'] / df['样本整体坏率']
180 |             else:
181 |                 df = pd.Series({'组合策略': rule,'好样本数': 0,'好样本占比': 0.,'坏样本数': 0,'坏样本占比': 0.,'命中数': 0,'命中率': 0.,'坏率': 0.,'样本整体坏率': total_bad_rate,'LIFT值': 0.,})
182 | 
183 |             rules = pd.concat([rules, pd.DataFrame(df).T]).reset_index(drop=True)
184 | 
185 |         return rules[self.describe_columns]
186 |     
187 |     def insert_dt_rules(self, parsed_rules, end_row, start_col, save=None):
188 |         end_row, end_col = self.writer.insert_df2sheet(self.worksheet, parsed_rules, (end_row + 2, start_col))
189 |         
190 |         for c in ['好样本占比', '坏样本占比', '命中率', '坏率', '样本整体坏率', 'LIFT值']:
191 |             conditional_column = get_column_letter(start_col + parsed_rules.columns.get_loc(c))
192 |             self.writer.set_number_format(self.worksheet, f"{conditional_column}{end_row - len(parsed_rules)}:{conditional_column}{end_row - 1}", "0.00%")
193 |         for c in ["坏率", "LIFT值"]:
194 |             conditional_column = get_column_letter(start_col + parsed_rules.columns.get_loc(c))
195 |             self.writer.add_conditional_formatting(self.worksheet, f'{conditional_column}{end_row - len(parsed_rules)}', f'{conditional_column}{end_row - 1}')
196 |         
197 |         if save is not None:
198 |             end_row, end_col = self.writer.insert_pic2sheet(self.worksheet, save, (end_row + 1, start_col), figsize=(400, 300))
199 |         
200 |         return end_row, end_col
201 |         
202 |     def fit(self, x, y=None, max_depth=2, lift=3, max_samples=0.2, min_score=None, verbose=False, **kwargs):
203 |         y = x[self.target]
204 |         X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y)
205 |         X_TE = X_TE.fillna(self.nan)
206 |         
207 |         self.feature_names = list(X_TE.columns)
208 |         
209 |         for i in range(self.max_iter):
210 |             decision_tree = DecisionTreeClassifier(max_depth=max_depth, **kwargs)
211 |             decision_tree = decision_tree.fit(X_TE, y)
212 |             
213 |             if (min_score is not None and decision_tree.score(X_TE, y) < min_score) or len(X_TE.columns) < max_depth:
214 |                 break
215 |             
216 |             try:
217 |                 parsed_rules, remove = self.select_dt_rules(decision_tree, X_TE, y, lift=lift, max_samples=max_samples, labels=self.labels, verbose=verbose, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", drop=True)
218 | 
219 |                 if len(parsed_rules) > 0:
220 |                     self.dt_rules = pd.concat([self.dt_rules, parsed_rules]).reset_index(drop=True)
221 | 
222 |                     if self.writer is not None:
223 |                         if self.feature_map is not None and len(self.feature_map) > 0:
224 |                             parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True)
225 |                         self.end_row, _ = self.insert_dt_rules(parsed_rules, self.end_row, self.start_col, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png")
226 | 
227 |                 X_TE = X_TE.drop(columns=remove)
228 |                 self.decision_trees.append(decision_tree)
229 |             except:
230 |                 pass
231 |         
232 |         return self
233 |     
234 |     def transform(self, x, y=None):
235 |         y = x[self.target]
236 |         X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y)
237 |         X_TE = X_TE.fillna(self.nan)
238 |         parsed_rules = self.query_dt_rules(X_TE, y, parsed_rules=self.dt_rules)
239 |         if self.feature_map is not None and len(self.feature_map) > 0:
240 |             parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True)
241 |         return parsed_rules
242 |     
243 |     def insert_all_rules(self, val=None, test=None):
244 |         parsed_rules_train = self.dt_rules.copy()
245 |         if self.feature_map is not None and len(self.feature_map) > 0:
246 |             parsed_rules_train["组合策略"] = parsed_rules_train["组合策略"].replace(self.feature_map, regex=True)
247 |         self.end_row, _ = self.writer.insert_value2sheet(self.worksheet, (self.end_row + 2, self.start_col), value="训练集决策树组合策略")
248 |         self.end_row, _ = self.insert_dt_rules(parsed_rules_train, self.end_row, self.start_col)
249 |         
250 |         if val is not None:
251 |             parsed_rules_val = self.transform(val)
252 |             self.end_row, _ = self.writer.insert_value2sheet(self.worksheet, (self.end_row + 2, self.start_col), value="验证集决策树组合策略")
253 |             self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col)
254 |         
255 |         if test is not None:
256 |             parsed_rules_test = self.transform(test)
257 |             self.end_row, _ = self.writer.insert_value2sheet(self.worksheet, (self.end_row + 2, self.start_col), value="测试集决策树组合策略")
258 |             self.end_row, _ = self.insert_dt_rules(parsed_rules_test, self.end_row, self.start_col)
259 |             
260 |     def save(self):
261 |         self.writer.save(self.output)
262 |         
263 |        
264 | if __name__ == '__main__':
265 |     pdtr = ParseDecisionTreeRules(target=target, feature_map=feature_map, max_iter=8)
266 |     pdtr.fit(train, lift=3., max_depth=2, max_samples=0.1, verbose=False, min_samples_split=8, min_samples_leaf=5, max_features="auto")
267 |     pdtr.insert_all_rules(test=test)
268 |     pdtr.save()
269 | 


--------------------------------------------------------------------------------
/utils/perf_eva.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | import warnings
  7 | from pandas.api.types import is_numeric_dtype
  8 | 
  9 | 
 10 | def check_y(dat, y, positive):
 11 |     positive = str(positive)
 12 |     # ncol of dt
 13 |     if isinstance(dat, pd.DataFrame) & (dat.shape[1] <= 1):
 14 |         raise Exception("Incorrect inputs; dat should be a DataFrame with at least two columns.")
 15 | 
 16 |     # y ------
 17 |     if isinstance(y, str):
 18 |         y = [y]
 19 |     # length of y == 1
 20 |     if len(y) != 1:
 21 |         raise Exception("Incorrect inputs; the length of y should be one")
 22 | 
 23 |     y = y[0]
 24 |     # y not in dat.columns
 25 |     if y not in dat.columns:
 26 |         raise Exception("Incorrect inputs; there is no \'{}\' column in dat.".format(y))
 27 | 
 28 |     # remove na in y
 29 |     if pd.isna(dat[y]).any():
 30 |         warnings.warn("There are NaNs in \'{}\' column. The rows with NaN in \'{}\' were removed from dat.".format(y,y))
 31 |         dat = dat.dropna(subset=[y])
 32 |         # dat = dat[pd.notna(dat[y])]
 33 | 
 34 | 
 35 |     # numeric y to int
 36 |     if is_numeric_dtype(dat[y]):
 37 |         dat.loc[:,y] = dat[y].apply(lambda x: x if pd.isnull(x) else int(x)) #dat[y].astype(int)
 38 |     # length of unique values in y
 39 |     unique_y = np.unique(dat[y].values)
 40 |     if len(unique_y) == 2:
 41 |         # if [v not in [0,1] for v in unique_y] == [True, True]:
 42 |         if True in [bool(re.search(positive, str(v))) for v in unique_y]:
 43 |             y1 = dat[y]
 44 |             y2 = dat[y].apply(lambda x: 1 if str(x) in re.split('\|', positive) else 0)
 45 |             if (y1 != y2).any():
 46 |                 dat.loc[:,y] = y2#dat[y] = y2
 47 |                 warnings.warn("The positive value in \"{}\" was replaced by 1 and negative value by 0.".format(y))
 48 |         else:
 49 |             raise Exception("Incorrect inputs; the positive value in \"{}\" is not specified".format(y))
 50 |     else:
 51 |         raise Exception("Incorrect inputs; the length of unique values in y column \'{}\' != 2.".format(y))
 52 | 
 53 |     return dat
 54 | 
 55 | 
 56 | 
 57 | def eva_dfkslift(df, groupnum=None):
 58 |     if groupnum is None: groupnum=len(df.index)
 59 |     # good bad func
 60 |     def n0(x): return sum(x==0)
 61 |     def n1(x): return sum(x==1)
 62 |     df_kslift = df.sort_values('pred', ascending=False).reset_index(drop=True)\
 63 |       .assign(group=lambda x: np.ceil((x.index+1)/(len(x.index)/groupnum)))\
 64 |       .groupby('group')['label'].agg([n0,n1])\
 65 |       .reset_index().rename(columns={'n0':'good','n1':'bad'})\
 66 |       .assign(
 67 |         group=lambda x: (x.index+1)/len(x.index),
 68 |         good_distri=lambda x: x.good/sum(x.good),
 69 |         bad_distri=lambda x: x.bad/sum(x.bad),
 70 |         badrate=lambda x: x.bad/(x.good+x.bad),
 71 |         cumbadrate=lambda x: np.cumsum(x.bad)/np.cumsum(x.good+x.bad),
 72 |         lift=lambda x: (np.cumsum(x.bad)/np.cumsum(x.good+x.bad))/(sum(x.bad)/sum(x.good+x.bad)),
 73 |         cumgood=lambda x: np.cumsum(x.good)/sum(x.good),
 74 |         cumbad=lambda x: np.cumsum(x.bad)/sum(x.bad)
 75 |       ).assign(ks=lambda x:abs(x.cumbad-x.cumgood))
 76 |     # bind 0
 77 |     df_kslift=pd.concat([
 78 |       pd.DataFrame({'group':0, 'good':0, 'bad':0, 'good_distri':0, 'bad_distri':0, 'badrate':0, 'cumbadrate':np.nan, 'cumgood':0, 'cumbad':0, 'ks':0, 'lift':np.nan}, index=np.arange(1)),
 79 |       df_kslift
 80 |     ], ignore_index=True)
 81 |     # return
 82 |     return df_kslift
 83 | # plot ks
 84 | def eva_pks(dfkslift, title):
 85 |     dfks = dfkslift.loc[lambda x: x.ks==max(x.ks)].sort_values('group').iloc[0]
 86 |     ###### plot ######
 87 |     # fig, ax = plt.subplots()
 88 |     # ks, cumbad, cumgood
 89 |     plt.plot(dfkslift.group, dfkslift.ks, 'b-',
 90 |       dfkslift.group, dfkslift.cumgood, 'k-',
 91 |       dfkslift.group, dfkslift.cumbad, 'k-')
 92 |     # ks vline
 93 |     plt.plot([dfks['group'], dfks['group']], [0, dfks['ks']], 'r--')
 94 |     # set xylabel
 95 |     plt.gca().set(title=title+'K-S',
 96 |       xlabel='% of population', ylabel='% of total Good/Bad',
 97 |       xlim=[0,1], ylim=[0,1], aspect='equal')
 98 |     # text
 99 |     # plt.text(0.5,0.96,'K-S', fontsize=15,horizontalalignment='center')
100 |     plt.text(0.2,0.8,'Bad',horizontalalignment='center')
101 |     plt.text(0.8,0.55,'Good',horizontalalignment='center')
102 |     plt.text(dfks['group'], dfks['ks'], 'KS:'+ str(round(dfks['ks'],4)), horizontalalignment='center',color='b')
103 |     # plt.grid()
104 |     # plt.show()
105 |     # return fig
106 | # plot lift
107 | def eva_plift(dfkslift, title):
108 |     badrate_avg = sum(dfkslift.bad)/sum(dfkslift.good+dfkslift.bad)
109 |     ###### plot ######
110 |     # fig, ax = plt.subplots()
111 |     # ks, cumbad, cumgood
112 |     plt.plot(dfkslift.group, dfkslift.cumbadrate, 'k-')
113 |     # ks vline
114 |     plt.plot([0, 1], [badrate_avg, badrate_avg], 'r--')
115 |     # set xylabel
116 |     plt.gca().set(title=title+'Lift',
117 |       xlabel='% of population', ylabel='% of Bad',
118 |       xlim=[0,1], ylim=[0,1], aspect='equal')
119 |     # text
120 |     # plt.text(0.5,0.96,'Lift', fontsize=15,horizontalalignment='center')
121 |     plt.text(0.7,np.mean(dfkslift.cumbadrate),'cumulate badrate',horizontalalignment='center')
122 |     plt.text(0.7,badrate_avg,'average badrate',horizontalalignment='center')
123 |     # plt.grid()
124 |     # plt.show()
125 |     # return fig
126 | 
127 | def eva_dfrocpr(df):
128 |     def n0(x): return sum(x==0)
129 |     def n1(x): return sum(x==1)
130 |     dfrocpr = df.sort_values('pred')\
131 |       .groupby('pred')['label'].agg([n0,n1,len])\
132 |       .reset_index().rename(columns={'n0':'countN','n1':'countP','len':'countpred'})\
133 |       .assign(
134 |         FN = lambda x: np.cumsum(x.countP),
135 |         TN = lambda x: np.cumsum(x.countN)
136 |       ).assign(
137 |         TP = lambda x: sum(x.countP) - x.FN,
138 |         FP = lambda x: sum(x.countN) - x.TN
139 |       ).assign(
140 |         TPR = lambda x: x.TP/(x.TP+x.FN),
141 |         FPR = lambda x: x.FP/(x.TN+x.FP),
142 |         precision = lambda x: x.TP/(x.TP+x.FP),
143 |         recall = lambda x: x.TP/(x.TP+x.FN)
144 |       ).assign(
145 |         F1 = lambda x: 2*x.precision*x.recall/(x.precision+x.recall)
146 |       )
147 |     return dfrocpr
148 | # plot roc
149 | def eva_proc(dfrocpr, title):
150 |     dfrocpr = pd.concat(
151 |       [dfrocpr[['FPR','TPR']], pd.DataFrame({'FPR':[0,1], 'TPR':[0,1]})],
152 |       ignore_index=True).sort_values(['FPR','TPR'])
153 |     auc = dfrocpr.sort_values(['FPR','TPR'])\
154 |           .assign(
155 |             TPR_lag=lambda x: x['TPR'].shift(1), FPR_lag=lambda x: x['FPR'].shift(1)
156 |           ).assign(
157 |             auc=lambda x: (x.TPR+x.TPR_lag)*(x.FPR-x.FPR_lag)/2
158 |           )['auc'].sum()
159 |     ###### plot ######
160 |     # fig, ax = plt.subplots()
161 |     # ks, cumbad, cumgood
162 |     plt.plot(dfrocpr.FPR, dfrocpr.TPR, 'k-')
163 |     # ks vline
164 |     x=np.array(np.arange(0,1.1,0.1))
165 |     plt.plot(x, x, 'r--')
166 |     # fill
167 |     plt.fill_between(dfrocpr.FPR, 0, dfrocpr.TPR, color='blue', alpha=0.1)
168 |     # set xylabel
169 |     plt.gca().set(title=title+'ROC',
170 |       xlabel='FPR', ylabel='TPR',
171 |       xlim=[0,1], ylim=[0,1], aspect='equal')
172 |     # text
173 |     # plt.text(0.5,0.96, 'ROC', fontsize=15, horizontalalignment='center')
174 |     plt.text(0.55,0.45, 'AUC:'+str(round(auc,4)), horizontalalignment='center', color='b')
175 |     # plt.grid()
176 |     # plt.show()
177 |     # return fig
178 | # plot ppr
179 | def eva_ppr(dfrocpr, title):
180 |     ###### plot ######
181 |     # fig, ax = plt.subplots()
182 |     # ks, cumbad, cumgood
183 |     plt.plot(dfrocpr.recall, dfrocpr.precision, 'k-')
184 |     # ks vline
185 |     x=np.array(np.arange(0,1.1,0.1))
186 |     plt.plot(x, x, 'r--')
187 |     # set xylabel
188 |     plt.gca().set(title=title+'P-R',
189 |       xlabel='Recall', ylabel='Precision',
190 |       xlim=[0,1], ylim=[0,1], aspect='equal')
191 |     # text
192 |     # plt.text(0.5,0.96, 'P-R', fontsize=15, horizontalalignment='center')
193 |     # plt.grid()
194 |     # plt.show()
195 |     # return fig
196 | # plot f1
197 | def eva_pf1(dfrocpr, title):
198 |     dfrocpr=dfrocpr.assign(pop=lambda x: np.cumsum(x.countpred)/sum(x.countpred))
199 |     ###### plot ######
200 |     # fig, ax = plt.subplots()
201 |     # ks, cumbad, cumgood
202 |     plt.plot(dfrocpr['pop'], dfrocpr['F1'], 'k-')
203 |     # ks vline
204 |     F1max_pop = dfrocpr.loc[dfrocpr['F1'].idxmax(),'pop']
205 |     F1max_F1 = dfrocpr.loc[dfrocpr['F1'].idxmax(),'F1']
206 |     plt.plot([F1max_pop,F1max_pop], [0,F1max_F1], 'r--')
207 |     # set xylabel
208 |     plt.gca().set(title=title+'F1',
209 |       xlabel='% of population', ylabel='F1',
210 |       xlim=[0,1], ylim=[0,1], aspect='equal')
211 |     # pred text
212 |     pred_0=dfrocpr.loc[dfrocpr['pred'].idxmin(),'pred']
213 |     pred_F1max=dfrocpr.loc[dfrocpr['F1'].idxmax(),'pred']
214 |     pred_1=dfrocpr.loc[dfrocpr['pred'].idxmax(),'pred']
215 |     if np.mean(dfrocpr.pred) < 0 or np.mean(dfrocpr.pred) > 1:
216 |         pred_0 = -pred_0
217 |         pred_F1max = -pred_F1max
218 |         pred_1 = -pred_1
219 |     plt.text(0, 0, 'pred \n'+str(round(pred_0,4)), horizontalalignment='left',color='b')
220 |     plt.text(F1max_pop, 0, 'pred \n'+str(round(pred_F1max,4)), horizontalalignment='center',color='b')
221 |     plt.text(1, 0, 'pred \n'+str(round(pred_1,4)), horizontalalignment='right',color='b')
222 |     # title F1
223 |     plt.text(F1max_pop, F1max_F1, 'F1 max: \n'+ str(round(F1max_F1,4)), horizontalalignment='center',color='b')
224 |     # plt.grid()
225 |     # plt.show()
226 |     # return fig
227 | 
228 | 
229 | 
230 | def perf_eva(label, pred, title=None, groupnum=None, plot_type=["ks", "roc"], show_plot=True, positive="bad|1", seed=186):
231 | 
232 |     # inputs checking
233 |     if len(label) != len(pred):
234 |         warnings.warn('Incorrect inputs; label and pred should be list with the same length.')
235 |     # if pred is score
236 |     if np.mean(pred) < 0 or np.mean(pred) > 1:
237 |         warnings.warn('Since the average of pred is not in [0,1], it is treated as predicted score but not probability.')
238 |         pred = -pred
239 |     # random sort datatable
240 |     df = pd.DataFrame({'label':label, 'pred':pred}).sample(frac=1, random_state=seed)
241 |     # remove NAs
242 |     if any(np.unique(df.isna())):
243 |         warnings.warn('The NANs in \'label\' or \'pred\' were removed.')
244 |         df = df.dropna()
245 |     # check label
246 |     df = check_y(df, 'label', positive)
247 |     # title
248 |     title='' if title is None else str(title)+': '
249 | 
250 |     ### data ###
251 |     # dfkslift ------
252 |     if any([i in plot_type for i in ['ks', 'lift']]):
253 |         dfkslift = eva_dfkslift(df, groupnum)
254 |         if 'ks' in plot_type: df_ks = dfkslift
255 |         if 'lift' in plot_type: df_lift = dfkslift
256 |     # dfrocpr ------
257 |     if any([i in plot_type for i in ["roc","pr",'f1']]):
258 |         dfrocpr = eva_dfrocpr(df)
259 |         if 'roc' in plot_type: df_roc = dfrocpr
260 |         if 'pr' in plot_type: df_pr = dfrocpr
261 |         if 'f1' in plot_type: df_f1 = dfrocpr
262 |     ### return list ###
263 |     rt = {}
264 |     # plot, KS ------
265 |     if 'ks' in plot_type:
266 |         rt['KS'] = round(dfkslift.loc[lambda x: x.ks==max(x.ks),'ks'].iloc[0],4)
267 |     # plot, ROC ------
268 |     if 'roc' in plot_type:
269 |         auc = pd.concat(
270 |           [dfrocpr[['FPR','TPR']], pd.DataFrame({'FPR':[0,1], 'TPR':[0,1]})],
271 |           ignore_index=True).sort_values(['FPR','TPR'])\
272 |           .assign(
273 |             TPR_lag=lambda x: x['TPR'].shift(1), FPR_lag=lambda x: x['FPR'].shift(1)
274 |           ).assign(
275 |             auc=lambda x: (x.TPR+x.TPR_lag)*(x.FPR-x.FPR_lag)/2
276 |           )['auc'].sum()
277 |         ###
278 |         rt['AUC'] = round(auc, 4)
279 |         rt['Gini'] = round(2*auc-1, 4)
280 | 
281 |     ### export plot ###
282 |     if show_plot:
283 |         plist = ["eva_p"+i+'(df_'+i+',title)' for i in plot_type]
284 |         subplot_nrows = np.ceil(len(plist)/2)
285 |         subplot_ncols = np.ceil(len(plist)/subplot_nrows)
286 | 
287 |         fig = plt.figure()
288 |         for i in np.arange(len(plist)):
289 |             plt.subplot(int(subplot_nrows),int(subplot_ncols),i+1)
290 |             eval(plist[i])
291 | 
292 |         rt['pic'] = fig
293 | 
294 |     return rt
295 | 
296 | 
297 | 
298 | def perf_psi(score, label=None, title=None, x_limits=None, x_tick_break=50, show_plot=True, seed=186, return_distr_dat=False):
299 | 
300 |     # inputs checking
301 |     ## score
302 |     if not isinstance(score, dict) and len(score) != 2:
303 |         raise Exception("Incorrect inputs; score should be a dictionary with two elements.")
304 |     else:
305 |         if any([not isinstance(i, pd.DataFrame) for i in score.values()]):
306 |             raise Exception("Incorrect inputs; score is a dictionary of two dataframes.")
307 |         score_columns = [list(i.columns) for i in score.values()]
308 |         if set(score_columns[0]) != set(score_columns[1]):
309 |             raise Exception("Incorrect inputs; the column names of two dataframes in score should be the same.")
310 |     ## label
311 |     if label is not None:
312 |         if not isinstance(label, dict) and len(label) != 2:
313 |             raise Exception("Incorrect inputs; label should be a dictionary with two elements.")
314 |         else:
315 |             if set(score.keys()) != set(label.keys()):
316 |                 raise Exception("Incorrect inputs; the keys of score and label should be the same. ")
317 |             for i in label.keys():
318 |                 if isinstance(label[i], pd.DataFrame):
319 |                     if len(label[i].columns) == 1:
320 |                         label[i] = label[i].iloc[:,0]
321 |                     else:
322 |                         raise Exception("Incorrect inputs; the number of columns in label should be 1.")
323 |     # score dataframe column names
324 |     score_names = score[list(score.keys())[0]].columns
325 |     # merge label with score
326 |     for i in score.keys():
327 |         score[i] = score[i].copy(deep=True)
328 |         if label is not None:
329 |             score[i].loc[:,'y'] = label[i]
330 |         else:
331 |             score[i].copy(deep=True).loc[:,'y'] = np.nan
332 |     # dateset of score and label
333 |     dt_sl = pd.concat(score, names=['ae', 'rowid']).reset_index()\
334 |       .sample(frac=1, random_state=seed)
335 |       # ae refers to 'Actual & Expected'
336 | 
337 |     # PSI function
338 |     def psi(dat):
339 |         dt_bae = dat.groupby(['ae','bin']).size().reset_index(name='N')\
340 |           .pivot_table(values='N', index='bin', columns='ae').fillna(0.9)\
341 |           .agg(lambda x: x/sum(x))
342 |         dt_bae.columns = ['A','E']
343 |         psi_dt = dt_bae.assign(
344 |           AE = lambda x: x.A-x.E,
345 |           logAE = lambda x: np.log(x.A/x.E)
346 |         ).assign(
347 |           bin_PSI=lambda x: x.AE*x.logAE
348 |         )['bin_PSI'].sum()
349 |         return psi_dt
350 | 
351 |     # return psi and pic
352 |     rt_psi = {}
353 |     rt_pic = {}
354 |     rt_dat = {}
355 |     rt = {}
356 |     for sn in score_names:
357 |         # dataframe with columns of ae y sn
358 |         dat = dt_sl[['ae', 'y', sn]]
359 |         if len(dt_sl[sn].unique()) > 10:
360 |             # breakpoints
361 |             if x_limits is None:
362 |                 x_limits = dat[sn].quantile([0.02, 0.98])
363 |                 x_limits = round(x_limits/x_tick_break)*x_tick_break
364 |                 x_limits = list(x_limits)
365 | 
366 |             brkp = np.unique([np.floor(min(dt_sl[sn])/x_tick_break)*x_tick_break]+\
367 |               list(np.arange(x_limits[0], x_limits[1], x_tick_break))+\
368 |               [np.ceil(max(dt_sl[sn])/x_tick_break)*x_tick_break])
369 |             # cut
370 |             labels = ['[{},{})'.format(int(brkp[i]), int(brkp[i+1])) for i in range(len(brkp)-1)]
371 |             dat.loc[:,'bin'] = pd.cut(dat[sn], brkp, right=False, labels=labels)
372 |         else:
373 |             dat.loc[:,'bin'] = dat[sn]
374 |         # psi ------
375 |         rt_psi[sn] = pd.DataFrame({'PSI':psi(dat)},index=np.arange(1))
376 | 
377 |         # distribution of scorecard probability
378 |         def good(x): return sum(x==0)
379 |         def bad(x): return sum(x==1)
380 |         distr_prob = dat.groupby(['ae', 'bin'])\
381 |           ['y'].agg([good, bad])\
382 |           .assign(N=lambda x: x.good+x.bad,
383 |             badprob=lambda x: x.bad/(x.good+x.bad)
384 |           ).reset_index()
385 |         distr_prob.loc[:,'distr'] = distr_prob.groupby('ae')['N'].transform(lambda x:x/sum(x))
386 |         # pivot table
387 |         distr_prob = distr_prob.pivot_table(values=['N','badprob', 'distr'], index='bin', columns='ae')
388 | 
389 |         # plot ------
390 |         if show_plot:
391 |             ###### param ######
392 |             ind = np.arange(len(distr_prob.index))    # the x locations for the groups
393 |             width = 0.35       # the width of the bars: can also be len(x) sequence
394 |             ###### plot ######
395 |             fig, ax1 = plt.subplots()
396 |             ax2 = ax1.twinx()
397 |             title_string = sn+'_PSI: '+str(round(psi(dat),4))
398 |             title_string = title_string if title is None else str(title)+' '+title_string
399 |             # ax1
400 |             p1 = ax1.bar(ind, distr_prob.distr.iloc[:,0], width, color=(24/254, 192/254, 196/254), alpha=0.6)
401 |             p2 = ax1.bar(ind+width, distr_prob.distr.iloc[:,1], width, color=(246/254, 115/254, 109/254), alpha=0.6)
402 |             # ax2
403 |             p3 = ax2.plot(ind+width/2, distr_prob.badprob.iloc[:,0], color=(24/254, 192/254, 196/254))
404 |             ax2.scatter(ind+width/2, distr_prob.badprob.iloc[:,0], facecolors='w', edgecolors=(24/254, 192/254, 196/254))
405 |             p4 = ax2.plot(ind+width/2, distr_prob.badprob.iloc[:,1], color=(246/254, 115/254, 109/254))
406 |             ax2.scatter(ind+width/2, distr_prob.badprob.iloc[:,1], facecolors='w', edgecolors=(246/254, 115/254, 109/254))
407 |             # settings
408 |             ax1.set_ylabel('Score distribution')
409 |             ax2.set_ylabel('Bad probability')#, color='blue')
410 |             # ax2.tick_params(axis='y', colors='blue')
411 |             # ax1.set_yticks(np.arange(0, np.nanmax(distr_prob['distr'].values), 0.2))
412 |             # ax2.set_yticks(np.arange(0, 1+0.2, 0.2))
413 |             ax1.set_ylim([0,np.ceil(np.nanmax(distr_prob['distr'].values)*10)/10])
414 |             ax2.set_ylim([0,1])
415 |             plt.xticks(ind+width/2, distr_prob.index)
416 |             plt.title(title_string, loc='left')
417 |             ax1.legend((p1[0], p2[0]), list(distr_prob.columns.levels[1]), loc='upper left')
418 |             ax2.legend((p3[0], p4[0]), list(distr_prob.columns.levels[1]), loc='upper right')
419 |             # show plot
420 |             plt.show()
421 | 
422 |             # return of pic
423 |             rt_pic[sn] = fig
424 | 
425 |         # return distr_dat ------
426 |         if return_distr_dat:
427 |             rt_dat[sn] = distr_prob[['N','badprob']].reset_index()
428 |     # return rt
429 |     rt['psi'] = pd.concat(rt_psi).reset_index().rename(columns={'level_0':'variable'})[['variable', 'PSI']]
430 |     rt['pic'] = rt_pic
431 |     if return_distr_dat: rt['dat'] = rt_dat
432 |     return rt
433 | 


--------------------------------------------------------------------------------
/utils/excel_writer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @Time    : 2023/2/14 16:23
  4 | @Author  : itlubber
  5 | @Site    : itlubber.art
  6 | """
  7 | import re
  8 | import os
  9 | 
 10 | import matplotlib.pyplot as plt
 11 | import numpy as np
 12 | import pandas as pd
 13 | 
 14 | from openpyxl.cell.cell import Cell
 15 | from openpyxl.drawing.image import Image
 16 | from openpyxl import load_workbook, Workbook
 17 | from openpyxl.formatting.rule import DataBarRule
 18 | from openpyxl.utils.dataframe import dataframe_to_rows
 19 | from openpyxl.utils import get_column_letter, column_index_from_string
 20 | from openpyxl.styles import NamedStyle, Border, Side, Alignment, PatternFill, Font
 21 | 
 22 | 
 23 | class ExcelWriter:
 24 | 
 25 |     def __init__(self, style_excel='报告输出模版.xlsx', style_sheet_name="初始化", fontsize=10, font='楷体', theme_color='8E8BFE'):
 26 |         """
 27 |         excel 文件内容写入公共方法
 28 | 
 29 |         :param style_excel: 样式模版文件，默认当前路径下的 报告输出模版.xlsx ，如果项目路径调整需要进行相应的调整
 30 |         :param style_sheet_name: 模版文件内初始样式sheet名称，默认即可
 31 |         :param fontsize: 插入excel文件中内容的字体大小，默认 10
 32 |         :param font: 插入excel文件中内容的字体，默认 楷体
 33 |         :param theme_color: 主题色，默认 8E8BFE，注意不包含 #
 34 |         """
 35 |         # english_width，chinese_width
 36 |         self.english_width = 0.12
 37 |         self.chinese_width = 0.21
 38 |         self.theme_color = theme_color
 39 |         self.fontsize = 10
 40 |         self.font = '楷体'
 41 | 
 42 |         self.workbook = load_workbook(style_excel)
 43 |         self.style_sheet = self.workbook[style_sheet_name]
 44 | 
 45 |         self.name_styles = []
 46 |         self.init_style(font, fontsize, theme_color)
 47 |         for style in self.name_styles:
 48 |             if style.name not in self.workbook.style_names:
 49 |                 self.workbook.add_named_style(style)
 50 | 
 51 |     def add_conditional_formatting(self, worksheet, start_space, end_space):
 52 |         """
 53 |         设置条件格式
 54 | 
 55 |         :param worksheet: 当前选择设置条件格式的sheet
 56 |         :param start_space: 开始单元格位置
 57 |         :param end_space: 结束单元格位置
 58 |         """
 59 |         worksheet.conditional_formatting.add(f'{start_space}:{end_space}', DataBarRule(start_type='min', end_type='max', color=self.theme_color))
 60 | 
 61 |     @staticmethod
 62 |     def set_column_width(worksheet, column, width):
 63 |         """
 64 |         调整excel列宽
 65 | 
 66 |         :param worksheet: 当前选择调整列宽的sheet
 67 |         :param column: 列，可以直接输入 index 或者 字母
 68 |         :param width: 设置列的宽度
 69 |         """
 70 |         worksheet.column_dimensions[column if isinstance(column, str) else get_column_letter(column)] = width
 71 | 
 72 |     @staticmethod
 73 |     def set_number_format(worksheet, space, _format):
 74 |         """
 75 |         设置数值显示格式
 76 | 
 77 |         :param worksheet: 当前选择调整数值显示格式的sheet
 78 |         :param space: 单元格范围
 79 |         :param _format: 显示格式，参考 openpyxl
 80 |         """
 81 |         cells = worksheet[space]
 82 |         if isinstance(cells, Cell):
 83 |             cells = [cells]
 84 | 
 85 |         for cell in cells:
 86 |             if isinstance(cell, tuple):
 87 |                 for c in cell:
 88 |                     c.number_format = _format
 89 |             else:
 90 |                 cell.number_format = _format
 91 | 
 92 |     def get_sheet_by_name(self, name):
 93 |         """
 94 |         获取sheet名称为name的工作簿，如果不存在，则从初始模版文件中拷贝一个名称为name的sheet
 95 | 
 96 |         :param name: 需要获取的工作簿名称
 97 |         """
 98 |         if name not in self.workbook.sheetnames:
 99 |             worksheet = self.workbook.copy_worksheet(self.style_sheet)
100 |             worksheet.title = name
101 |         else:
102 |             worksheet = self.workbook[name]
103 | 
104 |         return worksheet
105 | 
106 |     def insert_value2sheet(self, worksheet, insert_space, value="", style="content", auto_width=False):
107 |         """
108 |         向sheet中的某个单元格插入某种样式的内容
109 | 
110 |         :param worksheet: 需要插入内容的sheet
111 |         :param insert_space: 内容插入的单元格位置，可以是 "B2" 或者 (2, 2) 任意一种形式
112 |         :param value: 需要插入的内容
113 |         :param style: 渲染的样式，参考 init_style 中初始设置的样式
114 |         :param auto_width: 是否开启自动调整列宽
115 |         :return 返回插入元素最后一列之后、最后一行之后的位置
116 |         """
117 |         if isinstance(insert_space, str):
118 |             worksheet[insert_space] = value
119 |             cell = worksheet[insert_space]
120 |             start_col = re.findall('\D+', insert_space)[0]
121 |             start_row = int(re.findall("\d+", insert_space)[0])
122 |         else:
123 |             cell = worksheet.cell(insert_space[0], insert_space[1], value)
124 |             start_col = get_column_letter(insert_space[1])
125 |             start_row = insert_space[0]
126 |         cell.style =  style
127 | 
128 |         if auto_width:
129 |             curr_width = worksheet.column_dimensions[start_col].width
130 |             auto_width = min(max([(self.check_contain_chinese(value)[1] * self.english_width + self.check_contain_chinese(value)[2] * self.chinese_width) * self.fontsize, 10, curr_width]), 50)
131 |             worksheet.column_dimensions[start_col].width = auto_width
132 | 
133 |         return start_row + 1, column_index_from_string(start_col) + 1
134 | 
135 |     def insert_pic2sheet(self, worksheet, fig, insert_space, figsize=(600, 250)):
136 |         """
137 |         向excel中插入图片内容
138 | 
139 |         :param worksheet: 需要插入内容的sheet
140 |         :param fig: 需要插入的图片路径
141 |         :param insert_space: 插入图片的起始单元格
142 |         :param figsize: 图片大小设置
143 |         :return 返回插入元素最后一列之后、最后一行之后的位置
144 |         """
145 |         if isinstance(insert_space, str):
146 |             start_row = int(re.findall("\d+", insert_space)[0])
147 |             start_col = re.findall('\D+', insert_space)[0]
148 |         else:
149 |             start_row, start_col = insert_space
150 |             start_col = get_column_letter(start_col)
151 | 
152 |         image = Image(fig)
153 |         image.width, image.height = figsize
154 |         worksheet.add_image(image, f"{start_col}{start_row}")
155 | 
156 |         return start_row + int(figsize[1] / 17.5), column_index_from_string(start_col) + 8
157 | 
158 |     def insert_rows(self, worksheet, row, row_index, col_index, merge_rows=None, style="", auto_width=False):
159 |         curr_col = column_index_from_string(col_index)
160 |         for j, v in enumerate(row):
161 |             if merge_rows is not None and row_index + 1 not in merge_rows:
162 |                 if j == 0:
163 |                     self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style="merge_left", auto_width=auto_width)
164 |                 elif j == len(row) - 1:
165 |                     self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style="merge_right", auto_width=auto_width)
166 |                 else:
167 |                     self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style="merge_middle", auto_width=auto_width)
168 |             else:
169 |                 if j == 0:
170 |                     self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style=f"{style}_left" if style else "left", auto_width=auto_width)
171 |                 elif j == len(row) - 1:
172 |                     self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style=f"{style}_right" if style else "right", auto_width=auto_width)
173 |                 else:
174 |                     self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style=f"{style}_middle" if style else "middle", auto_width=auto_width)
175 | 
176 |     def insert_df2sheet(self, worksheet, data, insert_space, merge_column=None, header=True, index=False, auto_width=False):
177 |         """
178 |         向excel文件中插入制定样式的dataframe数据
179 | 
180 |         :param worksheet: 需要插入内容的sheet
181 |         :param data: 需要插入的dataframe
182 |         :param insert_space: 插入内容的起始单元格位置
183 |         :param merge_column: 需要分组显示的列，index或者列明
184 |         :param header: 是否存储dataframe的header，暂不支持多级表头
185 |         :param index: 是否存储dataframe的index
186 |         :param auto_width: 是否自动调整列宽
187 |         :return 返回插入元素最后一列之后、最后一行之后的位置
188 |         """
189 |         df = data.copy()
190 | 
191 |         if isinstance(insert_space, str):
192 |             start_row = int(re.findall("\d+", insert_space)[0])
193 |             start_col = re.findall('\D+', insert_space)[0]
194 |         else:
195 |             start_row, start_col = insert_space
196 |             start_col = get_column_letter(start_col)
197 | 
198 |         if merge_column:
199 |             if isinstance(merge_column, str):
200 |                 merge_column = [merge_column]
201 | 
202 |             if isinstance(merge_column[0], (int, float)):
203 |                 merge_cols = None
204 |                 merge_rows = merge_rows
205 |             else:
206 |                 merge_cols = [get_column_letter(df.columns.get_loc(col) + column_index_from_string(start_col)) for col in merge_column]
207 |                 df = df.sort_values(merge_column)
208 |                 merge_rows = list(np.cumsum(df.groupby(merge_column)[merge_column].count().values[:, 0]) + start_row + 1)
209 | 
210 |         for i, row in enumerate(dataframe_to_rows(df, header=header, index=index)):
211 |             if i == 0:
212 |                 if header:
213 |                     self.insert_rows(worksheet, row, start_row + i, start_col, style="header", auto_width=auto_width)
214 |                 else:
215 |                     self.insert_rows(worksheet, row, start_row + i, start_col, style="first", auto_width=auto_width)
216 |             elif (header and i == len(df)) or (not header and i + 1 == len(df)):
217 |                 self.insert_rows(worksheet, row, start_row + i, start_col, style="last", auto_width=auto_width)
218 |             else:
219 |                 self.insert_rows(worksheet, row, start_row + i, start_col, auto_width=auto_width, merge_rows=merge_rows if merge_column else None)
220 | 
221 |         # if merge_column and merge_cols is not None:
222 |         #     merge_rows = [start_row + 2] + merge_rows
223 |         #     for s, e in zip(merge_rows[:-1], merge_rows[1:]):
224 |         #         if e - s > 1:
225 |         #             for merge_col in merge_cols:
226 |         #                 worksheet.merge_cells(f"{merge_col}{s-1}:{merge_col}{e-1}")
227 | 
228 |         end_row = start_row + len(data) + 1 if header else start_row + len(data)
229 | 
230 |         return (end_row, column_index_from_string(start_col) + len(data.columns))
231 | 
232 |     @staticmethod
233 |     def check_contain_chinese(check_str):
234 |         out = []
235 |         for ch in str(check_str).encode('utf-8').decode('utf-8'):
236 |             if u'\u4e00' <= ch <= u'\u9fff':
237 |                 out.append(True)
238 |             else:
239 |                 out.append(False)
240 |         return out, len(out) - sum(out), sum(out)
241 | 
242 |     @staticmethod
243 |     def astype_insertvalue(value, decimal_point=4):
244 |         if re.search('tuple|list|numpy.dtype|bool|str|numpy.ndarray|Interval|Categorical', str(type(value))):
245 |             value = str(value)
246 |         elif re.search('int', str(type(value))):
247 |             value = value
248 |         elif re.search('float', str(type(value))):
249 |             value = round(float(value), decimal_point)
250 |         else:
251 |             value = 'nan'
252 | 
253 |         return value
254 | 
255 |     @staticmethod
256 |     def calc_continuous_cnt(list_, index_=0):
257 |         """
258 |         Clac continuous_cnt
259 | 
260 |         Examples:s
261 |             list_ = ['A','A','A','A','B','C','C','D','D','D']
262 |             (1) calc_continuous_cnt(list_, 0) ===>('A', 0, 4)
263 |             (2) calc_continuous_cnt(list_, 4) ===>('B', 4, 1)
264 |             (3) calc_continuous_cnt(list_, 6) ===>('C', 6, 1)
265 |         """
266 |         if index_ >= len(list_):
267 |             return None, None, None
268 | 
269 |         else:
270 |             cnt, str_ = 0, list_[index_]
271 |             for i in range(index_, len(list_), 1):
272 |                 if list_[i] == str_:
273 |                     cnt = cnt + 1
274 |                 else:
275 |                     break
276 |             return str_, index_, cnt
277 | 
278 |     @staticmethod
279 |     def itlubber_border(border, color):
280 |         if len(border) == 3:
281 |             return Border(left=Side(border_style=border[0], color=color[0]), right=Side(border_style=border[1], color=color[1]), bottom=Side(border_style=border[2], color=color[2]),)
282 |         else:
283 |             return Border(left=Side(border_style=border[0], color=color[0]), right=Side(border_style=border[1], color=color[1]), bottom=Side(border_style=border[2], color=color[2]), top=Side(border_style=border[3], color=color[3]),)
284 | 
285 |     @staticmethod
286 |     def get_cell_space(space):
287 |         if isinstance(space, str):
288 |             start_row = int(re.findall("\d+", space)[0])
289 |             start_col = re.findall('\D+', space)[0]
290 |             return start_row, column_index_from_string(start_col)
291 |         else:
292 |             start_row = space[0]
293 |             if isinstance(space[1], int):
294 |                 start_col = get_column_letter(space[1])
295 |             else:
296 |                 start_col = space[1]
297 |             return f"{start_row}{start_col}"
298 | 
299 |     def init_style(self, font, fontsize, theme_color):
300 |         header_style, header_left_style, header_middle_style, header_right_style = NamedStyle(name="header"), NamedStyle(name="header_left"), NamedStyle(name="header_middle"), NamedStyle(name="header_right")
301 |         last_style, last_left_style, last_middle_style, last_right_style = NamedStyle(name="last"), NamedStyle(name="last_left"), NamedStyle(name="last_middle"), NamedStyle(name="last_right")
302 |         content_style, left_style, middle_style, right_style = NamedStyle(name="content"), NamedStyle(name="left"), NamedStyle(name="middle"), NamedStyle(name="right")
303 |         merge_style, merge_left_style, merge_middle_style, merge_right_style =  NamedStyle(name="merge"), NamedStyle(name="merge_left"), NamedStyle(name="merge_middle"), NamedStyle(name="merge_right")
304 |         first_style, first_left_style, first_middle_style, first_right_style = NamedStyle(name="first"), NamedStyle(name="first_left"), NamedStyle(name="first_middle"), NamedStyle(name="first_right")
305 | 
306 |         header_font = Font(size=fontsize, name=font, color="FFFFFF", bold=True)
307 |         header_fill = PatternFill(fill_type="solid", start_color=theme_color)
308 |         alignment = Alignment(horizontal='center', vertical='center', wrap_text=False)
309 |         content_fill = PatternFill(fill_type="solid", start_color="FFFFFF")
310 |         content_font = Font(size=fontsize, name=font, color="000000")
311 | 
312 |         header_style.font, header_left_style.font, header_middle_style.font, header_right_style.font = header_font, header_font, header_font, header_font
313 |         header_style.fill, header_left_style.fill, header_middle_style.fill, header_right_style.fill = header_fill, header_fill, header_fill, header_fill
314 |         header_style.alignment, header_left_style.alignment, header_middle_style.alignment, header_right_style.alignment = Alignment(horizontal='left', vertical='center', wrap_text=True), alignment, alignment, alignment
315 | 
316 |         header_style.border = self.itlubber_border(["medium", "medium", "medium", "medium"], [theme_color, theme_color, theme_color, theme_color])
317 |         header_left_style.border = self.itlubber_border(["medium", "thin", "medium", "medium"], [theme_color, "FFFFFF", theme_color, theme_color])
318 |         header_middle_style.border = self.itlubber_border(["thin", "thin", "medium", "medium"], ["FFFFFF", "FFFFFF", theme_color, theme_color])
319 |         header_right_style.border = self.itlubber_border(["thin", "medium", "medium", "medium"], ["FFFFFF", theme_color, theme_color, theme_color])
320 | 
321 |         last_style.font, last_left_style.font, last_middle_style.font, last_right_style.font = content_font, content_font, content_font, content_font
322 |         last_style.fill, last_left_style.fill, last_middle_style.fill, last_right_style.fill = content_fill, content_fill, content_fill, content_fill
323 |         last_style.alignment, last_left_style.alignment, last_middle_style.alignment, last_right_style.alignment = alignment, alignment, alignment, alignment
324 | 
325 |         last_style.border = self.itlubber_border(["medium", "medium", "medium"], [theme_color, theme_color, theme_color])
326 |         last_left_style.border = self.itlubber_border(["medium", "thin", "medium"], [theme_color, "FFFFFF", theme_color])
327 |         last_middle_style.border = self.itlubber_border(["thin", "thin", "medium"], ["FFFFFF", "FFFFFF", theme_color])
328 |         last_right_style.border = self.itlubber_border(["thin", "medium", "medium"], ["FFFFFF", theme_color, theme_color])
329 | 
330 |         content_style.font, left_style.font, middle_style.font, right_style.font = content_font, content_font, content_font, content_font
331 |         content_style.fill, left_style.fill, middle_style.fill, right_style.fill = content_fill, content_fill, content_fill, content_fill
332 |         content_style.alignment, left_style.alignment, middle_style.alignment, right_style.alignment = alignment, alignment, alignment, alignment
333 | 
334 |         content_style.border = self.itlubber_border(["medium", "medium", "thin"], [theme_color, theme_color, theme_color])
335 |         left_style.border = self.itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", theme_color])
336 |         middle_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", "FFFFFF", theme_color])
337 |         right_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, theme_color])
338 | 
339 |         merge_style.font, merge_left_style.font, merge_middle_style.font, merge_right_style.font = content_font, content_font, content_font, content_font
340 |         merge_style.fill, merge_left_style.fill, merge_middle_style.fill, merge_right_style.fill = content_fill, content_fill, content_fill, content_fill
341 |         merge_style.alignment, merge_left_style.alignment, merge_middle_style.alignment, merge_right_style.alignment = alignment, alignment, alignment, alignment
342 | 
343 |         merge_style.border = self.itlubber_border(["medium", "medium", "thin"], ["FFFFFF", "FFFFFF", "FFFFFF"])
344 |         merge_left_style.border = self.itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", "FFFFFF"])
345 |         merge_middle_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", "FFFFFF", "FFFFFF"])
346 |         merge_right_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, "FFFFFF"])
347 | 
348 |         first_style.font, first_left_style.font, first_middle_style.font, first_right_style.font = content_font, content_font, content_font, content_font
349 |         first_style.fill, first_left_style.fill, first_middle_style.fill, first_right_style.fill = content_fill, content_fill, content_fill, content_fill
350 |         first_style.alignment, first_left_style.alignment, first_middle_style.alignment, first_right_style.alignment = alignment, alignment, alignment, alignment
351 | 
352 |         first_style.border = self.itlubber_border(["medium", "medium", "thin", "medium"], [theme_color, theme_color, theme_color, theme_color])
353 |         first_left_style.border = self.itlubber_border(["medium", "thin", "thin", "medium"], [theme_color, "FFFFFF", theme_color, theme_color])
354 |         first_middle_style.border = self.itlubber_border(["thin", "thin", "thin", "medium"], ["FFFFFF", "FFFFFF", theme_color, theme_color])
355 |         first_right_style.border = self.itlubber_border(["thin", "medium", "thin", "medium"], ["FFFFFF", theme_color, theme_color, theme_color])
356 | 
357 |         self.name_styles.extend([
358 |             header_style, header_left_style, header_middle_style, header_right_style,
359 |             last_style, last_left_style, last_middle_style, last_right_style,
360 |             content_style, left_style, middle_style, right_style,
361 |             merge_style, merge_left_style, merge_middle_style, merge_right_style,
362 |             first_style, first_left_style, first_middle_style, first_right_style
363 |         ])
364 | 
365 |     def save(self, filename):
366 |         """
367 |         保存excel文件
368 | 
369 |         :param filename: 需要保存 excel 文件的路径
370 |         """
371 |         self.workbook.remove(self.style_sheet)
372 |         self.workbook.save(filename)
373 |         self.workbook.close()
374 | 
375 | 
376 | if __name__ == '__main__':
377 |     writer = ExcelWriter(style_excel="/Users/lubberit/Desktop/金融/兴业银行贷中行为评分/utils/报告输出模版.xlsx")
378 |     worksheet = writer.get_sheet_by_name("模型报告")
379 |     writer.insert_value2sheet(worksheet, "B2", value="模型报告", style="header")
380 |     writer.insert_value2sheet(worksheet, "B3", value="当前模型主要为评分卡模型", style="content", auto_width=True)
381 |     end_row = writer.insert_pic2sheet(worksheet, "/Users/lubberit/Desktop/金融/兴业银行贷中行为评分/tests/mypic.png", "B5")
382 |     end_row = writer.insert_pic2sheet(worksheet, "/Users/lubberit/Desktop/金融/兴业银行贷中行为评分/tests/mypic.png", "H5")
383 |     sample = pd.DataFrame(np.concatenate([np.random.random_sample((10, 10)) * 40, np.random.randint(0, 3, (10, 2))], axis=1), columns=[f"B{i}" for i in range(10)] + ["target", "type"])
384 |     end_row, end_col = writer.insert_df2sheet(worksheet, sample, (end_row + 2, column_index_from_string("B")))
385 |     end_row, end_col = writer.insert_df2sheet(worksheet, sample, (end_row + 2, column_index_from_string("B")), merge_column="target")
386 |     end_row, end_col = writer.insert_df2sheet(worksheet, sample, (end_row + 2, column_index_from_string("B")), merge_column=["target", "type"])
387 |     writer.save("test.xlsx")
388 | 


--------------------------------------------------------------------------------
/utils/tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @Time    : 2022/8/23 13:12
  4 | @Author  : itlubber
  5 | @Site    : itlubber.art
  6 | """
  7 | 
  8 | import os
  9 | import six
 10 | import toad
 11 | import joblib
 12 | import warnings
 13 | import numpy as np
 14 | import pandas as pd
 15 | from tqdm import tqdm
 16 | import scorecardpy as sc
 17 | from datetime import datetime
 18 | import matplotlib.pyplot as plt
 19 | from optbinning import OptimalBinning
 20 | from sklearn.metrics import make_scorer
 21 | from sklearn.model_selection import train_test_split
 22 | 
 23 | from openpyxl import load_workbook, Workbook
 24 | from openpyxl.formatting.rule import DataBarRule
 25 | from openpyxl.styles import Border, Side, Alignment, PatternFill, Font
 26 | 
 27 | 
 28 | def init_setting(font_path="./utils/matplot_chinese.ttf"):
 29 |     import warnings
 30 |     import matplotlib
 31 |     from matplotlib import font_manager
 32 |     warnings.filterwarnings("ignore")
 33 |     pd.options.display.float_format = '{:.4f}'.format
 34 |     pd.set_option('display.max_colwidth', 300)
 35 |     plt.style.use('seaborn-ticks')
 36 |     matplotlib.font_manager.fontManager.addfont(font_path)
 37 |     matplotlib.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name()
 38 |     matplotlib.rcParams['axes.unicode_minus'] = False
 39 |     
 40 |     
 41 | # warnings.filterwarnings("ignore")
 42 | # pd.set_option('display.width', 5000)
 43 | # plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
 44 | # plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
 45 | 
 46 | 
 47 | try:
 48 |     feature_describe = pd.read_excel("变量字典及字段解释.xlsx", sheet_name="数据字段表", header=0, engine="openpyxl", usecols=[0, 1])
 49 |     feature_describe = feature_describe.drop_duplicates(subset=["变量名称"], keep="last")
 50 |     feature_dict = dict(zip(feature_describe["变量名称"], feature_describe["含义"]))
 51 | except:
 52 |     feature_dict = {}
 53 | 
 54 | 
 55 | def ks_score(y, y_pred):
 56 |     return toad.KS(y_pred[:, 1], y)
 57 | 
 58 | 
 59 | ks_score = make_scorer(ks_score, needs_proba=True)
 60 | 
 61 | 
 62 | def round_float(num):
 63 |     if ~pd.isnull(num) and isinstance(num, float):
 64 |         return float(str(num).split(".")[0] + "." + str(num).split(".")[1][:4])
 65 |     else:
 66 |         return num
 67 | 
 68 | 
 69 | def feature_bins(bins):
 70 |     if isinstance(bins, list): bins = np.array(bins)
 71 |     EMPTYBINS = len(bins) if not isinstance(bins[0], (set, list, np.ndarray)) else -1
 72 | 
 73 |     l = []
 74 |     if np.issubdtype(bins.dtype, np.number):
 75 |         has_empty = len(bins) > 0 and np.isnan(bins[-1])
 76 |         if has_empty: bins = bins[:-1]
 77 |         sp_l = ["负无穷"] + [round_float(b) for b in bins.tolist()] + ["正无穷"]
 78 |         for i in range(len(sp_l) - 1): l.append('['+str(sp_l[i])+' , '+str(sp_l[i+1])+')')
 79 |         if has_empty: l.append('缺失值')
 80 |     else:
 81 |         for keys in bins:
 82 |             keys_update = set()
 83 |             for key in keys:
 84 |                 if pd.isnull(key) or key == "nan":
 85 |                     keys_update.add("缺失值")
 86 |                 elif key.strip() == "":
 87 |                     keys_update.add("空字符串")
 88 |                 else:
 89 |                     keys_update.add(key)
 90 |             label = ','.join(keys_update)
 91 |             l.append(label)
 92 | 
 93 |     return {i if b != "缺失值" else EMPTYBINS: b for i, b in enumerate(l)}
 94 | 
 95 | 
 96 | def feature_bin_stats(data, feature, combiner=None, target="target", rules={}, empty_separate=True, method='cart', min_samples=0.15, max_n_bins=3, gamma=0.01, monotonic_trend="auto_asc_desc", feature_dict={}):
 97 |     # if combiner is None:
 98 |     #     combiner = toad.transform.Combiner()
 99 |     #     combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, min_samples=min_samples) 
100 |     if feature not in rules:
101 |         if data[feature].nunique(dropna=True) < 3:
102 |             splits = []
103 |             for v in data[feature].unique():
104 |                 if not pd.isnull(v):
105 |                     splits.append(v)
106 | 
107 |             if str(data[feature].dtypes) in ["object", "string", "category"]:
108 |                 rule = {feature: [[s] for s in splits]}
109 |                 rule[feature].append([[np.nan]])
110 |             else:
111 |                 rule = {feature: sorted(splits) + [np.nan]}
112 |         else:
113 |             try:
114 |                 y = data[target]
115 |                 if str(data[feature].dtypes) in ["object", "string", "category"]:
116 |                     dtype = "categorical"
117 |                     x = data[feature].astype("category").values
118 |                 else:
119 |                     dtype = "numerical"
120 |                     x = data[feature].values
121 |                 _combiner = OptimalBinning(feature, dtype=dtype, max_n_bins=max_n_bins, monotonic_trend=monotonic_trend, gamma=gamma).fit(x, y)
122 |                 if _combiner.status == "OPTIMAL":
123 |                     rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [[np.nan] if dtype == "categorical" else np.nan]}
124 |                 else:
125 |                     raise "OptimalBinning error"
126 |             except Exception as e:
127 |                 if method not in ["dt", "chi", ]:
128 |                     method = "chi"
129 |                 _combiner = toad.transform.Combiner()
130 |                 _combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, min_samples=min_samples)
131 |                 rule = _combiner.export()
132 | 
133 |     if combiner is None:
134 |         combiner = toad.transform.Combiner()
135 |     
136 |     combiner.update(rule)
137 |     
138 |     if rules and isinstance(rules, list): rules = {feature: rules}
139 |     if rules and isinstance(rules, dict): combiner.update(rules)
140 | 
141 |     # feature_bin = combiner.export()[feature]
142 |     # feature_bin_dict = format_bins(np.array(feature_bin))
143 |     
144 |     df_bin = combiner.transform(data[[feature, target]], labels=False)
145 |     
146 |     table = df_bin[[feature, target]].groupby([feature, target]).agg(len).unstack()
147 |     table.columns.name = None
148 |     table = table.rename(columns = {0 : '好样本数', 1 : '坏样本数'}).fillna(0)
149 |     table["指标名称"] = feature
150 |     table["指标含义"] = feature_dict.get(feature, "")
151 |     table = table.reset_index().rename(columns={feature: "分箱"})
152 |     # table["分箱"] = table["分箱"].map(feature_bin_dict)
153 | 
154 |     table['样本总数'] = table['好样本数'] + table['坏样本数']
155 |     table['样本占比'] = table['样本总数'] / table['样本总数'].sum()
156 |     table['好样本占比'] = table['好样本数'] / table['好样本数'].sum()
157 |     table['坏样本占比'] = table['坏样本数'] / table['坏样本数'].sum()
158 |     table['坏样本率'] = table['坏样本数'] / table['样本总数']
159 |     
160 |     table = table.fillna(0.)
161 |     
162 |     table['分档WOE值'] = table.apply(lambda x : np.log(x['坏样本占比'] / (x['好样本占比'] + 1e-6)),axis=1)
163 |     table['分档IV值'] = table.apply(lambda x : (x['坏样本占比'] - x['好样本占比']) * np.log(x['坏样本占比'] / (x['好样本占比'] + 1e-6)), axis=1)
164 |     table['指标IV值'] = table['分档IV值'].sum()
165 |     
166 |     table["LIFT值"] = table['坏样本率'] / (table["坏样本数"].sum() / table["样本总数"].sum())
167 |     table["累积LIFT值"] = table["LIFT值"].cumsum()
168 |     
169 |     return table[['指标名称', "指标含义", '分箱', '样本总数', '样本占比', '好样本数', '好样本占比', '坏样本数', '坏样本占比', '坏样本率', '分档WOE值', '分档IV值', '指标IV值', 'LIFT值', '累积LIFT值']]
170 | 
171 | 
172 | def plot_bin(binx, title="", show_iv=True, show_na=True, colors=["#2639E9", "#a29bfe", "#ff7675"], figsize=(10, 8)):
173 |     if not show_na:
174 |         binx = binx[binx["分箱"] != "缺失值"].reset_index(drop=True)
175 |     # y_right_max
176 |     y_right_max = np.ceil(binx['坏样本率'].max()*10)
177 |     if y_right_max % 2 == 1: y_right_max=y_right_max+1
178 |     if y_right_max - binx['坏样本率'].max()*10 <= 0.3: y_right_max = y_right_max+2
179 |     y_right_max = y_right_max/10
180 |     if y_right_max>1 or y_right_max<=0 or y_right_max is np.nan or y_right_max is None: y_right_max=1
181 |     ## y_left_max
182 |     y_left_max = np.ceil(binx['样本占比'].max()*10)/10
183 |     if y_left_max>1 or y_left_max<=0 or y_left_max is np.nan or y_left_max is None: y_left_max=1
184 |     # title
185 |     title_string = binx.loc[0,'指标名称']+"  (iv:"+str(round(binx['分档IV值'].sum(),4))+")" if show_iv else binx.loc[0,'指标名称']
186 |     title_string = title + '-' + title_string if title else title_string
187 |     # param
188 |     ind = np.arange(len(binx.index))    # the x locations for the groups
189 |     width = 0.35       # the width of the bars: can also be len(x) sequence
190 |     ###### plot ###### 
191 |     fig, ax1 = plt.subplots(figsize=figsize)
192 |     ax2 = ax1.twinx()
193 |     # ax1
194 |     p1 = ax1.bar(ind, binx['好样本占比'], width, color=colors[1])
195 |     p2 = ax1.bar(ind, binx['坏样本占比'], width, bottom=binx['好样本占比'], color=colors[2])
196 |     for i in ind:
197 |         ax1.text(i, binx.loc[i,'样本占比']*1.02, str(round(binx.loc[i,'样本占比']*100,1))+'%, '+str(binx.loc[i,'样本总数']), ha='center')
198 |     # ax2
199 |     ax2.plot(ind, binx['坏样本率'], marker='o', color=colors[0])
200 |     for i in ind:
201 |         ax2.text(i, binx.loc[i,'坏样本率']*1.02, str(round(binx.loc[i,'坏样本率']*100,1))+'%', color=colors[0], ha='center')
202 |     # settings
203 |     ax1.set_ylabel('样本分布情况')
204 |     ax2.set_ylabel('坏样本率', color=colors[0])
205 |     ax1.set_yticks(np.arange(0, y_left_max+0.2, 0.2))
206 |     ax2.set_yticks(np.arange(0, y_right_max+0.2, 0.2))
207 |     ax2.tick_params(axis='y', colors=colors[0])
208 |     plt.xticks(ind, binx['分箱'], fontsize=12)
209 |     plt.title(title_string, loc='center')
210 |     plt.legend((p2[0], p1[0]), ('好样本', '坏样本'), loc='upper right')
211 |     
212 | 
213 | # def bin_plot(feature_table, feature="", desc="", figsize=(8, 6), colors=['#8E8BFE', '#FEA3A2', '#9394E7'], max_len=35, save=None):
214 | #     feature_table = feature_table.copy()
215 | # 
216 | #     feature_table["分箱"] = feature_table["分箱"].apply(lambda x: x if re.match("^\[.*\)$", x) else str(x)[:max_len] + "..")
217 | # 
218 | #     # 绘制好坏样本分布情况
219 | #     fig, ax1 = plt.subplots(figsize=figsize)
220 | #     ax1.barh(feature_table['分箱'], feature_table['好样本数'], color=colors[0], label='好样本')
221 | #     ax1.barh(feature_table['分箱'], feature_table['坏样本数'], left=feature_table['好样本数'], color=colors[1], label='坏样本')
222 | #     ax1.set_xlabel('样本数')
223 | # 
224 | #     # 绘制坏样本率的分布情况
225 | #     ax2 = ax1.twiny()
226 | #     ax2.plot(feature_table['坏样本率'], feature_table['分箱'], colors[2], label='坏样本率', linestyle='-.')
227 | #     ax2.set_xlabel('坏样本率: 坏样本数 / 样本总数')
228 | # 
229 | #     for i, rate in enumerate(feature_table['坏样本率']):
230 | #         ax2.scatter(rate, i, color=colors[2], s=3)
231 | # 
232 | #     # 在图像对应位置显示样本总数和坏样本率
233 | #     for i, v in feature_table[['样本总数', '好样本数', '坏样本数', '坏样本率', '样本占比']].iterrows():
234 | #         ax1.text(v['样本总数'] / 2, i + len(feature_table) / 60, f"{int(v['好样本数'])}:{int(v['坏样本数'])}:{v['样本占比']:.1%}:{v['坏样本率']:.1%}")
235 | # 
236 | #     # 逆转y轴顺序
237 | #     ax1.invert_yaxis()
238 | #     
239 | #     desc = desc if desc else feature
240 | # 
241 | #     # 添加一个标题
242 | #     fig.suptitle(f'变量 {desc} 分箱图\n\n')
243 | # 
244 | #     # 合并图例
245 | #     handles1, labels1 = ax1.get_legend_handles_labels()
246 | #     handles2, labels2 = ax2.get_legend_handles_labels()
247 | #     fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.95), frameon=False)
248 | # 
249 | #     # 调整布局，使分箱信息能够完全显示
250 | #     plt.tight_layout()
251 | # 
252 | #     if save:
253 | #         if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
254 | #             os.makedirs(os.path.dirname(save))
255 | # 
256 | #         fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
257 |     
258 |     
259 | def cal_psi(train, test, feature, combiner=None):
260 |     # feature_bin = combiner.export()[feature]
261 |     # feature_bin_dict = format_bins(np.array(feature_bin))
262 |     try:
263 |         A = (combiner.transform(train[[feature]]).value_counts() / len(train[[feature]])).reset_index().rename(columns={feature: "分箱", 0: "A"})
264 |         E = (combiner.transform(test[[feature]]).value_counts() / len(test[[feature]])).reset_index().rename(columns={feature: "分箱", 0: "E"})
265 |     except:
266 |         A = (combiner.transform(train[[feature]])[feature].value_counts() / len(train)).reset_index().rename(columns={"index": "分箱", feature: "A"})
267 |         E = (combiner.transform(test[[feature]])[feature].value_counts() / len(test)).reset_index().rename(columns={"index": "分箱", feature: "E"})
268 |     df_psi = A.merge(E, on="分箱", how="outer").fillna(0.)
269 |     # df_psi["分箱"] = df_psi["分箱"].map(feature_bin_dict)
270 |     df_psi["分档PSI"] = (df_psi["A"] - df_psi["E"]) * np.log(df_psi["A"] / (df_psi["E"] + 1e-6))
271 |     df_psi["指标PSI"] = df_psi["分档PSI"].replace(np.inf, 0).sum()
272 |     
273 |     return df_psi[["分箱", "分档PSI", "指标PSI"]]
274 | 
275 | 
276 | def itlubber_border(border, color):
277 |     if len(border) == 3:
278 |         return Border(
279 |             left=Side(border_style=border[0], color=color[0]),
280 |             right=Side(border_style=border[1], color=color[1]),
281 |             bottom=Side(border_style=border[2], color=color[2]),
282 |         )
283 |     else:
284 |         return Border(
285 |             left=Side(border_style=border[0], color=color[0]),
286 |             right=Side(border_style=border[1], color=color[1]),
287 |             bottom=Side(border_style=border[2], color=color[2]),
288 |             top=Side(border_style=border[3], color=color[3]),
289 |         )
290 | 
291 | 
292 | def render_excel(excel_name, sheet_name=None, conditional_columns=[], freeze=None, merge_rows=[], percent_columns=[], theme_color="2639E9", conditional_color="9980FA", font="楷体", fontsize=10, max_column_width=50, header=True, start_row=0, n_jobs=4, bar=True, border=True):
293 |     workbook = load_workbook(excel_name)
294 |     
295 |     if sheet_name and isinstance(sheet_name, str):
296 |         sheet_names = [sheet_name]
297 |     else:
298 |         sheet_names = workbook.get_sheet_names()
299 |     
300 |     merge_rows = [i + start_row if header else i + start_row - 1 for i in merge_rows]
301 |     
302 |     for sheet_name in sheet_names:
303 |         worksheet = workbook.get_sheet_by_name(sheet_name)
304 |         
305 |         def add_conditional_formatting(column, theme_color="FDA7DF"):
306 |             worksheet.conditional_formatting.add(f'{column}2:{column}{worksheet.max_row}', DataBarRule(start_type='min', end_type='max', color=theme_color))
307 |         
308 |         for conditional_column in conditional_columns:
309 |             add_conditional_formatting(f"{conditional_column}", theme_color=conditional_color)
310 |         
311 |         def render_cell(row_index, row):
312 |             if row_index > start_row:
313 |                 if header and row_index == start_row + 1:
314 |                     for col_index, cell in enumerate(row, start=1):
315 |                         cell.font = Font(size=fontsize, name=font, color="FFFFFF", bold=True)
316 |                         cell.fill = PatternFill(fill_type="solid", start_color=theme_color)
317 |                         cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=False)
318 |                         
319 |                         if col_index == 1:
320 |                             cell.border = itlubber_border(["medium", "thin", "medium", "medium"], [theme_color, "FFFFFF", theme_color, theme_color])
321 |                         elif col_index == len(row):
322 |                             cell.border = itlubber_border(["thin", "medium", "medium", "medium"], ["FFFFFF", theme_color, theme_color, theme_color])
323 |                         else:
324 |                             cell.border = itlubber_border(["thin", "thin", "medium", "medium"], ["FFFFFF", "FFFFFF", theme_color, theme_color])
325 |                 else:
326 |                     for col_index, cell in enumerate(row, start=1):
327 |                         cell.font = Font(size=fontsize, name=font, color="000000")
328 |                         cell.fill = PatternFill(fill_type="solid", start_color="FFFFFF")
329 |                         cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=False)
330 |                         
331 |                         if col_index in percent_columns:
332 |                             # cell.alignment = Alignment(horizontal='right', vertical='center', wrap_text=False)
333 |                             cell.number_format = "0.00%"
334 |                         else:
335 |                             pass
336 |                             # cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=False)
337 |                         
338 |                         if row_index == worksheet.max_row:
339 |                             if col_index == 1:
340 |                                 cell.border = itlubber_border(["medium", "thin", "medium"], [theme_color, "FFFFFF", theme_color])
341 |                             elif col_index == len(row):
342 |                                 cell.border = itlubber_border(["thin", "medium", "medium"], ["FFFFFF", theme_color, theme_color])
343 |                             else:
344 |                                 cell.border = itlubber_border(["thin", "thin", "medium"], ["FFFFFF", "FFFFFF", theme_color])
345 |                         else:
346 |                             if merge_rows in [[], None] or (row_index - 1 in merge_rows):
347 |                                 if col_index == 1:
348 |                                     cell.border = itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", theme_color])
349 |                                 elif col_index == len(row):
350 |                                     cell.border = itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, theme_color])
351 |                                 else:
352 |                                     cell.border = itlubber_border(["thin", "thin", "thin"], ["FFFFFF", "FFFFFF", theme_color])
353 |                             else:
354 |                                 if col_index == 1:
355 |                                     cell.border = itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", "FFFFFF"])
356 |                                 elif col_index == len(row):
357 |                                     cell.border = itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, "FFFFFF"])
358 |                                 else:
359 |                                     cell.border = itlubber_border(["thin", "thin", "thin"], ["FFFFFF", "FFFFFF", "FFFFFF"])
360 |         
361 |         if border:
362 |             iterrows = tqdm(enumerate(worksheet.rows, start=1), total=worksheet.max_row - 1) if bar else enumerate(worksheet.rows, start=1)
363 |             if n_jobs > 0:
364 |                 joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(render_cell)(row_index, row) for row_index, row in iterrows)
365 |             else:
366 |                 for row_index, row in iterrows:
367 |                     render_cell(row_index, row)
368 |         
369 |         feature_table = pd.read_excel(
370 |             excel_name, sheet_name=sheet_name, engine="openpyxl"
371 |         )
372 |         feature_table_len_max = feature_table.apply(lambda x: [(len(str(i).encode('utf-8')) - len(str(i))) / 2 + len(str(i)) for i in x]).max()
373 |         for i in feature_table.columns:
374 |             # 列的字母
375 |             j = list(feature_table.columns)
376 |             column_letter = [chr(j.index(i) + 65) if j.index(i) <= 25 else 'A' + chr(j.index(i) - 26 + 65) ][0]
377 |             # 列的宽度
378 |             columns_length = (len(str(i).encode('utf-8')) - len(str(i)))/2 + len(str(i))
379 |             data_max_length = feature_table_len_max[i]
380 |             column_width = [data_max_length if columns_length < data_max_length else columns_length][0]
381 |             column_width = [column_width if column_width <= max_column_width else max_column_width][0] + 3
382 |             # 更改列的宽度
383 |             worksheet.column_dimensions['{}'.format(column_letter)].width = column_width
384 |             
385 |         if freeze:
386 |             worksheet.freeze_panes = freeze
387 |     
388 |     workbook.save(excel_name)
389 |     workbook.close()
390 |     
391 | 
392 | def run_feature_table(feature, train=None, feature_dict=None, rules={}, combiner=None, target="target", return_feature=False):
393 |     table = feature_bin_stats(train, feature, feature_dict=feature_dict, rules=rules, combiner=combiner)
394 |     df_psi = cal_psi(train[[feature, target]], test[[feature, target]], feature, combiner=combiner)
395 |     
396 |     table = table.merge(df_psi, on="分箱", how="left")
397 |     
398 |     feature_bin = combiner.export()[feature]
399 |     feature_bin_dict = format_bins(np.array(feature_bin))
400 |     table["分箱"] = table["分箱"].map(feature_bin_dict)
401 |     
402 |     if return_feature:
403 |         return feature, table
404 |     else:
405 |         return table
406 |     
407 |     
408 | def render_dataframe(df, row_height=0.4, font_size=14,
409 |                      header_color='#2639E9', row_colors=['#dae3f3', 'w'], edge_color='w',
410 |                      bbox=[0, 0, 1, 1], header_columns=0,
411 |                      ax=None, save=None, **kwargs):
412 |     data = df.copy()
413 |     for col in data.select_dtypes('datetime'):
414 |         data[col] = data[col].dt.strftime("%Y-%m-%d")
415 | 
416 |     for col in data.select_dtypes('float'):
417 |         data[col] = data[col].apply(lambda x: np.nan if pd.isnull(x) else round(x, 4))
418 | 
419 |     cols_width = [max(data[col].apply(lambda x:len(str(x).encode())).max(), len(str(col).encode())) / 8. for col in data.columns]
420 | 
421 |     if ax is None:
422 |         size = (sum(cols_width), (len(data) + 1) * row_height)
423 |         fig, ax = plt.subplots(figsize=size)
424 |         ax.axis('off')
425 | 
426 |     mpl_table = ax.table(cellText=data.values, colWidths=cols_width, bbox=bbox, colLabels=data.columns, **kwargs)
427 | 
428 |     mpl_table.auto_set_font_size(False)
429 |     mpl_table.set_fontsize(font_size)
430 | 
431 |     for k, cell in  six.iteritems(mpl_table._cells):
432 |         cell.set_edgecolor(edge_color)
433 |         if k[0] == 0 or k[1] < header_columns:
434 |             cell.set_text_props(weight='bold', color='w')
435 |             cell.set_facecolor(header_color)
436 |         else:
437 |             cell.set_facecolor(row_colors[k[0]%len(row_colors)])
438 | 
439 |     if save:
440 |         if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
441 |             os.makedirs(os.path.dirname(save))
442 | 
443 |         fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
444 | 
445 |     return fig
446 | 
447 | 
448 | if __name__ == '__main__':
449 |     from functools import partial
450 |     from multiprocessing import Pool
451 |     data = sc.germancredit()
452 |     
453 |     # 测试数据
454 |     data["target"] = data["creditability"].replace({'good':0,'bad':1})
455 |     data["credit.amount"].loc[0] = np.nan
456 |     data["status.of.existing.checking.account"].loc[0] = np.nan
457 |     data["test_a"] = 0.
458 |     data["test_a"].loc[0] = np.nan
459 |     data["test_b"] = ""
460 |     data["test_b"].loc[0] = np.nan
461 |     data["test_c"] = np.nan
462 |     
463 |     # data = data.replace("", np.nan)
464 |     
465 |     train, test = train_test_split(data, test_size=0.3,)
466 |     
467 |     target = "target"
468 |     cols = ["test_a", "test_b", "test_c", "status.of.existing.checking.account", "credit.amount"]
469 |     
470 |     combiner = toad.transform.Combiner()
471 |     # combiner.fit(data[cols + [target]], target, empty_separate=True, method="chi", min_samples=0.2)
472 |     
473 |     # 保存结果至 EXCEL 文件
474 |     output_excel_name = f"指标有效性验证-{datetime.now().strftime('%Y-%m-%d')}.xlsx"
475 |     output_sheet_name = "指标有效性"
476 |     tables = {}
477 |     merge_row_number = []
478 |     
479 |     # _run_feature_table = partial(run_feature_table, train=train, feature_dict=feature_dict, rules={}, combiner=combiner, target=target, return_feature=True)
480 |     # all_feature_tables = joblib.Parallel(n_jobs=4)(joblib.delayed(_run_feature_table)(feature) for feature in cols)
481 |     
482 |     # for feature, table in all_feature_tables:
483 |     #     merge_row_number.append(len(table))
484 |     #     tables[feature] = table
485 |     
486 |     for feature in cols:
487 |         table = feature_bin_stats(train, feature, feature_dict=feature_dict, rules={}, combiner=combiner)
488 |         print(train.shape)
489 |         df_psi = cal_psi(train[[feature, target]], test[[feature, target]], feature, combiner=combiner)
490 |         
491 |         table = table.merge(df_psi, on="分箱", how="left")
492 |         
493 |         feature_bin = combiner.export()[feature]
494 |         feature_bin_dict = format_bins(np.array(feature_bin))
495 |         table["分箱"] = table["分箱"].map(feature_bin_dict)
496 |     
497 |         table = run_feature_table(feature)
498 |         # plot_bin(table, show_na=True)
499 |         merge_row_number.append(len(table))
500 |         tables[feature] = table
501 | 
502 |     merge_row_number = np.cumsum(merge_row_number).tolist()
503 |     feature_table = pd.concat(tables, ignore_index=True).round(6)
504 |     feature_table["分档WOE值"] = feature_table["分档WOE值"].fillna(np.inf)
505 |     
506 |     workbook = load_workbook(output_excel_name) if os.path.exists(output_excel_name) else None
507 |     writer = pd.ExcelWriter(output_excel_name, engine="openpyxl")
508 |     
509 |     if workbook:
510 |         writer.book = workbook
511 |         writer.sheets = {ws.title: ws for ws in workbook.worksheets}
512 |         start_row = writer.book.get_sheet_by_name(output_sheet_name).max_row 
513 |     else:
514 |         start_row = 0
515 |         
516 |     feature_table.to_excel(writer, sheet_name=output_sheet_name, index=False, header=True, startcol=0, startrow=start_row)
517 |     
518 |     writer.close()
519 |     
520 |     render_excel(output_excel_name, sheet_name=output_sheet_name, conditional_columns=["J", "N"], freeze="D2", merge_rows=merge_row_number, percent_columns=[5, 7, 9, 10], start_row=start_row, header=False if start_row > 0 else True)
521 |     # render_excel("变量字典及字段解释.xlsx")
522 |     combiner.export(to_json=f"rules_{datetime.now().strftime('%Y-%m-%d')}.json")
523 | 


--------------------------------------------------------------------------------
/processing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @Time    : 2022/8/23 13:12
  4 | @Author  : itlubber
  5 | @Site    : itlubber.art
  6 | """
  7 | 
  8 | import re
  9 | import os
 10 | import toad
 11 | import scipy
 12 | import warnings
 13 | import numpy as np
 14 | import pandas as pd
 15 | import scorecardpy as sc
 16 | import statsmodels.api as sm
 17 | from functools import partial
 18 | import matplotlib.pyplot as plt
 19 | import plotly.graph_objects as go
 20 | from IPython.display import Image
 21 | from openpyxl import load_workbook
 22 | # from joblib import Parallel, delayed
 23 | from concurrent.futures import ProcessPoolExecutor
 24 | from openpyxl.styles import Alignment
 25 | from optbinning import OptimalBinning
 26 | from sklearn.decomposition import PCA
 27 | from sklearn.pipeline import Pipeline
 28 | from sklearn.linear_model import LogisticRegression
 29 | from sklearn.utils.validation import check_is_fitted
 30 | from sklearn.model_selection import train_test_split, GridSearchCV
 31 | from sklearn.ensemble import GradientBoostingClassifier
 32 | from toad.plot import bin_plot, proportion_plot, corr_plot, badrate_plot
 33 | from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
 34 | from statsmodels.stats.outliers_influence import variance_inflation_factor
 35 | 
 36 | 
 37 | warnings.filterwarnings("ignore")
 38 | pd.set_option('display.width', 5000)
 39 | # plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
 40 | # plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
 41 | 
 42 | 
 43 | def drop_identical(frame, threshold = 0.95, return_drop = False, exclude = None, target = None):
 44 |     """drop columns by identical
 45 |     Args:
 46 |         frame (DataFrame): dataframe that will be used
 47 |         threshold (number): drop the features whose identical num is greater than threshold. if threshold is float, it will be use as percentage
 48 |         return_drop (bool): if need to return features' name who has been dropped
 49 |         exclude (array-like): list of feature names that will not be dropped
 50 |         target (str): target's name in dataframe
 51 |     Returns:
 52 |         DataFrame: selected dataframe
 53 |         array: list of feature names that has been dropped
 54 |     """
 55 |     cols = frame.columns.copy()
 56 |     
 57 |     if target is not None:
 58 |         cols.drop(target)
 59 | 
 60 |     if exclude is not None:
 61 |         cols = cols.drop(exclude)
 62 | 
 63 |     if threshold < 1:
 64 |         threshold = len(frame) * threshold
 65 | 
 66 |     drop_list = []
 67 |     for col in cols:
 68 |         n = frame[col].value_counts().max()
 69 |         
 70 |         if n > threshold:
 71 |             drop_list.append(col)
 72 | 
 73 |     r = frame.drop(columns = drop_list)
 74 | 
 75 |     res = (r,)
 76 |     if return_drop:
 77 |         res += (np.array(drop_list),)
 78 | 
 79 |     return toad.utils.unpack_tuple(res)
 80 | 
 81 | 
 82 | def select(frame, target = 'target', empty = 0.9, iv = 0.02, corr = 0.7,
 83 |             identical=0.95, return_drop = False, exclude = None):
 84 |     """select features by rate of empty, iv and correlation
 85 |     Args:
 86 |         frame (DataFrame)
 87 |         target (str): target's name in dataframe
 88 |         empty (number): drop the features which empty num is greater than threshold. if threshold is less than `1`, it will be use as percentage
 89 |         identical (number): drop the features which identical num is greater than threshold. if threshold is less than `1`, it will be use as percentage
 90 |         iv (float): drop the features whose IV is less than threshold
 91 |         corr (float): drop features that has the smallest IV in each groups which correlation is greater than threshold
 92 |         return_drop (bool): if need to return features' name who has been dropped
 93 |         exclude (array-like): list of feature name that will not be dropped
 94 |     Returns:
 95 |         DataFrame: selected dataframe
 96 |         dict: list of dropped feature names in each step
 97 |     """
 98 |     empty_drop, iv_drop, corr_drop, identical_drop = None, None, None, None
 99 | 
100 |     if empty is not False:
101 |         frame, empty_drop = toad.selection.drop_empty(frame, threshold = empty, return_drop = True, exclude = exclude)
102 |         
103 |     if identical is not False:
104 |         frame, identical_drop = drop_identical(frame, threshold = identical, return_drop = True, exclude = exclude, target = target)
105 | 
106 |     if iv is not False:
107 |         frame, iv_drop, iv_list = toad.selection.drop_iv(frame, target = target, threshold = iv, return_drop = True, return_iv = True, exclude = exclude)
108 | 
109 |     if corr is not False:
110 |         weights = 'IV'
111 | 
112 |         if iv is not False:
113 |             weights = iv_list
114 | 
115 |         frame, corr_drop = toad.selection.drop_corr(frame, target = target, threshold = corr, by = weights, return_drop = True, exclude = exclude)
116 | 
117 |     res = (frame,)
118 |     if return_drop:
119 |         d = {
120 |             'empty': empty_drop,
121 |             'identical': identical_drop,
122 |             'iv': iv_drop,
123 |             'corr': corr_drop,
124 |         }
125 |         res += (d,)
126 | 
127 |     return toad.utils.unpack_tuple(res)
128 | 
129 | 
130 | class FeatureSelection(TransformerMixin, BaseEstimator):
131 |     
132 |     def __init__(self, target="target", empty=0.95, iv=0.02, corr=0.7, exclude=None, return_drop=True, identical=0.95, remove=None, engine="scorecardpy", target_rm=False):
133 |         """
134 |         ITLUBBER提供的特征筛选方法
135 | 
136 |         Args:
137 |             target: 数据集中标签名称，默认 target
138 |             empty: 空值率，默认 0.95, 即空值占比超过 95% 的特征会被剔除
139 |             iv: IV值，默认 0.02，即iv值小于 0.02 时特征会被剔除
140 |             corr: 相关性，默认 0.7，即特征之间相关性大于 0.7 时会剔除iv较小的特征
141 |             identical: 唯一值占比，默认 0.95，即当特征的某个值占比超过 95% 时，特征会被剔除
142 |             engine: 特征筛选使用的引擎，可选 "toad", "scorecardpy" 两种，默认 scorecardpy
143 |             remove: 引擎使用 scorecardpy 时，可以传入需要强制删除的变量
144 |             return_drop: 是否返回删除信息，默认 True，即默认返回删除特征信息
145 |             target_rm: 是否剔除标签，默认 False，即不剔除
146 |             exclude: 是否需要强制保留某些特征
147 |         """
148 |         self.engine = engine
149 |         self.target = target
150 |         self.empty = empty
151 |         self.identical = identical
152 |         self.iv = iv
153 |         self.corr = corr
154 |         self.exclude = exclude
155 |         self.remove = remove
156 |         self.return_drop = return_drop
157 |         self.target_rm = target_rm
158 |         self.select_columns = None
159 |         self.dropped = None
160 |     
161 |     def fit(self, x, y=None):
162 |         if self.engine == "toad":
163 |             selected = select(x, target=self.target, empty=self.empty, identical=self.identical, iv=self.iv, corr=self.corr, exclude=self.exclude, return_drop=self.return_drop)
164 |         else:
165 |             selected = sc.var_filter(x, y=self.target, iv_limit=self.iv, missing_limit=self.empty, identical_limit=self.identical, var_rm=self.remove, var_kp=self.exclude, return_rm_reason=self.return_drop)
166 |             
167 |         if self.return_drop and isinstance(selected, dict):
168 |             self.dropped = selected["rm"]
169 |             self.select_columns = list(selected["dt"].columns)
170 |         elif self.return_drop and isinstance(selected, (tuple, list)):
171 |             self.dropped = pd.DataFrame([(feature, reason) for reason, features in selected[1].items() for feature in features], columns=["variable", "rm_reason"])
172 |             self.select_columns = list(selected[0].columns)
173 |         else:
174 |             self.select_columns = list(selected.columns)
175 |         
176 |         if self.target_rm and self.target in self.select_columns:
177 |             self.select_columns.remove(self.target)
178 |             
179 |         return self
180 |         
181 |     def transform(self, x, y=None):
182 |         # if self.engine == "toad":
183 |         #     selected = toad.selection.select(x, target=self.target, empty=self.empty, iv=self.iv, corr=self.corr, exclude=self.exclude, return_drop=self.return_drop)
184 |         # else:
185 |         #     selected = sc.var_filter(x, y=self.target, iv_limit=self.iv, missing_limit=self.empty, identical_limit=self.identical, var_rm=self.remove, var_kp=self.exclude, return_rm_reason=self.return_drop)
186 |             
187 |         # if self.return_drop and isinstance(selected, dict):
188 |         #     self.dropped = selected["rm"]
189 |         #     return selected["dt"]
190 |         # elif self.return_drop and isinstance(selected, (tuple, list)):
191 |         #     self.dropped = pd.DataFrame([(feature, reason) for reason, features in selected[1].items() for feature in features], columns=["variable", "rm_reason"])
192 |         #     return selected[0]
193 |         # else:
194 |         #     return selected
195 |         return x[[col for col in self.select_columns if col in x.columns]]
196 |     
197 |     
198 | class FeatureImportanceSelector(BaseEstimator, TransformerMixin):
199 |     
200 |     def __init__(self, top_k=126, target="target", selector="catboost", params=None, max_iv=None):
201 |         """
202 |         基于特征重要性的特征筛选方法
203 |         
204 |         Args:
205 |             target: 数据集中标签名称，默认 target
206 |             top_k: 依据特征重要性进行排序，筛选最重要的 top_k 个特征
207 |             max_iv: 是否需要删除 IV 过高的特征，建议设置为 1.0
208 |             selector: 特征选择器，目前只支持 catboost ，可以支持数据集中包含字符串的数据
209 |             params: selector 的参数，不传使用默认参数
210 |         """
211 |         self.target = target
212 |         self.top_k = top_k
213 |         self.max_iv = max_iv
214 |         self.selector = selector
215 |         self.params = params
216 |         self.feature_names_ = None
217 |         self.high_iv_feature_names_ = None
218 |         self.low_importance_feature_names_ = None
219 |         self.select_columns = None
220 |         self.dropped = None
221 |     
222 |     def fit(self, x, y=None):
223 |         x = x.copy()
224 |         
225 |         if self.max_iv is not None:
226 |             self.high_iv_feature_names_ = list(toad.quality(train, target=target, cpu_cores=-1, iv_only=True).query("iv > 1.0").index)
227 |             x = x[[c for c in x.columns if c not in self.high_iv_feature_names_]]
228 |         
229 |         X = x.drop(columns=self.target)
230 |         Y = x[self.target]
231 |         
232 |         self.feature_names_ = list(X.columns)
233 |         cat_features_index = [i for i in range(len(self.feature_names_)) if self.feature_names_[i] not in X.select_dtypes("number").columns]
234 |         
235 |         if self.selector == "catboost":
236 |             self.catboost_selector(x=X, y=Y, cat_features=cat_features_index)
237 |         else:
238 |             pass
239 |         
240 |         return self
241 |         
242 |     def transform(self, x, y=None):
243 |         return x[self.select_columns + [self.target]]
244 |         
245 |         
246 |     def catboost_selector(self, x, y, cat_features=None):
247 |         from catboost import Pool, cv, metrics, CatBoostClassifier
248 |         
249 |         cat_data = Pool(data=x, label=y, cat_features=cat_features)
250 |         
251 |         if self.params is None:
252 |             self.params = {
253 |                 "iterations": 256,
254 |                 "objective": "CrossEntropy",
255 |                 "eval_metric": "AUC",
256 |                 "learning_rate": 1e-2,
257 |                 "colsample_bylevel": 0.1,
258 |                 "depth": 4,
259 |                 "boosting_type": "Ordered",
260 |                 "bootstrap_type": "Bernoulli",
261 |                 "subsample": 0.8,
262 |                 "random_seed": 1024,
263 |                 "early_stopping_rounds": 10,
264 |                 "verbose": 0,
265 |             }
266 |         
267 |         cat_model = CatBoostClassifier(**self.params)
268 |         cat_model.fit(cat_data, eval_set=[cat_data])
269 |         
270 |         self.select_columns = [name for score, name in sorted(zip(cat_model.feature_importances_, cat_model.feature_names_), reverse=True)][:self.top_k]
271 |         self.low_importance_feature_names_ = [c for c in x.columns if c not in self.select_columns]
272 | 
273 | 
274 | class FeatureDecomposition(BaseEstimator, TransformerMixin):
275 | 
276 |     def __init__(self, freq, app, key_words=None, combin_features=None, combiner=PCA, n_components=1):
277 |         """
278 |         同一类型 + 同一周期 + 新增数/安装数/活跃天数/卸载数 的特征通过降维方法转换为 n_components 个特征
279 | 
280 |         freq: 周期，例如 90天
281 |         app: 类型，例如 银行类
282 |         key_words: 不同类型的指标，例如 ["活跃款数", "新增款数", "活跃频次", "活跃天数"]
283 |         combin_features: 手工制定需要进行降维的特征，传入app、freq、freq时不需要传入
284 |         combiner: 降维的方法，默认 PCA，参考 sklearn.decomposition 中相关方法的使用
285 |         n_components: 降维后的特征数量，默认 1
286 |         """
287 |         self.freq = freq
288 |         self.app = app
289 |         self.key_words = key_words
290 |         self.combin_features = combin_features
291 |         self.n_components = n_components
292 |         self.combiner = combiner(n_components=self.n_components)
293 | 
294 |     def fit(self, x, y=None):
295 |         x = x.copy()
296 | 
297 |         if self.combin_features:
298 |             self.combin_features = [c for c in self.combin_features if c in x.columns]
299 |         else:
300 |             if self.key_words:
301 |                 if isinstance(self.key_words, str):
302 |                     self.key_words = [self.key_words]
303 |                 pattern = re.compile(f"(?=.*{self.freq})(?=.*{self.app})(?=.*(?:{'|'.join(self.key_words)})).+")
304 |             else:
305 |                 pattern = re.compile(f"{self.app}")
306 | 
307 |             self.combin_features = [c for c in x.columns if pattern.match(c)]
308 | 
309 |         if len(self.combin_features) > 0 and len(self.combin_features) > self.n_components:
310 |             x = x[self.combin_features]
311 |             self.combiner.fit(x, y=y)
312 | 
313 |         else:
314 |             raise Exception("组合特征不在数据中。")
315 | 
316 |         return self
317 | 
318 |     def transform(self, x, y=None):
319 |         x = x[self.combin_features].copy()
320 |         return self.combiner.transform(x)
321 | 
322 |     def inverse_transform(self, x, y=None):
323 |         return self.combiner.inverse_transform(x)
324 |     
325 |     
326 | class Combiner(TransformerMixin, BaseEstimator):
327 |     
328 |     def __init__(self, target="target", method='chi', engine="toad", empty_separate=False, min_samples=0.05, min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", rules={}, n_jobs=1):
329 |         """
330 |         特征分箱封装方法
331 | 
332 |         Args:
333 |             target: 数据集中标签名称，默认 target
334 |             method: 特征分箱方法，可选 "chi", "dt", "quantile", "step", "kmeans", "cart", "mdlp", "uniform", 参考 toad.Combiner & optbinning.OptimalBinning
335 |             engine: 分箱引擎，可选 "optbinning", "toad"
336 |             empty_separate: 是否空值单独一箱, 默认 False，推荐设置为 True
337 |             min_samples: 最小叶子结点样本占比，参考对应文档进行设置，默认 5%
338 |             min_n_bins: 最小分箱数，默认 2，即最小拆分2箱
339 |             max_n_bins: 最大分像素，默认 3，即最大拆分3箱，推荐设置 3 ～ 5，不宜过多，偶尔使用 optbinning 时不起效
340 |             max_n_prebins: 使用 optbinning 时预分箱数量
341 |             min_prebin_size: 使用 optbinning 时预分箱叶子结点（或者每箱）样本占比，默认 2%
342 |             min_bin_size: 使用 optbinning 正式分箱叶子结点（或者每箱）最小样本占比，默认 5%
343 |             max_bin_size: 使用 optbinning 正式分箱叶子结点（或者每箱）最大样本占比，默认 None
344 |             gamma: 使用 optbinning 分箱时限制过拟合的正则化参数，值越大惩罚越多，默认 0。01
345 |             monotonic_trend: 使用 optbinning 正式分箱时的坏率策略，默认 auto，可选 "auto", "auto_heuristic", "auto_asc_desc", "ascending", "descending", "convex", "concave", "peak", "valley", "peak_heuristic", "valley_heuristic"
346 |             rules: 自定义分箱规则，toad.Combiner 能够接收的形式
347 |             n_jobs: 使用多进程加速的worker数量，默认单进程
348 |         """
349 |         self.combiner = toad.transform.Combiner()
350 |         self.method = method
351 |         self.empty_separate = empty_separate
352 |         self.target = target
353 |         self.min_samples = min_samples
354 |         self.max_n_bins = max_n_bins
355 |         self.min_n_bins = min_n_bins
356 |         self.min_bin_size = min_bin_size
357 |         self.max_bin_size = max_bin_size
358 |         self.max_n_prebins = max_n_prebins
359 |         self.min_prebin_size = min_prebin_size
360 |         self.gamma = gamma
361 |         self.monotonic_trend = monotonic_trend
362 |         self.rules = rules
363 |         self.engine = engine
364 |         self.n_jobs = n_jobs
365 |         
366 |     def optbinning_bins(self, feature, data=None, target="target", min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc"):
367 |         if data[feature].dropna().nunique() <= min_n_bins:
368 |             splits = []
369 |             for v in data[feature].dropna().unique():
370 |                 splits.append(v)
371 | 
372 |             if str(data[feature].dtypes) in ["object", "string", "category"]:
373 |                 rule = {feature: [[s] for s in splits]}
374 |                 rule[feature].append([[np.nan]])
375 |             else:
376 |                 rule = {feature: sorted(splits) + [np.nan]}
377 |         else:
378 |             try:
379 |                 y = data[target]
380 |                 if str(data[feature].dtypes) in ["object", "string", "category"]:
381 |                     dtype = "categorical"
382 |                     x = data[feature].astype("category").values
383 |                 else:
384 |                     dtype = "numerical"
385 |                     x = data[feature].values
386 | 
387 |                 _combiner = OptimalBinning(feature, dtype=dtype, min_n_bins=min_n_bins, max_n_bins=max_n_bins, max_n_prebins=max_n_prebins, min_prebin_size=min_prebin_size, min_bin_size=min_bin_size, max_bin_size=max_bin_size, monotonic_trend=monotonic_trend, gamma=gamma).fit(x, y)
388 |                 if _combiner.status == "OPTIMAL":
389 |                     rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [[np.nan] if dtype == "categorical" else np.nan]}
390 |                 else:
391 |                     raise Exception("optimalBinning error")
392 |             
393 |             except Exception as e:
394 |                 _combiner = toad.transform.Combiner()
395 |                 _combiner.fit(data[[feature, target]].dropna(), target, method="chi", min_samples=self.min_samples, n_bins=self.max_n_bins)
396 |                 rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.export()[feature]] + [[np.nan] if dtype == "categorical" else np.nan]}
397 |         
398 |         self.combiner.update(rule)
399 |     
400 |     def fit(self, x, y=None):
401 |         if self.engine == "optbinning":
402 |             feature_optbinning_bins = partial(self.optbinning_bins, data=x, target=self.target, min_n_bins=self.min_n_bins, max_n_bins=self.max_n_bins, max_n_prebins=self.max_n_prebins, min_prebin_size=self.min_prebin_size, min_bin_size=self.min_bin_size, max_bin_size=self.max_bin_size, gamma=self.gamma, monotonic_trend=self.monotonic_trend)
403 |             if self.n_jobs > 1:
404 |                 with ProcessPoolExecutor(max_workers=self.n_jobs) as executor:
405 |                     [executor.submit(feature_optbinning_bins(feature)) for feature in x.columns.drop(self.target)]
406 |             else:
407 |                 for feature in x.drop(columns=[self.target]):
408 |                     self.optbinning_bins(feature, data=x, target=self.target, min_n_bins=self.min_n_bins, max_n_bins=self.max_n_bins, max_n_prebins=self.max_n_prebins, min_prebin_size=self.min_prebin_size, min_bin_size=self.min_bin_size, max_bin_size=self.max_bin_size, gamma=self.gamma, monotonic_trend=self.monotonic_trend)
409 |                     # feature_optbinning_bins(feature)
410 |         else:
411 |             self.combiner.fit(x, y=self.target, method=self.method, min_samples=self.min_samples, n_bins=self.max_n_bins)
412 |         
413 |         self.update(self.rules)
414 |         
415 |         return self
416 |     
417 |     def transform(self, x, y=None, labels=False):
418 |         return self.combiner.transform(x, labels=labels)
419 |     
420 |     def update(self, rules):
421 |         if isinstance(rules, dict):
422 |             self.combiner.update(rules)
423 |             
424 |     def export(self, to_json=None):
425 |         return self.combiner.export(to_json=to_json)
426 |     
427 |     def load(self, from_json=None):
428 |         self.combiner.load(from_json=from_json)
429 |         return self
430 |         
431 |     def bin_plot(self, data, x, rule=None, labels=True, result=False, save=None):
432 |         if rule:
433 |             if isinstance(rule, list):
434 |                 rule = {x: rule}
435 |             self.combiner.update(rule)
436 |         
437 |         bin_plot(self.combiner.transform(data, labels=labels), x=x, target=self.target)
438 |         
439 |         if save:
440 |             if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
441 |                 os.makedirs(os.path.dirname(save))
442 |                 
443 |             plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
444 |         
445 |         if result:
446 |             return self.combiner.export()[x]
447 |         
448 |     def proportion_plot(self, x, transform=False, labels=False):
449 |         if transform:
450 |             x = self.combiner.transform(x, labels=labels)
451 |         proportion_plot(x)
452 |         
453 |     def corr_plot(self, data, transform=False, figure_size=(20, 15)):
454 |         if transform:
455 |             data = self.combiner.transform(data, labels=False)
456 |         
457 |         corr_plot(data, figure_size=figure_size)
458 |         
459 |     def badrate_plot(self, data, date_column, feature, labels=True):
460 |         badrate_plot(self.combiner.transform(data[[date_column, feature, self.target]], labels=labels), target=self.target, x=date_column, by=feature)
461 |     
462 |     @property
463 |     def rules(self):
464 |         return self.combiner._rules
465 |     
466 |     @rules.setter
467 |     def rules(self, value):
468 |         self.combiner._rules = value
469 |         
470 |     def __len__(self):
471 |         return len(self.combiner._rules.keys())
472 |     
473 |     def __contains__(self, key):
474 |         return key in self.combiner._rules
475 |     
476 |     def __getitem__(self, key):
477 |         return self.combiner._rules[key]
478 |     
479 |     def __setitem__(self, key, value):
480 |         self.combiner._rules[key] = value
481 | 
482 |     def __iter__(self):
483 |         return iter(self.combiner._rules)
484 |         
485 |         
486 | class WOETransformer(TransformerMixin, BaseEstimator):
487 |     
488 |     def __init__(self, target="target", exclude=None):
489 |         """
490 |         WOE转换器
491 | 
492 |         Args:
493 |             target: 数据集中标签名称，默认 target
494 |             exclude: 不需要转换 woe 的列
495 |         """
496 |         self.target = target
497 |         self.exclude = exclude if isinstance(exclude, list) else [exclude] if exclude else []
498 |         self.transformer = toad.transform.WOETransformer()
499 |         
500 |     def fit(self, x, y=None):
501 |         self.transformer.fit(x.drop(columns=self.exclude + [self.target]), x[self.target])
502 |         return self
503 | 
504 |     def transform(self, x, y=None):
505 |         return self.transformer.transform(x)
506 |     
507 |     @property
508 |     def rules(self):
509 |         return self.transformer._rules
510 |     
511 |     @rules.setter
512 |     def rules(self, value):
513 |         self.transformer._rules = value
514 |         
515 |     def __len__(self):
516 |         return len(self.transformer._rules.keys())
517 |     
518 |     def __contains__(self, key):
519 |         return key in self.transformer._rules
520 |     
521 |     def __getitem__(self, key):
522 |         return self.transformer._rules[key]
523 |     
524 |     def __setitem__(self, key, value):
525 |         self.transformer._rules[key] = value
526 | 
527 |     def __iter__(self):
528 |         return iter(self.transformer._rules)
529 |     
530 |     
531 | class StepwiseSelection(TransformerMixin, BaseEstimator):
532 |     
533 |     def __init__(self, target="target", estimator="ols", direction="both", criterion="aic", max_iter=None, return_drop=True, exclude=None, intercept=True, p_value_enter=0.2, p_remove=0.01, p_enter=0.01, target_rm=False):
534 |         """
535 |         逐步回归筛选方法
536 | 
537 |         Args:
538 |             target: 数据集中标签名称，默认 target
539 |             estimator: 预估器，默认 ols，可选 "ols", "lr", "lasso", "ridge"，通常默认即可
540 |             direction: 逐步回归方向，默认both，可选 "forward", "backward", "both"，通常默认即可
541 |             criterion: 评价指标，默认 aic，可选 "aic", "bic"，通常默认即可
542 |             max_iter: 最大迭代次数，sklearn中使用的参数，默认为 None
543 |             return_drop: 是否返回特征剔除信息，默认 True
544 |             exclude: 强制保留的某些特征
545 |             intercept: 是否包含截距，默认为 True
546 |             p_value_enter: 特征进入的 p 值，用于前向筛选时决定特征是否进入模型
547 |             p_remove: 特征剔除的 p 值，用于后向剔除时决定特征是否要剔除
548 |             p_enter: 特征 p 值，用于判断双向逐步回归是否剔除或者准入特征
549 |             target_rm: 是否剔除数据集中的标签，默认为 False，即剔除数据集中的标签
550 |         """
551 |         self.target = target
552 |         self.intercept = intercept
553 |         self.p_value_enter = p_value_enter
554 |         self.p_remove = p_remove
555 |         self.p_enter = p_enter
556 |         self.estimator = estimator
557 |         self.direction = direction
558 |         self.criterion = criterion
559 |         self.max_iter = max_iter
560 |         self.return_drop = return_drop
561 |         self.target_rm = target_rm
562 |         self.exclude = exclude
563 |         self.select_columns = None
564 |         self.dropped = None
565 |     
566 |     def fit(self, x, y=None):
567 |         selected = toad.selection.stepwise(x, target=self.target, estimator=self.estimator, direction=self.direction, criterion=self.criterion, exclude=self.exclude, intercept=self.intercept, p_value_enter=self.p_value_enter, 
568 |                                            p_remove=self.p_remove, p_enter=self.p_enter, return_drop=self.return_drop)
569 |         if self.return_drop:
570 |             self.dropped = pd.DataFrame([(col, "stepwise") for col in selected[1]], columns=["variable", "rm_reason"])
571 |             selected = selected[0]
572 |         
573 |         self.select_columns = list(selected.columns)
574 |         
575 |         if self.target_rm and self.target in self.select_columns:
576 |             self.select_columns.remove(self.target)
577 |         
578 |         return self
579 |         
580 |     def transform(self, x, y=None):
581 |         return x[[col for col in self.select_columns if col in x.columns]]
582 | 
583 | 
584 | if __name__ == "__main__":
585 |     from model import ITLubberLogisticRegression, StatsLogisticRegression, ScoreCard
586 |     
587 |     target = "creditability"
588 |     data = sc.germancredit()
589 |     data[target] = data[target].map({"good": 0, "bad": 1})
590 |     
591 |     train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])
592 | 
593 |     # selection = FeatureSelection(target=target, engine="toad", return_drop=True, corr=0.9, iv=0.01)
594 |     # train = selection.fit_transform(train)
595 |     
596 |     # combiner = Combiner(min_samples=0.2, empty_separate=True, target=target)
597 |     # combiner.fit(train)
598 |     # train = combiner.transform(train)
599 |     
600 |     # transformer = WOETransformer(target=target)
601 |     # train = transformer.fit_transform(train)
602 |     
603 |     # stepwise = StepwiseSelection(target=target)
604 |     # train = stepwise.fit_transform(train)
605 |     
606 |     feature_pipeline = Pipeline([
607 |         ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
608 |         ("combiner", Combiner(target=target, min_samples=0.2)),
609 |         ("transformer", WOETransformer(target=target)),
610 |         ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
611 |         ("stepwise", StepwiseSelection(target=target, target_rm=False)),
612 |         # ("logistic", StatsLogisticRegression(target=target)),
613 |         ("logistic", ITLubberLogisticRegression(target=target)),
614 |     ])
615 | 
616 |     # feature_pipeline.fit(train)
617 |     # y_pred_train = feature_pipeline.predict(train.drop(columns=target))
618 |     # y_pred_test = feature_pipeline.predict(test.drop(columns=target))
619 | 
620 |     params_grid = {
621 |         "logistic__C": [i / 1. for i in range(1, 10, 2)],
622 |         "logistic__penalty": ["l2"],
623 |         "logistic__class_weight": [None, "balanced"], # + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10)],
624 |         "logistic__max_iter": [100],
625 |         "logistic__solver": ["sag"] # ["liblinear", "sag", "lbfgs", "newton-cg"],
626 |         "logistic__intercept": [True, False],
627 |     }
628 |     
629 |     clf = GridSearchCV(feature_pipeline, params_grid, cv=5, scoring='roc_auc', verbose=-1, n_jobs=2, return_train_score=True)
630 |     clf.fit(train, train[target])
631 | 
632 |     y_pred_train = clf.best_estimator_.predict(train)
633 |     y_pred_test = clf.best_estimator_.predict(test)
634 |     
635 |     print(clf.best_params_)
636 |     
637 |     # statmodels methods
638 |     # feature_pipeline.named_steps['logistic'].summary_save()
639 |     
640 |     # print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target]))
641 |     # print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target]))
642 |     
643 |     woe_train = feature_pipeline.fit_transform(train)
644 |     woe_test = feature_pipeline.transform(test)
645 |     
646 |     # lr = StatsLogisticRegression(target=target)
647 |     # lr.fit(woe_train)
648 |     # lr.summary_save()
649 | 
650 |     # cols = list(filter(lambda x: x != target, feature_pipeline.named_steps['preprocessing_select'].select_columns))
651 |     
652 |     combiner = feature_pipeline.named_steps['combiner'].combiner
653 |     transformer = feature_pipeline.named_steps['transformer'].transformer
654 |     
655 |     score_card = ScoreCard(target=target, combiner=combiner, transer=transformer, )
656 |     score_card.fit(woe_train)
657 | 
658 | 
659 |     data["score"] = score_card.transform(data)
660 |     
661 |     print(score_card.KS_bucket(data["score"], data[target]))
662 |     pt = score_card.perf_eva(data["score"], data[target], title="train")
663 | 
664 |     sc = score_card.score_hist(data["score"], data[target])
665 | 
666 |     print(score_card.KS(data["score"], data[target]), score_card.AUC(data["score"], data[target]))
667 |     


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @Time    : 2023/2/15 17:55
  4 | @Author  : itlubber
  5 | @Site    : itlubber.art
  6 | """
  7 | import math
  8 | import sys
  9 | import re
 10 | import matplotlib
 11 | import matplotlib.font_manager as font_manager
 12 | import matplotlib.pyplot as plt
 13 | import pandas as pd
 14 | from openpyxl.formatting.rule import Rule
 15 | from openpyxl.formatting.rule import ColorScaleRule
 16 | from openpyxl.utils.dataframe import dataframe_to_rows
 17 | from openpyxl.utils import get_column_letter, column_index_from_string
 18 | 
 19 | 
 20 | from model import *
 21 | from utils.excel_writer import ExcelWriter
 22 | 
 23 | 
 24 | plt.style.use('seaborn-ticks')
 25 | # plt.style.use('seaborn-white')
 26 | # plt.rcParams.update({'font.size': 14})
 27 | 
 28 | 
 29 | def pyplot_chinese(font_path='utils/matplot_chinese.ttf'):
 30 |     # matplotlib.rcParams['font.size'] = 20
 31 |     matplotlib.font_manager.fontManager.addfont(font_path)
 32 |     matplotlib.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name()
 33 |     matplotlib.rcParams['axes.unicode_minus']=False
 34 | 
 35 | 
 36 | pyplot_chinese()
 37 | 
 38 | 
 39 | target = "creditability"
 40 | data = sc.germancredit()
 41 | data[target] = data[target].map({"good": 0, "bad": 1})
 42 | 
 43 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])
 44 | oot = data.copy()
 45 | 
 46 | feature_pipeline = Pipeline([
 47 |     ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
 48 |     ("combiner", Combiner(target=target, min_samples=0.2)),
 49 |     ("transform", WOETransformer(target=target)),
 50 |     # ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
 51 |     ("stepwise", StepwiseSelection(target=target)),
 52 | ])
 53 | 
 54 | feature_pipeline.fit(train)
 55 | 
 56 | woe_train = feature_pipeline.transform(train)
 57 | woe_test = feature_pipeline.transform(test)
 58 | woe_oot = feature_pipeline.transform(oot)
 59 | 
 60 | # # save all bin_plot
 61 | # _combiner = feature_pipeline.named_steps["combiner"]
 62 | # for col in woe_train.columns:
 63 | #     if col != target:
 64 | #         _combiner.bin_plot(train, col, labels=True, save=f"model_report/bin_plots/train_{col}.png")
 65 | #         _combiner.bin_plot(test, col, labels=True, save=f"model_report/bin_plots/test_{col}.png")
 66 | #         _combiner.bin_plot(oot, col, labels=True, save=f"model_report/bin_plots/oot_{col}.png")
 67 | 
 68 | # logistic = StatsLogisticRegression(target=target)
 69 | logistic = ITLubberLogisticRegression(target=target)
 70 | 
 71 | logistic.fit(woe_train)
 72 | 
 73 | y_pred_train = logistic.predict_proba(woe_train.drop(columns=target))[:, 1]
 74 | y_pred_test = logistic.predict_proba(woe_test.drop(columns=target))[:, 1]
 75 | y_pred_oot = logistic.predict_proba(woe_oot.drop(columns=target))[:, 1]
 76 | 
 77 | ScoreCard.ks_plot(y_pred_train, train[target], save="model_report/lr_ksplot_train.png", figsize=(10, 5))
 78 | ScoreCard.ks_plot(y_pred_test, test[target], save="model_report/lr_ksplot_test.png", figsize=(10, 5))
 79 | ScoreCard.ks_plot(y_pred_oot, oot[target], save="model_report/lr_ksplot_oot.png", figsize=(10, 5))
 80 | 
 81 | summary = logistic.summary().reset_index().rename(columns={"index": "Features"})
 82 | 
 83 | train_corr = logistic.corr(woe_train, save="model_report/train_corr.png")
 84 | test_corr = logistic.corr(woe_test, save="model_report/test_corr.png")
 85 | oot_corr = logistic.corr(woe_oot, save="model_report/oot_corr.png")
 86 | 
 87 | train_report = logistic.report(woe_train)
 88 | test_report = logistic.report(woe_test)
 89 | oot_report = logistic.report(woe_oot)
 90 | 
 91 | print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target]))
 92 | print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target]))
 93 | print("oot: ", toad.metrics.KS(y_pred_oot, oot[target]), toad.metrics.AUC(y_pred_oot, oot[target]))
 94 | 
 95 | 
 96 | card = ScoreCard(target=target, pipeline=feature_pipeline, pretrain_lr=logistic)
 97 | card.fit(woe_train)
 98 | 
 99 | train["score"] = card.predict(train)
100 | test["score"] = card.predict(test)
101 | oot["score"] = card.predict(oot)
102 | 
103 | 
104 | def sample_distribution(df, date="date", target="target", user_count="count", save="model_report/sample_time_count.png", figsize=(10, 6), colors=["#2639E9", "#F76E6C", "#FE7715"]):
105 |     temp = df.set_index(date).assign(
106 |         好样本=lambda x: (x[target] == 0).astype(int),
107 |         坏样本=lambda x: (x[target] == 1).astype(int),
108 |     ).resample("W").agg({"好样本": sum, "坏样本": sum})
109 |     temp.index = [i.strftime("%Y-%m-%d") for i in temp.index]
110 | 
111 |     fig, ax1 = plt.subplots(1, 1, figsize=figsize)
112 |     temp.plot(kind='bar', stacked=True, ax=ax1, color=colors[:2], hatch="/", legend=False)
113 |     ax1.tick_params(axis='x', labelrotation=-90)
114 |     ax1.set(xlabel=None)
115 |     ax1.set_ylabel('样本数')
116 |     ax1.set_title('不同时点数据集样本分布情况\n\n')
117 | 
118 |     ax2 = plt.twinx()
119 |     (temp["坏样本"] / temp.sum(axis=1)).plot(ax=ax2, color=colors[-1], marker=".", linewidth=2, label="坏样本率")
120 |     # sns.despine()
121 | 
122 |     # 合并图例
123 |     handles1, labels1 = ax1.get_legend_handles_labels()
124 |     handles2, labels2 = ax2.get_legend_handles_labels()
125 |     fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.94), frameon=False)
126 |     # ax1.legend(frameon=False, labels=["good", "bad"], loc='upper right')
127 |     # ax2.legend(loc='upper left', frameon=False, labels=["bad rate"])
128 | 
129 |     plt.tight_layout()
130 | 
131 |     if save:
132 |         if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
133 |             os.makedirs(os.path.dirname(save))
134 | 
135 |         fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
136 | 
137 |     temp = temp.reset_index().rename(columns={date: "日期", "index": "日期", 0: "好样本", 1: "坏样本"})
138 |     temp["样本总数"] = temp["坏样本"] + temp["好样本"]
139 |     temp["样本占比"] = temp["样本总数"] / temp["样本总数"].sum()
140 |     temp["好样本占比"] = temp["好样本"] / temp["好样本"].sum()
141 |     temp["坏样本占比"] = temp["坏样本"] / temp["坏样本"].sum()
142 |     temp["坏样本率"] = temp["坏样本"] / temp["样本总数"]
143 | 
144 |     return temp[["日期", "样本总数", "样本占比", "好样本", "好样本占比", "坏样本", "坏样本占比", "坏样本率"]]
145 | 
146 | 
147 | def bin_plot(feature_table, feature="", figsize=(15, 8), colors=['#8E8BFE', '#FEA3A2', '#9394E7'], max_len=35, save=None):
148 |     feature_table = feature_table.copy()
149 | 
150 |     feature_table["分箱"] = feature_table["分箱"].apply(lambda x: x if re.match("^\[.*\)$", x) else str(x)[:max_len] + "..")
151 | 
152 |     # 绘制好坏样本分布情况
153 |     fig, ax1 = plt.subplots(figsize=figsize)
154 |     ax1.barh(feature_table['分箱'], feature_table['好样本数'], color=colors[0], label='好样本', hatch="/")
155 |     ax1.barh(feature_table['分箱'], feature_table['坏样本数'], left=feature_table['好样本数'], color=colors[1], label='坏样本', hatch="\\")
156 |     ax1.set_xlabel('样本数')
157 | 
158 |     # 绘制坏样本率的分布情况
159 |     ax2 = ax1.twiny()
160 |     ax2.plot(feature_table['坏样本率'], feature_table['分箱'], colors[2], label='坏样本率', linestyle='-.')
161 |     ax2.set_xlabel('坏样本率: 坏样本数 / 样本总数')
162 | 
163 |     for i, rate in enumerate(feature_table['坏样本率']):
164 |         ax2.scatter(rate, i, color=colors[2])
165 | 
166 |     # 在图像对应位置显示样本总数和坏样本率
167 |     for i, v in feature_table[['样本总数', '好样本数', '坏样本数', '坏样本率']].iterrows():
168 |         ax1.text(v['样本总数'] / 2, i + len(feature_table) / 60, f"{int(v['好样本数'])}:{int(v['坏样本数'])}:{v['坏样本率']:.2%}")
169 | 
170 |     # 逆转y轴顺序
171 |     ax1.invert_yaxis()
172 | 
173 |     # 添加一个标题
174 |     fig.suptitle(f'变量 {feature} 分箱图\n\n')
175 | 
176 |     # 合并图例
177 |     handles1, labels1 = ax1.get_legend_handles_labels()
178 |     handles2, labels2 = ax2.get_legend_handles_labels()
179 |     fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.925), frameon=False)
180 | 
181 |     # 调整布局，使分箱信息能够完全显示
182 |     plt.tight_layout()
183 | 
184 |     if save:
185 |         if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
186 |             os.makedirs(os.path.dirname(save))
187 | 
188 |         fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
189 | 
190 | 
191 | writer = ExcelWriter(style_excel="./utils/报告输出模版.xlsx", theme_color="8E8BFE")
192 | 
193 | 
194 | # ////////////////////////////////////// 样本说明 ///////////////////////////////////// #
195 | df = pd.DataFrame({
196 |     "date": pd.date_range(start="2021-01-01", end="2022-06-30"),
197 |     "target": np.random.randint(0, 2, 546),
198 |     "count": np.random.randint(0, 100, 546),
199 | })
200 | 
201 | total_count = len(data)
202 | dataset_summary = pd.DataFrame(
203 |     [
204 |         ["建模样本", "2022-01-01", "2023-01-31", len(data), len(data) / total_count, data[target].sum(), data[target].sum() / len(data), ""],
205 |         ["训练集", "2022-01-01", "2023-12-31", len(train), len(train) / total_count, train[target].sum(), train[target].sum() / len(train), ""],
206 |         ["测试集", "2022-01-01", "2023-12-31", len(test), len(test) / total_count, test[target].sum(), test[target].sum() / len(test), ""],
207 |         ["跨时间验证集", "2023-01-01", "2023-01-31", len(oot), len(oot) / total_count, oot[target].sum(), oot[target].sum() / len(oot), ""],
208 |     ],
209 |     columns=["数据集", "开始时间", "结束时间", "样本总数", "样本占比", "坏客户数", "坏客户占比", "备注"],
210 | )
211 | 
212 | worksheet = writer.get_sheet_by_name("汇总信息")
213 | 
214 | # 样本总体分布情况
215 | start_row, start_col = 2, 2
216 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="样本总体分布情况", style="header")
217 | end_row, end_col = writer.insert_df2sheet(worksheet, dataset_summary, (end_row + 1, start_col), header=True)
218 | 
219 | writer.set_number_format(worksheet, f"{get_column_letter(end_col - 2)}{end_row - len(dataset_summary)}:{get_column_letter(end_col - 2)}{end_row}", "0.00%")
220 | writer.set_number_format(worksheet, f"{get_column_letter(end_col - 4)}{end_row - len(dataset_summary)}:{get_column_letter(end_col - 4)}{end_row}", "0.00%")
221 | 
222 | # 建模样本时间分布情况
223 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/all_sample_time_count.png")
224 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="建模样本时间分布情况", style="header")
225 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/all_sample_time_count.png", (end_row, start_col), figsize=(720, 370))
226 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False)
227 | 
228 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%")
229 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%")
230 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%")
231 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%")
232 | 
233 | # 训练集样本时间分布情况
234 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/train_sample_time_count.png")
235 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练集样本时间分布情况", style="header")
236 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_sample_time_count.png", (end_row, start_col), figsize=(720, 370))
237 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False)
238 | 
239 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%")
240 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%")
241 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%")
242 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%")
243 | 
244 | # 测试集样本时间分布情况
245 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/test_sample_time_count.png")
246 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试集样本时间分布情况", style="header")
247 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_sample_time_count.png", (end_row, start_col), figsize=(720, 370))
248 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False)
249 | 
250 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%")
251 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%")
252 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%")
253 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%")
254 | 
255 | # 跨时间验证集样本时间分布情况
256 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/oot_sample_time_count.png")
257 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="跨时间验证集样本时间分布情况", style="header")
258 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/oot_sample_time_count.png", (end_row, start_col), figsize=(720, 370))
259 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False)
260 | 
261 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%")
262 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%")
263 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%")
264 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%")
265 | 
266 | 
267 | # ////////////////////////////////////// 模型报告 ///////////////////////////////////// #
268 | 
269 | # 逻辑回归拟合情况
270 | worksheet = writer.get_sheet_by_name("逻辑回归拟合结果")
271 | start_row, start_col = 2, 2
272 | 
273 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="逻辑回归拟合效果", style="header")
274 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{start_row}:{get_column_letter(start_col + len(summary.columns) - 1)}{start_row}")
275 | # worksheet[f"{get_column_letter(start_col)}{start_row}:{get_column_letter(start_col + len(summary.columns) - 1)}{start_row}"].style = "header"
276 | logistic.plot_weights(save="model_report/logistic_train.png")
277 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/logistic_train.png", (end_row + 2, start_col))
278 | end_row, end_col = writer.insert_df2sheet(worksheet, summary, (end_row + 1, start_col))
279 | 
280 | conditional_column = get_column_letter(start_col + summary.columns.get_loc("Coef."))
281 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(summary)}', f'{conditional_column}{end_row}')
282 | 
283 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{end_row + 2}:{get_column_letter(start_col + len(train_report.columns) - 1)}{end_row + 2}")
284 | # worksheet[f"{get_column_letter(start_col)}{end_row + 2}"].style = "header"
285 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集拟合报告", style="header")
286 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/lr_ksplot_train.png", (end_row, start_col), figsize=(480, 270))
287 | end_row, end_col = writer.insert_df2sheet(worksheet, train_report, (end_row + 1, start_col))
288 | 
289 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{end_row + 2}:{get_column_letter(start_col + len(test_report.columns) - 1)}{end_row + 2}")
290 | # worksheet[f"{get_column_letter(start_col)}{end_row + 2}"].style = "header"
291 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集拟合报告", style="header")
292 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/lr_ksplot_test.png", (end_row, start_col), figsize=(480, 270))
293 | end_row, end_col = writer.insert_df2sheet(worksheet, test_report, (end_row + 1, start_col))
294 | 
295 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{end_row + 2}:{get_column_letter(start_col + len(oot_report.columns) - 1)}{end_row + 2}")
296 | # worksheet[f"{get_column_letter(start_col)}{end_row + 2}"].style = "header"
297 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="跨时间验证集拟合报告", style="header")
298 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/lr_ksplot_oot.png", (end_row, start_col), figsize=(480, 270))
299 | end_row, end_col = writer.insert_df2sheet(worksheet, oot_report, (end_row + 1, start_col))
300 | 
301 | 
302 | # ////////////////////////////////////// 特征概述 ///////////////////////////////////// #
303 | 
304 | # 模型变量概览
305 | feature_describe = pd.DataFrame([
306 |     ["status_account", "支票账户状态"], ["duration", "借款周期"], ["credit_histor", "历史信用"], ["purpose", "借款目的"], ["amount", "信用额度"], ["svaing_account", "储蓄账户状态"], ["present_emp", "当前就业状态"], ["income_rate", "分期付款占可支配收入百分比"], ["personal_status", "性别与婚姻状态"], ["other_debtors", "他人担保信息"], ["residence_info", "现居住地"], ["property", "财产状态"], ["age", "年龄"], ["inst_plans", "其他分期情况"], ["housing", "房产状态"], ["num_credits", "信用卡数量"], ["job", "工作状态"], ["dependents", "赡养人数"], ["telephone", "电话号码注册情况"], ["foreign_worke", "是否有海外工作经历"],
307 | ], columns=["变量名称", "变量含义"])
308 | 
309 | worksheet = writer.get_sheet_by_name("模型变量信息")
310 | start_row, start_col = 2, 2
311 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="入模变量信息", style="header")
312 | end_row, end_col = writer.insert_df2sheet(worksheet, feature_describe.reset_index().rename(columns={"index": "序号"}), (end_row + 1, start_col))
313 | 
314 | # 变量分布情况
315 | data_info = toad.detect(data[card.rules.keys()]).reset_index().rename(columns={"index": "变量名称", "type": "变量类型", "size": "样本个数", "missing": "缺失值", "unique": "唯一值个数"})
316 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分布情况", style="header")
317 | end_row, end_col = writer.insert_df2sheet(worksheet, data_info, (end_row + 1, start_col))
318 | 
319 | # 变量相关性
320 | data_corr = logistic.corr(feature_pipeline.transform(train), save="model_report/data_corr.png", annot=False)
321 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量相关性", style="header")
322 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/data_corr.png", (end_row + 1, start_col), figsize=(700, 500))
323 | end_row, end_col = writer.insert_df2sheet(worksheet, data_corr.reset_index().rename(columns={"index": ""}), (end_row + 1, start_col))
324 | 
325 | conditional_column = f"{get_column_letter(start_col + 1)}{end_row - len(data_corr)}:{get_column_letter(end_col - 1)}{end_row - 1}"
326 | worksheet.conditional_formatting.add(conditional_column, ColorScaleRule(start_type='num', start_value=-1.0, start_color='8E8BFE', mid_type='num', mid_value=0., mid_color='FFFFFF', end_type='num', end_value=1.0, end_color='8E8BFE'))
327 | 
328 | 
329 | # 变量分箱信息
330 | _combiner = feature_pipeline.named_steps["combiner"]
331 | 
332 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分箱信息", style="header")
333 | for col in card.rules.keys():
334 |     feature_table = card.feature_bin_stats(data, col, target=target, desc="逻辑回归入模变量", combiner=card.combiner)
335 |     # _combiner.bin_plot(data, col, labels=True, save=f"model_report/bin_plots/data_{col}.png")
336 |     bin_plot(feature_table, feature=col, save=f"model_report/bin_plots/data_{col}.png")
337 |     end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/bin_plots/data_{col}.png", (end_row + 1, start_col), figsize=(700, 400))
338 |     end_row, end_col = writer.insert_df2sheet(worksheet, feature_table, (end_row, start_col))
339 | 
340 |     for c in ["坏样本率", "LIFT值"]:
341 |         conditional_column = get_column_letter(start_col + feature_table.columns.get_loc(c))
342 |         writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(feature_table)}', f'{conditional_column}{end_row}')
343 |         # conditional_column = get_column_letter(start_col + feature_table.columns.get_loc("LIFT值"))
344 |         # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(feature_table)}', f'{conditional_column}{end_row}')
345 | 
346 |     for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值"]:
347 |         conditional_column = get_column_letter(start_col + feature_table.columns.get_loc(c))
348 |         writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(feature_table)}:{conditional_column}{end_row}", "0.00%")
349 | 
350 | 
351 | # ////////////////////////////////////// 评分卡说明 ///////////////////////////////////// #
352 | 
353 | # 评分卡刻度
354 | scorecard_kedu = pd.DataFrame(
355 |     [
356 |         ["base_odds", card.base_odds, "根据业务经验设置的基础比率（违约概率/正常概率），估算方法：（1-样本坏客户占比）/坏客户占比"],
357 |         ["base_score", card.base_score, "基础ODDS对应的分数"],
358 |         ["rate", card.rate, "设置分数的倍率"],
359 |         ["pdo", card.pdo, "表示分数增长PDO时，ODDS值增长到RATE倍"],
360 |         ["B", card.offset, "补偿值，计算方式：pdo / ln(rate)"],
361 |         ["A", card.factor, "刻度，计算方式：base_score - B * ln(base_odds)"],
362 |     ],
363 |     columns=["刻度项", "刻度值", "备注"],
364 | )
365 | 
366 | worksheet = writer.get_sheet_by_name("评分卡结果")
367 | start_row, start_col = 2, 2
368 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="评分卡刻度", style="header")
369 | end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_kedu, (end_row + 1, start_col))
370 | 
371 | # 评分卡对应分数
372 | card_points = card.export(to_frame=True).rename(columns={"name": "变量名称", "value": "变量分箱", "score": "对应分数"})
373 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡分数", style="header")
374 | end_row, end_col = writer.insert_df2sheet(worksheet, card_points, (end_row + 1, start_col), merge_column="变量名称")
375 | 
376 | # 评分效果
377 | clip = 50
378 | clip_start = max(math.ceil(train["score"].min() / clip) * clip, math.ceil(train["score"].quantile(0.01) / clip) * clip)
379 | clip_end = min(math.ceil(train["score"].max() / clip) * clip, math.ceil(train["score"].quantile(0.99) / clip) * clip)
380 | score_clip = [i for i in range(clip_start, clip_end, clip)]
381 | 
382 | train_score_rank = card.feature_bin_stats(train, "score", target=target, rules=score_clip, verbose=0, method="step", ks=True)
383 | test_score_rank = card.feature_bin_stats(test, "score", target=target, rules=score_clip, verbose=0, method="step", ks=True)
384 | oot_score_rank = card.feature_bin_stats(oot, "score", target=target, rules=score_clip, verbose=0, method="step", ks=True)
385 | 
386 | card.ks_plot(train["score"], train[target], title="Train Dataset", save="model_report/train_ksplot.png")
387 | card.ks_plot(test["score"], test[target], title="Test Dataset", save="model_report/test_ksplot.png")
388 | card.ks_plot(oot["score"], oot[target], title="OOT Dataset", save="model_report/oot_ksplot.png")
389 | 
390 | card.score_hist(train["score"], train[target], save="model_report/train_scorehist.png", bins=30, figsize=(13, 10))
391 | card.score_hist(test["score"], test[target], save="model_report/test_scorehist.png", bins=30, figsize=(13, 10))
392 | card.score_hist(oot["score"], oot[target], save="model_report/oot_scorehist.png", bins=30, figsize=(13, 10))
393 | 
394 | 
395 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集评分模型效果", style="header")
396 | ks_row = end_row
397 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_ksplot.png", (ks_row, start_col))
398 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_scorehist.png", (ks_row, end_col))
399 | end_row, end_col = writer.insert_df2sheet(worksheet, train_score_rank, (end_row + 1, start_col))
400 | 
401 | for c in ["坏样本率", "LIFT值", "分档KS值"]:
402 |     conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc(c))
403 |     writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(train_score_rank)}', f'{conditional_column}{end_row}')
404 | 
405 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"]:
406 |     conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc(c))
407 |     writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(train_score_rank)}:{conditional_column}{end_row}", "0.00%")
408 | 
409 | # conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc("坏样本率"))
410 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_score_rank)}', f'{conditional_column}{end_row}')
411 | # conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc("LIFT值"))
412 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_score_rank)}', f'{conditional_column}{end_row}')
413 | # conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc("分档KS值"))
414 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_score_rank)}', f'{conditional_column}{end_row}')
415 | 
416 | 
417 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集评分模型效果", style="header")
418 | ks_row = end_row
419 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_ksplot.png", (ks_row, start_col))
420 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_scorehist.png", (ks_row, end_col))
421 | end_row, end_col = writer.insert_df2sheet(worksheet, test_score_rank, (end_row + 1, start_col))
422 | 
423 | for c in ["坏样本率", "LIFT值", "分档KS值"]:
424 |     conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc(c))
425 |     writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(test_score_rank)}', f'{conditional_column}{end_row}')
426 | 
427 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"]:
428 |     conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc(c))
429 |     writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(test_score_rank)}:{conditional_column}{end_row}", "0.00%")
430 | 
431 | # conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc("坏样本率"))
432 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_score_rank)}', f'{conditional_column}{end_row}')
433 | # conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc("LIFT值"))
434 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_score_rank)}', f'{conditional_column}{end_row}')
435 | # conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc("分档KS值"))
436 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_score_rank)}', f'{conditional_column}{end_row}')
437 | 
438 | 
439 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="跨时间验证集评分模型效果", style="header")
440 | ks_row = end_row
441 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/oot_ksplot.png", (ks_row, start_col))
442 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/oot_scorehist.png", (ks_row, end_col))
443 | end_row, end_col = writer.insert_df2sheet(worksheet, oot_score_rank, (end_row + 1, start_col))
444 | 
445 | for c in ["坏样本率", "LIFT值", "分档KS值"]:
446 |     conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc(c))
447 |     writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(oot_score_rank)}', f'{conditional_column}{end_row}')
448 | 
449 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"]:
450 |     conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc(c))
451 |     writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(oot_score_rank)}:{conditional_column}{end_row}", "0.00%")
452 | 
453 | # conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc("坏样本率"))
454 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(oot_score_rank)}', f'{conditional_column}{end_row}')
455 | # conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc("LIFT值"))
456 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(oot_score_rank)}', f'{conditional_column}{end_row}')
457 | # conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc("分档KS值"))
458 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(oot_score_rank)}', f'{conditional_column}{end_row}')
459 | 
460 | 
461 | def score_psi(expected, actual, labels=["预期", "实际"], save=None, colors=['#8E8BFE', '#FEA3A2', '#9394E7'], figsize=(15, 8)):
462 |     expected = expected.rename(columns={"分箱": "评分区间", "样本总数": f"{labels[0]}样本数", "样本占比": f"{labels[0]}样本占比", "坏样本率": f"{labels[0]}坏样本率"})
463 |     actual = actual.rename(columns={"分箱": "评分区间", "样本总数": f"{labels[1]}样本数", "样本占比": f"{labels[1]}样本占比", "坏样本率": f"{labels[1]}坏样本率"})
464 |     df_psi = expected.merge(actual, on="评分区间", how="outer").replace(np.nan, 0)
465 |     df_psi[f"{labels[1]}% - {labels[0]}%"] = df_psi[f"{labels[1]}样本占比"] - df_psi[f"{labels[0]}样本占比"]
466 |     df_psi[f"ln({labels[1]}% / {labels[0]}%)"] = np.log(df_psi[f"{labels[1]}样本占比"] / df_psi[f"{labels[0]}样本占比"])
467 |     df_psi["分档PSI值"] = (df_psi[f"{labels[1]}% - {labels[0]}%"] * df_psi[f"ln({labels[1]}% / {labels[0]}%)"])
468 |     df_psi = df_psi.fillna(0).replace(np.inf, 0).replace(-np.inf, 0)
469 |     df_psi["总体PSI值"] = df_psi["分档PSI值"].sum()
470 | 
471 |     if save:
472 |         if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
473 |             os.makedirs(os.path.dirname(save))
474 | 
475 |         x = df_psi['评分区间']
476 |         width = 0.35
477 |         x_indexes = np.arange(len(x))
478 |         fig, ax1 = plt.subplots(figsize=figsize)
479 | 
480 |         ax1.bar(x_indexes - width / 2, df_psi[f'{labels[0]}样本占比'], width, label=f'{labels[0]}样本占比', color=colors[0], hatch="/")
481 |         ax1.bar(x_indexes + width / 2, df_psi[f'{labels[1]}样本占比'], width, label=f'{labels[1]}样本占比', color=colors[1], hatch="\\")
482 | 
483 |         ax1.set_ylabel('样本占比: 评分区间内样本数 / 样本总数')
484 |         ax1.set_xticks(x_indexes)
485 |         ax1.set_xticklabels(x)
486 |         ax1.tick_params(axis='x', labelrotation=90)
487 | 
488 |         ax2 = ax1.twinx()
489 |         ax2.plot(df_psi["评分区间"], df_psi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3)))
490 |         ax2.plot(df_psi["评分区间"], df_psi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3)))
491 | 
492 |         ax2.scatter(df_psi["评分区间"], df_psi[f"{labels[0]}坏样本率"], marker=".")
493 |         ax2.scatter(df_psi["评分区间"], df_psi[f"{labels[1]}坏样本率"], marker=".")
494 | 
495 |         ax2.set_ylabel('坏样本率: 坏样本数 / 样本总数')
496 | 
497 |         handles1, labels1 = ax1.get_legend_handles_labels()
498 |         handles2, labels2 = ax2.get_legend_handles_labels()
499 |         fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.94), frameon=False)
500 | 
501 |         fig.suptitle(f"{labels[0]} vs {labels[1]} 群体稳定性指数(PSI): {df_psi['分档PSI值'].sum():.4f}\n\n")
502 | 
503 |         fig.tight_layout()
504 | 
505 |         fig.savefig(save, dpi=240, format="png", bbox_inches="tight")
506 | 
507 |     return df_psi[["评分区间", f"{labels[0]}样本数", f"{labels[0]}样本占比", f"{labels[0]}坏样本率", f"{labels[1]}样本数", f"{labels[1]}样本占比", f"{labels[1]}坏样本率", f"{labels[1]}% - {labels[0]}%", f"ln({labels[1]}% / {labels[0]}%)", "分档PSI值", "总体PSI值"]]
508 | 
509 | 
510 | train_test_score_psi = score_psi(train_score_rank, test_score_rank, labels=["训练数据集", "测试数据集"], save="model_report/train_test_psiplot.png")
511 | train_oot_score_psi = score_psi(train_score_rank, oot_score_rank, labels=["训练数据集", "跨时间验证集"], save="model_report/train_oot_psiplot.png")
512 | test_oot_score_psi = score_psi(test_score_rank, oot_score_rank, labels=["测试数据集", "跨时间验证集"], save="model_report/test_oot_psiplot.png")
513 | 
514 | 
515 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡模型稳定性评估: 训练数据集 vs 测试数据集", style="header")
516 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_test_psiplot.png", (end_row, start_col), figsize=(1000, 400))
517 | end_row, end_col = writer.insert_df2sheet(worksheet, train_test_score_psi, (end_row + 1, start_col))
518 | 
519 | conditional_column = get_column_letter(start_col + train_test_score_psi.columns.get_loc("分档PSI值"))
520 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_test_score_psi)}', f'{conditional_column}{end_row}')
521 | 
522 | for c in ["训练数据集样本占比", "训练数据集坏样本率", "测试数据集样本占比", "测试数据集坏样本率"]:
523 |     conditional_column = get_column_letter(start_col + train_test_score_psi.columns.get_loc(c))
524 |     writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(train_test_score_psi)}:{conditional_column}{end_row}", "0.00%")
525 | 
526 | 
527 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡模型稳定性评估: 训练数据集 vs 跨时间验证集", style="header")
528 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_oot_psiplot.png", (end_row, start_col), figsize=(1000, 400))
529 | end_row, end_col = writer.insert_df2sheet(worksheet, train_oot_score_psi, (end_row + 1, start_col))
530 | 
531 | conditional_column = get_column_letter(start_col + train_oot_score_psi.columns.get_loc("分档PSI值"))
532 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_oot_score_psi)}', f'{conditional_column}{end_row}')
533 | 
534 | for c in ["训练数据集样本占比", "训练数据集坏样本率", "跨时间验证集样本占比", "跨时间验证集坏样本率"]:
535 |     conditional_column = get_column_letter(start_col + train_oot_score_psi.columns.get_loc(c))
536 |     writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(train_oot_score_psi)}:{conditional_column}{end_row}", "0.00%")
537 | 
538 | 
539 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡模型稳定性评估: 测试数据集 vs 跨时间验证集", style="header")
540 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_oot_psiplot.png", (end_row, start_col), figsize=(1000, 400))
541 | end_row, end_col = writer.insert_df2sheet(worksheet, test_oot_score_psi, (end_row + 1, start_col))
542 | 
543 | conditional_column = get_column_letter(start_col + test_oot_score_psi.columns.get_loc("分档PSI值"))
544 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_oot_score_psi)}', f'{conditional_column}{end_row}')
545 | 
546 | for c in ["跨时间验证集样本占比", "跨时间验证集坏样本率", "测试数据集样本占比", "测试数据集坏样本率"]:
547 |     conditional_column = get_column_letter(start_col + test_oot_score_psi.columns.get_loc(c))
548 |     writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(test_oot_score_psi)}:{conditional_column}{end_row}", "0.00%")
549 | 
550 | 
551 | # ////////////////////////////////////// 模型稳定性 ///////////////////////////////////// #
552 | #
553 | # worksheet = writer.get_sheet_by_name("模型稳定性")
554 | # start_row, start_col = 2, 2
555 | #
556 | # # 变量 CSI 表
557 | # end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="入模变量稳定性指标 (Characteristic Stability Index, CSI)", style="header")
558 | #
559 | # # train vs test
560 | #
561 | # # 评分分布稳定性
562 | # end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="模型评分稳定性指标 (Population Stability Index, PSI)", style="header")
563 | 
564 | 
565 | writer.save("model_report/评分卡模型报告.xlsx")
566 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @Time    : 2022/8/23 13:12
  4 | @Author  : itlubber
  5 | @Site    : itlubber.art
  6 | """
  7 | 
  8 | import os
  9 | import toad
 10 | import warnings
 11 | import numpy as np
 12 | import pandas as pd
 13 | import scorecardpy as sc
 14 | from scorecardpy.perf import eva_pks, eva_proc
 15 | from optbinning import OptimalBinning
 16 | import matplotlib.pyplot as plt
 17 | from matplotlib import font_manager
 18 | import seaborn as sns
 19 | # import plotly.graph_objects as go
 20 | # from plotly.io import write_image
 21 | from openpyxl import load_workbook
 22 | from openpyxl.styles import Alignment, PatternFill
 23 | 
 24 | import scipy
 25 | import statsmodels.api as sm
 26 | from statsmodels.stats.outliers_influence import variance_inflation_factor
 27 | 
 28 | from sklearn.pipeline import Pipeline
 29 | from sklearn.metrics import roc_curve, auc
 30 | from sklearn.metrics import classification_report
 31 | from sklearn.linear_model import LogisticRegression
 32 | from sklearn.model_selection import train_test_split
 33 | from sklearn.utils.validation import check_is_fitted
 34 | from sklearn.ensemble import GradientBoostingClassifier
 35 | from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
 36 | 
 37 | from processing import FeatureSelection, Combiner, WOETransformer, StepwiseSelection
 38 | 
 39 | 
 40 | warnings.filterwarnings("ignore")
 41 | pd.set_option('display.width', 5000)
 42 | # plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体
 43 | # plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
 44 | 
 45 | 
 46 | def pyplot_chinese(font_path='utils/matplot_chinese.ttf'):
 47 |     font_manager.fontManager.addfont(font_path)
 48 |     plt.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name()
 49 |     plt.rcParams['axes.unicode_minus']=False
 50 | 
 51 | 
 52 | class StatsLogisticRegression(TransformerMixin, BaseEstimator):
 53 |     
 54 |     def __init__(self, target="target", intercept=True):
 55 |         """
 56 |         基于statsmodels的逻辑回归方法
 57 | 
 58 |         Args:
 59 |             target: 数据集中标签名称，默认 target
 60 |             intercept: 是否包含截距，默认 True，即包含截距
 61 |         """
 62 |         self.intercept = intercept
 63 |         self.target = target
 64 |         self.classifier = None
 65 |         self.corr = None
 66 |         self.vif = None
 67 |         self.coef_normalization = None
 68 |         self.feature_names_ = None
 69 |         self.feature_importances_ = None
 70 |     
 71 |     def fit(self, x, y=None, vif=True, corr=True, normalization=True):
 72 |         self.feature_names_ = list(x.drop(columns=[self.target]).columns)
 73 |         self.feature_importances_ = self.feature_importances(x)
 74 |         
 75 |         if vif:
 76 |             self.vif = self.VIF(x)
 77 |             
 78 |         if normalization:
 79 |             _x = x.drop(columns=[self.target]).apply(lambda x: (x - np.mean(x)) / np.std(x))
 80 |             _y = x[self.target]
 81 |             lr_normalization = sm.Logit(_y, sm.add_constant(_x) if self.intercept else _x).fit()
 82 |             self.coef_normalization = pd.DataFrame(lr_normalization.params, columns=["coef_normalization"])
 83 |             
 84 |         if corr:
 85 |             self.corr = x.drop(columns=[self.target]).corr()
 86 |             
 87 |         if self.intercept:
 88 |             x = sm.add_constant(x)
 89 |         
 90 |         self.classes_ = x[self.target].unique()
 91 |         self.classifier = sm.Logit(x[self.target], x.drop(columns=[self.target])).fit()
 92 |         
 93 |         return self
 94 |     
 95 |     def transform(self, x):
 96 |         if self.intercept:
 97 |             x = sm.add_constant(x)
 98 |         
 99 |         return self.classifier.predict(x)
100 |     
101 |     def predict(self, x):
102 |         return self.transform(x)
103 |     
104 |     def summary(self):
105 |         describe = self.classifier.summary2()
106 |         return describe
107 |     
108 |     def feature_importances(self, x):
109 |         params = {
110 |             "n_estimators": 256,
111 |             "max_depth": 4,
112 |             "min_samples_split": 5,
113 |             "learning_rate": 1e-3,
114 |             "loss": "deviance",
115 |             "subsample": 0.9,
116 |         }
117 |         feature_importances_ = GradientBoostingClassifier(**params).fit(x.drop(columns=[self.target]), x[self.target]).feature_importances_
118 |         return pd.DataFrame(feature_importances_, index=self.feature_names_, columns=["feature_importances"])
119 |         
120 |     def VIF(self, x):
121 |         if self.intercept:
122 |             x = sm.add_constant(x)
123 |         
124 |         x = x.drop(columns=[self.target])
125 |         columns = x.columns
126 |         vif = pd.DataFrame({"VIF": [variance_inflation_factor(np.matrix(x), i) for i in range(len(columns))]}, index=columns)
127 |         
128 |         return vif
129 |     
130 |     def WALD(self):
131 |         return self.classifier.wald_test_terms().table[["statistic", "pvalue"]].rename(columns={"pvalue": "wald_test_pvalue", "statistic": "wald_test_statistic"})
132 |     
133 |     def report(self):
134 |         return self.classifier.summary2().tables[1].join([self.coef_normalization, self.WALD(), self.vif, self.feature_importances_]), self.classifier.summary2().tables[0], self.corr
135 |     
136 |     def summary_save(self, excel_name="逻辑回归模型拟合效果.xlsx", sheet_name="逻辑回归拟合效果"):
137 |         writer = pd.ExcelWriter(excel_name, engine='openpyxl')
138 |         
139 |         coef_report, summary_report, corr_report = self.report()
140 |         summary_report.columns = ["逻辑回归模型拟合效果"] * summary_report.shape[1]
141 |         summary_report.to_excel(writer, sheet_name=sheet_name, index=False, header=False, startcol=0, startrow=2)
142 |         coef_report.reset_index().rename(columns={"index": "variable"}).to_excel(writer, sheet_name=sheet_name, index=False, header=True, startcol=0, startrow=summary_report.shape[0] + 4)
143 |         corr_report.to_excel(writer, sheet_name=sheet_name, index=True, header=True, startcol=0, startrow=summary_report.shape[0] + coef_report.shape[0] + 7)
144 |         
145 |         writer.save()
146 |         writer.close()
147 |         
148 |         if os.path.exists(excel_name):
149 |             workbook = load_workbook(excel_name)
150 |             worksheet = workbook.get_sheet_by_name(sheet_name)
151 |             worksheet["A1"].value = "逻辑回归模型报告"
152 |             worksheet["A1"].alignment = Alignment(horizontal='center', vertical='center')
153 |             worksheet.merge_cells(f"A1:L1")
154 |             
155 |             workbook.save(excel_name)
156 |             workbook.close()
157 |         
158 |         try:
159 |             from processing import render_excel # From: https://github.com/itlubber/openpyxl-excel-style-template/blob/main/feature_bins.py
160 |             render_excel(excel_name, sheet_name=sheet_name, max_column_width=25, merge_rows=np.cumsum([1, len(summary_report), 2, len(coef_report) + 1, 2, len(corr_report) + 1]).tolist())
161 |         except:
162 |             pass
163 | 
164 | 
165 | class ITLubberLogisticRegression(LogisticRegression):
166 |     """
167 |     Extended Logistic Regression.
168 |     Extends [sklearn.linear_model.LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).
169 |     This class provides the following extra statistics, calculated on `.fit()` and accessible via `.summary()`:
170 |     - `cov_matrix_`: covariance matrix for the estimated parameters.
171 |     - `std_err_intercept_`: estimated uncertainty for the intercept
172 |     - `std_err_coef_`: estimated uncertainty for the coefficients
173 |     - `z_intercept_`: estimated z-statistic for the intercept
174 |     - `z_coef_`: estimated z-statistic for the coefficients
175 |     - `p_value_intercept_`: estimated p-value for the intercept
176 |     - `p_value_coef_`: estimated p-value for the coefficients
177 |     
178 |     Example:
179 |     ```python
180 |     feature_pipeline = Pipeline([
181 |         ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
182 |         ("combiner", Combiner(target=target, min_samples=0.2)),
183 |         ("transform", WOETransformer(target=target)),
184 |         ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
185 |         ("stepwise", StepwiseSelection(target=target)),
186 |         # ("logistic", LogisticClassifier(target=target)),
187 |         ("logistic", ITLubberLogisticRegression(target=target)),
188 |     ])
189 |     
190 |     feature_pipeline.fit(train)
191 |     summary = feature_pipeline.named_steps['logistic'].summary()
192 |     ```
193 |     
194 |     An example output of `.summary()`:
195 |     
196 |     |                   |     Coef. |   Std.Err |        z |       P>|z| |    [ 0.025 |   0.975 ] |     VIF |
197 |     |:------------------|----------:|----------:|---------:|------------:|-----------:|----------:|--------:|
198 |     | const             | -0.844037 | 0.0965117 | -8.74544 | 2.22148e-18 | -1.0332    | -0.654874 | 1.05318 |
199 |     | duration.in.month |  0.847445 | 0.248873  |  3.40513 | 0.000661323 |  0.359654  |  1.33524  | 1.14522 |
200 |     """
201 | 
202 |     def __init__(self, target="target", penalty="l2", calculate_stats=True, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver="lbfgs", max_iter=100, multi_class="auto", verbose=0, warm_start=False, n_jobs=None, l1_ratio=None,):
203 |         """
204 |         Extends [sklearn.linear_model.LogisticRegression.fit()](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html).
205 | 
206 |         Args:
207 |             target (str): your dataset's target name
208 |             calculate_stats (bool): If true, calculate statistics like standard error during fit, accessible with .summary()
209 |         """
210 |         super().__init__(penalty=penalty, dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs, l1_ratio=l1_ratio,)
211 |         self.target = target
212 |         self.calculate_stats = calculate_stats
213 | 
214 |     def fit(self, x, sample_weight=None, **kwargs):
215 |         y = x[self.target]
216 |         x = x.drop(columns=[self.target])
217 |         
218 |         if not self.calculate_stats:
219 |             return super().fit(x, y, sample_weight=sample_weight, **kwargs)
220 | 
221 |         x = self.convert_sparse_matrix(x)
222 |         
223 |         if isinstance(x, pd.DataFrame):
224 |             self.names_ = ["const"] + [f for f in x.columns]
225 |         else:
226 |             self.names_ = ["const"] + [f"x{i}" for i in range(x.shape[1])]
227 | 
228 |         lr = super().fit(x, y, sample_weight=sample_weight, **kwargs)
229 | 
230 |         predProbs = self.predict_proba(x)
231 | 
232 |         # Design matrix -- add column of 1's at the beginning of your x matrix
233 |         if lr.fit_intercept:
234 |             x_design = np.hstack([np.ones((x.shape[0], 1)), x])
235 |         else:
236 |             x_design = x
237 | 
238 |         self.vif = [variance_inflation_factor(np.matrix(x_design), i) for i in range(x_design.shape[-1])]
239 |         p = np.product(predProbs, axis=1)
240 |         self.cov_matrix_ = np.linalg.inv((x_design * p[..., np.newaxis]).T @ x_design)
241 |         std_err = np.sqrt(np.diag(self.cov_matrix_)).reshape(1, -1)
242 | 
243 |         # In case fit_intercept is set to True, then in the std_error array
244 |         # Index 0 corresponds to the intercept, from index 1 onwards it relates to the coefficients
245 |         # If fit intercept is False, then all the values are related to the coefficients
246 |         if lr.fit_intercept:
247 | 
248 |             self.std_err_intercept_ = std_err[:, 0]
249 |             self.std_err_coef_ = std_err[:, 1:][0]
250 | 
251 |             self.z_intercept_ = self.intercept_ / self.std_err_intercept_
252 | 
253 |             # Get p-values under the gaussian assumption
254 |             self.p_val_intercept_ = scipy.stats.norm.sf(abs(self.z_intercept_)) * 2
255 | 
256 |         else:
257 |             self.std_err_intercept_ = np.array([np.nan])
258 |             self.std_err_coef_ = std_err[0]
259 | 
260 |             self.z_intercept_ = np.array([np.nan])
261 | 
262 |             # Get p-values under the gaussian assumption
263 |             self.p_val_intercept_ = np.array([np.nan])
264 | 
265 |         self.z_coef_ = self.coef_ / self.std_err_coef_
266 |         self.p_val_coef_ = scipy.stats.norm.sf(abs(self.z_coef_)) * 2
267 | 
268 |         return self
269 |     
270 |     def corr(self, data, save=None, annot=True):
271 |         corr = data.drop(columns=[self.target]).corr()
272 |         
273 |         if save:
274 |             self.corr_plot(data.drop(columns=[self.target]), save=save, annot=annot)
275 |             
276 |         return corr
277 |     
278 |     @staticmethod
279 |     def corr_plot(data, figure_size=(16, 8),  fontsize=14, color=["#2639E9", "#F76E6C", "#FE7715"], mask=False, save=None, annot=True):
280 |         corr = data.corr()
281 |         corr_mask = np.zeros_like(corr, dtype = np.bool)
282 |         corr_mask[np.triu_indices_from(corr_mask)] = True
283 | 
284 |         map_plot = toad.tadpole.tadpole.heatmap(
285 |             corr,
286 |             mask = corr_mask if mask else None,
287 |             cmap = sns.diverging_palette(267, 267, n=10, s=100, l=40),
288 |             vmax = 1,
289 |             vmin = -1,
290 |             center = 0,
291 |             square = True,
292 |             linewidths = .1,
293 |             annot = annot,
294 |             fmt = '.2f',
295 |             figure_size = figure_size,
296 |         )
297 | 
298 |         map_plot.tick_params(axis='x', labelrotation=270, labelsize=fontsize)
299 |         map_plot.tick_params(axis='y', labelrotation=0, labelsize=fontsize)
300 |         
301 |         if save:
302 |             if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
303 |                 os.makedirs(os.path.dirname(save))
304 |             
305 |             plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
306 |         
307 |         return map_plot
308 | 
309 |     def report(self, data):
310 |         report_dict = classification_report(data[self.target], self.predict(data.drop(columns=self.target)), output_dict=True, target_names=["好客户", "坏客户"])
311 |         accuracy = report_dict.pop("accuracy")
312 |         _report = pd.DataFrame(report_dict).T.reset_index().rename(columns={"index": "desc"})
313 |         _report.loc[len(_report)] = ['accuracy', '', '', accuracy, len(data)]
314 |         return _report
315 | 
316 |     def summary(self):
317 |         """
318 |         Puts the summary statistics of the fit() function into a pandas DataFrame.
319 |         Returns:
320 |             data (pandas DataFrame): The statistics dataframe, indexed by the column name
321 |         """
322 |         check_is_fitted(self)
323 | 
324 |         if not hasattr(self, "std_err_coef_"):
325 |             msg = "Summary statistics were not calculated on .fit(). Options to fix:\n"
326 |             msg += "\t- Re-fit using .fit(X, y, calculate_stats=True)\n"
327 |             msg += "\t- Re-inititialize using LogisticRegression(calculate_stats=True)"
328 |             raise AssertionError(msg)
329 | 
330 |         data = {
331 |             "Coef.": (self.intercept_.tolist() + self.coef_.tolist()[0]),
332 |             "Std.Err": (self.std_err_intercept_.tolist() + self.std_err_coef_.tolist()),
333 |             "z": (self.z_intercept_.tolist() + self.z_coef_.tolist()[0]),
334 |             "P>|z|": (self.p_val_intercept_.tolist() + self.p_val_coef_.tolist()[0]),
335 |         }
336 |         
337 |         stats = pd.DataFrame(data, index=self.names_)
338 |         stats["[ 0.025"] = stats["Coef."] - 1.96 * stats["Std.Err"]
339 |         stats["0.975 ]"] = stats["Coef."] + 1.96 * stats["Std.Err"]
340 |         
341 |         stats["VIF"] = self.vif
342 |         
343 |         return stats
344 |     
345 |     @staticmethod
346 |     def convert_sparse_matrix(x):
347 |         """
348 |         Converts a sparse matrix to a numpy array.
349 |         This can prevent problems arising from, e.g. OneHotEncoder.
350 |         Args:
351 |             x: numpy array, sparse matrix
352 |         Returns:
353 |             numpy array of x
354 |         """
355 |         if scipy.sparse.issparse(x):
356 |             return x.toarray()
357 |         else:
358 |             return x
359 |     
360 |     def plot_weights(self, save=None, figsize=(15, 8), fontsize=14, color=["#2639E9", "#F76E6C", "#FE7715"]):
361 |         summary = self.summary()
362 |         
363 |         x = summary["Coef."]
364 |         y = summary.index
365 |         lower_error = summary["Coef."] - summary["[ 0.025"]
366 |         upper_error = summary["0.975 ]"] - summary["Coef."]
367 |         
368 |         fig, ax = plt.subplots(1, 1, figsize=figsize)
369 |         ax.errorbar(x, y, xerr=[lower_error, upper_error], fmt="o", ecolor=color[0], elinewidth=2, capthick=2, capsize=4, ms=6, mfc=color[0], mec=color[0])
370 |         # ax.tick_params(axis='x', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize)
371 |         # ax.tick_params(axis='y', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize)
372 |         ax.axvline(0, color=color[0], linestyle='--', ymax=len(y), alpha=0.5)
373 |         ax.spines['top'].set_color(color[0])
374 |         ax.spines['bottom'].set_color(color[0])
375 |         ax.spines['right'].set_color(color[0])
376 |         ax.spines['left'].set_color(color[0])
377 |         ax.spines['top'].set_visible(False)
378 |         ax.spines['right'].set_visible(False)
379 | 
380 |         ax.set_title("Regression Meta Analysis - Weight Plot", fontsize=fontsize, fontweight="bold")
381 |         ax.set_xlabel("Weight Estimates", fontsize=fontsize, weight="bold")
382 |         ax.set_ylabel("Variable", fontsize=fontsize, weight="bold")
383 |         
384 |         if save:
385 |             if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
386 |                 os.makedirs(os.path.dirname(save))
387 |             
388 |             plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
389 | 
390 |         return fig
391 | 
392 |     # def plot_weights(self, save=None):
393 |     #     """
394 |     #     Generates a weight plot(plotly chart) from `stats`
395 |     #     Example:
396 |     #     ```
397 |     #     pipeline = Pipeline([
398 |     #         ('clf', LogisticRegression(calculate_stats=True))
399 |     #     ])
400 |     #     pipeline.fit(X, y)
401 |     #     stats = pipeline.named_steps['clf'].plot_weights()
402 |     #     ```
403 |     #     Args:
404 |     #         stats: The statistics to display
405 |     #         format: The format of the image, such as 'png'. The default None returns a plotly image.
406 |     #         scale: If format is specified, the scale of the image
407 |     #         width: If format is specified, the width of the image
408 |     #         height: If format is specified, the image of the image
409 |     #     """
410 |     #     stats = self.summary()
411 |         
412 |     #     fig = go.Figure()
413 | 
414 |     #     fig.add_trace(
415 |     #         go.Scatter(
416 |     #             x=stats['Coef.'],
417 |     #             y=stats['Coef.'].index,
418 |     #             line=dict(color='#2639E9', width=2),
419 |     #             mode='markers',
420 | 
421 |     #             error_x=dict(
422 |     #                 type='data',
423 |     #                 symmetric=False,
424 |     #                 array=stats['0.975 ]'] - stats['Coef.'],
425 |     #                 arrayminus=stats['Coef.'] - stats['[ 0.025'],
426 |     #                 color='#2639E9')
427 |     #         )
428 |     #     )
429 | 
430 |     #     fig.add_shape(type="line",
431 |     #                 x0=0, y0=0, x1=0, y1=len(stats),
432 |     #                 line=dict(color="#a29bfe", width=3, dash='dash')
433 |     #                 )
434 | 
435 |     #     fig.update_layout(
436 |     #         title='Regression Meta Analysis - Weight Plot',
437 |     #         xaxis_title='Weight Estimates',
438 |     #         yaxis_title='Variable',
439 |     #         xaxis_showgrid=False,
440 |     #         yaxis_showgrid=False
441 |     #     )
442 |         
443 |     #     fig.update_layout(template="simple_white")
444 |         
445 |     #     if save:
446 |     #         write_image(fig, save)
447 | 
448 |     #     return fig
449 |     
450 |     
451 | class ScoreCard(toad.ScoreCard, TransformerMixin):
452 |     
453 |     def __init__(self, target="target", pdo=60, rate=2, base_odds=35, base_score=750, combiner={}, transer=None, pretrain_lr=None, pipeline=None, **kwargs):
454 |         """
455 |         评分卡模型转换
456 | 
457 |         Args:
458 |             target: 数据集中标签名称，默认 target
459 |             pdo: odds 每增加 rate 倍时减少 pdo 分，默认 60
460 |             rate: 倍率
461 |             base_odds: 基础 odds，通常根据业务经验设置的基础比率（违约概率/正常概率），估算方法：（1-样本坏客户占比）/坏客户占比，默认 35，即 35:1 => 0.972 => 坏样本率 2.8%
462 |             base_score: 基础 odds 对应的分数，默认 750
463 |             combiner: 分箱转换器，传入 pipeline 时可以为None
464 |             transer: woe转换器，传入 pipeline 时可以为None
465 |             pretrain_lr: 预训练好的逻辑回归模型，可以不传
466 |             pipeline: 训练好的 pipeline，必须包含 Combiner 和 WOETransformer
467 |             **kwargs: 其他相关参数，具体参考 toad.ScoreCard
468 |         """
469 |         if pipeline:
470 |             combiner = self.class_steps(pipeline, Combiner)[0]
471 |             transer = self.class_steps(pipeline, WOETransformer)[0]
472 |             
473 |             if self.class_steps(pipeline, (ITLubberLogisticRegression, LogisticRegression)):
474 |                 pretrain_lr = self.class_steps(pipeline, (ITLubberLogisticRegression, LogisticRegression))[0]
475 |             
476 |         super().__init__(
477 |                             combiner=combiner.combiner if isinstance(combiner, Combiner) else combiner, transer=transer.transformer if isinstance(transer, WOETransformer) else transer, 
478 |                             pdo=pdo, rate=rate, base_odds=base_odds, base_score=base_score, **kwargs
479 |                         )
480 |         
481 |         self.target = target
482 |         self.pipeline = pipeline
483 |         self.pretrain_lr = pretrain_lr
484 |         
485 |     def fit(self, x):
486 |         y = x[self.target]
487 |         x = x.drop(columns=[self.target])
488 |         
489 |         self._feature_names = x.columns.tolist()
490 | 
491 |         for f in self.features_:
492 |             if f not in self.transer:
493 |                 raise Exception('column \'{f}\' is not in transer'.format(f = f))
494 | 
495 |         if self.pretrain_lr:
496 |             self.model = self.pretrain_lr
497 |         else:
498 |             self.model.fit(x, y)
499 |         
500 |         self.rules = self._generate_rules()
501 | 
502 |         sub_score = self.woe_to_score(x)
503 |         self.base_effect = pd.Series(np.median(sub_score, axis=0), index = self.features_)
504 | 
505 |         return self
506 |     
507 |     def transform(self, x):
508 |         return self.predict(x)
509 |     
510 |     def scorecard_scale(self):
511 |         scorecard_kedu = pd.DataFrame(
512 |             [
513 |                 ["base_odds", self.base_odds, "根据业务经验设置的基础比率（违约概率/正常概率），估算方法：（1-样本坏客户占比）/坏客户占比"],
514 |                 ["base_score", self.base_score, "基础ODDS对应的分数"],
515 |                 ["rate", self.rate, "设置分数的倍率"],
516 |                 ["pdo", self.pdo, "表示分数增长PDO时，ODDS值增长到RATE倍"],
517 |                 ["B", self.offset, "补偿值，计算方式：pdo / ln(rate)"],
518 |                 ["A", self.factor, "刻度，计算方式：base_score - B * ln(base_odds)"],
519 |             ],
520 |             columns=["刻度项", "刻度值", "备注"],
521 |         )
522 |         return scorecard_kedu
523 |     
524 |     @staticmethod
525 |     def KS_bucket(y_pred, y_true, bucket=10, method="quantile"):
526 |         return toad.metrics.KS_bucket(y_pred, y_true, bucket=bucket, method=method)
527 |     
528 |     @staticmethod
529 |     def KS(y_pred, y_true):
530 |         return toad.metrics.KS(y_pred, y_true)
531 |     
532 |     @staticmethod
533 |     def AUC(y_pred, y_true):
534 |         return toad.metrics.AUC(y_pred, y_true)
535 |     
536 |     @staticmethod
537 |     def perf_eva(y_pred, y_true, title="", plot_type=["ks", "roc"], save=None, figsize=(14, 6)):
538 |         # plt.figure(figsize=figsize)
539 |         rt = sc.perf_eva(y_true, y_pred, title=title, plot_type=plot_type, show_plot=True)
540 | 
541 |         if save:
542 |             if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
543 |                 os.makedirs(os.path.dirname(save))
544 |             
545 |             rt["pic"].savefig(save, dpi=240, format="png", bbox_inches="tight")
546 |         
547 |         return rt
548 |     
549 |     @staticmethod
550 |     def ks_plot(score, target, title="", fontsize=14, figsize=(16, 8), save=None, colors=["#2639E9", "#F76E6C", "#FE7715"]):
551 |         if np.mean(score) < 0 or np.mean(score) > 1:
552 |             warnings.warn('Since the average of pred is not in [0,1], it is treated as predicted score but not probability.')
553 |             score = -score
554 | 
555 |         df = pd.DataFrame({'label': target, 'pred': score})
556 |         def n0(x): return sum(x==0)
557 |         def n1(x): return sum(x==1)
558 |         df_ks = df.sort_values('pred', ascending=False).reset_index(drop=True) \
559 |             .assign(group=lambda x: np.ceil((x.index+1)/(len(x.index)/len(df.index)))) \
560 |             .groupby('group')['label'].agg([n0, n1]) \
561 |             .reset_index().rename(columns={'n0':'good','n1':'bad'}) \
562 |             .assign(
563 |                 group=lambda x: (x.index+1)/len(x.index),
564 |                 cumgood=lambda x: np.cumsum(x.good)/sum(x.good), 
565 |                 cumbad=lambda x: np.cumsum(x.bad)/sum(x.bad)
566 |             ).assign(ks=lambda x:abs(x.cumbad-x.cumgood))
567 | 
568 |         fig, ax = plt.subplots(1, 2, figsize = figsize)
569 | 
570 |         # KS曲线
571 |         dfks = df_ks.loc[lambda x: x.ks==max(x.ks)].sort_values('group').iloc[0]
572 | 
573 |         ax[0].plot(df_ks.group, df_ks.ks, color=colors[0], label="KS曲线")
574 |         ax[0].plot(df_ks.group, df_ks.cumgood, color=colors[1], label="累积好客户占比")
575 |         ax[0].plot(df_ks.group, df_ks.cumbad, color=colors[2], label="累积坏客户占比")
576 |         ax[0].fill_between(df_ks.group, df_ks.cumbad, df_ks.cumgood, color=colors[0], alpha=0.25)
577 | 
578 |         ax[0].plot([dfks['group'], dfks['group']], [0, dfks['ks']], 'r--')
579 |         ax[0].text(dfks['group'], dfks['ks'], f"KS: {round(dfks['ks'],4)} at: {dfks.group:.2%}", horizontalalignment='center', fontsize=fontsize)
580 | 
581 |         ax[0].spines['top'].set_color(colors[0])
582 |         ax[0].spines['bottom'].set_color(colors[0])
583 |         ax[0].spines['right'].set_color(colors[0])
584 |         ax[0].spines['left'].set_color(colors[0])
585 |         ax[0].set_xlabel('% of Population', fontsize=fontsize)
586 |         ax[0].set_ylabel('% of Total Bad / Good', fontsize=fontsize)
587 | 
588 |         ax[0].set_xlim((0, 1))
589 |         ax[0].set_ylim((0, 1))
590 |         
591 |         handles1, labels1 = ax[0].get_legend_handles_labels()
592 | 
593 |         ax[0].legend(loc='upper center', ncol=len(labels1), bbox_to_anchor=(0.5, 1.1), frameon=False)
594 | 
595 |         # ROC 曲线
596 |         fpr, tpr, thresholds = roc_curve(target, score)
597 |         auc_value = toad.metrics.AUC(score, target)
598 | 
599 |         ax[1].plot(fpr, tpr, color=colors[0], label="ROC Curve")
600 |         ax[1].stackplot(fpr, tpr, color=colors[0], alpha=0.25)
601 |         ax[1].plot([0, 1], [0, 1], color=colors[1], lw=2, linestyle=':')
602 |         # ax[1].tick_params(axis='x', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize)
603 |         # ax[1].tick_params(axis='y', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize)
604 |         ax[1].text(0.5, 0.5, f"AUC: {auc_value:.4f}", fontsize=fontsize, horizontalalignment="center", transform=ax[1].transAxes)
605 | 
606 |         ax[1].spines['top'].set_color(colors[0])
607 |         ax[1].spines['bottom'].set_color(colors[0])
608 |         ax[1].spines['right'].set_color(colors[0])
609 |         ax[1].spines['left'].set_color(colors[0])
610 |         ax[1].set_xlabel("False Positive Rate", fontsize=fontsize)
611 |         ax[1].set_ylabel('True Positive Rate', fontsize=fontsize)
612 | 
613 |         ax[1].set_xlim((0, 1))
614 |         ax[1].set_ylim((0, 1))
615 | 
616 |         ax[1].yaxis.tick_right()
617 |         ax[1].yaxis.set_label_position("right")
618 | 
619 |         handles2, labels2 = ax[1].get_legend_handles_labels()
620 | 
621 |         ax[1].legend(loc='upper center', ncol=len(labels2), bbox_to_anchor=(0.5, 1.1), frameon=False)
622 |         
623 |         if title: title += " "
624 |         fig.suptitle(f"{title}K-S & ROC CURVE\n", fontsize=fontsize, fontweight="bold")
625 |         
626 |         plt.tight_layout()
627 |         
628 |         if save:
629 |             if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
630 |                 os.makedirs(os.path.dirname(save))
631 |                 
632 |             plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
633 | 
634 |         return fig
635 |     
636 |     @staticmethod
637 |     def PSI(y_pred_train, y_pred_oot):
638 |         return toad.metrics.PSI(y_pred_train, y_pred_oot)
639 |     
640 |     @staticmethod
641 |     def perf_psi(y_pred_train, y_pred_oot, y_true_train, y_true_oot, keys=["train", "test"], x_limits=None, x_tick_break=50, show_plot=True, return_distr_dat=False):
642 |         return sc.perf_psi(
643 |             score = {keys[0]: y_pred_train, keys[1]: y_pred_oot},
644 |             label = {keys[0]: y_true_train, keys[1]: y_true_oot},
645 |             x_limits = x_limits,
646 |             x_tick_break = x_tick_break,
647 |             show_plot = show_plot,
648 |             return_distr_dat = return_distr_dat,
649 |         )
650 |     
651 |     @staticmethod
652 |     def score_hist(score, y_true, figsize=(15, 10), bins=20, alpha=1, save=None):
653 |         fig, ax = plt.subplots(1, 1, figsize = figsize)
654 |         palette = sns.diverging_palette(340, 267, n=2, s=100, l=40)
655 | 
656 |         sns.histplot(
657 |                     x=score, hue=y_true.replace({0: "good", 1: "bad"}), element="step", stat="density", bins=bins, common_bins=True, common_norm=True, palette=palette, ax=ax
658 |                 )
659 | 
660 |         sns.despine()
661 | 
662 |         ax.spines['top'].set_color("#2639E9")
663 |         ax.spines['bottom'].set_color("#2639E9")
664 |         ax.spines['right'].set_color("#2639E9")
665 |         ax.spines['left'].set_color("#2639E9")
666 | 
667 |         ax.set_xlabel("score")
668 |         ax.set_ylabel("density")
669 |         
670 |         ax.legend(["坏样本", "好样本"], loc='upper center', ncol=len(y_true.unique()), bbox_to_anchor=(0.5, 1.05), frameon=False, fontsize=14)
671 |         
672 |         fig.tight_layout()
673 | 
674 |         if save:
675 |             if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)):
676 |                 os.makedirs(os.path.dirname(save))
677 |                 
678 |             plt.savefig(save, dpi=240, format="png", bbox_inches="tight")
679 |         
680 |         return fig
681 |     
682 |     def _format_rule(self, rule, decimal = 2, **kwargs):
683 |         bins = self.format_bins(rule['bins'])
684 |         scores = np.around(rule['scores'], decimals = decimal).tolist()
685 |         
686 |         return dict(zip(bins, scores))
687 |     
688 |     @staticmethod
689 |     def class_steps(pipeline, query):
690 |         return [v for k, v in pipeline.named_steps.items() if isinstance(v, query)]
691 |     
692 |     @staticmethod
693 |     def round_float(num, decimal = 4):
694 |         if ~pd.isnull(num) and isinstance(num, float):
695 |             return float(str(num).split(".")[0] + "." + str(num).split(".")[1][:decimal])
696 |         else:
697 |             return num
698 |     
699 |     def feature_bins(self, bins, decimal = 4):
700 |         if isinstance(bins, list): bins = np.array(bins)
701 |         EMPTYBINS = len(bins) if not isinstance(bins[0], (set, list, np.ndarray)) else -1
702 |         
703 |         l = []
704 |         if np.issubdtype(bins.dtype, np.number):
705 |             has_empty = len(bins) > 0 and np.isnan(bins[-1])
706 |             if has_empty: bins = bins[:-1]
707 |             sp_l = ["负无穷"] + [self.round_float(b, decimal=decimal) for b in bins.tolist()] + ["正无穷"]
708 |             for i in range(len(sp_l) - 1): l.append('['+str(sp_l[i])+' , '+str(sp_l[i+1])+')')
709 |             if has_empty: l.append('缺失值')
710 |         else:
711 |             for keys in bins:
712 |                 keys_update = set()
713 |                 for key in keys:
714 |                     if pd.isnull(key) or key == "nan":
715 |                         keys_update.add("缺失值")
716 |                     elif key.strip() == "":
717 |                         keys_update.add("空字符串")
718 |                     else:
719 |                         keys_update.add(key)
720 |                 label = ','.join(keys_update)
721 |                 l.append(label)
722 | 
723 |         return {i if b != "缺失值" else EMPTYBINS: b for i, b in enumerate(l)}
724 |     
725 |     def feature_bin_stats(self, data, feature, target="target", rules={}, empty_separate=True, method='step', max_n_bins=10, clip_v=None, desc="评分卡分数", verbose=0, combiner=None, ks=False):
726 |         if method not in ['dt', 'chi', 'quantile', 'step', 'kmeans', 'cart']:
727 |             raise "method is the one of ['dt', 'chi', 'quantile', 'step', 'kmeans', 'cart']"
728 |         
729 |         if combiner is None:
730 |             combiner = toad.transform.Combiner()
731 |             
732 |             if method == "cart":
733 |                 x = data[feature].values
734 |                 y = data[target]
735 |                 _combiner = OptimalBinning(feature, dtype="numerical", max_n_bins=max_n_bins, monotonic_trend="auto_asc_desc", gamma=0.01).fit(x, y)
736 |                 if _combiner.status == "OPTIMAL":
737 |                     rules.update({feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [np.nan]})
738 |             else:
739 |                 if method == "step":
740 |                     combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, n_bins=max_n_bins, clip_v=clip_v)
741 |                 else:
742 |                     combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, n_bins=max_n_bins)
743 | 
744 |             if verbose > 0:
745 |                 print(data[feature].describe())
746 | 
747 |         if rules and isinstance(rules, list): rules = {feature: rules}
748 |         if rules and isinstance(rules, dict): combiner.update(rules)
749 | 
750 |         feature_bin = combiner.export()[feature]
751 |         feature_bin_dict = self.feature_bins(np.array(feature_bin))
752 |         
753 |         df_bin = combiner.transform(data[[feature, target]], labels=False)
754 |         
755 |         table = df_bin[[feature, target]].groupby([feature, target]).agg(len).unstack()
756 |         table.columns.name = None
757 |         table = table.rename(columns = {0 : '好样本数', 1 : '坏样本数'}).fillna(0)
758 |         if "好样本数" not in table.columns:
759 |             table["好样本数"] = 0
760 |         if "坏样本数" not in table.columns:
761 |             table["坏样本数"] = 0
762 |         
763 |         table["指标名称"] = feature
764 |         table["指标含义"] = desc
765 |         table = table.reset_index().rename(columns={feature: "分箱"})
766 | 
767 |         table['样本总数'] = table['好样本数'] + table['坏样本数']
768 |         table['样本占比'] = table['样本总数'] / table['样本总数'].sum()
769 |         table['好样本占比'] = table['好样本数'] / table['好样本数'].sum()
770 |         table['坏样本占比'] = table['坏样本数'] / table['坏样本数'].sum()
771 |         table['坏样本率'] = table['坏样本数'] / table['样本总数']
772 |         
773 |         table = table.fillna(0.)
774 |         
775 |         table['分档WOE值'] = table.apply(lambda x : np.log(x['好样本占比'] / (x['坏样本占比'] + 1e-6)),axis=1)
776 |         table['分档IV值'] = table.apply(lambda x : (x['好样本占比'] - x['坏样本占比']) * np.log(x['好样本占比'] / (x['坏样本占比'] + 1e-6)), axis=1)
777 |         
778 |         table = table.replace(np.inf, 0).replace(-np.inf, 0)
779 |         
780 |         table['指标IV值'] = table['分档IV值'].sum()
781 |         
782 |         table["LIFT值"] = table['坏样本率'] / (table["坏样本数"].sum() / table["样本总数"].sum())
783 |         table["累积LIFT值"] = (table['坏样本数'].cumsum() / table['样本总数'].cumsum()) / (table["坏样本数"].sum() / table["样本总数"].sum())
784 |         # table["累积LIFT值"] = table["LIFT值"].cumsum()
785 |         
786 |         if ks:
787 |             table = table.sort_values("分箱")
788 |             table["累积好样本数"] = table["好样本数"].cumsum()
789 |             table["累积坏样本数"] = table["坏样本数"].cumsum()
790 |             table["分档KS值"] = table["累积坏样本数"] / table['坏样本数'].sum() - table["累积好样本数"] / table['好样本数'].sum()
791 |         
792 |         table["分箱"] = table["分箱"].map(feature_bin_dict)
793 |         table = table.set_index(['指标名称', '指标含义', '分箱']).reindex([(feature, desc, b) for b in feature_bin_dict.values()]).fillna(0).reset_index()
794 |         
795 |         if ks:
796 |             return table[['指标名称', "指标含义", '分箱', '样本总数', '样本占比', '好样本数', '好样本占比', '坏样本数', '坏样本占比', '坏样本率', '分档WOE值', '分档IV值', '指标IV值', 'LIFT值', '累积LIFT值', '累积好样本数', '累积坏样本数', '分档KS值']]
797 |         else:
798 |             return table[['指标名称', "指标含义", '分箱', '样本总数', '样本占比', '好样本数', '好样本占比', '坏样本数', '坏样本占比', '坏样本率', '分档WOE值', '分档IV值', '指标IV值', 'LIFT值', '累积LIFT值']]
799 | 
800 |     
801 | if __name__ == '__main__':
802 |     # https://github.com/itlubber/openpyxl-excel-style-template/blob/main/pipeline_model.py
803 |     plt.ion()
804 |     
805 |     target = "creditability"
806 |     data = sc.germancredit()
807 |     data[target] = data[target].map({"good": 0, "bad": 1})
808 | 
809 |     train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target])
810 |     oot = data.copy()
811 |     feature_pipeline = Pipeline([
812 |         ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")),
813 |         ("combiner", Combiner(target=target, min_samples=0.2)),
814 |         ("transform", WOETransformer(target=target)),
815 |         ("processing_select", FeatureSelection(target=target, engine="scorecardpy")),
816 |         ("stepwise", StepwiseSelection(target=target)),
817 |     ])
818 | 
819 |     feature_pipeline.fit(train)
820 | 
821 |     woe_train = feature_pipeline.transform(train)
822 |     woe_test = feature_pipeline.transform(test)
823 |     woe_oot = feature_pipeline.transform(oot)
824 |     
825 |     # save all bin_plot
826 |     _combiner = feature_pipeline.named_steps["combiner"]
827 |     for col in woe_train.columns:
828 |         if col != target:
829 |             _combiner.bin_plot(train, col, labels=True, save=f"outputs/bin_plots/train_{col}.png")
830 |             _combiner.bin_plot(test, col, labels=True, save=f"outputs/bin_plots/test_{col}.png")
831 |             _combiner.bin_plot(oot, col, labels=True, save=f"outputs/bin_plots/oot_{col}.png")
832 | 
833 |     # logistic = StatsLogisticRegression(target=target)
834 |     logistic = ITLubberLogisticRegression(target=target)
835 |     
836 |     logistic.fit(woe_train)
837 | 
838 |     y_pred_train = logistic.predict_proba(woe_train.drop(columns=target))[:, 1]
839 |     y_pred_test = logistic.predict_proba(woe_test.drop(columns=target))[:, 1]
840 |     y_pred_oot = logistic.predict_proba(woe_oot.drop(columns=target))[:, 1]
841 |     
842 |     # params_grid = {
843 |     #     # "logistic__C": [i / 1. for i in range(1, 10, 2)],
844 |     #     # "logistic__penalty": ["l2"],
845 |     #     # "logistic__class_weight": [None, "balanced"], # + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10)],
846 |     #     # "logistic__max_iter": [100],
847 |     #     # "logistic__solver": ["sag"] # ["liblinear", "sag", "lbfgs", "newton-cg"],
848 |     #     "logistic__intercept": [True, False],
849 |     # }
850 |     
851 |     # clf = GridSearchCV(feature_pipeline, params_grid, cv=5, scoring='roc_auc', verbose=-1, n_jobs=2, return_train_score=True)
852 |     # clf.fit(train, train[target])
853 |     
854 |     # y_pred_train = clf.best_estimator_.predict(train)
855 |     # y_pred_test = clf.best_estimator_.predict(test)
856 |     
857 |     # print(clf.best_params_)
858 |     
859 |     # model summary
860 |     # logistic.summary_save()
861 |     
862 |     logistic.plot_weights(save="outputs/logistic_train.png")
863 |     
864 |     summary = logistic.summary().reset_index().rename(columns={"index": "Features"})
865 |     
866 |     train_corr = logistic.corr(woe_train, save="outputs/train_corr.png")
867 |     test_corr = logistic.corr(woe_test, save="outputs/test_corr.png")
868 |     oot_corr = logistic.corr(woe_oot, save="outputs/oot_corr.png")
869 |     
870 |     train_report = logistic.report(woe_train)
871 |     test_report = logistic.report(woe_test)
872 |     oot_report = logistic.report(woe_oot)
873 |     
874 |     print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target]))
875 |     print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target]))
876 |     print("oot: ", toad.metrics.KS(y_pred_oot, oot[target]), toad.metrics.AUC(y_pred_oot, oot[target]))
877 | 
878 |     card = ScoreCard(target=target, pipeline=feature_pipeline, pretrain_lr=logistic)
879 |     card.fit(woe_train)
880 |     
881 |     train["score"] = card.predict(train)
882 |     test["score"] = card.predict(test)
883 |     oot["score"] = card.predict(oot)
884 |     
885 |     card.perf_eva(train["score"], train[target], title="Train Dataset", save="outputs/train_ksplot.png")
886 |     card.perf_eva(test["score"], test[target], title="Test Dataset", save="outputs/test_ksplot.png")
887 |     card.perf_eva(oot["score"], oot[target], title="OOT Dataset", save="outputs/oot_ksplot.png")
888 |     
889 |     card.score_hist(train["score"], train[target], save="outputs/train_scorehist.png")
890 |     card.score_hist(test["score"], test[target], save="outputs/test_scorehist.png")
891 |     card.score_hist(oot["score"], oot[target], save="outputs/oot_scorehist.png")
892 |     
893 |     train_score_rank = card.feature_bin_stats(train, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
894 |     test_score_rank = card.feature_bin_stats(test, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
895 |     oot_score_rank = card.feature_bin_stats(oot, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")
896 |     
897 |     card_points = card.export(to_frame=True)
898 |     
899 |     writer = pd.ExcelWriter("outputs/评分卡结果验证表.xlsx", engine="openpyxl")
900 |     
901 |     summary.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=1, index=False)
902 |     train_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + 5, index=False)
903 |     test_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + 9, index=False)
904 |     oot_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + len(test_report) + 13, index=False)
905 |     
906 |     worksheet = writer.sheets['逻辑回归拟合结果']
907 |     worksheet.cell(row=1, column=1).value = "入模变量系数及相关统计指标"
908 |     worksheet.cell(row=len(summary) + 5, column=1).value = "训练数据集模型预测报告"
909 |     worksheet.cell(row=len(summary) + len(train_report) + 9, column=1).value = "测试数据集模型预测报告"
910 |     worksheet.cell(row=len(summary) + len(train_report) + len(test_report) + 13, column=1).value = "跨时间验证集模型预测报告"
911 |     
912 |     train_corr.to_excel(writer, sheet_name="入模变量相关性", startrow=1, index=True)
913 |     test_corr.to_excel(writer, sheet_name="入模变量相关性", startrow=len(train_corr) + 5, index=True)
914 |     oot_corr.to_excel(writer, sheet_name="入模变量相关性", startrow=len(train_corr) + len(test_corr) + 9, index=True)
915 |     
916 |     worksheet = writer.sheets['入模变量相关性']
917 |     worksheet.cell(row=2, column=1).value = "训练数据集入模变量相关性"
918 |     worksheet.cell(row=len(train_corr) + 6, column=1).value = "测试数据集入模变量相关性"
919 |     worksheet.cell(row=len(train_corr) + len(test_corr) + 10, column=1).value = "跨时间验证集入模变量相关性"
920 |     
921 |     card_points.to_excel(writer, sheet_name="评分卡", index=False)
922 |     
923 |     train_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=1, index=False)
924 |     test_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + 5, index=False)
925 |     oot_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + len(test_score_rank) + 9, index=False)
926 |     
927 |     worksheet = writer.sheets['评分卡排序性']
928 |     
929 |     worksheet.cell(row=1, column=1).value = "训练数据集评分排序性"
930 |     worksheet.cell(row=len(train_score_rank) + 5, column=1).value = "测试数据集评分排序性"
931 |     worksheet.cell(row=len(train_score_rank) + len(test_score_rank) + 9, column=1).value = "跨时间验证集评分排序性"
932 |     
933 |     writer.close()
934 |     
935 |     from utils.tools import render_excel
936 |     
937 |     render_excel("outputs/评分卡结果验证表.xlsx", border=False)
938 |     
939 | 


--------------------------------------------------------------------------------