├── utils ├── 报告输出模版.xlsx ├── matplot_chinese.ttf ├── __init__.py ├── perf_eva.py ├── excel_writer.py └── tools.py ├── requertments.txt ├── clear_cache.sh ├── LICENSE ├── .gitignore ├── README.md ├── tree_ming.py ├── rules_auto_mining.py ├── processing.py ├── main.py └── model.py /utils/报告输出模版.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/LogisticRegressionPipeline/HEAD/utils/报告输出模版.xlsx -------------------------------------------------------------------------------- /requertments.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy<1.20 3 | ortools>=9.4 4 | ropwr>=0.4.0 5 | scikit-learn>=1.0.2 6 | scipy>=1.6.0 7 | -------------------------------------------------------------------------------- /utils/matplot_chinese.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itlubber/LogisticRegressionPipeline/HEAD/utils/matplot_chinese.ttf -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2023/2/14 09:08 4 | @Author : itlubber 5 | @Site : itlubber.art 6 | """ 7 | -------------------------------------------------------------------------------- /clear_cache.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PYSTRING="$(find . | grep -E "(__pycache__|\.pyc|\.pyo$)")" 4 | IPYNBSTRING="$(find . | grep -E "(ipynb_checkpoints|\.ipynb$)")" 5 | 6 | # 删除 __pycache__ 缓存文件 7 | if [ -n "$PYSTRING" ]; then 8 | echo "删除以下缓存文件 :" 9 | echo "-----------------------------------------------------" 10 | echo "$PYSTRING" 11 | echo "-----------------------------------------------------" 12 | find . | grep -E "(__pycache__|\.pyc|\.pyo$)" | xargs rm -rf 13 | else 14 | echo "不存在 __pycache__ 缓存文件" 15 | fi 16 | 17 | # # 删除 ipynb_checkpoints 缓存文件 18 | # if [ -n "$IPYNBSTRING" ]; then 19 | # echo "删除以下缓存文件 :" 20 | # echo "-----------------------------------------------------" 21 | # echo "$IPYNBSTRING" 22 | # echo "-----------------------------------------------------" 23 | # find . | grep -E "(ipynb_checkpoints|\.ipynb$)" | xargs rm -rf 24 | # else 25 | # echo "不存在 ipynb_checkpoints 缓存文件" 26 | # fi -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 itlubber 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | *.ipynb 131 | *.zip 132 | .DS_store 133 | catboost_info/ 134 | test.py 135 | .idea 136 | .vscode -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 可用于 `超参数搜索` & `pipeline` 的逻辑回归 2 | 3 | ## 交流 4 | 5 | 6 | 7 | 10 | 13 | 14 | 15 | 18 | 21 | 22 |
8 | 微信: itlubber 9 | 11 | 微信公众号: itlubber_art 12 |
16 | itlubber.png 17 | 19 | itlubber_art.png 20 |
23 | 24 | ## 概述 25 | 26 | 分别基于 `statsmodels` 和 `scikit-learn` 实现两种可用于 `sklearn pipeline` 的 `LogisticRegression`,并输出相应的报告,效果如下: 27 | 28 | > 基于 `statsmodels` 的 `StatsLogisticRegression` 29 | 30 | 31 | 32 | 33 | 34 | 35 | > 基于 `sklearn` 的 `ITLubberLogisticRegression` 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | ## 使用方法 45 | 46 | ```python 47 | target = "creditability" 48 | data = sc.germancredit() 49 | data[target] = data[target].map({"good": 0, "bad": 1}) 50 | 51 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target]) 52 | oot = data.copy() 53 | feature_pipeline = Pipeline([ 54 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")), 55 | ("combiner", Combiner(target=target, min_samples=0.2)), 56 | ("transform", WOETransformer(target=target)), 57 | ("processing_select", FeatureSelection(target=target, engine="scorecardpy")), 58 | ("stepwise", StepwiseSelection(target=target)), 59 | ]) 60 | 61 | feature_pipeline.fit(train) 62 | 63 | woe_train = feature_pipeline.transform(train) 64 | woe_test = feature_pipeline.transform(test) 65 | woe_oot = feature_pipeline.transform(oot) 66 | 67 | # logistic = StatsLogisticRegression(target=target) 68 | logistic = ITLubberLogisticRegression(target=target) 69 | 70 | logistic.fit(woe_train) 71 | 72 | y_pred_train = logistic.predict_proba(woe_train.drop(columns=target))[:, 1] 73 | y_pred_test = logistic.predict_proba(woe_test.drop(columns=target))[:, 1] 74 | y_pred_oot = logistic.predict_proba(woe_oot.drop(columns=target))[:, 1] 75 | 76 | # params_grid = { 77 | # # "logistic__C": [i / 1. for i in range(1, 10, 2)], 78 | # # "logistic__penalty": ["l2"], 79 | # # "logistic__class_weight": [None, "balanced"], # + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10)], 80 | # # "logistic__max_iter": [100], 81 | # # "logistic__solver": ["sag"] # ["liblinear", "sag", "lbfgs", "newton-cg"], 82 | # "logistic__intercept": [True, False], 83 | # } 84 | 85 | # clf = GridSearchCV(feature_pipeline, params_grid, cv=5, scoring='roc_auc', verbose=-1, n_jobs=2, return_train_score=True) 86 | # clf.fit(train, train[target]) 87 | 88 | # y_pred_train = clf.best_estimator_.predict(train) 89 | # y_pred_test = clf.best_estimator_.predict(test) 90 | 91 | # print(clf.best_params_) 92 | 93 | # model summary 94 | # logistic.summary_save() 95 | # logistic.plot_weights(save="logistic_train.png") 96 | summary = logistic.summary().reset_index().rename(columns={"index": "Features"}) 97 | 98 | train_report = logistic.report(woe_train) 99 | test_report = logistic.report(woe_test) 100 | oot_report = logistic.report(woe_oot) 101 | 102 | print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target])) 103 | print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target])) 104 | print("oot: ", toad.metrics.KS(y_pred_oot, oot[target]), toad.metrics.AUC(y_pred_oot, oot[target])) 105 | 106 | card = ScoreCard(target=target, pipeline=feature_pipeline, pretrain_lr=logistic) 107 | card.fit(woe_train) 108 | 109 | train["score"] = card.predict(train) 110 | test["score"] = card.predict(test) 111 | oot["score"] = card.predict(oot) 112 | 113 | # print(card.feature_bin_stats(train, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step")) 114 | # print(card.feature_bin_stats(train, "score", target=target, verbose=0, method="cart")) 115 | 116 | train_score_rank = card.feature_bin_stats(train, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step") 117 | test_score_rank = card.feature_bin_stats(test, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step") 118 | oot_score_rank = card.feature_bin_stats(oot, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step") 119 | 120 | writer = pd.ExcelWriter("评分卡结果验证表.xlsx", engine="openpyxl") 121 | 122 | summary.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=1, index=False) 123 | train_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + 5, index=False) 124 | test_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + 9, index=False) 125 | oot_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + len(test_report) + 13, index=False) 126 | 127 | worksheet = writer.sheets['逻辑回归拟合结果'] 128 | worksheet.cell(row=1, column=1).value = "入模变量系数及相关统计指标" 129 | worksheet.cell(row=len(summary) + 5, column=1).value = "训练数据集模型预测报告" 130 | worksheet.cell(row=len(summary) + len(train_report) + 9, column=1).value = "测试数据集模型预测报告" 131 | worksheet.cell(row=len(summary) + len(train_report) + len(test_report) + 13, column=1).value = "跨时间验证集模型预测报告" 132 | 133 | train_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=1, index=False) 134 | test_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + 5, index=False) 135 | oot_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + len(test_score_rank) + 9, index=False) 136 | 137 | worksheet = writer.sheets['评分卡排序性'] 138 | 139 | worksheet.cell(row=1, column=1).value = "训练数据集评分排序性" 140 | worksheet.cell(row=len(train_score_rank) + 5, column=1).value = "测试数据集评分排序性" 141 | worksheet.cell(row=len(train_score_rank) + len(test_score_rank) + 9, column=1).value = "跨时间验证集评分排序性" 142 | 143 | writer.close() 144 | 145 | from utils import render_excel 146 | 147 | render_excel("评分卡结果验证表.xlsx", border=False) 148 | ``` 149 | 150 | 151 | ## 参考 152 | 153 | > https://github.com/ing-bank/skorecard/blob/main/skorecard/linear_model/linear_model.py 154 | > 155 | > https://github.com/itlubber/openpyxl-excel-style-template/blob/main/pipeline_model.py 156 | > -------------------------------------------------------------------------------- /tree_ming.py: -------------------------------------------------------------------------------- 1 | import os 2 | import graphviz 3 | import warnings 4 | import numpy as np 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | from matplotlib import font_manager 8 | import dtreeviz 9 | 10 | import category_encoders as ce 11 | from sklearn.preprocessing import LabelEncoder 12 | from sklearn.tree import _tree, DecisionTreeClassifier, plot_tree, export_graphviz 13 | 14 | 15 | warnings.filterwarnings("ignore") 16 | pd.set_option('display.width', 5000) 17 | plt.style.use('seaborn-ticks') 18 | plt.rcParams["font.sans-serif"]=["SimHei"] 19 | plt.rcParams["axes.unicode_minus"]=False 20 | 21 | 22 | def get_dt_rules(tree, feature_names, total_bad_rate, total_count): 23 | tree_ = tree.tree_ 24 | left = tree.tree_.children_left 25 | right = tree.tree_.children_right 26 | feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature] 27 | rules=dict() 28 | 29 | global res_df 30 | res_df = pd.DataFrame() 31 | 32 | def recurse(node, depth, parent): # 搜每个节点的规则 33 | 34 | if tree_.feature[node] != -2: # 非叶子节点,搜索每个节点的规则 35 | name = feature_name[node] 36 | thd = np.round(tree_.threshold[node],3) 37 | s= "{} <= {} ".format( name, thd, node ) 38 | # 左子 39 | if node == 0: 40 | rules[node]=s 41 | else: 42 | rules[node]=rules[parent]+' & ' +s 43 | recurse(left[node], depth + 1, node) 44 | s="{} > {}".format(name, thd) 45 | # 右子 46 | if node == 0: 47 | rules[node]=s 48 | else: 49 | rules[node]=rules[parent]+' & ' +s 50 | recurse(right[node], depth + 1, node) 51 | else: 52 | df = pd.DataFrame() 53 | df['组合策略'] = rules[parent], 54 | df['好样本数'] = tree_.value[node][0][0].astype(int) 55 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate)) 56 | df['坏样本数'] = tree_.value[node][0][1].astype(int) 57 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate) 58 | df['命中数'] = df['好样本数'] + df['坏样本数'] 59 | df['命中率'] = df['命中数'] / total_count 60 | df['坏率'] = df['坏样本数'] / df['命中数'] 61 | df['样本整体坏率'] = total_bad_rate 62 | df['LIFT值'] = df['坏率'] / df['样本整体坏率'] 63 | 64 | global res_df 65 | 66 | res_df = pd.concat([res_df, df], 0) 67 | 68 | recurse(0, 1, 0) 69 | 70 | return res_df.sort_values("LIFT值", ascending=True).reset_index(drop=True) 71 | 72 | 73 | def dtreeviz_plot(tree, X_TE, y, target="target", save=None): 74 | viz_model = dtreeviz.model(tree, 75 | X_train=X_TE, y_train=y, 76 | feature_names=X_TE.columns, 77 | target_name=target, class_names=["GOOD", f"BAD"]) 78 | viz = viz_model.view( 79 | scale=1.5, 80 | orientation='LR', 81 | colors={ 82 | "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]], 83 | "arrow": "#2639E9", 84 | 'text_wedge': "#F76E6C", 85 | "pie": "#2639E9", 86 | "tile_alpha": 1, 87 | "legend_edge": "#FFFFFF", 88 | }, 89 | ticks_fontsize=10, 90 | label_fontsize=10, 91 | ) 92 | 93 | # viz = dtreeviz.model( 94 | # decision_tree, 95 | # X_TE, 96 | # y, 97 | # # title="DecisionTreeClassifier", 98 | # # title_fontsize=10, 99 | # ticks_fontsize=10, 100 | # label_fontsize=10, 101 | # target_name=target, 102 | # feature_names=X_TE.columns, 103 | # class_names=["good", "bad"], 104 | # orientation='LR', 105 | # scale=1.5, 106 | # colors={ 107 | # "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]], 108 | # "arrow": "#2639E9", 109 | # 'text_wedge': "#F76E6C", 110 | # "pie": "#2639E9", 111 | # "tile_alpha": 1, 112 | # "legend_edge": "#FFFFFF", 113 | # }, 114 | # ) 115 | 116 | if save: 117 | viz.save(save) 118 | 119 | return viz 120 | 121 | 122 | if __name__ == '__main__': 123 | import scorecardpy as sc 124 | 125 | target = "creditability" 126 | data = sc.germancredit() 127 | data[target] = data[target].map({"good": 0, "bad": 1}) 128 | 129 | cat_features = list(set(data.select_dtypes(include=[object, pd.CategoricalDtype]).columns) - set([target])) 130 | cat_features_index = [i for i, f in enumerate(data.columns) if f in cat_features] 131 | 132 | X = data.drop(columns=[target]) 133 | y = data[target] 134 | 135 | target_enc = ce.TargetEncoder(cols=cat_features) 136 | target_enc.fit(X[cat_features], y) 137 | 138 | X_TE = X.join(target_enc.transform(X[cat_features]).add_suffix('_target')) 139 | 140 | target_enc.target_mapping = {} 141 | for col in cat_features: 142 | mapping = X_TE[[col, f"{col}_target"]].drop_duplicates() 143 | target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"])) 144 | 145 | X_TE = X_TE.drop(columns=cat_features) 146 | X_TE = X_TE.rename(columns={f"{c}_target": c for c in cat_features}) 147 | 148 | removes = [] 149 | dt_rules = pd.DataFrame() 150 | 151 | for i in range(128): 152 | decision_tree = DecisionTreeClassifier(max_depth=2, min_samples_split=8, min_samples_leaf=5, max_features="auto") 153 | decision_tree = decision_tree.fit(X_TE, y) 154 | 155 | if decision_tree.score(X_TE, y) < 0.8: 156 | break 157 | 158 | rules = get_dt_rules(decision_tree, X_TE.columns, sum(y) / len(y), len(y)) 159 | viz_model = dtreeviz.model(decision_tree, 160 | X_train=X_TE, y_train=y, 161 | feature_names=X_TE.columns, 162 | target_name=target, class_names=["DPD 0", f"DPD {dpd}+"]) 163 | 164 | rules = rules.query("LIFT值 > 4 & 命中率 < 0.1") 165 | 166 | if len(rules) > 0: 167 | print("/" * 150) 168 | rules["组合策略"] = rules["组合策略"].replace(feature_map, regex=True) 169 | display(rules) 170 | c = viz_model.view( 171 | scale=1.5, 172 | orientation='LR', 173 | colors={ 174 | "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]], 175 | "arrow": "#2639E9", 176 | 'text_wedge': "#F76E6C", 177 | "pie": "#2639E9", 178 | "tile_alpha": 1, 179 | "legend_edge": "#FFFFFF", 180 | }, 181 | ticks_fontsize=10, 182 | label_fontsize=10, 183 | ) 184 | display(c) 185 | 186 | dt_rules = pd.concat([dt_rules, rules]).reset_index(drop=True) 187 | removes.append(decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))]) 188 | X_TE = X_TE.drop(columns=removes[-1]) 189 | print("-" * 150) 190 | 191 | pd.set_option('display.max_row', None) 192 | dt_rules.sort_values(["LIFT值", "命中率"], ascending=False) 193 | 194 | # decision_tree = DecisionTreeClassifier(random_state=0, max_depth=2) 195 | # decision_tree = decision_tree.fit(X_TE, y) 196 | 197 | # rules = get_dt_rules(decision_tree, X_TE.columns, sum(y) / len(y), len(y)) 198 | 199 | # dtreeviz_plot(decision_tree, X_TE, y, save="decision_tree.svg") 200 | # rules.to_excel("组合策略挖掘.xlsx") 201 | 202 | # dot_data = export_graphviz(decision_tree, feature_names=X_TE.columns, class_names=True, filled=True, rounded=False, out_file=None) 203 | # graph = graphviz.Source(dot_data) 204 | 205 | # graph.render("组合策略挖掘") 206 | -------------------------------------------------------------------------------- /rules_auto_mining.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cairosvg 3 | import graphviz 4 | import dtreeviz 5 | import warnings 6 | import numpy as np 7 | import pandas as pd 8 | 9 | import category_encoders as ce 10 | from sklearn.preprocessing import LabelEncoder 11 | from sklearn.tree import _tree, DecisionTreeClassifier, plot_tree, export_graphviz 12 | 13 | 14 | warnings.filterwarnings("ignore") 15 | 16 | 17 | class ParseDecisionTreeRules: 18 | 19 | def __init__(self, target="target", labels=["positive", "negative"], feature_map={}, nan=-1., max_iter=128, output="model_report/auto_mining_rules/决策树组合策略挖掘.xlsx", writer=None): 20 | self.target = target 21 | self.labels = labels 22 | self.feature_map = feature_map 23 | self.nan = nan 24 | self.max_iter = max_iter 25 | self.output = output 26 | self.decision_trees = [] 27 | self.target_enc = None 28 | self.feature_names = None 29 | self.dt_rules = pd.DataFrame() 30 | self.end_row = 2 31 | self.start_col = 2 32 | self.describe_columns = ["组合策略", "命中数", "命中率", "好样本数", "好样本占比", "坏样本数", "坏样本占比", "坏率", "样本整体坏率", "LIFT值"] 33 | 34 | if output: 35 | from utils.excel_writer import ExcelWriter 36 | from openpyxl.utils import get_column_letter, column_index_from_string 37 | init_setting() 38 | if writer: 39 | self.writer = writer 40 | else: 41 | self.writer = ExcelWriter(style_excel="./utils/报告输出模版.xlsx", theme_color="2639E9") 42 | 43 | self.worksheet = self.writer.get_sheet_by_name("决策树组合策略挖掘") 44 | 45 | def encode_cat_features(self, X, y): 46 | cat_features = list(set(X.select_dtypes(include=[object, pd.CategoricalDtype]).columns)) 47 | cat_features_index = [i for i, f in enumerate(X.columns) if f in cat_features] 48 | 49 | if len(cat_features) > 0: 50 | if self.target_enc is None: 51 | self.target_enc = ce.TargetEncoder(cols=cat_features) 52 | self.target_enc.fit(X[cat_features], y) 53 | self.target_enc.target_mapping = {} 54 | X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target')) 55 | for col in cat_features: 56 | mapping = X_TE[[col, f"{col}_target"]].drop_duplicates() 57 | self.target_enc.target_mapping[col] = dict(zip(mapping[col], mapping[f"{col}_target"])) 58 | else: 59 | X_TE = X.join(self.target_enc.transform(X[cat_features]).add_suffix('_target')) 60 | 61 | X_TE = X_TE.drop(columns=cat_features) 62 | return X_TE.rename(columns={f"{c}_target": c for c in cat_features}) 63 | else: 64 | return X 65 | 66 | def get_dt_rules(self, tree, feature_names, total_bad_rate, total_count): 67 | tree_ = tree.tree_ 68 | left = tree.tree_.children_left 69 | right = tree.tree_.children_right 70 | feature_name = [feature_names[i] if i != -2 else "undefined!" for i in tree_.feature] 71 | rules=dict() 72 | 73 | global res_df 74 | res_df = pd.DataFrame() 75 | 76 | def recurse(node, depth, parent): # 搜每个节点的规则 77 | 78 | if tree_.feature[node] != -2: # 非叶子节点,搜索每个节点的规则 79 | name = feature_name[node] 80 | thd = np.round(tree_.threshold[node],3) 81 | s= "{} <= {} ".format( name, thd, node ) 82 | # 左子 83 | if node == 0: 84 | rules[node]=s 85 | else: 86 | rules[node]=rules[parent]+' & ' +s 87 | recurse(left[node], depth + 1, node) 88 | s="{} > {}".format(name, thd) 89 | # 右子 90 | if node == 0: 91 | rules[node]=s 92 | else: 93 | rules[node]=rules[parent]+' & ' +s 94 | recurse(right[node], depth + 1, node) 95 | else: 96 | df = pd.DataFrame() 97 | df['组合策略'] = rules[parent], 98 | df['好样本数'] = tree_.value[node][0][0].astype(int) 99 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate)) 100 | df['坏样本数'] = tree_.value[node][0][1].astype(int) 101 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate) 102 | df['命中数'] = df['好样本数'] + df['坏样本数'] 103 | df['命中率'] = df['命中数'] / total_count 104 | df['坏率'] = df['坏样本数'] / df['命中数'] 105 | df['样本整体坏率'] = total_bad_rate 106 | df['LIFT值'] = df['坏率'] / df['样本整体坏率'] 107 | 108 | global res_df 109 | 110 | res_df = pd.concat([res_df, df], 0) 111 | 112 | recurse(0, 1, 0) 113 | 114 | return res_df.sort_values("LIFT值", ascending=True)[self.describe_columns].reset_index(drop=True) 115 | 116 | def select_dt_rules(self, decision_tree, x, y, lift=3., max_samples=0.05, labels=["positive", "negative"], save=None, verbose=False, drop=False): 117 | rules = self.get_dt_rules(decision_tree, x.columns, sum(y) / len(y), len(y)) 118 | viz_model = dtreeviz.model(decision_tree, 119 | X_train=x, 120 | y_train=y, 121 | feature_names=x.columns, 122 | target_name=target, 123 | class_names=labels, 124 | ) 125 | rules = rules.query(f"LIFT值 >= {lift} & 命中率 <= {max_samples}").reset_index(drop=True) 126 | 127 | if len(rules) > 0: 128 | decision_tree_viz = viz_model.view( 129 | scale=1.5, 130 | orientation='LR', 131 | colors={ 132 | "classes": [None, None, ["#2639E9", "#F76E6C"], ["#2639E9", "#F76E6C", "#FE7715", "#FFFFFF"]], 133 | "arrow": "#2639E9", 134 | 'text_wedge': "#F76E6C", 135 | "pie": "#2639E9", 136 | "tile_alpha": 1, 137 | "legend_edge": "#FFFFFF", 138 | }, 139 | ticks_fontsize=10, 140 | label_fontsize=10, 141 | ) 142 | if verbose: 143 | if self.feature_map is not None and len(self.feature_map) > 0: 144 | display(rules.replace(self.feature_map, regex=True)) 145 | else: 146 | display(rules) 147 | display(decision_tree_viz) 148 | if save: 149 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 150 | os.makedirs(os.path.dirname(save)) 151 | 152 | decision_tree_viz.save("combine_rules_cache.svg") 153 | cairosvg.svg2png(url="combine_rules_cache.svg", write_to=save, dpi=240) 154 | 155 | if drop: 156 | return rules, decision_tree.feature_names_in_[list(decision_tree.feature_importances_).index(max(decision_tree.feature_importances_))] 157 | else: 158 | return rules 159 | 160 | def query_dt_rules(self, x, y, parsed_rules=None): 161 | total_count = len(y) 162 | total_bad_rate = y.sum() / len(y) 163 | 164 | rules = pd.DataFrame() 165 | for rule in parsed_rules["组合策略"].unique(): 166 | select_index = x.query(rule).index 167 | if len(select_index) > 0: 168 | y_select = y[select_index] 169 | df = pd.Series() 170 | df['组合策略'] = rule 171 | df['好样本数'] = len(y_select) - y_select.sum() 172 | df['好样本占比'] = df['好样本数'] / (total_count * (1 - total_bad_rate)) 173 | df['坏样本数'] = y_select.sum() 174 | df['坏样本占比'] = df['坏样本数'] / (total_count * total_bad_rate) 175 | df['命中数'] = df['好样本数'] + df['坏样本数'] 176 | df['命中率'] = df['命中数'] / total_count 177 | df['坏率'] = df['坏样本数'] / df['命中数'] 178 | df['样本整体坏率'] = total_bad_rate 179 | df['LIFT值'] = df['坏率'] / df['样本整体坏率'] 180 | else: 181 | df = pd.Series({'组合策略': rule,'好样本数': 0,'好样本占比': 0.,'坏样本数': 0,'坏样本占比': 0.,'命中数': 0,'命中率': 0.,'坏率': 0.,'样本整体坏率': total_bad_rate,'LIFT值': 0.,}) 182 | 183 | rules = pd.concat([rules, pd.DataFrame(df).T]).reset_index(drop=True) 184 | 185 | return rules[self.describe_columns] 186 | 187 | def insert_dt_rules(self, parsed_rules, end_row, start_col, save=None): 188 | end_row, end_col = self.writer.insert_df2sheet(self.worksheet, parsed_rules, (end_row + 2, start_col)) 189 | 190 | for c in ['好样本占比', '坏样本占比', '命中率', '坏率', '样本整体坏率', 'LIFT值']: 191 | conditional_column = get_column_letter(start_col + parsed_rules.columns.get_loc(c)) 192 | self.writer.set_number_format(self.worksheet, f"{conditional_column}{end_row - len(parsed_rules)}:{conditional_column}{end_row - 1}", "0.00%") 193 | for c in ["坏率", "LIFT值"]: 194 | conditional_column = get_column_letter(start_col + parsed_rules.columns.get_loc(c)) 195 | self.writer.add_conditional_formatting(self.worksheet, f'{conditional_column}{end_row - len(parsed_rules)}', f'{conditional_column}{end_row - 1}') 196 | 197 | if save is not None: 198 | end_row, end_col = self.writer.insert_pic2sheet(self.worksheet, save, (end_row + 1, start_col), figsize=(400, 300)) 199 | 200 | return end_row, end_col 201 | 202 | def fit(self, x, y=None, max_depth=2, lift=3, max_samples=0.2, min_score=None, verbose=False, **kwargs): 203 | y = x[self.target] 204 | X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y) 205 | X_TE = X_TE.fillna(self.nan) 206 | 207 | self.feature_names = list(X_TE.columns) 208 | 209 | for i in range(self.max_iter): 210 | decision_tree = DecisionTreeClassifier(max_depth=max_depth, **kwargs) 211 | decision_tree = decision_tree.fit(X_TE, y) 212 | 213 | if (min_score is not None and decision_tree.score(X_TE, y) < min_score) or len(X_TE.columns) < max_depth: 214 | break 215 | 216 | try: 217 | parsed_rules, remove = self.select_dt_rules(decision_tree, X_TE, y, lift=lift, max_samples=max_samples, labels=self.labels, verbose=verbose, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png", drop=True) 218 | 219 | if len(parsed_rules) > 0: 220 | self.dt_rules = pd.concat([self.dt_rules, parsed_rules]).reset_index(drop=True) 221 | 222 | if self.writer is not None: 223 | if self.feature_map is not None and len(self.feature_map) > 0: 224 | parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True) 225 | self.end_row, _ = self.insert_dt_rules(parsed_rules, self.end_row, self.start_col, save=f"model_report/auto_mining_rules/combiner_rules_{i}.png") 226 | 227 | X_TE = X_TE.drop(columns=remove) 228 | self.decision_trees.append(decision_tree) 229 | except: 230 | pass 231 | 232 | return self 233 | 234 | def transform(self, x, y=None): 235 | y = x[self.target] 236 | X_TE = self.encode_cat_features(x.drop(columns=[self.target]), y) 237 | X_TE = X_TE.fillna(self.nan) 238 | parsed_rules = self.query_dt_rules(X_TE, y, parsed_rules=self.dt_rules) 239 | if self.feature_map is not None and len(self.feature_map) > 0: 240 | parsed_rules["组合策略"] = parsed_rules["组合策略"].replace(self.feature_map, regex=True) 241 | return parsed_rules 242 | 243 | def insert_all_rules(self, val=None, test=None): 244 | parsed_rules_train = self.dt_rules.copy() 245 | if self.feature_map is not None and len(self.feature_map) > 0: 246 | parsed_rules_train["组合策略"] = parsed_rules_train["组合策略"].replace(self.feature_map, regex=True) 247 | self.end_row, _ = self.writer.insert_value2sheet(self.worksheet, (self.end_row + 2, self.start_col), value="训练集决策树组合策略") 248 | self.end_row, _ = self.insert_dt_rules(parsed_rules_train, self.end_row, self.start_col) 249 | 250 | if val is not None: 251 | parsed_rules_val = self.transform(val) 252 | self.end_row, _ = self.writer.insert_value2sheet(self.worksheet, (self.end_row + 2, self.start_col), value="验证集决策树组合策略") 253 | self.end_row, _ = self.insert_dt_rules(parsed_rules_val, self.end_row, self.start_col) 254 | 255 | if test is not None: 256 | parsed_rules_test = self.transform(test) 257 | self.end_row, _ = self.writer.insert_value2sheet(self.worksheet, (self.end_row + 2, self.start_col), value="测试集决策树组合策略") 258 | self.end_row, _ = self.insert_dt_rules(parsed_rules_test, self.end_row, self.start_col) 259 | 260 | def save(self): 261 | self.writer.save(self.output) 262 | 263 | 264 | if __name__ == '__main__': 265 | pdtr = ParseDecisionTreeRules(target=target, feature_map=feature_map, max_iter=8) 266 | pdtr.fit(train, lift=3., max_depth=2, max_samples=0.1, verbose=False, min_samples_split=8, min_samples_leaf=5, max_features="auto") 267 | pdtr.insert_all_rules(test=test) 268 | pdtr.save() 269 | -------------------------------------------------------------------------------- /utils/perf_eva.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | import warnings 7 | from pandas.api.types import is_numeric_dtype 8 | 9 | 10 | def check_y(dat, y, positive): 11 | positive = str(positive) 12 | # ncol of dt 13 | if isinstance(dat, pd.DataFrame) & (dat.shape[1] <= 1): 14 | raise Exception("Incorrect inputs; dat should be a DataFrame with at least two columns.") 15 | 16 | # y ------ 17 | if isinstance(y, str): 18 | y = [y] 19 | # length of y == 1 20 | if len(y) != 1: 21 | raise Exception("Incorrect inputs; the length of y should be one") 22 | 23 | y = y[0] 24 | # y not in dat.columns 25 | if y not in dat.columns: 26 | raise Exception("Incorrect inputs; there is no \'{}\' column in dat.".format(y)) 27 | 28 | # remove na in y 29 | if pd.isna(dat[y]).any(): 30 | warnings.warn("There are NaNs in \'{}\' column. The rows with NaN in \'{}\' were removed from dat.".format(y,y)) 31 | dat = dat.dropna(subset=[y]) 32 | # dat = dat[pd.notna(dat[y])] 33 | 34 | 35 | # numeric y to int 36 | if is_numeric_dtype(dat[y]): 37 | dat.loc[:,y] = dat[y].apply(lambda x: x if pd.isnull(x) else int(x)) #dat[y].astype(int) 38 | # length of unique values in y 39 | unique_y = np.unique(dat[y].values) 40 | if len(unique_y) == 2: 41 | # if [v not in [0,1] for v in unique_y] == [True, True]: 42 | if True in [bool(re.search(positive, str(v))) for v in unique_y]: 43 | y1 = dat[y] 44 | y2 = dat[y].apply(lambda x: 1 if str(x) in re.split('\|', positive) else 0) 45 | if (y1 != y2).any(): 46 | dat.loc[:,y] = y2#dat[y] = y2 47 | warnings.warn("The positive value in \"{}\" was replaced by 1 and negative value by 0.".format(y)) 48 | else: 49 | raise Exception("Incorrect inputs; the positive value in \"{}\" is not specified".format(y)) 50 | else: 51 | raise Exception("Incorrect inputs; the length of unique values in y column \'{}\' != 2.".format(y)) 52 | 53 | return dat 54 | 55 | 56 | 57 | def eva_dfkslift(df, groupnum=None): 58 | if groupnum is None: groupnum=len(df.index) 59 | # good bad func 60 | def n0(x): return sum(x==0) 61 | def n1(x): return sum(x==1) 62 | df_kslift = df.sort_values('pred', ascending=False).reset_index(drop=True)\ 63 | .assign(group=lambda x: np.ceil((x.index+1)/(len(x.index)/groupnum)))\ 64 | .groupby('group')['label'].agg([n0,n1])\ 65 | .reset_index().rename(columns={'n0':'good','n1':'bad'})\ 66 | .assign( 67 | group=lambda x: (x.index+1)/len(x.index), 68 | good_distri=lambda x: x.good/sum(x.good), 69 | bad_distri=lambda x: x.bad/sum(x.bad), 70 | badrate=lambda x: x.bad/(x.good+x.bad), 71 | cumbadrate=lambda x: np.cumsum(x.bad)/np.cumsum(x.good+x.bad), 72 | lift=lambda x: (np.cumsum(x.bad)/np.cumsum(x.good+x.bad))/(sum(x.bad)/sum(x.good+x.bad)), 73 | cumgood=lambda x: np.cumsum(x.good)/sum(x.good), 74 | cumbad=lambda x: np.cumsum(x.bad)/sum(x.bad) 75 | ).assign(ks=lambda x:abs(x.cumbad-x.cumgood)) 76 | # bind 0 77 | df_kslift=pd.concat([ 78 | pd.DataFrame({'group':0, 'good':0, 'bad':0, 'good_distri':0, 'bad_distri':0, 'badrate':0, 'cumbadrate':np.nan, 'cumgood':0, 'cumbad':0, 'ks':0, 'lift':np.nan}, index=np.arange(1)), 79 | df_kslift 80 | ], ignore_index=True) 81 | # return 82 | return df_kslift 83 | # plot ks 84 | def eva_pks(dfkslift, title): 85 | dfks = dfkslift.loc[lambda x: x.ks==max(x.ks)].sort_values('group').iloc[0] 86 | ###### plot ###### 87 | # fig, ax = plt.subplots() 88 | # ks, cumbad, cumgood 89 | plt.plot(dfkslift.group, dfkslift.ks, 'b-', 90 | dfkslift.group, dfkslift.cumgood, 'k-', 91 | dfkslift.group, dfkslift.cumbad, 'k-') 92 | # ks vline 93 | plt.plot([dfks['group'], dfks['group']], [0, dfks['ks']], 'r--') 94 | # set xylabel 95 | plt.gca().set(title=title+'K-S', 96 | xlabel='% of population', ylabel='% of total Good/Bad', 97 | xlim=[0,1], ylim=[0,1], aspect='equal') 98 | # text 99 | # plt.text(0.5,0.96,'K-S', fontsize=15,horizontalalignment='center') 100 | plt.text(0.2,0.8,'Bad',horizontalalignment='center') 101 | plt.text(0.8,0.55,'Good',horizontalalignment='center') 102 | plt.text(dfks['group'], dfks['ks'], 'KS:'+ str(round(dfks['ks'],4)), horizontalalignment='center',color='b') 103 | # plt.grid() 104 | # plt.show() 105 | # return fig 106 | # plot lift 107 | def eva_plift(dfkslift, title): 108 | badrate_avg = sum(dfkslift.bad)/sum(dfkslift.good+dfkslift.bad) 109 | ###### plot ###### 110 | # fig, ax = plt.subplots() 111 | # ks, cumbad, cumgood 112 | plt.plot(dfkslift.group, dfkslift.cumbadrate, 'k-') 113 | # ks vline 114 | plt.plot([0, 1], [badrate_avg, badrate_avg], 'r--') 115 | # set xylabel 116 | plt.gca().set(title=title+'Lift', 117 | xlabel='% of population', ylabel='% of Bad', 118 | xlim=[0,1], ylim=[0,1], aspect='equal') 119 | # text 120 | # plt.text(0.5,0.96,'Lift', fontsize=15,horizontalalignment='center') 121 | plt.text(0.7,np.mean(dfkslift.cumbadrate),'cumulate badrate',horizontalalignment='center') 122 | plt.text(0.7,badrate_avg,'average badrate',horizontalalignment='center') 123 | # plt.grid() 124 | # plt.show() 125 | # return fig 126 | 127 | def eva_dfrocpr(df): 128 | def n0(x): return sum(x==0) 129 | def n1(x): return sum(x==1) 130 | dfrocpr = df.sort_values('pred')\ 131 | .groupby('pred')['label'].agg([n0,n1,len])\ 132 | .reset_index().rename(columns={'n0':'countN','n1':'countP','len':'countpred'})\ 133 | .assign( 134 | FN = lambda x: np.cumsum(x.countP), 135 | TN = lambda x: np.cumsum(x.countN) 136 | ).assign( 137 | TP = lambda x: sum(x.countP) - x.FN, 138 | FP = lambda x: sum(x.countN) - x.TN 139 | ).assign( 140 | TPR = lambda x: x.TP/(x.TP+x.FN), 141 | FPR = lambda x: x.FP/(x.TN+x.FP), 142 | precision = lambda x: x.TP/(x.TP+x.FP), 143 | recall = lambda x: x.TP/(x.TP+x.FN) 144 | ).assign( 145 | F1 = lambda x: 2*x.precision*x.recall/(x.precision+x.recall) 146 | ) 147 | return dfrocpr 148 | # plot roc 149 | def eva_proc(dfrocpr, title): 150 | dfrocpr = pd.concat( 151 | [dfrocpr[['FPR','TPR']], pd.DataFrame({'FPR':[0,1], 'TPR':[0,1]})], 152 | ignore_index=True).sort_values(['FPR','TPR']) 153 | auc = dfrocpr.sort_values(['FPR','TPR'])\ 154 | .assign( 155 | TPR_lag=lambda x: x['TPR'].shift(1), FPR_lag=lambda x: x['FPR'].shift(1) 156 | ).assign( 157 | auc=lambda x: (x.TPR+x.TPR_lag)*(x.FPR-x.FPR_lag)/2 158 | )['auc'].sum() 159 | ###### plot ###### 160 | # fig, ax = plt.subplots() 161 | # ks, cumbad, cumgood 162 | plt.plot(dfrocpr.FPR, dfrocpr.TPR, 'k-') 163 | # ks vline 164 | x=np.array(np.arange(0,1.1,0.1)) 165 | plt.plot(x, x, 'r--') 166 | # fill 167 | plt.fill_between(dfrocpr.FPR, 0, dfrocpr.TPR, color='blue', alpha=0.1) 168 | # set xylabel 169 | plt.gca().set(title=title+'ROC', 170 | xlabel='FPR', ylabel='TPR', 171 | xlim=[0,1], ylim=[0,1], aspect='equal') 172 | # text 173 | # plt.text(0.5,0.96, 'ROC', fontsize=15, horizontalalignment='center') 174 | plt.text(0.55,0.45, 'AUC:'+str(round(auc,4)), horizontalalignment='center', color='b') 175 | # plt.grid() 176 | # plt.show() 177 | # return fig 178 | # plot ppr 179 | def eva_ppr(dfrocpr, title): 180 | ###### plot ###### 181 | # fig, ax = plt.subplots() 182 | # ks, cumbad, cumgood 183 | plt.plot(dfrocpr.recall, dfrocpr.precision, 'k-') 184 | # ks vline 185 | x=np.array(np.arange(0,1.1,0.1)) 186 | plt.plot(x, x, 'r--') 187 | # set xylabel 188 | plt.gca().set(title=title+'P-R', 189 | xlabel='Recall', ylabel='Precision', 190 | xlim=[0,1], ylim=[0,1], aspect='equal') 191 | # text 192 | # plt.text(0.5,0.96, 'P-R', fontsize=15, horizontalalignment='center') 193 | # plt.grid() 194 | # plt.show() 195 | # return fig 196 | # plot f1 197 | def eva_pf1(dfrocpr, title): 198 | dfrocpr=dfrocpr.assign(pop=lambda x: np.cumsum(x.countpred)/sum(x.countpred)) 199 | ###### plot ###### 200 | # fig, ax = plt.subplots() 201 | # ks, cumbad, cumgood 202 | plt.plot(dfrocpr['pop'], dfrocpr['F1'], 'k-') 203 | # ks vline 204 | F1max_pop = dfrocpr.loc[dfrocpr['F1'].idxmax(),'pop'] 205 | F1max_F1 = dfrocpr.loc[dfrocpr['F1'].idxmax(),'F1'] 206 | plt.plot([F1max_pop,F1max_pop], [0,F1max_F1], 'r--') 207 | # set xylabel 208 | plt.gca().set(title=title+'F1', 209 | xlabel='% of population', ylabel='F1', 210 | xlim=[0,1], ylim=[0,1], aspect='equal') 211 | # pred text 212 | pred_0=dfrocpr.loc[dfrocpr['pred'].idxmin(),'pred'] 213 | pred_F1max=dfrocpr.loc[dfrocpr['F1'].idxmax(),'pred'] 214 | pred_1=dfrocpr.loc[dfrocpr['pred'].idxmax(),'pred'] 215 | if np.mean(dfrocpr.pred) < 0 or np.mean(dfrocpr.pred) > 1: 216 | pred_0 = -pred_0 217 | pred_F1max = -pred_F1max 218 | pred_1 = -pred_1 219 | plt.text(0, 0, 'pred \n'+str(round(pred_0,4)), horizontalalignment='left',color='b') 220 | plt.text(F1max_pop, 0, 'pred \n'+str(round(pred_F1max,4)), horizontalalignment='center',color='b') 221 | plt.text(1, 0, 'pred \n'+str(round(pred_1,4)), horizontalalignment='right',color='b') 222 | # title F1 223 | plt.text(F1max_pop, F1max_F1, 'F1 max: \n'+ str(round(F1max_F1,4)), horizontalalignment='center',color='b') 224 | # plt.grid() 225 | # plt.show() 226 | # return fig 227 | 228 | 229 | 230 | def perf_eva(label, pred, title=None, groupnum=None, plot_type=["ks", "roc"], show_plot=True, positive="bad|1", seed=186): 231 | 232 | # inputs checking 233 | if len(label) != len(pred): 234 | warnings.warn('Incorrect inputs; label and pred should be list with the same length.') 235 | # if pred is score 236 | if np.mean(pred) < 0 or np.mean(pred) > 1: 237 | warnings.warn('Since the average of pred is not in [0,1], it is treated as predicted score but not probability.') 238 | pred = -pred 239 | # random sort datatable 240 | df = pd.DataFrame({'label':label, 'pred':pred}).sample(frac=1, random_state=seed) 241 | # remove NAs 242 | if any(np.unique(df.isna())): 243 | warnings.warn('The NANs in \'label\' or \'pred\' were removed.') 244 | df = df.dropna() 245 | # check label 246 | df = check_y(df, 'label', positive) 247 | # title 248 | title='' if title is None else str(title)+': ' 249 | 250 | ### data ### 251 | # dfkslift ------ 252 | if any([i in plot_type for i in ['ks', 'lift']]): 253 | dfkslift = eva_dfkslift(df, groupnum) 254 | if 'ks' in plot_type: df_ks = dfkslift 255 | if 'lift' in plot_type: df_lift = dfkslift 256 | # dfrocpr ------ 257 | if any([i in plot_type for i in ["roc","pr",'f1']]): 258 | dfrocpr = eva_dfrocpr(df) 259 | if 'roc' in plot_type: df_roc = dfrocpr 260 | if 'pr' in plot_type: df_pr = dfrocpr 261 | if 'f1' in plot_type: df_f1 = dfrocpr 262 | ### return list ### 263 | rt = {} 264 | # plot, KS ------ 265 | if 'ks' in plot_type: 266 | rt['KS'] = round(dfkslift.loc[lambda x: x.ks==max(x.ks),'ks'].iloc[0],4) 267 | # plot, ROC ------ 268 | if 'roc' in plot_type: 269 | auc = pd.concat( 270 | [dfrocpr[['FPR','TPR']], pd.DataFrame({'FPR':[0,1], 'TPR':[0,1]})], 271 | ignore_index=True).sort_values(['FPR','TPR'])\ 272 | .assign( 273 | TPR_lag=lambda x: x['TPR'].shift(1), FPR_lag=lambda x: x['FPR'].shift(1) 274 | ).assign( 275 | auc=lambda x: (x.TPR+x.TPR_lag)*(x.FPR-x.FPR_lag)/2 276 | )['auc'].sum() 277 | ### 278 | rt['AUC'] = round(auc, 4) 279 | rt['Gini'] = round(2*auc-1, 4) 280 | 281 | ### export plot ### 282 | if show_plot: 283 | plist = ["eva_p"+i+'(df_'+i+',title)' for i in plot_type] 284 | subplot_nrows = np.ceil(len(plist)/2) 285 | subplot_ncols = np.ceil(len(plist)/subplot_nrows) 286 | 287 | fig = plt.figure() 288 | for i in np.arange(len(plist)): 289 | plt.subplot(int(subplot_nrows),int(subplot_ncols),i+1) 290 | eval(plist[i]) 291 | 292 | rt['pic'] = fig 293 | 294 | return rt 295 | 296 | 297 | 298 | def perf_psi(score, label=None, title=None, x_limits=None, x_tick_break=50, show_plot=True, seed=186, return_distr_dat=False): 299 | 300 | # inputs checking 301 | ## score 302 | if not isinstance(score, dict) and len(score) != 2: 303 | raise Exception("Incorrect inputs; score should be a dictionary with two elements.") 304 | else: 305 | if any([not isinstance(i, pd.DataFrame) for i in score.values()]): 306 | raise Exception("Incorrect inputs; score is a dictionary of two dataframes.") 307 | score_columns = [list(i.columns) for i in score.values()] 308 | if set(score_columns[0]) != set(score_columns[1]): 309 | raise Exception("Incorrect inputs; the column names of two dataframes in score should be the same.") 310 | ## label 311 | if label is not None: 312 | if not isinstance(label, dict) and len(label) != 2: 313 | raise Exception("Incorrect inputs; label should be a dictionary with two elements.") 314 | else: 315 | if set(score.keys()) != set(label.keys()): 316 | raise Exception("Incorrect inputs; the keys of score and label should be the same. ") 317 | for i in label.keys(): 318 | if isinstance(label[i], pd.DataFrame): 319 | if len(label[i].columns) == 1: 320 | label[i] = label[i].iloc[:,0] 321 | else: 322 | raise Exception("Incorrect inputs; the number of columns in label should be 1.") 323 | # score dataframe column names 324 | score_names = score[list(score.keys())[0]].columns 325 | # merge label with score 326 | for i in score.keys(): 327 | score[i] = score[i].copy(deep=True) 328 | if label is not None: 329 | score[i].loc[:,'y'] = label[i] 330 | else: 331 | score[i].copy(deep=True).loc[:,'y'] = np.nan 332 | # dateset of score and label 333 | dt_sl = pd.concat(score, names=['ae', 'rowid']).reset_index()\ 334 | .sample(frac=1, random_state=seed) 335 | # ae refers to 'Actual & Expected' 336 | 337 | # PSI function 338 | def psi(dat): 339 | dt_bae = dat.groupby(['ae','bin']).size().reset_index(name='N')\ 340 | .pivot_table(values='N', index='bin', columns='ae').fillna(0.9)\ 341 | .agg(lambda x: x/sum(x)) 342 | dt_bae.columns = ['A','E'] 343 | psi_dt = dt_bae.assign( 344 | AE = lambda x: x.A-x.E, 345 | logAE = lambda x: np.log(x.A/x.E) 346 | ).assign( 347 | bin_PSI=lambda x: x.AE*x.logAE 348 | )['bin_PSI'].sum() 349 | return psi_dt 350 | 351 | # return psi and pic 352 | rt_psi = {} 353 | rt_pic = {} 354 | rt_dat = {} 355 | rt = {} 356 | for sn in score_names: 357 | # dataframe with columns of ae y sn 358 | dat = dt_sl[['ae', 'y', sn]] 359 | if len(dt_sl[sn].unique()) > 10: 360 | # breakpoints 361 | if x_limits is None: 362 | x_limits = dat[sn].quantile([0.02, 0.98]) 363 | x_limits = round(x_limits/x_tick_break)*x_tick_break 364 | x_limits = list(x_limits) 365 | 366 | brkp = np.unique([np.floor(min(dt_sl[sn])/x_tick_break)*x_tick_break]+\ 367 | list(np.arange(x_limits[0], x_limits[1], x_tick_break))+\ 368 | [np.ceil(max(dt_sl[sn])/x_tick_break)*x_tick_break]) 369 | # cut 370 | labels = ['[{},{})'.format(int(brkp[i]), int(brkp[i+1])) for i in range(len(brkp)-1)] 371 | dat.loc[:,'bin'] = pd.cut(dat[sn], brkp, right=False, labels=labels) 372 | else: 373 | dat.loc[:,'bin'] = dat[sn] 374 | # psi ------ 375 | rt_psi[sn] = pd.DataFrame({'PSI':psi(dat)},index=np.arange(1)) 376 | 377 | # distribution of scorecard probability 378 | def good(x): return sum(x==0) 379 | def bad(x): return sum(x==1) 380 | distr_prob = dat.groupby(['ae', 'bin'])\ 381 | ['y'].agg([good, bad])\ 382 | .assign(N=lambda x: x.good+x.bad, 383 | badprob=lambda x: x.bad/(x.good+x.bad) 384 | ).reset_index() 385 | distr_prob.loc[:,'distr'] = distr_prob.groupby('ae')['N'].transform(lambda x:x/sum(x)) 386 | # pivot table 387 | distr_prob = distr_prob.pivot_table(values=['N','badprob', 'distr'], index='bin', columns='ae') 388 | 389 | # plot ------ 390 | if show_plot: 391 | ###### param ###### 392 | ind = np.arange(len(distr_prob.index)) # the x locations for the groups 393 | width = 0.35 # the width of the bars: can also be len(x) sequence 394 | ###### plot ###### 395 | fig, ax1 = plt.subplots() 396 | ax2 = ax1.twinx() 397 | title_string = sn+'_PSI: '+str(round(psi(dat),4)) 398 | title_string = title_string if title is None else str(title)+' '+title_string 399 | # ax1 400 | p1 = ax1.bar(ind, distr_prob.distr.iloc[:,0], width, color=(24/254, 192/254, 196/254), alpha=0.6) 401 | p2 = ax1.bar(ind+width, distr_prob.distr.iloc[:,1], width, color=(246/254, 115/254, 109/254), alpha=0.6) 402 | # ax2 403 | p3 = ax2.plot(ind+width/2, distr_prob.badprob.iloc[:,0], color=(24/254, 192/254, 196/254)) 404 | ax2.scatter(ind+width/2, distr_prob.badprob.iloc[:,0], facecolors='w', edgecolors=(24/254, 192/254, 196/254)) 405 | p4 = ax2.plot(ind+width/2, distr_prob.badprob.iloc[:,1], color=(246/254, 115/254, 109/254)) 406 | ax2.scatter(ind+width/2, distr_prob.badprob.iloc[:,1], facecolors='w', edgecolors=(246/254, 115/254, 109/254)) 407 | # settings 408 | ax1.set_ylabel('Score distribution') 409 | ax2.set_ylabel('Bad probability')#, color='blue') 410 | # ax2.tick_params(axis='y', colors='blue') 411 | # ax1.set_yticks(np.arange(0, np.nanmax(distr_prob['distr'].values), 0.2)) 412 | # ax2.set_yticks(np.arange(0, 1+0.2, 0.2)) 413 | ax1.set_ylim([0,np.ceil(np.nanmax(distr_prob['distr'].values)*10)/10]) 414 | ax2.set_ylim([0,1]) 415 | plt.xticks(ind+width/2, distr_prob.index) 416 | plt.title(title_string, loc='left') 417 | ax1.legend((p1[0], p2[0]), list(distr_prob.columns.levels[1]), loc='upper left') 418 | ax2.legend((p3[0], p4[0]), list(distr_prob.columns.levels[1]), loc='upper right') 419 | # show plot 420 | plt.show() 421 | 422 | # return of pic 423 | rt_pic[sn] = fig 424 | 425 | # return distr_dat ------ 426 | if return_distr_dat: 427 | rt_dat[sn] = distr_prob[['N','badprob']].reset_index() 428 | # return rt 429 | rt['psi'] = pd.concat(rt_psi).reset_index().rename(columns={'level_0':'variable'})[['variable', 'PSI']] 430 | rt['pic'] = rt_pic 431 | if return_distr_dat: rt['dat'] = rt_dat 432 | return rt 433 | -------------------------------------------------------------------------------- /utils/excel_writer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2023/2/14 16:23 4 | @Author : itlubber 5 | @Site : itlubber.art 6 | """ 7 | import re 8 | import os 9 | 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import pandas as pd 13 | 14 | from openpyxl.cell.cell import Cell 15 | from openpyxl.drawing.image import Image 16 | from openpyxl import load_workbook, Workbook 17 | from openpyxl.formatting.rule import DataBarRule 18 | from openpyxl.utils.dataframe import dataframe_to_rows 19 | from openpyxl.utils import get_column_letter, column_index_from_string 20 | from openpyxl.styles import NamedStyle, Border, Side, Alignment, PatternFill, Font 21 | 22 | 23 | class ExcelWriter: 24 | 25 | def __init__(self, style_excel='报告输出模版.xlsx', style_sheet_name="初始化", fontsize=10, font='楷体', theme_color='8E8BFE'): 26 | """ 27 | excel 文件内容写入公共方法 28 | 29 | :param style_excel: 样式模版文件,默认当前路径下的 报告输出模版.xlsx ,如果项目路径调整需要进行相应的调整 30 | :param style_sheet_name: 模版文件内初始样式sheet名称,默认即可 31 | :param fontsize: 插入excel文件中内容的字体大小,默认 10 32 | :param font: 插入excel文件中内容的字体,默认 楷体 33 | :param theme_color: 主题色,默认 8E8BFE,注意不包含 # 34 | """ 35 | # english_width,chinese_width 36 | self.english_width = 0.12 37 | self.chinese_width = 0.21 38 | self.theme_color = theme_color 39 | self.fontsize = 10 40 | self.font = '楷体' 41 | 42 | self.workbook = load_workbook(style_excel) 43 | self.style_sheet = self.workbook[style_sheet_name] 44 | 45 | self.name_styles = [] 46 | self.init_style(font, fontsize, theme_color) 47 | for style in self.name_styles: 48 | if style.name not in self.workbook.style_names: 49 | self.workbook.add_named_style(style) 50 | 51 | def add_conditional_formatting(self, worksheet, start_space, end_space): 52 | """ 53 | 设置条件格式 54 | 55 | :param worksheet: 当前选择设置条件格式的sheet 56 | :param start_space: 开始单元格位置 57 | :param end_space: 结束单元格位置 58 | """ 59 | worksheet.conditional_formatting.add(f'{start_space}:{end_space}', DataBarRule(start_type='min', end_type='max', color=self.theme_color)) 60 | 61 | @staticmethod 62 | def set_column_width(worksheet, column, width): 63 | """ 64 | 调整excel列宽 65 | 66 | :param worksheet: 当前选择调整列宽的sheet 67 | :param column: 列,可以直接输入 index 或者 字母 68 | :param width: 设置列的宽度 69 | """ 70 | worksheet.column_dimensions[column if isinstance(column, str) else get_column_letter(column)] = width 71 | 72 | @staticmethod 73 | def set_number_format(worksheet, space, _format): 74 | """ 75 | 设置数值显示格式 76 | 77 | :param worksheet: 当前选择调整数值显示格式的sheet 78 | :param space: 单元格范围 79 | :param _format: 显示格式,参考 openpyxl 80 | """ 81 | cells = worksheet[space] 82 | if isinstance(cells, Cell): 83 | cells = [cells] 84 | 85 | for cell in cells: 86 | if isinstance(cell, tuple): 87 | for c in cell: 88 | c.number_format = _format 89 | else: 90 | cell.number_format = _format 91 | 92 | def get_sheet_by_name(self, name): 93 | """ 94 | 获取sheet名称为name的工作簿,如果不存在,则从初始模版文件中拷贝一个名称为name的sheet 95 | 96 | :param name: 需要获取的工作簿名称 97 | """ 98 | if name not in self.workbook.sheetnames: 99 | worksheet = self.workbook.copy_worksheet(self.style_sheet) 100 | worksheet.title = name 101 | else: 102 | worksheet = self.workbook[name] 103 | 104 | return worksheet 105 | 106 | def insert_value2sheet(self, worksheet, insert_space, value="", style="content", auto_width=False): 107 | """ 108 | 向sheet中的某个单元格插入某种样式的内容 109 | 110 | :param worksheet: 需要插入内容的sheet 111 | :param insert_space: 内容插入的单元格位置,可以是 "B2" 或者 (2, 2) 任意一种形式 112 | :param value: 需要插入的内容 113 | :param style: 渲染的样式,参考 init_style 中初始设置的样式 114 | :param auto_width: 是否开启自动调整列宽 115 | :return 返回插入元素最后一列之后、最后一行之后的位置 116 | """ 117 | if isinstance(insert_space, str): 118 | worksheet[insert_space] = value 119 | cell = worksheet[insert_space] 120 | start_col = re.findall('\D+', insert_space)[0] 121 | start_row = int(re.findall("\d+", insert_space)[0]) 122 | else: 123 | cell = worksheet.cell(insert_space[0], insert_space[1], value) 124 | start_col = get_column_letter(insert_space[1]) 125 | start_row = insert_space[0] 126 | cell.style = style 127 | 128 | if auto_width: 129 | curr_width = worksheet.column_dimensions[start_col].width 130 | auto_width = min(max([(self.check_contain_chinese(value)[1] * self.english_width + self.check_contain_chinese(value)[2] * self.chinese_width) * self.fontsize, 10, curr_width]), 50) 131 | worksheet.column_dimensions[start_col].width = auto_width 132 | 133 | return start_row + 1, column_index_from_string(start_col) + 1 134 | 135 | def insert_pic2sheet(self, worksheet, fig, insert_space, figsize=(600, 250)): 136 | """ 137 | 向excel中插入图片内容 138 | 139 | :param worksheet: 需要插入内容的sheet 140 | :param fig: 需要插入的图片路径 141 | :param insert_space: 插入图片的起始单元格 142 | :param figsize: 图片大小设置 143 | :return 返回插入元素最后一列之后、最后一行之后的位置 144 | """ 145 | if isinstance(insert_space, str): 146 | start_row = int(re.findall("\d+", insert_space)[0]) 147 | start_col = re.findall('\D+', insert_space)[0] 148 | else: 149 | start_row, start_col = insert_space 150 | start_col = get_column_letter(start_col) 151 | 152 | image = Image(fig) 153 | image.width, image.height = figsize 154 | worksheet.add_image(image, f"{start_col}{start_row}") 155 | 156 | return start_row + int(figsize[1] / 17.5), column_index_from_string(start_col) + 8 157 | 158 | def insert_rows(self, worksheet, row, row_index, col_index, merge_rows=None, style="", auto_width=False): 159 | curr_col = column_index_from_string(col_index) 160 | for j, v in enumerate(row): 161 | if merge_rows is not None and row_index + 1 not in merge_rows: 162 | if j == 0: 163 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style="merge_left", auto_width=auto_width) 164 | elif j == len(row) - 1: 165 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style="merge_right", auto_width=auto_width) 166 | else: 167 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style="merge_middle", auto_width=auto_width) 168 | else: 169 | if j == 0: 170 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style=f"{style}_left" if style else "left", auto_width=auto_width) 171 | elif j == len(row) - 1: 172 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style=f"{style}_right" if style else "right", auto_width=auto_width) 173 | else: 174 | self.insert_value2sheet(worksheet, f'{get_column_letter(curr_col + j)}{row_index}', self.astype_insertvalue(v), style=f"{style}_middle" if style else "middle", auto_width=auto_width) 175 | 176 | def insert_df2sheet(self, worksheet, data, insert_space, merge_column=None, header=True, index=False, auto_width=False): 177 | """ 178 | 向excel文件中插入制定样式的dataframe数据 179 | 180 | :param worksheet: 需要插入内容的sheet 181 | :param data: 需要插入的dataframe 182 | :param insert_space: 插入内容的起始单元格位置 183 | :param merge_column: 需要分组显示的列,index或者列明 184 | :param header: 是否存储dataframe的header,暂不支持多级表头 185 | :param index: 是否存储dataframe的index 186 | :param auto_width: 是否自动调整列宽 187 | :return 返回插入元素最后一列之后、最后一行之后的位置 188 | """ 189 | df = data.copy() 190 | 191 | if isinstance(insert_space, str): 192 | start_row = int(re.findall("\d+", insert_space)[0]) 193 | start_col = re.findall('\D+', insert_space)[0] 194 | else: 195 | start_row, start_col = insert_space 196 | start_col = get_column_letter(start_col) 197 | 198 | if merge_column: 199 | if isinstance(merge_column, str): 200 | merge_column = [merge_column] 201 | 202 | if isinstance(merge_column[0], (int, float)): 203 | merge_cols = None 204 | merge_rows = merge_rows 205 | else: 206 | merge_cols = [get_column_letter(df.columns.get_loc(col) + column_index_from_string(start_col)) for col in merge_column] 207 | df = df.sort_values(merge_column) 208 | merge_rows = list(np.cumsum(df.groupby(merge_column)[merge_column].count().values[:, 0]) + start_row + 1) 209 | 210 | for i, row in enumerate(dataframe_to_rows(df, header=header, index=index)): 211 | if i == 0: 212 | if header: 213 | self.insert_rows(worksheet, row, start_row + i, start_col, style="header", auto_width=auto_width) 214 | else: 215 | self.insert_rows(worksheet, row, start_row + i, start_col, style="first", auto_width=auto_width) 216 | elif (header and i == len(df)) or (not header and i + 1 == len(df)): 217 | self.insert_rows(worksheet, row, start_row + i, start_col, style="last", auto_width=auto_width) 218 | else: 219 | self.insert_rows(worksheet, row, start_row + i, start_col, auto_width=auto_width, merge_rows=merge_rows if merge_column else None) 220 | 221 | # if merge_column and merge_cols is not None: 222 | # merge_rows = [start_row + 2] + merge_rows 223 | # for s, e in zip(merge_rows[:-1], merge_rows[1:]): 224 | # if e - s > 1: 225 | # for merge_col in merge_cols: 226 | # worksheet.merge_cells(f"{merge_col}{s-1}:{merge_col}{e-1}") 227 | 228 | end_row = start_row + len(data) + 1 if header else start_row + len(data) 229 | 230 | return (end_row, column_index_from_string(start_col) + len(data.columns)) 231 | 232 | @staticmethod 233 | def check_contain_chinese(check_str): 234 | out = [] 235 | for ch in str(check_str).encode('utf-8').decode('utf-8'): 236 | if u'\u4e00' <= ch <= u'\u9fff': 237 | out.append(True) 238 | else: 239 | out.append(False) 240 | return out, len(out) - sum(out), sum(out) 241 | 242 | @staticmethod 243 | def astype_insertvalue(value, decimal_point=4): 244 | if re.search('tuple|list|numpy.dtype|bool|str|numpy.ndarray|Interval|Categorical', str(type(value))): 245 | value = str(value) 246 | elif re.search('int', str(type(value))): 247 | value = value 248 | elif re.search('float', str(type(value))): 249 | value = round(float(value), decimal_point) 250 | else: 251 | value = 'nan' 252 | 253 | return value 254 | 255 | @staticmethod 256 | def calc_continuous_cnt(list_, index_=0): 257 | """ 258 | Clac continuous_cnt 259 | 260 | Examples:s 261 | list_ = ['A','A','A','A','B','C','C','D','D','D'] 262 | (1) calc_continuous_cnt(list_, 0) ===>('A', 0, 4) 263 | (2) calc_continuous_cnt(list_, 4) ===>('B', 4, 1) 264 | (3) calc_continuous_cnt(list_, 6) ===>('C', 6, 1) 265 | """ 266 | if index_ >= len(list_): 267 | return None, None, None 268 | 269 | else: 270 | cnt, str_ = 0, list_[index_] 271 | for i in range(index_, len(list_), 1): 272 | if list_[i] == str_: 273 | cnt = cnt + 1 274 | else: 275 | break 276 | return str_, index_, cnt 277 | 278 | @staticmethod 279 | def itlubber_border(border, color): 280 | if len(border) == 3: 281 | return Border(left=Side(border_style=border[0], color=color[0]), right=Side(border_style=border[1], color=color[1]), bottom=Side(border_style=border[2], color=color[2]),) 282 | else: 283 | return Border(left=Side(border_style=border[0], color=color[0]), right=Side(border_style=border[1], color=color[1]), bottom=Side(border_style=border[2], color=color[2]), top=Side(border_style=border[3], color=color[3]),) 284 | 285 | @staticmethod 286 | def get_cell_space(space): 287 | if isinstance(space, str): 288 | start_row = int(re.findall("\d+", space)[0]) 289 | start_col = re.findall('\D+', space)[0] 290 | return start_row, column_index_from_string(start_col) 291 | else: 292 | start_row = space[0] 293 | if isinstance(space[1], int): 294 | start_col = get_column_letter(space[1]) 295 | else: 296 | start_col = space[1] 297 | return f"{start_row}{start_col}" 298 | 299 | def init_style(self, font, fontsize, theme_color): 300 | header_style, header_left_style, header_middle_style, header_right_style = NamedStyle(name="header"), NamedStyle(name="header_left"), NamedStyle(name="header_middle"), NamedStyle(name="header_right") 301 | last_style, last_left_style, last_middle_style, last_right_style = NamedStyle(name="last"), NamedStyle(name="last_left"), NamedStyle(name="last_middle"), NamedStyle(name="last_right") 302 | content_style, left_style, middle_style, right_style = NamedStyle(name="content"), NamedStyle(name="left"), NamedStyle(name="middle"), NamedStyle(name="right") 303 | merge_style, merge_left_style, merge_middle_style, merge_right_style = NamedStyle(name="merge"), NamedStyle(name="merge_left"), NamedStyle(name="merge_middle"), NamedStyle(name="merge_right") 304 | first_style, first_left_style, first_middle_style, first_right_style = NamedStyle(name="first"), NamedStyle(name="first_left"), NamedStyle(name="first_middle"), NamedStyle(name="first_right") 305 | 306 | header_font = Font(size=fontsize, name=font, color="FFFFFF", bold=True) 307 | header_fill = PatternFill(fill_type="solid", start_color=theme_color) 308 | alignment = Alignment(horizontal='center', vertical='center', wrap_text=False) 309 | content_fill = PatternFill(fill_type="solid", start_color="FFFFFF") 310 | content_font = Font(size=fontsize, name=font, color="000000") 311 | 312 | header_style.font, header_left_style.font, header_middle_style.font, header_right_style.font = header_font, header_font, header_font, header_font 313 | header_style.fill, header_left_style.fill, header_middle_style.fill, header_right_style.fill = header_fill, header_fill, header_fill, header_fill 314 | header_style.alignment, header_left_style.alignment, header_middle_style.alignment, header_right_style.alignment = Alignment(horizontal='left', vertical='center', wrap_text=True), alignment, alignment, alignment 315 | 316 | header_style.border = self.itlubber_border(["medium", "medium", "medium", "medium"], [theme_color, theme_color, theme_color, theme_color]) 317 | header_left_style.border = self.itlubber_border(["medium", "thin", "medium", "medium"], [theme_color, "FFFFFF", theme_color, theme_color]) 318 | header_middle_style.border = self.itlubber_border(["thin", "thin", "medium", "medium"], ["FFFFFF", "FFFFFF", theme_color, theme_color]) 319 | header_right_style.border = self.itlubber_border(["thin", "medium", "medium", "medium"], ["FFFFFF", theme_color, theme_color, theme_color]) 320 | 321 | last_style.font, last_left_style.font, last_middle_style.font, last_right_style.font = content_font, content_font, content_font, content_font 322 | last_style.fill, last_left_style.fill, last_middle_style.fill, last_right_style.fill = content_fill, content_fill, content_fill, content_fill 323 | last_style.alignment, last_left_style.alignment, last_middle_style.alignment, last_right_style.alignment = alignment, alignment, alignment, alignment 324 | 325 | last_style.border = self.itlubber_border(["medium", "medium", "medium"], [theme_color, theme_color, theme_color]) 326 | last_left_style.border = self.itlubber_border(["medium", "thin", "medium"], [theme_color, "FFFFFF", theme_color]) 327 | last_middle_style.border = self.itlubber_border(["thin", "thin", "medium"], ["FFFFFF", "FFFFFF", theme_color]) 328 | last_right_style.border = self.itlubber_border(["thin", "medium", "medium"], ["FFFFFF", theme_color, theme_color]) 329 | 330 | content_style.font, left_style.font, middle_style.font, right_style.font = content_font, content_font, content_font, content_font 331 | content_style.fill, left_style.fill, middle_style.fill, right_style.fill = content_fill, content_fill, content_fill, content_fill 332 | content_style.alignment, left_style.alignment, middle_style.alignment, right_style.alignment = alignment, alignment, alignment, alignment 333 | 334 | content_style.border = self.itlubber_border(["medium", "medium", "thin"], [theme_color, theme_color, theme_color]) 335 | left_style.border = self.itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", theme_color]) 336 | middle_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", "FFFFFF", theme_color]) 337 | right_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, theme_color]) 338 | 339 | merge_style.font, merge_left_style.font, merge_middle_style.font, merge_right_style.font = content_font, content_font, content_font, content_font 340 | merge_style.fill, merge_left_style.fill, merge_middle_style.fill, merge_right_style.fill = content_fill, content_fill, content_fill, content_fill 341 | merge_style.alignment, merge_left_style.alignment, merge_middle_style.alignment, merge_right_style.alignment = alignment, alignment, alignment, alignment 342 | 343 | merge_style.border = self.itlubber_border(["medium", "medium", "thin"], ["FFFFFF", "FFFFFF", "FFFFFF"]) 344 | merge_left_style.border = self.itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", "FFFFFF"]) 345 | merge_middle_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", "FFFFFF", "FFFFFF"]) 346 | merge_right_style.border = self.itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, "FFFFFF"]) 347 | 348 | first_style.font, first_left_style.font, first_middle_style.font, first_right_style.font = content_font, content_font, content_font, content_font 349 | first_style.fill, first_left_style.fill, first_middle_style.fill, first_right_style.fill = content_fill, content_fill, content_fill, content_fill 350 | first_style.alignment, first_left_style.alignment, first_middle_style.alignment, first_right_style.alignment = alignment, alignment, alignment, alignment 351 | 352 | first_style.border = self.itlubber_border(["medium", "medium", "thin", "medium"], [theme_color, theme_color, theme_color, theme_color]) 353 | first_left_style.border = self.itlubber_border(["medium", "thin", "thin", "medium"], [theme_color, "FFFFFF", theme_color, theme_color]) 354 | first_middle_style.border = self.itlubber_border(["thin", "thin", "thin", "medium"], ["FFFFFF", "FFFFFF", theme_color, theme_color]) 355 | first_right_style.border = self.itlubber_border(["thin", "medium", "thin", "medium"], ["FFFFFF", theme_color, theme_color, theme_color]) 356 | 357 | self.name_styles.extend([ 358 | header_style, header_left_style, header_middle_style, header_right_style, 359 | last_style, last_left_style, last_middle_style, last_right_style, 360 | content_style, left_style, middle_style, right_style, 361 | merge_style, merge_left_style, merge_middle_style, merge_right_style, 362 | first_style, first_left_style, first_middle_style, first_right_style 363 | ]) 364 | 365 | def save(self, filename): 366 | """ 367 | 保存excel文件 368 | 369 | :param filename: 需要保存 excel 文件的路径 370 | """ 371 | self.workbook.remove(self.style_sheet) 372 | self.workbook.save(filename) 373 | self.workbook.close() 374 | 375 | 376 | if __name__ == '__main__': 377 | writer = ExcelWriter(style_excel="/Users/lubberit/Desktop/金融/兴业银行贷中行为评分/utils/报告输出模版.xlsx") 378 | worksheet = writer.get_sheet_by_name("模型报告") 379 | writer.insert_value2sheet(worksheet, "B2", value="模型报告", style="header") 380 | writer.insert_value2sheet(worksheet, "B3", value="当前模型主要为评分卡模型", style="content", auto_width=True) 381 | end_row = writer.insert_pic2sheet(worksheet, "/Users/lubberit/Desktop/金融/兴业银行贷中行为评分/tests/mypic.png", "B5") 382 | end_row = writer.insert_pic2sheet(worksheet, "/Users/lubberit/Desktop/金融/兴业银行贷中行为评分/tests/mypic.png", "H5") 383 | sample = pd.DataFrame(np.concatenate([np.random.random_sample((10, 10)) * 40, np.random.randint(0, 3, (10, 2))], axis=1), columns=[f"B{i}" for i in range(10)] + ["target", "type"]) 384 | end_row, end_col = writer.insert_df2sheet(worksheet, sample, (end_row + 2, column_index_from_string("B"))) 385 | end_row, end_col = writer.insert_df2sheet(worksheet, sample, (end_row + 2, column_index_from_string("B")), merge_column="target") 386 | end_row, end_col = writer.insert_df2sheet(worksheet, sample, (end_row + 2, column_index_from_string("B")), merge_column=["target", "type"]) 387 | writer.save("test.xlsx") 388 | -------------------------------------------------------------------------------- /utils/tools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2022/8/23 13:12 4 | @Author : itlubber 5 | @Site : itlubber.art 6 | """ 7 | 8 | import os 9 | import six 10 | import toad 11 | import joblib 12 | import warnings 13 | import numpy as np 14 | import pandas as pd 15 | from tqdm import tqdm 16 | import scorecardpy as sc 17 | from datetime import datetime 18 | import matplotlib.pyplot as plt 19 | from optbinning import OptimalBinning 20 | from sklearn.metrics import make_scorer 21 | from sklearn.model_selection import train_test_split 22 | 23 | from openpyxl import load_workbook, Workbook 24 | from openpyxl.formatting.rule import DataBarRule 25 | from openpyxl.styles import Border, Side, Alignment, PatternFill, Font 26 | 27 | 28 | def init_setting(font_path="./utils/matplot_chinese.ttf"): 29 | import warnings 30 | import matplotlib 31 | from matplotlib import font_manager 32 | warnings.filterwarnings("ignore") 33 | pd.options.display.float_format = '{:.4f}'.format 34 | pd.set_option('display.max_colwidth', 300) 35 | plt.style.use('seaborn-ticks') 36 | matplotlib.font_manager.fontManager.addfont(font_path) 37 | matplotlib.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name() 38 | matplotlib.rcParams['axes.unicode_minus'] = False 39 | 40 | 41 | # warnings.filterwarnings("ignore") 42 | # pd.set_option('display.width', 5000) 43 | # plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体 44 | # plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题 45 | 46 | 47 | try: 48 | feature_describe = pd.read_excel("变量字典及字段解释.xlsx", sheet_name="数据字段表", header=0, engine="openpyxl", usecols=[0, 1]) 49 | feature_describe = feature_describe.drop_duplicates(subset=["变量名称"], keep="last") 50 | feature_dict = dict(zip(feature_describe["变量名称"], feature_describe["含义"])) 51 | except: 52 | feature_dict = {} 53 | 54 | 55 | def ks_score(y, y_pred): 56 | return toad.KS(y_pred[:, 1], y) 57 | 58 | 59 | ks_score = make_scorer(ks_score, needs_proba=True) 60 | 61 | 62 | def round_float(num): 63 | if ~pd.isnull(num) and isinstance(num, float): 64 | return float(str(num).split(".")[0] + "." + str(num).split(".")[1][:4]) 65 | else: 66 | return num 67 | 68 | 69 | def feature_bins(bins): 70 | if isinstance(bins, list): bins = np.array(bins) 71 | EMPTYBINS = len(bins) if not isinstance(bins[0], (set, list, np.ndarray)) else -1 72 | 73 | l = [] 74 | if np.issubdtype(bins.dtype, np.number): 75 | has_empty = len(bins) > 0 and np.isnan(bins[-1]) 76 | if has_empty: bins = bins[:-1] 77 | sp_l = ["负无穷"] + [round_float(b) for b in bins.tolist()] + ["正无穷"] 78 | for i in range(len(sp_l) - 1): l.append('['+str(sp_l[i])+' , '+str(sp_l[i+1])+')') 79 | if has_empty: l.append('缺失值') 80 | else: 81 | for keys in bins: 82 | keys_update = set() 83 | for key in keys: 84 | if pd.isnull(key) or key == "nan": 85 | keys_update.add("缺失值") 86 | elif key.strip() == "": 87 | keys_update.add("空字符串") 88 | else: 89 | keys_update.add(key) 90 | label = ','.join(keys_update) 91 | l.append(label) 92 | 93 | return {i if b != "缺失值" else EMPTYBINS: b for i, b in enumerate(l)} 94 | 95 | 96 | def feature_bin_stats(data, feature, combiner=None, target="target", rules={}, empty_separate=True, method='cart', min_samples=0.15, max_n_bins=3, gamma=0.01, monotonic_trend="auto_asc_desc", feature_dict={}): 97 | # if combiner is None: 98 | # combiner = toad.transform.Combiner() 99 | # combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, min_samples=min_samples) 100 | if feature not in rules: 101 | if data[feature].nunique(dropna=True) < 3: 102 | splits = [] 103 | for v in data[feature].unique(): 104 | if not pd.isnull(v): 105 | splits.append(v) 106 | 107 | if str(data[feature].dtypes) in ["object", "string", "category"]: 108 | rule = {feature: [[s] for s in splits]} 109 | rule[feature].append([[np.nan]]) 110 | else: 111 | rule = {feature: sorted(splits) + [np.nan]} 112 | else: 113 | try: 114 | y = data[target] 115 | if str(data[feature].dtypes) in ["object", "string", "category"]: 116 | dtype = "categorical" 117 | x = data[feature].astype("category").values 118 | else: 119 | dtype = "numerical" 120 | x = data[feature].values 121 | _combiner = OptimalBinning(feature, dtype=dtype, max_n_bins=max_n_bins, monotonic_trend=monotonic_trend, gamma=gamma).fit(x, y) 122 | if _combiner.status == "OPTIMAL": 123 | rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [[np.nan] if dtype == "categorical" else np.nan]} 124 | else: 125 | raise "OptimalBinning error" 126 | except Exception as e: 127 | if method not in ["dt", "chi", ]: 128 | method = "chi" 129 | _combiner = toad.transform.Combiner() 130 | _combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, min_samples=min_samples) 131 | rule = _combiner.export() 132 | 133 | if combiner is None: 134 | combiner = toad.transform.Combiner() 135 | 136 | combiner.update(rule) 137 | 138 | if rules and isinstance(rules, list): rules = {feature: rules} 139 | if rules and isinstance(rules, dict): combiner.update(rules) 140 | 141 | # feature_bin = combiner.export()[feature] 142 | # feature_bin_dict = format_bins(np.array(feature_bin)) 143 | 144 | df_bin = combiner.transform(data[[feature, target]], labels=False) 145 | 146 | table = df_bin[[feature, target]].groupby([feature, target]).agg(len).unstack() 147 | table.columns.name = None 148 | table = table.rename(columns = {0 : '好样本数', 1 : '坏样本数'}).fillna(0) 149 | table["指标名称"] = feature 150 | table["指标含义"] = feature_dict.get(feature, "") 151 | table = table.reset_index().rename(columns={feature: "分箱"}) 152 | # table["分箱"] = table["分箱"].map(feature_bin_dict) 153 | 154 | table['样本总数'] = table['好样本数'] + table['坏样本数'] 155 | table['样本占比'] = table['样本总数'] / table['样本总数'].sum() 156 | table['好样本占比'] = table['好样本数'] / table['好样本数'].sum() 157 | table['坏样本占比'] = table['坏样本数'] / table['坏样本数'].sum() 158 | table['坏样本率'] = table['坏样本数'] / table['样本总数'] 159 | 160 | table = table.fillna(0.) 161 | 162 | table['分档WOE值'] = table.apply(lambda x : np.log(x['坏样本占比'] / (x['好样本占比'] + 1e-6)),axis=1) 163 | table['分档IV值'] = table.apply(lambda x : (x['坏样本占比'] - x['好样本占比']) * np.log(x['坏样本占比'] / (x['好样本占比'] + 1e-6)), axis=1) 164 | table['指标IV值'] = table['分档IV值'].sum() 165 | 166 | table["LIFT值"] = table['坏样本率'] / (table["坏样本数"].sum() / table["样本总数"].sum()) 167 | table["累积LIFT值"] = table["LIFT值"].cumsum() 168 | 169 | return table[['指标名称', "指标含义", '分箱', '样本总数', '样本占比', '好样本数', '好样本占比', '坏样本数', '坏样本占比', '坏样本率', '分档WOE值', '分档IV值', '指标IV值', 'LIFT值', '累积LIFT值']] 170 | 171 | 172 | def plot_bin(binx, title="", show_iv=True, show_na=True, colors=["#2639E9", "#a29bfe", "#ff7675"], figsize=(10, 8)): 173 | if not show_na: 174 | binx = binx[binx["分箱"] != "缺失值"].reset_index(drop=True) 175 | # y_right_max 176 | y_right_max = np.ceil(binx['坏样本率'].max()*10) 177 | if y_right_max % 2 == 1: y_right_max=y_right_max+1 178 | if y_right_max - binx['坏样本率'].max()*10 <= 0.3: y_right_max = y_right_max+2 179 | y_right_max = y_right_max/10 180 | if y_right_max>1 or y_right_max<=0 or y_right_max is np.nan or y_right_max is None: y_right_max=1 181 | ## y_left_max 182 | y_left_max = np.ceil(binx['样本占比'].max()*10)/10 183 | if y_left_max>1 or y_left_max<=0 or y_left_max is np.nan or y_left_max is None: y_left_max=1 184 | # title 185 | title_string = binx.loc[0,'指标名称']+" (iv:"+str(round(binx['分档IV值'].sum(),4))+")" if show_iv else binx.loc[0,'指标名称'] 186 | title_string = title + '-' + title_string if title else title_string 187 | # param 188 | ind = np.arange(len(binx.index)) # the x locations for the groups 189 | width = 0.35 # the width of the bars: can also be len(x) sequence 190 | ###### plot ###### 191 | fig, ax1 = plt.subplots(figsize=figsize) 192 | ax2 = ax1.twinx() 193 | # ax1 194 | p1 = ax1.bar(ind, binx['好样本占比'], width, color=colors[1]) 195 | p2 = ax1.bar(ind, binx['坏样本占比'], width, bottom=binx['好样本占比'], color=colors[2]) 196 | for i in ind: 197 | ax1.text(i, binx.loc[i,'样本占比']*1.02, str(round(binx.loc[i,'样本占比']*100,1))+'%, '+str(binx.loc[i,'样本总数']), ha='center') 198 | # ax2 199 | ax2.plot(ind, binx['坏样本率'], marker='o', color=colors[0]) 200 | for i in ind: 201 | ax2.text(i, binx.loc[i,'坏样本率']*1.02, str(round(binx.loc[i,'坏样本率']*100,1))+'%', color=colors[0], ha='center') 202 | # settings 203 | ax1.set_ylabel('样本分布情况') 204 | ax2.set_ylabel('坏样本率', color=colors[0]) 205 | ax1.set_yticks(np.arange(0, y_left_max+0.2, 0.2)) 206 | ax2.set_yticks(np.arange(0, y_right_max+0.2, 0.2)) 207 | ax2.tick_params(axis='y', colors=colors[0]) 208 | plt.xticks(ind, binx['分箱'], fontsize=12) 209 | plt.title(title_string, loc='center') 210 | plt.legend((p2[0], p1[0]), ('好样本', '坏样本'), loc='upper right') 211 | 212 | 213 | # def bin_plot(feature_table, feature="", desc="", figsize=(8, 6), colors=['#8E8BFE', '#FEA3A2', '#9394E7'], max_len=35, save=None): 214 | # feature_table = feature_table.copy() 215 | # 216 | # feature_table["分箱"] = feature_table["分箱"].apply(lambda x: x if re.match("^\[.*\)$", x) else str(x)[:max_len] + "..") 217 | # 218 | # # 绘制好坏样本分布情况 219 | # fig, ax1 = plt.subplots(figsize=figsize) 220 | # ax1.barh(feature_table['分箱'], feature_table['好样本数'], color=colors[0], label='好样本') 221 | # ax1.barh(feature_table['分箱'], feature_table['坏样本数'], left=feature_table['好样本数'], color=colors[1], label='坏样本') 222 | # ax1.set_xlabel('样本数') 223 | # 224 | # # 绘制坏样本率的分布情况 225 | # ax2 = ax1.twiny() 226 | # ax2.plot(feature_table['坏样本率'], feature_table['分箱'], colors[2], label='坏样本率', linestyle='-.') 227 | # ax2.set_xlabel('坏样本率: 坏样本数 / 样本总数') 228 | # 229 | # for i, rate in enumerate(feature_table['坏样本率']): 230 | # ax2.scatter(rate, i, color=colors[2], s=3) 231 | # 232 | # # 在图像对应位置显示样本总数和坏样本率 233 | # for i, v in feature_table[['样本总数', '好样本数', '坏样本数', '坏样本率', '样本占比']].iterrows(): 234 | # ax1.text(v['样本总数'] / 2, i + len(feature_table) / 60, f"{int(v['好样本数'])}:{int(v['坏样本数'])}:{v['样本占比']:.1%}:{v['坏样本率']:.1%}") 235 | # 236 | # # 逆转y轴顺序 237 | # ax1.invert_yaxis() 238 | # 239 | # desc = desc if desc else feature 240 | # 241 | # # 添加一个标题 242 | # fig.suptitle(f'变量 {desc} 分箱图\n\n') 243 | # 244 | # # 合并图例 245 | # handles1, labels1 = ax1.get_legend_handles_labels() 246 | # handles2, labels2 = ax2.get_legend_handles_labels() 247 | # fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.95), frameon=False) 248 | # 249 | # # 调整布局,使分箱信息能够完全显示 250 | # plt.tight_layout() 251 | # 252 | # if save: 253 | # if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 254 | # os.makedirs(os.path.dirname(save)) 255 | # 256 | # fig.savefig(save, dpi=240, format="png", bbox_inches="tight") 257 | 258 | 259 | def cal_psi(train, test, feature, combiner=None): 260 | # feature_bin = combiner.export()[feature] 261 | # feature_bin_dict = format_bins(np.array(feature_bin)) 262 | try: 263 | A = (combiner.transform(train[[feature]]).value_counts() / len(train[[feature]])).reset_index().rename(columns={feature: "分箱", 0: "A"}) 264 | E = (combiner.transform(test[[feature]]).value_counts() / len(test[[feature]])).reset_index().rename(columns={feature: "分箱", 0: "E"}) 265 | except: 266 | A = (combiner.transform(train[[feature]])[feature].value_counts() / len(train)).reset_index().rename(columns={"index": "分箱", feature: "A"}) 267 | E = (combiner.transform(test[[feature]])[feature].value_counts() / len(test)).reset_index().rename(columns={"index": "分箱", feature: "E"}) 268 | df_psi = A.merge(E, on="分箱", how="outer").fillna(0.) 269 | # df_psi["分箱"] = df_psi["分箱"].map(feature_bin_dict) 270 | df_psi["分档PSI"] = (df_psi["A"] - df_psi["E"]) * np.log(df_psi["A"] / (df_psi["E"] + 1e-6)) 271 | df_psi["指标PSI"] = df_psi["分档PSI"].replace(np.inf, 0).sum() 272 | 273 | return df_psi[["分箱", "分档PSI", "指标PSI"]] 274 | 275 | 276 | def itlubber_border(border, color): 277 | if len(border) == 3: 278 | return Border( 279 | left=Side(border_style=border[0], color=color[0]), 280 | right=Side(border_style=border[1], color=color[1]), 281 | bottom=Side(border_style=border[2], color=color[2]), 282 | ) 283 | else: 284 | return Border( 285 | left=Side(border_style=border[0], color=color[0]), 286 | right=Side(border_style=border[1], color=color[1]), 287 | bottom=Side(border_style=border[2], color=color[2]), 288 | top=Side(border_style=border[3], color=color[3]), 289 | ) 290 | 291 | 292 | def render_excel(excel_name, sheet_name=None, conditional_columns=[], freeze=None, merge_rows=[], percent_columns=[], theme_color="2639E9", conditional_color="9980FA", font="楷体", fontsize=10, max_column_width=50, header=True, start_row=0, n_jobs=4, bar=True, border=True): 293 | workbook = load_workbook(excel_name) 294 | 295 | if sheet_name and isinstance(sheet_name, str): 296 | sheet_names = [sheet_name] 297 | else: 298 | sheet_names = workbook.get_sheet_names() 299 | 300 | merge_rows = [i + start_row if header else i + start_row - 1 for i in merge_rows] 301 | 302 | for sheet_name in sheet_names: 303 | worksheet = workbook.get_sheet_by_name(sheet_name) 304 | 305 | def add_conditional_formatting(column, theme_color="FDA7DF"): 306 | worksheet.conditional_formatting.add(f'{column}2:{column}{worksheet.max_row}', DataBarRule(start_type='min', end_type='max', color=theme_color)) 307 | 308 | for conditional_column in conditional_columns: 309 | add_conditional_formatting(f"{conditional_column}", theme_color=conditional_color) 310 | 311 | def render_cell(row_index, row): 312 | if row_index > start_row: 313 | if header and row_index == start_row + 1: 314 | for col_index, cell in enumerate(row, start=1): 315 | cell.font = Font(size=fontsize, name=font, color="FFFFFF", bold=True) 316 | cell.fill = PatternFill(fill_type="solid", start_color=theme_color) 317 | cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=False) 318 | 319 | if col_index == 1: 320 | cell.border = itlubber_border(["medium", "thin", "medium", "medium"], [theme_color, "FFFFFF", theme_color, theme_color]) 321 | elif col_index == len(row): 322 | cell.border = itlubber_border(["thin", "medium", "medium", "medium"], ["FFFFFF", theme_color, theme_color, theme_color]) 323 | else: 324 | cell.border = itlubber_border(["thin", "thin", "medium", "medium"], ["FFFFFF", "FFFFFF", theme_color, theme_color]) 325 | else: 326 | for col_index, cell in enumerate(row, start=1): 327 | cell.font = Font(size=fontsize, name=font, color="000000") 328 | cell.fill = PatternFill(fill_type="solid", start_color="FFFFFF") 329 | cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=False) 330 | 331 | if col_index in percent_columns: 332 | # cell.alignment = Alignment(horizontal='right', vertical='center', wrap_text=False) 333 | cell.number_format = "0.00%" 334 | else: 335 | pass 336 | # cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=False) 337 | 338 | if row_index == worksheet.max_row: 339 | if col_index == 1: 340 | cell.border = itlubber_border(["medium", "thin", "medium"], [theme_color, "FFFFFF", theme_color]) 341 | elif col_index == len(row): 342 | cell.border = itlubber_border(["thin", "medium", "medium"], ["FFFFFF", theme_color, theme_color]) 343 | else: 344 | cell.border = itlubber_border(["thin", "thin", "medium"], ["FFFFFF", "FFFFFF", theme_color]) 345 | else: 346 | if merge_rows in [[], None] or (row_index - 1 in merge_rows): 347 | if col_index == 1: 348 | cell.border = itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", theme_color]) 349 | elif col_index == len(row): 350 | cell.border = itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, theme_color]) 351 | else: 352 | cell.border = itlubber_border(["thin", "thin", "thin"], ["FFFFFF", "FFFFFF", theme_color]) 353 | else: 354 | if col_index == 1: 355 | cell.border = itlubber_border(["medium", "thin", "thin"], [theme_color, "FFFFFF", "FFFFFF"]) 356 | elif col_index == len(row): 357 | cell.border = itlubber_border(["thin", "medium", "thin"], ["FFFFFF", theme_color, "FFFFFF"]) 358 | else: 359 | cell.border = itlubber_border(["thin", "thin", "thin"], ["FFFFFF", "FFFFFF", "FFFFFF"]) 360 | 361 | if border: 362 | iterrows = tqdm(enumerate(worksheet.rows, start=1), total=worksheet.max_row - 1) if bar else enumerate(worksheet.rows, start=1) 363 | if n_jobs > 0: 364 | joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(render_cell)(row_index, row) for row_index, row in iterrows) 365 | else: 366 | for row_index, row in iterrows: 367 | render_cell(row_index, row) 368 | 369 | feature_table = pd.read_excel( 370 | excel_name, sheet_name=sheet_name, engine="openpyxl" 371 | ) 372 | feature_table_len_max = feature_table.apply(lambda x: [(len(str(i).encode('utf-8')) - len(str(i))) / 2 + len(str(i)) for i in x]).max() 373 | for i in feature_table.columns: 374 | # 列的字母 375 | j = list(feature_table.columns) 376 | column_letter = [chr(j.index(i) + 65) if j.index(i) <= 25 else 'A' + chr(j.index(i) - 26 + 65) ][0] 377 | # 列的宽度 378 | columns_length = (len(str(i).encode('utf-8')) - len(str(i)))/2 + len(str(i)) 379 | data_max_length = feature_table_len_max[i] 380 | column_width = [data_max_length if columns_length < data_max_length else columns_length][0] 381 | column_width = [column_width if column_width <= max_column_width else max_column_width][0] + 3 382 | # 更改列的宽度 383 | worksheet.column_dimensions['{}'.format(column_letter)].width = column_width 384 | 385 | if freeze: 386 | worksheet.freeze_panes = freeze 387 | 388 | workbook.save(excel_name) 389 | workbook.close() 390 | 391 | 392 | def run_feature_table(feature, train=None, feature_dict=None, rules={}, combiner=None, target="target", return_feature=False): 393 | table = feature_bin_stats(train, feature, feature_dict=feature_dict, rules=rules, combiner=combiner) 394 | df_psi = cal_psi(train[[feature, target]], test[[feature, target]], feature, combiner=combiner) 395 | 396 | table = table.merge(df_psi, on="分箱", how="left") 397 | 398 | feature_bin = combiner.export()[feature] 399 | feature_bin_dict = format_bins(np.array(feature_bin)) 400 | table["分箱"] = table["分箱"].map(feature_bin_dict) 401 | 402 | if return_feature: 403 | return feature, table 404 | else: 405 | return table 406 | 407 | 408 | def render_dataframe(df, row_height=0.4, font_size=14, 409 | header_color='#2639E9', row_colors=['#dae3f3', 'w'], edge_color='w', 410 | bbox=[0, 0, 1, 1], header_columns=0, 411 | ax=None, save=None, **kwargs): 412 | data = df.copy() 413 | for col in data.select_dtypes('datetime'): 414 | data[col] = data[col].dt.strftime("%Y-%m-%d") 415 | 416 | for col in data.select_dtypes('float'): 417 | data[col] = data[col].apply(lambda x: np.nan if pd.isnull(x) else round(x, 4)) 418 | 419 | cols_width = [max(data[col].apply(lambda x:len(str(x).encode())).max(), len(str(col).encode())) / 8. for col in data.columns] 420 | 421 | if ax is None: 422 | size = (sum(cols_width), (len(data) + 1) * row_height) 423 | fig, ax = plt.subplots(figsize=size) 424 | ax.axis('off') 425 | 426 | mpl_table = ax.table(cellText=data.values, colWidths=cols_width, bbox=bbox, colLabels=data.columns, **kwargs) 427 | 428 | mpl_table.auto_set_font_size(False) 429 | mpl_table.set_fontsize(font_size) 430 | 431 | for k, cell in six.iteritems(mpl_table._cells): 432 | cell.set_edgecolor(edge_color) 433 | if k[0] == 0 or k[1] < header_columns: 434 | cell.set_text_props(weight='bold', color='w') 435 | cell.set_facecolor(header_color) 436 | else: 437 | cell.set_facecolor(row_colors[k[0]%len(row_colors)]) 438 | 439 | if save: 440 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 441 | os.makedirs(os.path.dirname(save)) 442 | 443 | fig.savefig(save, dpi=240, format="png", bbox_inches="tight") 444 | 445 | return fig 446 | 447 | 448 | if __name__ == '__main__': 449 | from functools import partial 450 | from multiprocessing import Pool 451 | data = sc.germancredit() 452 | 453 | # 测试数据 454 | data["target"] = data["creditability"].replace({'good':0,'bad':1}) 455 | data["credit.amount"].loc[0] = np.nan 456 | data["status.of.existing.checking.account"].loc[0] = np.nan 457 | data["test_a"] = 0. 458 | data["test_a"].loc[0] = np.nan 459 | data["test_b"] = "" 460 | data["test_b"].loc[0] = np.nan 461 | data["test_c"] = np.nan 462 | 463 | # data = data.replace("", np.nan) 464 | 465 | train, test = train_test_split(data, test_size=0.3,) 466 | 467 | target = "target" 468 | cols = ["test_a", "test_b", "test_c", "status.of.existing.checking.account", "credit.amount"] 469 | 470 | combiner = toad.transform.Combiner() 471 | # combiner.fit(data[cols + [target]], target, empty_separate=True, method="chi", min_samples=0.2) 472 | 473 | # 保存结果至 EXCEL 文件 474 | output_excel_name = f"指标有效性验证-{datetime.now().strftime('%Y-%m-%d')}.xlsx" 475 | output_sheet_name = "指标有效性" 476 | tables = {} 477 | merge_row_number = [] 478 | 479 | # _run_feature_table = partial(run_feature_table, train=train, feature_dict=feature_dict, rules={}, combiner=combiner, target=target, return_feature=True) 480 | # all_feature_tables = joblib.Parallel(n_jobs=4)(joblib.delayed(_run_feature_table)(feature) for feature in cols) 481 | 482 | # for feature, table in all_feature_tables: 483 | # merge_row_number.append(len(table)) 484 | # tables[feature] = table 485 | 486 | for feature in cols: 487 | table = feature_bin_stats(train, feature, feature_dict=feature_dict, rules={}, combiner=combiner) 488 | print(train.shape) 489 | df_psi = cal_psi(train[[feature, target]], test[[feature, target]], feature, combiner=combiner) 490 | 491 | table = table.merge(df_psi, on="分箱", how="left") 492 | 493 | feature_bin = combiner.export()[feature] 494 | feature_bin_dict = format_bins(np.array(feature_bin)) 495 | table["分箱"] = table["分箱"].map(feature_bin_dict) 496 | 497 | table = run_feature_table(feature) 498 | # plot_bin(table, show_na=True) 499 | merge_row_number.append(len(table)) 500 | tables[feature] = table 501 | 502 | merge_row_number = np.cumsum(merge_row_number).tolist() 503 | feature_table = pd.concat(tables, ignore_index=True).round(6) 504 | feature_table["分档WOE值"] = feature_table["分档WOE值"].fillna(np.inf) 505 | 506 | workbook = load_workbook(output_excel_name) if os.path.exists(output_excel_name) else None 507 | writer = pd.ExcelWriter(output_excel_name, engine="openpyxl") 508 | 509 | if workbook: 510 | writer.book = workbook 511 | writer.sheets = {ws.title: ws for ws in workbook.worksheets} 512 | start_row = writer.book.get_sheet_by_name(output_sheet_name).max_row 513 | else: 514 | start_row = 0 515 | 516 | feature_table.to_excel(writer, sheet_name=output_sheet_name, index=False, header=True, startcol=0, startrow=start_row) 517 | 518 | writer.close() 519 | 520 | render_excel(output_excel_name, sheet_name=output_sheet_name, conditional_columns=["J", "N"], freeze="D2", merge_rows=merge_row_number, percent_columns=[5, 7, 9, 10], start_row=start_row, header=False if start_row > 0 else True) 521 | # render_excel("变量字典及字段解释.xlsx") 522 | combiner.export(to_json=f"rules_{datetime.now().strftime('%Y-%m-%d')}.json") 523 | -------------------------------------------------------------------------------- /processing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2022/8/23 13:12 4 | @Author : itlubber 5 | @Site : itlubber.art 6 | """ 7 | 8 | import re 9 | import os 10 | import toad 11 | import scipy 12 | import warnings 13 | import numpy as np 14 | import pandas as pd 15 | import scorecardpy as sc 16 | import statsmodels.api as sm 17 | from functools import partial 18 | import matplotlib.pyplot as plt 19 | import plotly.graph_objects as go 20 | from IPython.display import Image 21 | from openpyxl import load_workbook 22 | # from joblib import Parallel, delayed 23 | from concurrent.futures import ProcessPoolExecutor 24 | from openpyxl.styles import Alignment 25 | from optbinning import OptimalBinning 26 | from sklearn.decomposition import PCA 27 | from sklearn.pipeline import Pipeline 28 | from sklearn.linear_model import LogisticRegression 29 | from sklearn.utils.validation import check_is_fitted 30 | from sklearn.model_selection import train_test_split, GridSearchCV 31 | from sklearn.ensemble import GradientBoostingClassifier 32 | from toad.plot import bin_plot, proportion_plot, corr_plot, badrate_plot 33 | from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin 34 | from statsmodels.stats.outliers_influence import variance_inflation_factor 35 | 36 | 37 | warnings.filterwarnings("ignore") 38 | pd.set_option('display.width', 5000) 39 | # plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体 40 | # plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题 41 | 42 | 43 | def drop_identical(frame, threshold = 0.95, return_drop = False, exclude = None, target = None): 44 | """drop columns by identical 45 | Args: 46 | frame (DataFrame): dataframe that will be used 47 | threshold (number): drop the features whose identical num is greater than threshold. if threshold is float, it will be use as percentage 48 | return_drop (bool): if need to return features' name who has been dropped 49 | exclude (array-like): list of feature names that will not be dropped 50 | target (str): target's name in dataframe 51 | Returns: 52 | DataFrame: selected dataframe 53 | array: list of feature names that has been dropped 54 | """ 55 | cols = frame.columns.copy() 56 | 57 | if target is not None: 58 | cols.drop(target) 59 | 60 | if exclude is not None: 61 | cols = cols.drop(exclude) 62 | 63 | if threshold < 1: 64 | threshold = len(frame) * threshold 65 | 66 | drop_list = [] 67 | for col in cols: 68 | n = frame[col].value_counts().max() 69 | 70 | if n > threshold: 71 | drop_list.append(col) 72 | 73 | r = frame.drop(columns = drop_list) 74 | 75 | res = (r,) 76 | if return_drop: 77 | res += (np.array(drop_list),) 78 | 79 | return toad.utils.unpack_tuple(res) 80 | 81 | 82 | def select(frame, target = 'target', empty = 0.9, iv = 0.02, corr = 0.7, 83 | identical=0.95, return_drop = False, exclude = None): 84 | """select features by rate of empty, iv and correlation 85 | Args: 86 | frame (DataFrame) 87 | target (str): target's name in dataframe 88 | empty (number): drop the features which empty num is greater than threshold. if threshold is less than `1`, it will be use as percentage 89 | identical (number): drop the features which identical num is greater than threshold. if threshold is less than `1`, it will be use as percentage 90 | iv (float): drop the features whose IV is less than threshold 91 | corr (float): drop features that has the smallest IV in each groups which correlation is greater than threshold 92 | return_drop (bool): if need to return features' name who has been dropped 93 | exclude (array-like): list of feature name that will not be dropped 94 | Returns: 95 | DataFrame: selected dataframe 96 | dict: list of dropped feature names in each step 97 | """ 98 | empty_drop, iv_drop, corr_drop, identical_drop = None, None, None, None 99 | 100 | if empty is not False: 101 | frame, empty_drop = toad.selection.drop_empty(frame, threshold = empty, return_drop = True, exclude = exclude) 102 | 103 | if identical is not False: 104 | frame, identical_drop = drop_identical(frame, threshold = identical, return_drop = True, exclude = exclude, target = target) 105 | 106 | if iv is not False: 107 | frame, iv_drop, iv_list = toad.selection.drop_iv(frame, target = target, threshold = iv, return_drop = True, return_iv = True, exclude = exclude) 108 | 109 | if corr is not False: 110 | weights = 'IV' 111 | 112 | if iv is not False: 113 | weights = iv_list 114 | 115 | frame, corr_drop = toad.selection.drop_corr(frame, target = target, threshold = corr, by = weights, return_drop = True, exclude = exclude) 116 | 117 | res = (frame,) 118 | if return_drop: 119 | d = { 120 | 'empty': empty_drop, 121 | 'identical': identical_drop, 122 | 'iv': iv_drop, 123 | 'corr': corr_drop, 124 | } 125 | res += (d,) 126 | 127 | return toad.utils.unpack_tuple(res) 128 | 129 | 130 | class FeatureSelection(TransformerMixin, BaseEstimator): 131 | 132 | def __init__(self, target="target", empty=0.95, iv=0.02, corr=0.7, exclude=None, return_drop=True, identical=0.95, remove=None, engine="scorecardpy", target_rm=False): 133 | """ 134 | ITLUBBER提供的特征筛选方法 135 | 136 | Args: 137 | target: 数据集中标签名称,默认 target 138 | empty: 空值率,默认 0.95, 即空值占比超过 95% 的特征会被剔除 139 | iv: IV值,默认 0.02,即iv值小于 0.02 时特征会被剔除 140 | corr: 相关性,默认 0.7,即特征之间相关性大于 0.7 时会剔除iv较小的特征 141 | identical: 唯一值占比,默认 0.95,即当特征的某个值占比超过 95% 时,特征会被剔除 142 | engine: 特征筛选使用的引擎,可选 "toad", "scorecardpy" 两种,默认 scorecardpy 143 | remove: 引擎使用 scorecardpy 时,可以传入需要强制删除的变量 144 | return_drop: 是否返回删除信息,默认 True,即默认返回删除特征信息 145 | target_rm: 是否剔除标签,默认 False,即不剔除 146 | exclude: 是否需要强制保留某些特征 147 | """ 148 | self.engine = engine 149 | self.target = target 150 | self.empty = empty 151 | self.identical = identical 152 | self.iv = iv 153 | self.corr = corr 154 | self.exclude = exclude 155 | self.remove = remove 156 | self.return_drop = return_drop 157 | self.target_rm = target_rm 158 | self.select_columns = None 159 | self.dropped = None 160 | 161 | def fit(self, x, y=None): 162 | if self.engine == "toad": 163 | selected = select(x, target=self.target, empty=self.empty, identical=self.identical, iv=self.iv, corr=self.corr, exclude=self.exclude, return_drop=self.return_drop) 164 | else: 165 | selected = sc.var_filter(x, y=self.target, iv_limit=self.iv, missing_limit=self.empty, identical_limit=self.identical, var_rm=self.remove, var_kp=self.exclude, return_rm_reason=self.return_drop) 166 | 167 | if self.return_drop and isinstance(selected, dict): 168 | self.dropped = selected["rm"] 169 | self.select_columns = list(selected["dt"].columns) 170 | elif self.return_drop and isinstance(selected, (tuple, list)): 171 | self.dropped = pd.DataFrame([(feature, reason) for reason, features in selected[1].items() for feature in features], columns=["variable", "rm_reason"]) 172 | self.select_columns = list(selected[0].columns) 173 | else: 174 | self.select_columns = list(selected.columns) 175 | 176 | if self.target_rm and self.target in self.select_columns: 177 | self.select_columns.remove(self.target) 178 | 179 | return self 180 | 181 | def transform(self, x, y=None): 182 | # if self.engine == "toad": 183 | # selected = toad.selection.select(x, target=self.target, empty=self.empty, iv=self.iv, corr=self.corr, exclude=self.exclude, return_drop=self.return_drop) 184 | # else: 185 | # selected = sc.var_filter(x, y=self.target, iv_limit=self.iv, missing_limit=self.empty, identical_limit=self.identical, var_rm=self.remove, var_kp=self.exclude, return_rm_reason=self.return_drop) 186 | 187 | # if self.return_drop and isinstance(selected, dict): 188 | # self.dropped = selected["rm"] 189 | # return selected["dt"] 190 | # elif self.return_drop and isinstance(selected, (tuple, list)): 191 | # self.dropped = pd.DataFrame([(feature, reason) for reason, features in selected[1].items() for feature in features], columns=["variable", "rm_reason"]) 192 | # return selected[0] 193 | # else: 194 | # return selected 195 | return x[[col for col in self.select_columns if col in x.columns]] 196 | 197 | 198 | class FeatureImportanceSelector(BaseEstimator, TransformerMixin): 199 | 200 | def __init__(self, top_k=126, target="target", selector="catboost", params=None, max_iv=None): 201 | """ 202 | 基于特征重要性的特征筛选方法 203 | 204 | Args: 205 | target: 数据集中标签名称,默认 target 206 | top_k: 依据特征重要性进行排序,筛选最重要的 top_k 个特征 207 | max_iv: 是否需要删除 IV 过高的特征,建议设置为 1.0 208 | selector: 特征选择器,目前只支持 catboost ,可以支持数据集中包含字符串的数据 209 | params: selector 的参数,不传使用默认参数 210 | """ 211 | self.target = target 212 | self.top_k = top_k 213 | self.max_iv = max_iv 214 | self.selector = selector 215 | self.params = params 216 | self.feature_names_ = None 217 | self.high_iv_feature_names_ = None 218 | self.low_importance_feature_names_ = None 219 | self.select_columns = None 220 | self.dropped = None 221 | 222 | def fit(self, x, y=None): 223 | x = x.copy() 224 | 225 | if self.max_iv is not None: 226 | self.high_iv_feature_names_ = list(toad.quality(train, target=target, cpu_cores=-1, iv_only=True).query("iv > 1.0").index) 227 | x = x[[c for c in x.columns if c not in self.high_iv_feature_names_]] 228 | 229 | X = x.drop(columns=self.target) 230 | Y = x[self.target] 231 | 232 | self.feature_names_ = list(X.columns) 233 | cat_features_index = [i for i in range(len(self.feature_names_)) if self.feature_names_[i] not in X.select_dtypes("number").columns] 234 | 235 | if self.selector == "catboost": 236 | self.catboost_selector(x=X, y=Y, cat_features=cat_features_index) 237 | else: 238 | pass 239 | 240 | return self 241 | 242 | def transform(self, x, y=None): 243 | return x[self.select_columns + [self.target]] 244 | 245 | 246 | def catboost_selector(self, x, y, cat_features=None): 247 | from catboost import Pool, cv, metrics, CatBoostClassifier 248 | 249 | cat_data = Pool(data=x, label=y, cat_features=cat_features) 250 | 251 | if self.params is None: 252 | self.params = { 253 | "iterations": 256, 254 | "objective": "CrossEntropy", 255 | "eval_metric": "AUC", 256 | "learning_rate": 1e-2, 257 | "colsample_bylevel": 0.1, 258 | "depth": 4, 259 | "boosting_type": "Ordered", 260 | "bootstrap_type": "Bernoulli", 261 | "subsample": 0.8, 262 | "random_seed": 1024, 263 | "early_stopping_rounds": 10, 264 | "verbose": 0, 265 | } 266 | 267 | cat_model = CatBoostClassifier(**self.params) 268 | cat_model.fit(cat_data, eval_set=[cat_data]) 269 | 270 | self.select_columns = [name for score, name in sorted(zip(cat_model.feature_importances_, cat_model.feature_names_), reverse=True)][:self.top_k] 271 | self.low_importance_feature_names_ = [c for c in x.columns if c not in self.select_columns] 272 | 273 | 274 | class FeatureDecomposition(BaseEstimator, TransformerMixin): 275 | 276 | def __init__(self, freq, app, key_words=None, combin_features=None, combiner=PCA, n_components=1): 277 | """ 278 | 同一类型 + 同一周期 + 新增数/安装数/活跃天数/卸载数 的特征通过降维方法转换为 n_components 个特征 279 | 280 | freq: 周期,例如 90天 281 | app: 类型,例如 银行类 282 | key_words: 不同类型的指标,例如 ["活跃款数", "新增款数", "活跃频次", "活跃天数"] 283 | combin_features: 手工制定需要进行降维的特征,传入app、freq、freq时不需要传入 284 | combiner: 降维的方法,默认 PCA,参考 sklearn.decomposition 中相关方法的使用 285 | n_components: 降维后的特征数量,默认 1 286 | """ 287 | self.freq = freq 288 | self.app = app 289 | self.key_words = key_words 290 | self.combin_features = combin_features 291 | self.n_components = n_components 292 | self.combiner = combiner(n_components=self.n_components) 293 | 294 | def fit(self, x, y=None): 295 | x = x.copy() 296 | 297 | if self.combin_features: 298 | self.combin_features = [c for c in self.combin_features if c in x.columns] 299 | else: 300 | if self.key_words: 301 | if isinstance(self.key_words, str): 302 | self.key_words = [self.key_words] 303 | pattern = re.compile(f"(?=.*{self.freq})(?=.*{self.app})(?=.*(?:{'|'.join(self.key_words)})).+") 304 | else: 305 | pattern = re.compile(f"{self.app}") 306 | 307 | self.combin_features = [c for c in x.columns if pattern.match(c)] 308 | 309 | if len(self.combin_features) > 0 and len(self.combin_features) > self.n_components: 310 | x = x[self.combin_features] 311 | self.combiner.fit(x, y=y) 312 | 313 | else: 314 | raise Exception("组合特征不在数据中。") 315 | 316 | return self 317 | 318 | def transform(self, x, y=None): 319 | x = x[self.combin_features].copy() 320 | return self.combiner.transform(x) 321 | 322 | def inverse_transform(self, x, y=None): 323 | return self.combiner.inverse_transform(x) 324 | 325 | 326 | class Combiner(TransformerMixin, BaseEstimator): 327 | 328 | def __init__(self, target="target", method='chi', engine="toad", empty_separate=False, min_samples=0.05, min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc", rules={}, n_jobs=1): 329 | """ 330 | 特征分箱封装方法 331 | 332 | Args: 333 | target: 数据集中标签名称,默认 target 334 | method: 特征分箱方法,可选 "chi", "dt", "quantile", "step", "kmeans", "cart", "mdlp", "uniform", 参考 toad.Combiner & optbinning.OptimalBinning 335 | engine: 分箱引擎,可选 "optbinning", "toad" 336 | empty_separate: 是否空值单独一箱, 默认 False,推荐设置为 True 337 | min_samples: 最小叶子结点样本占比,参考对应文档进行设置,默认 5% 338 | min_n_bins: 最小分箱数,默认 2,即最小拆分2箱 339 | max_n_bins: 最大分像素,默认 3,即最大拆分3箱,推荐设置 3 ~ 5,不宜过多,偶尔使用 optbinning 时不起效 340 | max_n_prebins: 使用 optbinning 时预分箱数量 341 | min_prebin_size: 使用 optbinning 时预分箱叶子结点(或者每箱)样本占比,默认 2% 342 | min_bin_size: 使用 optbinning 正式分箱叶子结点(或者每箱)最小样本占比,默认 5% 343 | max_bin_size: 使用 optbinning 正式分箱叶子结点(或者每箱)最大样本占比,默认 None 344 | gamma: 使用 optbinning 分箱时限制过拟合的正则化参数,值越大惩罚越多,默认 0。01 345 | monotonic_trend: 使用 optbinning 正式分箱时的坏率策略,默认 auto,可选 "auto", "auto_heuristic", "auto_asc_desc", "ascending", "descending", "convex", "concave", "peak", "valley", "peak_heuristic", "valley_heuristic" 346 | rules: 自定义分箱规则,toad.Combiner 能够接收的形式 347 | n_jobs: 使用多进程加速的worker数量,默认单进程 348 | """ 349 | self.combiner = toad.transform.Combiner() 350 | self.method = method 351 | self.empty_separate = empty_separate 352 | self.target = target 353 | self.min_samples = min_samples 354 | self.max_n_bins = max_n_bins 355 | self.min_n_bins = min_n_bins 356 | self.min_bin_size = min_bin_size 357 | self.max_bin_size = max_bin_size 358 | self.max_n_prebins = max_n_prebins 359 | self.min_prebin_size = min_prebin_size 360 | self.gamma = gamma 361 | self.monotonic_trend = monotonic_trend 362 | self.rules = rules 363 | self.engine = engine 364 | self.n_jobs = n_jobs 365 | 366 | def optbinning_bins(self, feature, data=None, target="target", min_n_bins=2, max_n_bins=3, max_n_prebins=10, min_prebin_size=0.02, min_bin_size=0.05, max_bin_size=None, gamma=0.01, monotonic_trend="auto_asc_desc"): 367 | if data[feature].dropna().nunique() <= min_n_bins: 368 | splits = [] 369 | for v in data[feature].dropna().unique(): 370 | splits.append(v) 371 | 372 | if str(data[feature].dtypes) in ["object", "string", "category"]: 373 | rule = {feature: [[s] for s in splits]} 374 | rule[feature].append([[np.nan]]) 375 | else: 376 | rule = {feature: sorted(splits) + [np.nan]} 377 | else: 378 | try: 379 | y = data[target] 380 | if str(data[feature].dtypes) in ["object", "string", "category"]: 381 | dtype = "categorical" 382 | x = data[feature].astype("category").values 383 | else: 384 | dtype = "numerical" 385 | x = data[feature].values 386 | 387 | _combiner = OptimalBinning(feature, dtype=dtype, min_n_bins=min_n_bins, max_n_bins=max_n_bins, max_n_prebins=max_n_prebins, min_prebin_size=min_prebin_size, min_bin_size=min_bin_size, max_bin_size=max_bin_size, monotonic_trend=monotonic_trend, gamma=gamma).fit(x, y) 388 | if _combiner.status == "OPTIMAL": 389 | rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [[np.nan] if dtype == "categorical" else np.nan]} 390 | else: 391 | raise Exception("optimalBinning error") 392 | 393 | except Exception as e: 394 | _combiner = toad.transform.Combiner() 395 | _combiner.fit(data[[feature, target]].dropna(), target, method="chi", min_samples=self.min_samples, n_bins=self.max_n_bins) 396 | rule = {feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.export()[feature]] + [[np.nan] if dtype == "categorical" else np.nan]} 397 | 398 | self.combiner.update(rule) 399 | 400 | def fit(self, x, y=None): 401 | if self.engine == "optbinning": 402 | feature_optbinning_bins = partial(self.optbinning_bins, data=x, target=self.target, min_n_bins=self.min_n_bins, max_n_bins=self.max_n_bins, max_n_prebins=self.max_n_prebins, min_prebin_size=self.min_prebin_size, min_bin_size=self.min_bin_size, max_bin_size=self.max_bin_size, gamma=self.gamma, monotonic_trend=self.monotonic_trend) 403 | if self.n_jobs > 1: 404 | with ProcessPoolExecutor(max_workers=self.n_jobs) as executor: 405 | [executor.submit(feature_optbinning_bins(feature)) for feature in x.columns.drop(self.target)] 406 | else: 407 | for feature in x.drop(columns=[self.target]): 408 | self.optbinning_bins(feature, data=x, target=self.target, min_n_bins=self.min_n_bins, max_n_bins=self.max_n_bins, max_n_prebins=self.max_n_prebins, min_prebin_size=self.min_prebin_size, min_bin_size=self.min_bin_size, max_bin_size=self.max_bin_size, gamma=self.gamma, monotonic_trend=self.monotonic_trend) 409 | # feature_optbinning_bins(feature) 410 | else: 411 | self.combiner.fit(x, y=self.target, method=self.method, min_samples=self.min_samples, n_bins=self.max_n_bins) 412 | 413 | self.update(self.rules) 414 | 415 | return self 416 | 417 | def transform(self, x, y=None, labels=False): 418 | return self.combiner.transform(x, labels=labels) 419 | 420 | def update(self, rules): 421 | if isinstance(rules, dict): 422 | self.combiner.update(rules) 423 | 424 | def export(self, to_json=None): 425 | return self.combiner.export(to_json=to_json) 426 | 427 | def load(self, from_json=None): 428 | self.combiner.load(from_json=from_json) 429 | return self 430 | 431 | def bin_plot(self, data, x, rule=None, labels=True, result=False, save=None): 432 | if rule: 433 | if isinstance(rule, list): 434 | rule = {x: rule} 435 | self.combiner.update(rule) 436 | 437 | bin_plot(self.combiner.transform(data, labels=labels), x=x, target=self.target) 438 | 439 | if save: 440 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 441 | os.makedirs(os.path.dirname(save)) 442 | 443 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight") 444 | 445 | if result: 446 | return self.combiner.export()[x] 447 | 448 | def proportion_plot(self, x, transform=False, labels=False): 449 | if transform: 450 | x = self.combiner.transform(x, labels=labels) 451 | proportion_plot(x) 452 | 453 | def corr_plot(self, data, transform=False, figure_size=(20, 15)): 454 | if transform: 455 | data = self.combiner.transform(data, labels=False) 456 | 457 | corr_plot(data, figure_size=figure_size) 458 | 459 | def badrate_plot(self, data, date_column, feature, labels=True): 460 | badrate_plot(self.combiner.transform(data[[date_column, feature, self.target]], labels=labels), target=self.target, x=date_column, by=feature) 461 | 462 | @property 463 | def rules(self): 464 | return self.combiner._rules 465 | 466 | @rules.setter 467 | def rules(self, value): 468 | self.combiner._rules = value 469 | 470 | def __len__(self): 471 | return len(self.combiner._rules.keys()) 472 | 473 | def __contains__(self, key): 474 | return key in self.combiner._rules 475 | 476 | def __getitem__(self, key): 477 | return self.combiner._rules[key] 478 | 479 | def __setitem__(self, key, value): 480 | self.combiner._rules[key] = value 481 | 482 | def __iter__(self): 483 | return iter(self.combiner._rules) 484 | 485 | 486 | class WOETransformer(TransformerMixin, BaseEstimator): 487 | 488 | def __init__(self, target="target", exclude=None): 489 | """ 490 | WOE转换器 491 | 492 | Args: 493 | target: 数据集中标签名称,默认 target 494 | exclude: 不需要转换 woe 的列 495 | """ 496 | self.target = target 497 | self.exclude = exclude if isinstance(exclude, list) else [exclude] if exclude else [] 498 | self.transformer = toad.transform.WOETransformer() 499 | 500 | def fit(self, x, y=None): 501 | self.transformer.fit(x.drop(columns=self.exclude + [self.target]), x[self.target]) 502 | return self 503 | 504 | def transform(self, x, y=None): 505 | return self.transformer.transform(x) 506 | 507 | @property 508 | def rules(self): 509 | return self.transformer._rules 510 | 511 | @rules.setter 512 | def rules(self, value): 513 | self.transformer._rules = value 514 | 515 | def __len__(self): 516 | return len(self.transformer._rules.keys()) 517 | 518 | def __contains__(self, key): 519 | return key in self.transformer._rules 520 | 521 | def __getitem__(self, key): 522 | return self.transformer._rules[key] 523 | 524 | def __setitem__(self, key, value): 525 | self.transformer._rules[key] = value 526 | 527 | def __iter__(self): 528 | return iter(self.transformer._rules) 529 | 530 | 531 | class StepwiseSelection(TransformerMixin, BaseEstimator): 532 | 533 | def __init__(self, target="target", estimator="ols", direction="both", criterion="aic", max_iter=None, return_drop=True, exclude=None, intercept=True, p_value_enter=0.2, p_remove=0.01, p_enter=0.01, target_rm=False): 534 | """ 535 | 逐步回归筛选方法 536 | 537 | Args: 538 | target: 数据集中标签名称,默认 target 539 | estimator: 预估器,默认 ols,可选 "ols", "lr", "lasso", "ridge",通常默认即可 540 | direction: 逐步回归方向,默认both,可选 "forward", "backward", "both",通常默认即可 541 | criterion: 评价指标,默认 aic,可选 "aic", "bic",通常默认即可 542 | max_iter: 最大迭代次数,sklearn中使用的参数,默认为 None 543 | return_drop: 是否返回特征剔除信息,默认 True 544 | exclude: 强制保留的某些特征 545 | intercept: 是否包含截距,默认为 True 546 | p_value_enter: 特征进入的 p 值,用于前向筛选时决定特征是否进入模型 547 | p_remove: 特征剔除的 p 值,用于后向剔除时决定特征是否要剔除 548 | p_enter: 特征 p 值,用于判断双向逐步回归是否剔除或者准入特征 549 | target_rm: 是否剔除数据集中的标签,默认为 False,即剔除数据集中的标签 550 | """ 551 | self.target = target 552 | self.intercept = intercept 553 | self.p_value_enter = p_value_enter 554 | self.p_remove = p_remove 555 | self.p_enter = p_enter 556 | self.estimator = estimator 557 | self.direction = direction 558 | self.criterion = criterion 559 | self.max_iter = max_iter 560 | self.return_drop = return_drop 561 | self.target_rm = target_rm 562 | self.exclude = exclude 563 | self.select_columns = None 564 | self.dropped = None 565 | 566 | def fit(self, x, y=None): 567 | selected = toad.selection.stepwise(x, target=self.target, estimator=self.estimator, direction=self.direction, criterion=self.criterion, exclude=self.exclude, intercept=self.intercept, p_value_enter=self.p_value_enter, 568 | p_remove=self.p_remove, p_enter=self.p_enter, return_drop=self.return_drop) 569 | if self.return_drop: 570 | self.dropped = pd.DataFrame([(col, "stepwise") for col in selected[1]], columns=["variable", "rm_reason"]) 571 | selected = selected[0] 572 | 573 | self.select_columns = list(selected.columns) 574 | 575 | if self.target_rm and self.target in self.select_columns: 576 | self.select_columns.remove(self.target) 577 | 578 | return self 579 | 580 | def transform(self, x, y=None): 581 | return x[[col for col in self.select_columns if col in x.columns]] 582 | 583 | 584 | if __name__ == "__main__": 585 | from model import ITLubberLogisticRegression, StatsLogisticRegression, ScoreCard 586 | 587 | target = "creditability" 588 | data = sc.germancredit() 589 | data[target] = data[target].map({"good": 0, "bad": 1}) 590 | 591 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target]) 592 | 593 | # selection = FeatureSelection(target=target, engine="toad", return_drop=True, corr=0.9, iv=0.01) 594 | # train = selection.fit_transform(train) 595 | 596 | # combiner = Combiner(min_samples=0.2, empty_separate=True, target=target) 597 | # combiner.fit(train) 598 | # train = combiner.transform(train) 599 | 600 | # transformer = WOETransformer(target=target) 601 | # train = transformer.fit_transform(train) 602 | 603 | # stepwise = StepwiseSelection(target=target) 604 | # train = stepwise.fit_transform(train) 605 | 606 | feature_pipeline = Pipeline([ 607 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")), 608 | ("combiner", Combiner(target=target, min_samples=0.2)), 609 | ("transformer", WOETransformer(target=target)), 610 | ("processing_select", FeatureSelection(target=target, engine="scorecardpy")), 611 | ("stepwise", StepwiseSelection(target=target, target_rm=False)), 612 | # ("logistic", StatsLogisticRegression(target=target)), 613 | ("logistic", ITLubberLogisticRegression(target=target)), 614 | ]) 615 | 616 | # feature_pipeline.fit(train) 617 | # y_pred_train = feature_pipeline.predict(train.drop(columns=target)) 618 | # y_pred_test = feature_pipeline.predict(test.drop(columns=target)) 619 | 620 | params_grid = { 621 | "logistic__C": [i / 1. for i in range(1, 10, 2)], 622 | "logistic__penalty": ["l2"], 623 | "logistic__class_weight": [None, "balanced"], # + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10)], 624 | "logistic__max_iter": [100], 625 | "logistic__solver": ["sag"] # ["liblinear", "sag", "lbfgs", "newton-cg"], 626 | "logistic__intercept": [True, False], 627 | } 628 | 629 | clf = GridSearchCV(feature_pipeline, params_grid, cv=5, scoring='roc_auc', verbose=-1, n_jobs=2, return_train_score=True) 630 | clf.fit(train, train[target]) 631 | 632 | y_pred_train = clf.best_estimator_.predict(train) 633 | y_pred_test = clf.best_estimator_.predict(test) 634 | 635 | print(clf.best_params_) 636 | 637 | # statmodels methods 638 | # feature_pipeline.named_steps['logistic'].summary_save() 639 | 640 | # print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target])) 641 | # print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target])) 642 | 643 | woe_train = feature_pipeline.fit_transform(train) 644 | woe_test = feature_pipeline.transform(test) 645 | 646 | # lr = StatsLogisticRegression(target=target) 647 | # lr.fit(woe_train) 648 | # lr.summary_save() 649 | 650 | # cols = list(filter(lambda x: x != target, feature_pipeline.named_steps['preprocessing_select'].select_columns)) 651 | 652 | combiner = feature_pipeline.named_steps['combiner'].combiner 653 | transformer = feature_pipeline.named_steps['transformer'].transformer 654 | 655 | score_card = ScoreCard(target=target, combiner=combiner, transer=transformer, ) 656 | score_card.fit(woe_train) 657 | 658 | 659 | data["score"] = score_card.transform(data) 660 | 661 | print(score_card.KS_bucket(data["score"], data[target])) 662 | pt = score_card.perf_eva(data["score"], data[target], title="train") 663 | 664 | sc = score_card.score_hist(data["score"], data[target]) 665 | 666 | print(score_card.KS(data["score"], data[target]), score_card.AUC(data["score"], data[target])) 667 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2023/2/15 17:55 4 | @Author : itlubber 5 | @Site : itlubber.art 6 | """ 7 | import math 8 | import sys 9 | import re 10 | import matplotlib 11 | import matplotlib.font_manager as font_manager 12 | import matplotlib.pyplot as plt 13 | import pandas as pd 14 | from openpyxl.formatting.rule import Rule 15 | from openpyxl.formatting.rule import ColorScaleRule 16 | from openpyxl.utils.dataframe import dataframe_to_rows 17 | from openpyxl.utils import get_column_letter, column_index_from_string 18 | 19 | 20 | from model import * 21 | from utils.excel_writer import ExcelWriter 22 | 23 | 24 | plt.style.use('seaborn-ticks') 25 | # plt.style.use('seaborn-white') 26 | # plt.rcParams.update({'font.size': 14}) 27 | 28 | 29 | def pyplot_chinese(font_path='utils/matplot_chinese.ttf'): 30 | # matplotlib.rcParams['font.size'] = 20 31 | matplotlib.font_manager.fontManager.addfont(font_path) 32 | matplotlib.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name() 33 | matplotlib.rcParams['axes.unicode_minus']=False 34 | 35 | 36 | pyplot_chinese() 37 | 38 | 39 | target = "creditability" 40 | data = sc.germancredit() 41 | data[target] = data[target].map({"good": 0, "bad": 1}) 42 | 43 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target]) 44 | oot = data.copy() 45 | 46 | feature_pipeline = Pipeline([ 47 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")), 48 | ("combiner", Combiner(target=target, min_samples=0.2)), 49 | ("transform", WOETransformer(target=target)), 50 | # ("processing_select", FeatureSelection(target=target, engine="scorecardpy")), 51 | ("stepwise", StepwiseSelection(target=target)), 52 | ]) 53 | 54 | feature_pipeline.fit(train) 55 | 56 | woe_train = feature_pipeline.transform(train) 57 | woe_test = feature_pipeline.transform(test) 58 | woe_oot = feature_pipeline.transform(oot) 59 | 60 | # # save all bin_plot 61 | # _combiner = feature_pipeline.named_steps["combiner"] 62 | # for col in woe_train.columns: 63 | # if col != target: 64 | # _combiner.bin_plot(train, col, labels=True, save=f"model_report/bin_plots/train_{col}.png") 65 | # _combiner.bin_plot(test, col, labels=True, save=f"model_report/bin_plots/test_{col}.png") 66 | # _combiner.bin_plot(oot, col, labels=True, save=f"model_report/bin_plots/oot_{col}.png") 67 | 68 | # logistic = StatsLogisticRegression(target=target) 69 | logistic = ITLubberLogisticRegression(target=target) 70 | 71 | logistic.fit(woe_train) 72 | 73 | y_pred_train = logistic.predict_proba(woe_train.drop(columns=target))[:, 1] 74 | y_pred_test = logistic.predict_proba(woe_test.drop(columns=target))[:, 1] 75 | y_pred_oot = logistic.predict_proba(woe_oot.drop(columns=target))[:, 1] 76 | 77 | ScoreCard.ks_plot(y_pred_train, train[target], save="model_report/lr_ksplot_train.png", figsize=(10, 5)) 78 | ScoreCard.ks_plot(y_pred_test, test[target], save="model_report/lr_ksplot_test.png", figsize=(10, 5)) 79 | ScoreCard.ks_plot(y_pred_oot, oot[target], save="model_report/lr_ksplot_oot.png", figsize=(10, 5)) 80 | 81 | summary = logistic.summary().reset_index().rename(columns={"index": "Features"}) 82 | 83 | train_corr = logistic.corr(woe_train, save="model_report/train_corr.png") 84 | test_corr = logistic.corr(woe_test, save="model_report/test_corr.png") 85 | oot_corr = logistic.corr(woe_oot, save="model_report/oot_corr.png") 86 | 87 | train_report = logistic.report(woe_train) 88 | test_report = logistic.report(woe_test) 89 | oot_report = logistic.report(woe_oot) 90 | 91 | print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target])) 92 | print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target])) 93 | print("oot: ", toad.metrics.KS(y_pred_oot, oot[target]), toad.metrics.AUC(y_pred_oot, oot[target])) 94 | 95 | 96 | card = ScoreCard(target=target, pipeline=feature_pipeline, pretrain_lr=logistic) 97 | card.fit(woe_train) 98 | 99 | train["score"] = card.predict(train) 100 | test["score"] = card.predict(test) 101 | oot["score"] = card.predict(oot) 102 | 103 | 104 | def sample_distribution(df, date="date", target="target", user_count="count", save="model_report/sample_time_count.png", figsize=(10, 6), colors=["#2639E9", "#F76E6C", "#FE7715"]): 105 | temp = df.set_index(date).assign( 106 | 好样本=lambda x: (x[target] == 0).astype(int), 107 | 坏样本=lambda x: (x[target] == 1).astype(int), 108 | ).resample("W").agg({"好样本": sum, "坏样本": sum}) 109 | temp.index = [i.strftime("%Y-%m-%d") for i in temp.index] 110 | 111 | fig, ax1 = plt.subplots(1, 1, figsize=figsize) 112 | temp.plot(kind='bar', stacked=True, ax=ax1, color=colors[:2], hatch="/", legend=False) 113 | ax1.tick_params(axis='x', labelrotation=-90) 114 | ax1.set(xlabel=None) 115 | ax1.set_ylabel('样本数') 116 | ax1.set_title('不同时点数据集样本分布情况\n\n') 117 | 118 | ax2 = plt.twinx() 119 | (temp["坏样本"] / temp.sum(axis=1)).plot(ax=ax2, color=colors[-1], marker=".", linewidth=2, label="坏样本率") 120 | # sns.despine() 121 | 122 | # 合并图例 123 | handles1, labels1 = ax1.get_legend_handles_labels() 124 | handles2, labels2 = ax2.get_legend_handles_labels() 125 | fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.94), frameon=False) 126 | # ax1.legend(frameon=False, labels=["good", "bad"], loc='upper right') 127 | # ax2.legend(loc='upper left', frameon=False, labels=["bad rate"]) 128 | 129 | plt.tight_layout() 130 | 131 | if save: 132 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 133 | os.makedirs(os.path.dirname(save)) 134 | 135 | fig.savefig(save, dpi=240, format="png", bbox_inches="tight") 136 | 137 | temp = temp.reset_index().rename(columns={date: "日期", "index": "日期", 0: "好样本", 1: "坏样本"}) 138 | temp["样本总数"] = temp["坏样本"] + temp["好样本"] 139 | temp["样本占比"] = temp["样本总数"] / temp["样本总数"].sum() 140 | temp["好样本占比"] = temp["好样本"] / temp["好样本"].sum() 141 | temp["坏样本占比"] = temp["坏样本"] / temp["坏样本"].sum() 142 | temp["坏样本率"] = temp["坏样本"] / temp["样本总数"] 143 | 144 | return temp[["日期", "样本总数", "样本占比", "好样本", "好样本占比", "坏样本", "坏样本占比", "坏样本率"]] 145 | 146 | 147 | def bin_plot(feature_table, feature="", figsize=(15, 8), colors=['#8E8BFE', '#FEA3A2', '#9394E7'], max_len=35, save=None): 148 | feature_table = feature_table.copy() 149 | 150 | feature_table["分箱"] = feature_table["分箱"].apply(lambda x: x if re.match("^\[.*\)$", x) else str(x)[:max_len] + "..") 151 | 152 | # 绘制好坏样本分布情况 153 | fig, ax1 = plt.subplots(figsize=figsize) 154 | ax1.barh(feature_table['分箱'], feature_table['好样本数'], color=colors[0], label='好样本', hatch="/") 155 | ax1.barh(feature_table['分箱'], feature_table['坏样本数'], left=feature_table['好样本数'], color=colors[1], label='坏样本', hatch="\\") 156 | ax1.set_xlabel('样本数') 157 | 158 | # 绘制坏样本率的分布情况 159 | ax2 = ax1.twiny() 160 | ax2.plot(feature_table['坏样本率'], feature_table['分箱'], colors[2], label='坏样本率', linestyle='-.') 161 | ax2.set_xlabel('坏样本率: 坏样本数 / 样本总数') 162 | 163 | for i, rate in enumerate(feature_table['坏样本率']): 164 | ax2.scatter(rate, i, color=colors[2]) 165 | 166 | # 在图像对应位置显示样本总数和坏样本率 167 | for i, v in feature_table[['样本总数', '好样本数', '坏样本数', '坏样本率']].iterrows(): 168 | ax1.text(v['样本总数'] / 2, i + len(feature_table) / 60, f"{int(v['好样本数'])}:{int(v['坏样本数'])}:{v['坏样本率']:.2%}") 169 | 170 | # 逆转y轴顺序 171 | ax1.invert_yaxis() 172 | 173 | # 添加一个标题 174 | fig.suptitle(f'变量 {feature} 分箱图\n\n') 175 | 176 | # 合并图例 177 | handles1, labels1 = ax1.get_legend_handles_labels() 178 | handles2, labels2 = ax2.get_legend_handles_labels() 179 | fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.925), frameon=False) 180 | 181 | # 调整布局,使分箱信息能够完全显示 182 | plt.tight_layout() 183 | 184 | if save: 185 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 186 | os.makedirs(os.path.dirname(save)) 187 | 188 | fig.savefig(save, dpi=240, format="png", bbox_inches="tight") 189 | 190 | 191 | writer = ExcelWriter(style_excel="./utils/报告输出模版.xlsx", theme_color="8E8BFE") 192 | 193 | 194 | # ////////////////////////////////////// 样本说明 ///////////////////////////////////// # 195 | df = pd.DataFrame({ 196 | "date": pd.date_range(start="2021-01-01", end="2022-06-30"), 197 | "target": np.random.randint(0, 2, 546), 198 | "count": np.random.randint(0, 100, 546), 199 | }) 200 | 201 | total_count = len(data) 202 | dataset_summary = pd.DataFrame( 203 | [ 204 | ["建模样本", "2022-01-01", "2023-01-31", len(data), len(data) / total_count, data[target].sum(), data[target].sum() / len(data), ""], 205 | ["训练集", "2022-01-01", "2023-12-31", len(train), len(train) / total_count, train[target].sum(), train[target].sum() / len(train), ""], 206 | ["测试集", "2022-01-01", "2023-12-31", len(test), len(test) / total_count, test[target].sum(), test[target].sum() / len(test), ""], 207 | ["跨时间验证集", "2023-01-01", "2023-01-31", len(oot), len(oot) / total_count, oot[target].sum(), oot[target].sum() / len(oot), ""], 208 | ], 209 | columns=["数据集", "开始时间", "结束时间", "样本总数", "样本占比", "坏客户数", "坏客户占比", "备注"], 210 | ) 211 | 212 | worksheet = writer.get_sheet_by_name("汇总信息") 213 | 214 | # 样本总体分布情况 215 | start_row, start_col = 2, 2 216 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="样本总体分布情况", style="header") 217 | end_row, end_col = writer.insert_df2sheet(worksheet, dataset_summary, (end_row + 1, start_col), header=True) 218 | 219 | writer.set_number_format(worksheet, f"{get_column_letter(end_col - 2)}{end_row - len(dataset_summary)}:{get_column_letter(end_col - 2)}{end_row}", "0.00%") 220 | writer.set_number_format(worksheet, f"{get_column_letter(end_col - 4)}{end_row - len(dataset_summary)}:{get_column_letter(end_col - 4)}{end_row}", "0.00%") 221 | 222 | # 建模样本时间分布情况 223 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/all_sample_time_count.png") 224 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="建模样本时间分布情况", style="header") 225 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/all_sample_time_count.png", (end_row, start_col), figsize=(720, 370)) 226 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False) 227 | 228 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%") 229 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%") 230 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%") 231 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%") 232 | 233 | # 训练集样本时间分布情况 234 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/train_sample_time_count.png") 235 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练集样本时间分布情况", style="header") 236 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_sample_time_count.png", (end_row, start_col), figsize=(720, 370)) 237 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False) 238 | 239 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%") 240 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%") 241 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%") 242 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%") 243 | 244 | # 测试集样本时间分布情况 245 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/test_sample_time_count.png") 246 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试集样本时间分布情况", style="header") 247 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_sample_time_count.png", (end_row, start_col), figsize=(720, 370)) 248 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False) 249 | 250 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%") 251 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%") 252 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%") 253 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%") 254 | 255 | # 跨时间验证集样本时间分布情况 256 | temp = sample_distribution(df, date="date", target="target", user_count="count", save="model_report/oot_sample_time_count.png") 257 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="跨时间验证集样本时间分布情况", style="header") 258 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/oot_sample_time_count.png", (end_row, start_col), figsize=(720, 370)) 259 | end_row, end_col = writer.insert_df2sheet(worksheet, temp.T.reset_index(), (end_row, start_col), header=False) 260 | 261 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 1}:{get_column_letter(end_col)}{end_row - 1}", "0.00%") 262 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 2}:{get_column_letter(end_col)}{end_row - 2}", "0.00%") 263 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 4}:{get_column_letter(end_col)}{end_row - 4}", "0.00%") 264 | writer.set_number_format(worksheet, f"{get_column_letter(start_col)}{end_row - 6}:{get_column_letter(end_col)}{end_row - 6}", "0.00%") 265 | 266 | 267 | # ////////////////////////////////////// 模型报告 ///////////////////////////////////// # 268 | 269 | # 逻辑回归拟合情况 270 | worksheet = writer.get_sheet_by_name("逻辑回归拟合结果") 271 | start_row, start_col = 2, 2 272 | 273 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="逻辑回归拟合效果", style="header") 274 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{start_row}:{get_column_letter(start_col + len(summary.columns) - 1)}{start_row}") 275 | # worksheet[f"{get_column_letter(start_col)}{start_row}:{get_column_letter(start_col + len(summary.columns) - 1)}{start_row}"].style = "header" 276 | logistic.plot_weights(save="model_report/logistic_train.png") 277 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/logistic_train.png", (end_row + 2, start_col)) 278 | end_row, end_col = writer.insert_df2sheet(worksheet, summary, (end_row + 1, start_col)) 279 | 280 | conditional_column = get_column_letter(start_col + summary.columns.get_loc("Coef.")) 281 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(summary)}', f'{conditional_column}{end_row}') 282 | 283 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{end_row + 2}:{get_column_letter(start_col + len(train_report.columns) - 1)}{end_row + 2}") 284 | # worksheet[f"{get_column_letter(start_col)}{end_row + 2}"].style = "header" 285 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集拟合报告", style="header") 286 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/lr_ksplot_train.png", (end_row, start_col), figsize=(480, 270)) 287 | end_row, end_col = writer.insert_df2sheet(worksheet, train_report, (end_row + 1, start_col)) 288 | 289 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{end_row + 2}:{get_column_letter(start_col + len(test_report.columns) - 1)}{end_row + 2}") 290 | # worksheet[f"{get_column_letter(start_col)}{end_row + 2}"].style = "header" 291 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集拟合报告", style="header") 292 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/lr_ksplot_test.png", (end_row, start_col), figsize=(480, 270)) 293 | end_row, end_col = writer.insert_df2sheet(worksheet, test_report, (end_row + 1, start_col)) 294 | 295 | # worksheet.merge_cells(f"{get_column_letter(start_col)}{end_row + 2}:{get_column_letter(start_col + len(oot_report.columns) - 1)}{end_row + 2}") 296 | # worksheet[f"{get_column_letter(start_col)}{end_row + 2}"].style = "header" 297 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="跨时间验证集拟合报告", style="header") 298 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/lr_ksplot_oot.png", (end_row, start_col), figsize=(480, 270)) 299 | end_row, end_col = writer.insert_df2sheet(worksheet, oot_report, (end_row + 1, start_col)) 300 | 301 | 302 | # ////////////////////////////////////// 特征概述 ///////////////////////////////////// # 303 | 304 | # 模型变量概览 305 | feature_describe = pd.DataFrame([ 306 | ["status_account", "支票账户状态"], ["duration", "借款周期"], ["credit_histor", "历史信用"], ["purpose", "借款目的"], ["amount", "信用额度"], ["svaing_account", "储蓄账户状态"], ["present_emp", "当前就业状态"], ["income_rate", "分期付款占可支配收入百分比"], ["personal_status", "性别与婚姻状态"], ["other_debtors", "他人担保信息"], ["residence_info", "现居住地"], ["property", "财产状态"], ["age", "年龄"], ["inst_plans", "其他分期情况"], ["housing", "房产状态"], ["num_credits", "信用卡数量"], ["job", "工作状态"], ["dependents", "赡养人数"], ["telephone", "电话号码注册情况"], ["foreign_worke", "是否有海外工作经历"], 307 | ], columns=["变量名称", "变量含义"]) 308 | 309 | worksheet = writer.get_sheet_by_name("模型变量信息") 310 | start_row, start_col = 2, 2 311 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="入模变量信息", style="header") 312 | end_row, end_col = writer.insert_df2sheet(worksheet, feature_describe.reset_index().rename(columns={"index": "序号"}), (end_row + 1, start_col)) 313 | 314 | # 变量分布情况 315 | data_info = toad.detect(data[card.rules.keys()]).reset_index().rename(columns={"index": "变量名称", "type": "变量类型", "size": "样本个数", "missing": "缺失值", "unique": "唯一值个数"}) 316 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分布情况", style="header") 317 | end_row, end_col = writer.insert_df2sheet(worksheet, data_info, (end_row + 1, start_col)) 318 | 319 | # 变量相关性 320 | data_corr = logistic.corr(feature_pipeline.transform(train), save="model_report/data_corr.png", annot=False) 321 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量相关性", style="header") 322 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/data_corr.png", (end_row + 1, start_col), figsize=(700, 500)) 323 | end_row, end_col = writer.insert_df2sheet(worksheet, data_corr.reset_index().rename(columns={"index": ""}), (end_row + 1, start_col)) 324 | 325 | conditional_column = f"{get_column_letter(start_col + 1)}{end_row - len(data_corr)}:{get_column_letter(end_col - 1)}{end_row - 1}" 326 | worksheet.conditional_formatting.add(conditional_column, ColorScaleRule(start_type='num', start_value=-1.0, start_color='8E8BFE', mid_type='num', mid_value=0., mid_color='FFFFFF', end_type='num', end_value=1.0, end_color='8E8BFE')) 327 | 328 | 329 | # 变量分箱信息 330 | _combiner = feature_pipeline.named_steps["combiner"] 331 | 332 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="变量分箱信息", style="header") 333 | for col in card.rules.keys(): 334 | feature_table = card.feature_bin_stats(data, col, target=target, desc="逻辑回归入模变量", combiner=card.combiner) 335 | # _combiner.bin_plot(data, col, labels=True, save=f"model_report/bin_plots/data_{col}.png") 336 | bin_plot(feature_table, feature=col, save=f"model_report/bin_plots/data_{col}.png") 337 | end_row, end_col = writer.insert_pic2sheet(worksheet, f"model_report/bin_plots/data_{col}.png", (end_row + 1, start_col), figsize=(700, 400)) 338 | end_row, end_col = writer.insert_df2sheet(worksheet, feature_table, (end_row, start_col)) 339 | 340 | for c in ["坏样本率", "LIFT值"]: 341 | conditional_column = get_column_letter(start_col + feature_table.columns.get_loc(c)) 342 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(feature_table)}', f'{conditional_column}{end_row}') 343 | # conditional_column = get_column_letter(start_col + feature_table.columns.get_loc("LIFT值")) 344 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(feature_table)}', f'{conditional_column}{end_row}') 345 | 346 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值"]: 347 | conditional_column = get_column_letter(start_col + feature_table.columns.get_loc(c)) 348 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(feature_table)}:{conditional_column}{end_row}", "0.00%") 349 | 350 | 351 | # ////////////////////////////////////// 评分卡说明 ///////////////////////////////////// # 352 | 353 | # 评分卡刻度 354 | scorecard_kedu = pd.DataFrame( 355 | [ 356 | ["base_odds", card.base_odds, "根据业务经验设置的基础比率(违约概率/正常概率),估算方法:(1-样本坏客户占比)/坏客户占比"], 357 | ["base_score", card.base_score, "基础ODDS对应的分数"], 358 | ["rate", card.rate, "设置分数的倍率"], 359 | ["pdo", card.pdo, "表示分数增长PDO时,ODDS值增长到RATE倍"], 360 | ["B", card.offset, "补偿值,计算方式:pdo / ln(rate)"], 361 | ["A", card.factor, "刻度,计算方式:base_score - B * ln(base_odds)"], 362 | ], 363 | columns=["刻度项", "刻度值", "备注"], 364 | ) 365 | 366 | worksheet = writer.get_sheet_by_name("评分卡结果") 367 | start_row, start_col = 2, 2 368 | end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="评分卡刻度", style="header") 369 | end_row, end_col = writer.insert_df2sheet(worksheet, scorecard_kedu, (end_row + 1, start_col)) 370 | 371 | # 评分卡对应分数 372 | card_points = card.export(to_frame=True).rename(columns={"name": "变量名称", "value": "变量分箱", "score": "对应分数"}) 373 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡分数", style="header") 374 | end_row, end_col = writer.insert_df2sheet(worksheet, card_points, (end_row + 1, start_col), merge_column="变量名称") 375 | 376 | # 评分效果 377 | clip = 50 378 | clip_start = max(math.ceil(train["score"].min() / clip) * clip, math.ceil(train["score"].quantile(0.01) / clip) * clip) 379 | clip_end = min(math.ceil(train["score"].max() / clip) * clip, math.ceil(train["score"].quantile(0.99) / clip) * clip) 380 | score_clip = [i for i in range(clip_start, clip_end, clip)] 381 | 382 | train_score_rank = card.feature_bin_stats(train, "score", target=target, rules=score_clip, verbose=0, method="step", ks=True) 383 | test_score_rank = card.feature_bin_stats(test, "score", target=target, rules=score_clip, verbose=0, method="step", ks=True) 384 | oot_score_rank = card.feature_bin_stats(oot, "score", target=target, rules=score_clip, verbose=0, method="step", ks=True) 385 | 386 | card.ks_plot(train["score"], train[target], title="Train Dataset", save="model_report/train_ksplot.png") 387 | card.ks_plot(test["score"], test[target], title="Test Dataset", save="model_report/test_ksplot.png") 388 | card.ks_plot(oot["score"], oot[target], title="OOT Dataset", save="model_report/oot_ksplot.png") 389 | 390 | card.score_hist(train["score"], train[target], save="model_report/train_scorehist.png", bins=30, figsize=(13, 10)) 391 | card.score_hist(test["score"], test[target], save="model_report/test_scorehist.png", bins=30, figsize=(13, 10)) 392 | card.score_hist(oot["score"], oot[target], save="model_report/oot_scorehist.png", bins=30, figsize=(13, 10)) 393 | 394 | 395 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="训练数据集评分模型效果", style="header") 396 | ks_row = end_row 397 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_ksplot.png", (ks_row, start_col)) 398 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_scorehist.png", (ks_row, end_col)) 399 | end_row, end_col = writer.insert_df2sheet(worksheet, train_score_rank, (end_row + 1, start_col)) 400 | 401 | for c in ["坏样本率", "LIFT值", "分档KS值"]: 402 | conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc(c)) 403 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(train_score_rank)}', f'{conditional_column}{end_row}') 404 | 405 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"]: 406 | conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc(c)) 407 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(train_score_rank)}:{conditional_column}{end_row}", "0.00%") 408 | 409 | # conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc("坏样本率")) 410 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_score_rank)}', f'{conditional_column}{end_row}') 411 | # conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc("LIFT值")) 412 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_score_rank)}', f'{conditional_column}{end_row}') 413 | # conditional_column = get_column_letter(start_col + train_score_rank.columns.get_loc("分档KS值")) 414 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_score_rank)}', f'{conditional_column}{end_row}') 415 | 416 | 417 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="测试数据集评分模型效果", style="header") 418 | ks_row = end_row 419 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_ksplot.png", (ks_row, start_col)) 420 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_scorehist.png", (ks_row, end_col)) 421 | end_row, end_col = writer.insert_df2sheet(worksheet, test_score_rank, (end_row + 1, start_col)) 422 | 423 | for c in ["坏样本率", "LIFT值", "分档KS值"]: 424 | conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc(c)) 425 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(test_score_rank)}', f'{conditional_column}{end_row}') 426 | 427 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"]: 428 | conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc(c)) 429 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(test_score_rank)}:{conditional_column}{end_row}", "0.00%") 430 | 431 | # conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc("坏样本率")) 432 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_score_rank)}', f'{conditional_column}{end_row}') 433 | # conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc("LIFT值")) 434 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_score_rank)}', f'{conditional_column}{end_row}') 435 | # conditional_column = get_column_letter(start_col + test_score_rank.columns.get_loc("分档KS值")) 436 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_score_rank)}', f'{conditional_column}{end_row}') 437 | 438 | 439 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="跨时间验证集评分模型效果", style="header") 440 | ks_row = end_row 441 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/oot_ksplot.png", (ks_row, start_col)) 442 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/oot_scorehist.png", (ks_row, end_col)) 443 | end_row, end_col = writer.insert_df2sheet(worksheet, oot_score_rank, (end_row + 1, start_col)) 444 | 445 | for c in ["坏样本率", "LIFT值", "分档KS值"]: 446 | conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc(c)) 447 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row - len(oot_score_rank)}', f'{conditional_column}{end_row}') 448 | 449 | for c in ["样本占比", "好样本占比", "坏样本占比", "坏样本率", "LIFT值", "累积LIFT值", "分档KS值"]: 450 | conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc(c)) 451 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(oot_score_rank)}:{conditional_column}{end_row}", "0.00%") 452 | 453 | # conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc("坏样本率")) 454 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(oot_score_rank)}', f'{conditional_column}{end_row}') 455 | # conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc("LIFT值")) 456 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(oot_score_rank)}', f'{conditional_column}{end_row}') 457 | # conditional_column = get_column_letter(start_col + oot_score_rank.columns.get_loc("分档KS值")) 458 | # writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(oot_score_rank)}', f'{conditional_column}{end_row}') 459 | 460 | 461 | def score_psi(expected, actual, labels=["预期", "实际"], save=None, colors=['#8E8BFE', '#FEA3A2', '#9394E7'], figsize=(15, 8)): 462 | expected = expected.rename(columns={"分箱": "评分区间", "样本总数": f"{labels[0]}样本数", "样本占比": f"{labels[0]}样本占比", "坏样本率": f"{labels[0]}坏样本率"}) 463 | actual = actual.rename(columns={"分箱": "评分区间", "样本总数": f"{labels[1]}样本数", "样本占比": f"{labels[1]}样本占比", "坏样本率": f"{labels[1]}坏样本率"}) 464 | df_psi = expected.merge(actual, on="评分区间", how="outer").replace(np.nan, 0) 465 | df_psi[f"{labels[1]}% - {labels[0]}%"] = df_psi[f"{labels[1]}样本占比"] - df_psi[f"{labels[0]}样本占比"] 466 | df_psi[f"ln({labels[1]}% / {labels[0]}%)"] = np.log(df_psi[f"{labels[1]}样本占比"] / df_psi[f"{labels[0]}样本占比"]) 467 | df_psi["分档PSI值"] = (df_psi[f"{labels[1]}% - {labels[0]}%"] * df_psi[f"ln({labels[1]}% / {labels[0]}%)"]) 468 | df_psi = df_psi.fillna(0).replace(np.inf, 0).replace(-np.inf, 0) 469 | df_psi["总体PSI值"] = df_psi["分档PSI值"].sum() 470 | 471 | if save: 472 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 473 | os.makedirs(os.path.dirname(save)) 474 | 475 | x = df_psi['评分区间'] 476 | width = 0.35 477 | x_indexes = np.arange(len(x)) 478 | fig, ax1 = plt.subplots(figsize=figsize) 479 | 480 | ax1.bar(x_indexes - width / 2, df_psi[f'{labels[0]}样本占比'], width, label=f'{labels[0]}样本占比', color=colors[0], hatch="/") 481 | ax1.bar(x_indexes + width / 2, df_psi[f'{labels[1]}样本占比'], width, label=f'{labels[1]}样本占比', color=colors[1], hatch="\\") 482 | 483 | ax1.set_ylabel('样本占比: 评分区间内样本数 / 样本总数') 484 | ax1.set_xticks(x_indexes) 485 | ax1.set_xticklabels(x) 486 | ax1.tick_params(axis='x', labelrotation=90) 487 | 488 | ax2 = ax1.twinx() 489 | ax2.plot(df_psi["评分区间"], df_psi[f"{labels[0]}坏样本率"], color=colors[0], label=f"{labels[0]}坏样本率", linestyle=(5, (10, 3))) 490 | ax2.plot(df_psi["评分区间"], df_psi[f"{labels[1]}坏样本率"], color=colors[1], label=f"{labels[1]}坏样本率", linestyle=(5, (10, 3))) 491 | 492 | ax2.scatter(df_psi["评分区间"], df_psi[f"{labels[0]}坏样本率"], marker=".") 493 | ax2.scatter(df_psi["评分区间"], df_psi[f"{labels[1]}坏样本率"], marker=".") 494 | 495 | ax2.set_ylabel('坏样本率: 坏样本数 / 样本总数') 496 | 497 | handles1, labels1 = ax1.get_legend_handles_labels() 498 | handles2, labels2 = ax2.get_legend_handles_labels() 499 | fig.legend(handles1 + handles2, labels1 + labels2, loc='upper center', ncol=len(labels1 + labels2), bbox_to_anchor=(0.5, 0.94), frameon=False) 500 | 501 | fig.suptitle(f"{labels[0]} vs {labels[1]} 群体稳定性指数(PSI): {df_psi['分档PSI值'].sum():.4f}\n\n") 502 | 503 | fig.tight_layout() 504 | 505 | fig.savefig(save, dpi=240, format="png", bbox_inches="tight") 506 | 507 | return df_psi[["评分区间", f"{labels[0]}样本数", f"{labels[0]}样本占比", f"{labels[0]}坏样本率", f"{labels[1]}样本数", f"{labels[1]}样本占比", f"{labels[1]}坏样本率", f"{labels[1]}% - {labels[0]}%", f"ln({labels[1]}% / {labels[0]}%)", "分档PSI值", "总体PSI值"]] 508 | 509 | 510 | train_test_score_psi = score_psi(train_score_rank, test_score_rank, labels=["训练数据集", "测试数据集"], save="model_report/train_test_psiplot.png") 511 | train_oot_score_psi = score_psi(train_score_rank, oot_score_rank, labels=["训练数据集", "跨时间验证集"], save="model_report/train_oot_psiplot.png") 512 | test_oot_score_psi = score_psi(test_score_rank, oot_score_rank, labels=["测试数据集", "跨时间验证集"], save="model_report/test_oot_psiplot.png") 513 | 514 | 515 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡模型稳定性评估: 训练数据集 vs 测试数据集", style="header") 516 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_test_psiplot.png", (end_row, start_col), figsize=(1000, 400)) 517 | end_row, end_col = writer.insert_df2sheet(worksheet, train_test_score_psi, (end_row + 1, start_col)) 518 | 519 | conditional_column = get_column_letter(start_col + train_test_score_psi.columns.get_loc("分档PSI值")) 520 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_test_score_psi)}', f'{conditional_column}{end_row}') 521 | 522 | for c in ["训练数据集样本占比", "训练数据集坏样本率", "测试数据集样本占比", "测试数据集坏样本率"]: 523 | conditional_column = get_column_letter(start_col + train_test_score_psi.columns.get_loc(c)) 524 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(train_test_score_psi)}:{conditional_column}{end_row}", "0.00%") 525 | 526 | 527 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡模型稳定性评估: 训练数据集 vs 跨时间验证集", style="header") 528 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/train_oot_psiplot.png", (end_row, start_col), figsize=(1000, 400)) 529 | end_row, end_col = writer.insert_df2sheet(worksheet, train_oot_score_psi, (end_row + 1, start_col)) 530 | 531 | conditional_column = get_column_letter(start_col + train_oot_score_psi.columns.get_loc("分档PSI值")) 532 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(train_oot_score_psi)}', f'{conditional_column}{end_row}') 533 | 534 | for c in ["训练数据集样本占比", "训练数据集坏样本率", "跨时间验证集样本占比", "跨时间验证集坏样本率"]: 535 | conditional_column = get_column_letter(start_col + train_oot_score_psi.columns.get_loc(c)) 536 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(train_oot_score_psi)}:{conditional_column}{end_row}", "0.00%") 537 | 538 | 539 | end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="评分卡模型稳定性评估: 测试数据集 vs 跨时间验证集", style="header") 540 | end_row, end_col = writer.insert_pic2sheet(worksheet, "model_report/test_oot_psiplot.png", (end_row, start_col), figsize=(1000, 400)) 541 | end_row, end_col = writer.insert_df2sheet(worksheet, test_oot_score_psi, (end_row + 1, start_col)) 542 | 543 | conditional_column = get_column_letter(start_col + test_oot_score_psi.columns.get_loc("分档PSI值")) 544 | writer.add_conditional_formatting(worksheet, f'{conditional_column}{end_row-len(test_oot_score_psi)}', f'{conditional_column}{end_row}') 545 | 546 | for c in ["跨时间验证集样本占比", "跨时间验证集坏样本率", "测试数据集样本占比", "测试数据集坏样本率"]: 547 | conditional_column = get_column_letter(start_col + test_oot_score_psi.columns.get_loc(c)) 548 | writer.set_number_format(worksheet, f"{conditional_column}{end_row - len(test_oot_score_psi)}:{conditional_column}{end_row}", "0.00%") 549 | 550 | 551 | # ////////////////////////////////////// 模型稳定性 ///////////////////////////////////// # 552 | # 553 | # worksheet = writer.get_sheet_by_name("模型稳定性") 554 | # start_row, start_col = 2, 2 555 | # 556 | # # 变量 CSI 表 557 | # end_row, end_col = writer.insert_value2sheet(worksheet, (start_row, start_col), value="入模变量稳定性指标 (Characteristic Stability Index, CSI)", style="header") 558 | # 559 | # # train vs test 560 | # 561 | # # 评分分布稳定性 562 | # end_row, end_col = writer.insert_value2sheet(worksheet, (end_row + 2, start_col), value="模型评分稳定性指标 (Population Stability Index, PSI)", style="header") 563 | 564 | 565 | writer.save("model_report/评分卡模型报告.xlsx") 566 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @Time : 2022/8/23 13:12 4 | @Author : itlubber 5 | @Site : itlubber.art 6 | """ 7 | 8 | import os 9 | import toad 10 | import warnings 11 | import numpy as np 12 | import pandas as pd 13 | import scorecardpy as sc 14 | from scorecardpy.perf import eva_pks, eva_proc 15 | from optbinning import OptimalBinning 16 | import matplotlib.pyplot as plt 17 | from matplotlib import font_manager 18 | import seaborn as sns 19 | # import plotly.graph_objects as go 20 | # from plotly.io import write_image 21 | from openpyxl import load_workbook 22 | from openpyxl.styles import Alignment, PatternFill 23 | 24 | import scipy 25 | import statsmodels.api as sm 26 | from statsmodels.stats.outliers_influence import variance_inflation_factor 27 | 28 | from sklearn.pipeline import Pipeline 29 | from sklearn.metrics import roc_curve, auc 30 | from sklearn.metrics import classification_report 31 | from sklearn.linear_model import LogisticRegression 32 | from sklearn.model_selection import train_test_split 33 | from sklearn.utils.validation import check_is_fitted 34 | from sklearn.ensemble import GradientBoostingClassifier 35 | from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin 36 | 37 | from processing import FeatureSelection, Combiner, WOETransformer, StepwiseSelection 38 | 39 | 40 | warnings.filterwarnings("ignore") 41 | pd.set_option('display.width', 5000) 42 | # plt.rcParams["font.sans-serif"]=["SimHei"] #设置字体 43 | # plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题 44 | 45 | 46 | def pyplot_chinese(font_path='utils/matplot_chinese.ttf'): 47 | font_manager.fontManager.addfont(font_path) 48 | plt.rcParams['font.family'] = font_manager.FontProperties(fname=font_path).get_name() 49 | plt.rcParams['axes.unicode_minus']=False 50 | 51 | 52 | class StatsLogisticRegression(TransformerMixin, BaseEstimator): 53 | 54 | def __init__(self, target="target", intercept=True): 55 | """ 56 | 基于statsmodels的逻辑回归方法 57 | 58 | Args: 59 | target: 数据集中标签名称,默认 target 60 | intercept: 是否包含截距,默认 True,即包含截距 61 | """ 62 | self.intercept = intercept 63 | self.target = target 64 | self.classifier = None 65 | self.corr = None 66 | self.vif = None 67 | self.coef_normalization = None 68 | self.feature_names_ = None 69 | self.feature_importances_ = None 70 | 71 | def fit(self, x, y=None, vif=True, corr=True, normalization=True): 72 | self.feature_names_ = list(x.drop(columns=[self.target]).columns) 73 | self.feature_importances_ = self.feature_importances(x) 74 | 75 | if vif: 76 | self.vif = self.VIF(x) 77 | 78 | if normalization: 79 | _x = x.drop(columns=[self.target]).apply(lambda x: (x - np.mean(x)) / np.std(x)) 80 | _y = x[self.target] 81 | lr_normalization = sm.Logit(_y, sm.add_constant(_x) if self.intercept else _x).fit() 82 | self.coef_normalization = pd.DataFrame(lr_normalization.params, columns=["coef_normalization"]) 83 | 84 | if corr: 85 | self.corr = x.drop(columns=[self.target]).corr() 86 | 87 | if self.intercept: 88 | x = sm.add_constant(x) 89 | 90 | self.classes_ = x[self.target].unique() 91 | self.classifier = sm.Logit(x[self.target], x.drop(columns=[self.target])).fit() 92 | 93 | return self 94 | 95 | def transform(self, x): 96 | if self.intercept: 97 | x = sm.add_constant(x) 98 | 99 | return self.classifier.predict(x) 100 | 101 | def predict(self, x): 102 | return self.transform(x) 103 | 104 | def summary(self): 105 | describe = self.classifier.summary2() 106 | return describe 107 | 108 | def feature_importances(self, x): 109 | params = { 110 | "n_estimators": 256, 111 | "max_depth": 4, 112 | "min_samples_split": 5, 113 | "learning_rate": 1e-3, 114 | "loss": "deviance", 115 | "subsample": 0.9, 116 | } 117 | feature_importances_ = GradientBoostingClassifier(**params).fit(x.drop(columns=[self.target]), x[self.target]).feature_importances_ 118 | return pd.DataFrame(feature_importances_, index=self.feature_names_, columns=["feature_importances"]) 119 | 120 | def VIF(self, x): 121 | if self.intercept: 122 | x = sm.add_constant(x) 123 | 124 | x = x.drop(columns=[self.target]) 125 | columns = x.columns 126 | vif = pd.DataFrame({"VIF": [variance_inflation_factor(np.matrix(x), i) for i in range(len(columns))]}, index=columns) 127 | 128 | return vif 129 | 130 | def WALD(self): 131 | return self.classifier.wald_test_terms().table[["statistic", "pvalue"]].rename(columns={"pvalue": "wald_test_pvalue", "statistic": "wald_test_statistic"}) 132 | 133 | def report(self): 134 | return self.classifier.summary2().tables[1].join([self.coef_normalization, self.WALD(), self.vif, self.feature_importances_]), self.classifier.summary2().tables[0], self.corr 135 | 136 | def summary_save(self, excel_name="逻辑回归模型拟合效果.xlsx", sheet_name="逻辑回归拟合效果"): 137 | writer = pd.ExcelWriter(excel_name, engine='openpyxl') 138 | 139 | coef_report, summary_report, corr_report = self.report() 140 | summary_report.columns = ["逻辑回归模型拟合效果"] * summary_report.shape[1] 141 | summary_report.to_excel(writer, sheet_name=sheet_name, index=False, header=False, startcol=0, startrow=2) 142 | coef_report.reset_index().rename(columns={"index": "variable"}).to_excel(writer, sheet_name=sheet_name, index=False, header=True, startcol=0, startrow=summary_report.shape[0] + 4) 143 | corr_report.to_excel(writer, sheet_name=sheet_name, index=True, header=True, startcol=0, startrow=summary_report.shape[0] + coef_report.shape[0] + 7) 144 | 145 | writer.save() 146 | writer.close() 147 | 148 | if os.path.exists(excel_name): 149 | workbook = load_workbook(excel_name) 150 | worksheet = workbook.get_sheet_by_name(sheet_name) 151 | worksheet["A1"].value = "逻辑回归模型报告" 152 | worksheet["A1"].alignment = Alignment(horizontal='center', vertical='center') 153 | worksheet.merge_cells(f"A1:L1") 154 | 155 | workbook.save(excel_name) 156 | workbook.close() 157 | 158 | try: 159 | from processing import render_excel # From: https://github.com/itlubber/openpyxl-excel-style-template/blob/main/feature_bins.py 160 | render_excel(excel_name, sheet_name=sheet_name, max_column_width=25, merge_rows=np.cumsum([1, len(summary_report), 2, len(coef_report) + 1, 2, len(corr_report) + 1]).tolist()) 161 | except: 162 | pass 163 | 164 | 165 | class ITLubberLogisticRegression(LogisticRegression): 166 | """ 167 | Extended Logistic Regression. 168 | Extends [sklearn.linear_model.LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). 169 | This class provides the following extra statistics, calculated on `.fit()` and accessible via `.summary()`: 170 | - `cov_matrix_`: covariance matrix for the estimated parameters. 171 | - `std_err_intercept_`: estimated uncertainty for the intercept 172 | - `std_err_coef_`: estimated uncertainty for the coefficients 173 | - `z_intercept_`: estimated z-statistic for the intercept 174 | - `z_coef_`: estimated z-statistic for the coefficients 175 | - `p_value_intercept_`: estimated p-value for the intercept 176 | - `p_value_coef_`: estimated p-value for the coefficients 177 | 178 | Example: 179 | ```python 180 | feature_pipeline = Pipeline([ 181 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")), 182 | ("combiner", Combiner(target=target, min_samples=0.2)), 183 | ("transform", WOETransformer(target=target)), 184 | ("processing_select", FeatureSelection(target=target, engine="scorecardpy")), 185 | ("stepwise", StepwiseSelection(target=target)), 186 | # ("logistic", LogisticClassifier(target=target)), 187 | ("logistic", ITLubberLogisticRegression(target=target)), 188 | ]) 189 | 190 | feature_pipeline.fit(train) 191 | summary = feature_pipeline.named_steps['logistic'].summary() 192 | ``` 193 | 194 | An example output of `.summary()`: 195 | 196 | | | Coef. | Std.Err | z | P>|z| | [ 0.025 | 0.975 ] | VIF | 197 | |:------------------|----------:|----------:|---------:|------------:|-----------:|----------:|--------:| 198 | | const | -0.844037 | 0.0965117 | -8.74544 | 2.22148e-18 | -1.0332 | -0.654874 | 1.05318 | 199 | | duration.in.month | 0.847445 | 0.248873 | 3.40513 | 0.000661323 | 0.359654 | 1.33524 | 1.14522 | 200 | """ 201 | 202 | def __init__(self, target="target", penalty="l2", calculate_stats=True, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver="lbfgs", max_iter=100, multi_class="auto", verbose=0, warm_start=False, n_jobs=None, l1_ratio=None,): 203 | """ 204 | Extends [sklearn.linear_model.LogisticRegression.fit()](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html). 205 | 206 | Args: 207 | target (str): your dataset's target name 208 | calculate_stats (bool): If true, calculate statistics like standard error during fit, accessible with .summary() 209 | """ 210 | super().__init__(penalty=penalty, dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs, l1_ratio=l1_ratio,) 211 | self.target = target 212 | self.calculate_stats = calculate_stats 213 | 214 | def fit(self, x, sample_weight=None, **kwargs): 215 | y = x[self.target] 216 | x = x.drop(columns=[self.target]) 217 | 218 | if not self.calculate_stats: 219 | return super().fit(x, y, sample_weight=sample_weight, **kwargs) 220 | 221 | x = self.convert_sparse_matrix(x) 222 | 223 | if isinstance(x, pd.DataFrame): 224 | self.names_ = ["const"] + [f for f in x.columns] 225 | else: 226 | self.names_ = ["const"] + [f"x{i}" for i in range(x.shape[1])] 227 | 228 | lr = super().fit(x, y, sample_weight=sample_weight, **kwargs) 229 | 230 | predProbs = self.predict_proba(x) 231 | 232 | # Design matrix -- add column of 1's at the beginning of your x matrix 233 | if lr.fit_intercept: 234 | x_design = np.hstack([np.ones((x.shape[0], 1)), x]) 235 | else: 236 | x_design = x 237 | 238 | self.vif = [variance_inflation_factor(np.matrix(x_design), i) for i in range(x_design.shape[-1])] 239 | p = np.product(predProbs, axis=1) 240 | self.cov_matrix_ = np.linalg.inv((x_design * p[..., np.newaxis]).T @ x_design) 241 | std_err = np.sqrt(np.diag(self.cov_matrix_)).reshape(1, -1) 242 | 243 | # In case fit_intercept is set to True, then in the std_error array 244 | # Index 0 corresponds to the intercept, from index 1 onwards it relates to the coefficients 245 | # If fit intercept is False, then all the values are related to the coefficients 246 | if lr.fit_intercept: 247 | 248 | self.std_err_intercept_ = std_err[:, 0] 249 | self.std_err_coef_ = std_err[:, 1:][0] 250 | 251 | self.z_intercept_ = self.intercept_ / self.std_err_intercept_ 252 | 253 | # Get p-values under the gaussian assumption 254 | self.p_val_intercept_ = scipy.stats.norm.sf(abs(self.z_intercept_)) * 2 255 | 256 | else: 257 | self.std_err_intercept_ = np.array([np.nan]) 258 | self.std_err_coef_ = std_err[0] 259 | 260 | self.z_intercept_ = np.array([np.nan]) 261 | 262 | # Get p-values under the gaussian assumption 263 | self.p_val_intercept_ = np.array([np.nan]) 264 | 265 | self.z_coef_ = self.coef_ / self.std_err_coef_ 266 | self.p_val_coef_ = scipy.stats.norm.sf(abs(self.z_coef_)) * 2 267 | 268 | return self 269 | 270 | def corr(self, data, save=None, annot=True): 271 | corr = data.drop(columns=[self.target]).corr() 272 | 273 | if save: 274 | self.corr_plot(data.drop(columns=[self.target]), save=save, annot=annot) 275 | 276 | return corr 277 | 278 | @staticmethod 279 | def corr_plot(data, figure_size=(16, 8), fontsize=14, color=["#2639E9", "#F76E6C", "#FE7715"], mask=False, save=None, annot=True): 280 | corr = data.corr() 281 | corr_mask = np.zeros_like(corr, dtype = np.bool) 282 | corr_mask[np.triu_indices_from(corr_mask)] = True 283 | 284 | map_plot = toad.tadpole.tadpole.heatmap( 285 | corr, 286 | mask = corr_mask if mask else None, 287 | cmap = sns.diverging_palette(267, 267, n=10, s=100, l=40), 288 | vmax = 1, 289 | vmin = -1, 290 | center = 0, 291 | square = True, 292 | linewidths = .1, 293 | annot = annot, 294 | fmt = '.2f', 295 | figure_size = figure_size, 296 | ) 297 | 298 | map_plot.tick_params(axis='x', labelrotation=270, labelsize=fontsize) 299 | map_plot.tick_params(axis='y', labelrotation=0, labelsize=fontsize) 300 | 301 | if save: 302 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 303 | os.makedirs(os.path.dirname(save)) 304 | 305 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight") 306 | 307 | return map_plot 308 | 309 | def report(self, data): 310 | report_dict = classification_report(data[self.target], self.predict(data.drop(columns=self.target)), output_dict=True, target_names=["好客户", "坏客户"]) 311 | accuracy = report_dict.pop("accuracy") 312 | _report = pd.DataFrame(report_dict).T.reset_index().rename(columns={"index": "desc"}) 313 | _report.loc[len(_report)] = ['accuracy', '', '', accuracy, len(data)] 314 | return _report 315 | 316 | def summary(self): 317 | """ 318 | Puts the summary statistics of the fit() function into a pandas DataFrame. 319 | Returns: 320 | data (pandas DataFrame): The statistics dataframe, indexed by the column name 321 | """ 322 | check_is_fitted(self) 323 | 324 | if not hasattr(self, "std_err_coef_"): 325 | msg = "Summary statistics were not calculated on .fit(). Options to fix:\n" 326 | msg += "\t- Re-fit using .fit(X, y, calculate_stats=True)\n" 327 | msg += "\t- Re-inititialize using LogisticRegression(calculate_stats=True)" 328 | raise AssertionError(msg) 329 | 330 | data = { 331 | "Coef.": (self.intercept_.tolist() + self.coef_.tolist()[0]), 332 | "Std.Err": (self.std_err_intercept_.tolist() + self.std_err_coef_.tolist()), 333 | "z": (self.z_intercept_.tolist() + self.z_coef_.tolist()[0]), 334 | "P>|z|": (self.p_val_intercept_.tolist() + self.p_val_coef_.tolist()[0]), 335 | } 336 | 337 | stats = pd.DataFrame(data, index=self.names_) 338 | stats["[ 0.025"] = stats["Coef."] - 1.96 * stats["Std.Err"] 339 | stats["0.975 ]"] = stats["Coef."] + 1.96 * stats["Std.Err"] 340 | 341 | stats["VIF"] = self.vif 342 | 343 | return stats 344 | 345 | @staticmethod 346 | def convert_sparse_matrix(x): 347 | """ 348 | Converts a sparse matrix to a numpy array. 349 | This can prevent problems arising from, e.g. OneHotEncoder. 350 | Args: 351 | x: numpy array, sparse matrix 352 | Returns: 353 | numpy array of x 354 | """ 355 | if scipy.sparse.issparse(x): 356 | return x.toarray() 357 | else: 358 | return x 359 | 360 | def plot_weights(self, save=None, figsize=(15, 8), fontsize=14, color=["#2639E9", "#F76E6C", "#FE7715"]): 361 | summary = self.summary() 362 | 363 | x = summary["Coef."] 364 | y = summary.index 365 | lower_error = summary["Coef."] - summary["[ 0.025"] 366 | upper_error = summary["0.975 ]"] - summary["Coef."] 367 | 368 | fig, ax = plt.subplots(1, 1, figsize=figsize) 369 | ax.errorbar(x, y, xerr=[lower_error, upper_error], fmt="o", ecolor=color[0], elinewidth=2, capthick=2, capsize=4, ms=6, mfc=color[0], mec=color[0]) 370 | # ax.tick_params(axis='x', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize) 371 | # ax.tick_params(axis='y', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize) 372 | ax.axvline(0, color=color[0], linestyle='--', ymax=len(y), alpha=0.5) 373 | ax.spines['top'].set_color(color[0]) 374 | ax.spines['bottom'].set_color(color[0]) 375 | ax.spines['right'].set_color(color[0]) 376 | ax.spines['left'].set_color(color[0]) 377 | ax.spines['top'].set_visible(False) 378 | ax.spines['right'].set_visible(False) 379 | 380 | ax.set_title("Regression Meta Analysis - Weight Plot", fontsize=fontsize, fontweight="bold") 381 | ax.set_xlabel("Weight Estimates", fontsize=fontsize, weight="bold") 382 | ax.set_ylabel("Variable", fontsize=fontsize, weight="bold") 383 | 384 | if save: 385 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 386 | os.makedirs(os.path.dirname(save)) 387 | 388 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight") 389 | 390 | return fig 391 | 392 | # def plot_weights(self, save=None): 393 | # """ 394 | # Generates a weight plot(plotly chart) from `stats` 395 | # Example: 396 | # ``` 397 | # pipeline = Pipeline([ 398 | # ('clf', LogisticRegression(calculate_stats=True)) 399 | # ]) 400 | # pipeline.fit(X, y) 401 | # stats = pipeline.named_steps['clf'].plot_weights() 402 | # ``` 403 | # Args: 404 | # stats: The statistics to display 405 | # format: The format of the image, such as 'png'. The default None returns a plotly image. 406 | # scale: If format is specified, the scale of the image 407 | # width: If format is specified, the width of the image 408 | # height: If format is specified, the image of the image 409 | # """ 410 | # stats = self.summary() 411 | 412 | # fig = go.Figure() 413 | 414 | # fig.add_trace( 415 | # go.Scatter( 416 | # x=stats['Coef.'], 417 | # y=stats['Coef.'].index, 418 | # line=dict(color='#2639E9', width=2), 419 | # mode='markers', 420 | 421 | # error_x=dict( 422 | # type='data', 423 | # symmetric=False, 424 | # array=stats['0.975 ]'] - stats['Coef.'], 425 | # arrayminus=stats['Coef.'] - stats['[ 0.025'], 426 | # color='#2639E9') 427 | # ) 428 | # ) 429 | 430 | # fig.add_shape(type="line", 431 | # x0=0, y0=0, x1=0, y1=len(stats), 432 | # line=dict(color="#a29bfe", width=3, dash='dash') 433 | # ) 434 | 435 | # fig.update_layout( 436 | # title='Regression Meta Analysis - Weight Plot', 437 | # xaxis_title='Weight Estimates', 438 | # yaxis_title='Variable', 439 | # xaxis_showgrid=False, 440 | # yaxis_showgrid=False 441 | # ) 442 | 443 | # fig.update_layout(template="simple_white") 444 | 445 | # if save: 446 | # write_image(fig, save) 447 | 448 | # return fig 449 | 450 | 451 | class ScoreCard(toad.ScoreCard, TransformerMixin): 452 | 453 | def __init__(self, target="target", pdo=60, rate=2, base_odds=35, base_score=750, combiner={}, transer=None, pretrain_lr=None, pipeline=None, **kwargs): 454 | """ 455 | 评分卡模型转换 456 | 457 | Args: 458 | target: 数据集中标签名称,默认 target 459 | pdo: odds 每增加 rate 倍时减少 pdo 分,默认 60 460 | rate: 倍率 461 | base_odds: 基础 odds,通常根据业务经验设置的基础比率(违约概率/正常概率),估算方法:(1-样本坏客户占比)/坏客户占比,默认 35,即 35:1 => 0.972 => 坏样本率 2.8% 462 | base_score: 基础 odds 对应的分数,默认 750 463 | combiner: 分箱转换器,传入 pipeline 时可以为None 464 | transer: woe转换器,传入 pipeline 时可以为None 465 | pretrain_lr: 预训练好的逻辑回归模型,可以不传 466 | pipeline: 训练好的 pipeline,必须包含 Combiner 和 WOETransformer 467 | **kwargs: 其他相关参数,具体参考 toad.ScoreCard 468 | """ 469 | if pipeline: 470 | combiner = self.class_steps(pipeline, Combiner)[0] 471 | transer = self.class_steps(pipeline, WOETransformer)[0] 472 | 473 | if self.class_steps(pipeline, (ITLubberLogisticRegression, LogisticRegression)): 474 | pretrain_lr = self.class_steps(pipeline, (ITLubberLogisticRegression, LogisticRegression))[0] 475 | 476 | super().__init__( 477 | combiner=combiner.combiner if isinstance(combiner, Combiner) else combiner, transer=transer.transformer if isinstance(transer, WOETransformer) else transer, 478 | pdo=pdo, rate=rate, base_odds=base_odds, base_score=base_score, **kwargs 479 | ) 480 | 481 | self.target = target 482 | self.pipeline = pipeline 483 | self.pretrain_lr = pretrain_lr 484 | 485 | def fit(self, x): 486 | y = x[self.target] 487 | x = x.drop(columns=[self.target]) 488 | 489 | self._feature_names = x.columns.tolist() 490 | 491 | for f in self.features_: 492 | if f not in self.transer: 493 | raise Exception('column \'{f}\' is not in transer'.format(f = f)) 494 | 495 | if self.pretrain_lr: 496 | self.model = self.pretrain_lr 497 | else: 498 | self.model.fit(x, y) 499 | 500 | self.rules = self._generate_rules() 501 | 502 | sub_score = self.woe_to_score(x) 503 | self.base_effect = pd.Series(np.median(sub_score, axis=0), index = self.features_) 504 | 505 | return self 506 | 507 | def transform(self, x): 508 | return self.predict(x) 509 | 510 | def scorecard_scale(self): 511 | scorecard_kedu = pd.DataFrame( 512 | [ 513 | ["base_odds", self.base_odds, "根据业务经验设置的基础比率(违约概率/正常概率),估算方法:(1-样本坏客户占比)/坏客户占比"], 514 | ["base_score", self.base_score, "基础ODDS对应的分数"], 515 | ["rate", self.rate, "设置分数的倍率"], 516 | ["pdo", self.pdo, "表示分数增长PDO时,ODDS值增长到RATE倍"], 517 | ["B", self.offset, "补偿值,计算方式:pdo / ln(rate)"], 518 | ["A", self.factor, "刻度,计算方式:base_score - B * ln(base_odds)"], 519 | ], 520 | columns=["刻度项", "刻度值", "备注"], 521 | ) 522 | return scorecard_kedu 523 | 524 | @staticmethod 525 | def KS_bucket(y_pred, y_true, bucket=10, method="quantile"): 526 | return toad.metrics.KS_bucket(y_pred, y_true, bucket=bucket, method=method) 527 | 528 | @staticmethod 529 | def KS(y_pred, y_true): 530 | return toad.metrics.KS(y_pred, y_true) 531 | 532 | @staticmethod 533 | def AUC(y_pred, y_true): 534 | return toad.metrics.AUC(y_pred, y_true) 535 | 536 | @staticmethod 537 | def perf_eva(y_pred, y_true, title="", plot_type=["ks", "roc"], save=None, figsize=(14, 6)): 538 | # plt.figure(figsize=figsize) 539 | rt = sc.perf_eva(y_true, y_pred, title=title, plot_type=plot_type, show_plot=True) 540 | 541 | if save: 542 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 543 | os.makedirs(os.path.dirname(save)) 544 | 545 | rt["pic"].savefig(save, dpi=240, format="png", bbox_inches="tight") 546 | 547 | return rt 548 | 549 | @staticmethod 550 | def ks_plot(score, target, title="", fontsize=14, figsize=(16, 8), save=None, colors=["#2639E9", "#F76E6C", "#FE7715"]): 551 | if np.mean(score) < 0 or np.mean(score) > 1: 552 | warnings.warn('Since the average of pred is not in [0,1], it is treated as predicted score but not probability.') 553 | score = -score 554 | 555 | df = pd.DataFrame({'label': target, 'pred': score}) 556 | def n0(x): return sum(x==0) 557 | def n1(x): return sum(x==1) 558 | df_ks = df.sort_values('pred', ascending=False).reset_index(drop=True) \ 559 | .assign(group=lambda x: np.ceil((x.index+1)/(len(x.index)/len(df.index)))) \ 560 | .groupby('group')['label'].agg([n0, n1]) \ 561 | .reset_index().rename(columns={'n0':'good','n1':'bad'}) \ 562 | .assign( 563 | group=lambda x: (x.index+1)/len(x.index), 564 | cumgood=lambda x: np.cumsum(x.good)/sum(x.good), 565 | cumbad=lambda x: np.cumsum(x.bad)/sum(x.bad) 566 | ).assign(ks=lambda x:abs(x.cumbad-x.cumgood)) 567 | 568 | fig, ax = plt.subplots(1, 2, figsize = figsize) 569 | 570 | # KS曲线 571 | dfks = df_ks.loc[lambda x: x.ks==max(x.ks)].sort_values('group').iloc[0] 572 | 573 | ax[0].plot(df_ks.group, df_ks.ks, color=colors[0], label="KS曲线") 574 | ax[0].plot(df_ks.group, df_ks.cumgood, color=colors[1], label="累积好客户占比") 575 | ax[0].plot(df_ks.group, df_ks.cumbad, color=colors[2], label="累积坏客户占比") 576 | ax[0].fill_between(df_ks.group, df_ks.cumbad, df_ks.cumgood, color=colors[0], alpha=0.25) 577 | 578 | ax[0].plot([dfks['group'], dfks['group']], [0, dfks['ks']], 'r--') 579 | ax[0].text(dfks['group'], dfks['ks'], f"KS: {round(dfks['ks'],4)} at: {dfks.group:.2%}", horizontalalignment='center', fontsize=fontsize) 580 | 581 | ax[0].spines['top'].set_color(colors[0]) 582 | ax[0].spines['bottom'].set_color(colors[0]) 583 | ax[0].spines['right'].set_color(colors[0]) 584 | ax[0].spines['left'].set_color(colors[0]) 585 | ax[0].set_xlabel('% of Population', fontsize=fontsize) 586 | ax[0].set_ylabel('% of Total Bad / Good', fontsize=fontsize) 587 | 588 | ax[0].set_xlim((0, 1)) 589 | ax[0].set_ylim((0, 1)) 590 | 591 | handles1, labels1 = ax[0].get_legend_handles_labels() 592 | 593 | ax[0].legend(loc='upper center', ncol=len(labels1), bbox_to_anchor=(0.5, 1.1), frameon=False) 594 | 595 | # ROC 曲线 596 | fpr, tpr, thresholds = roc_curve(target, score) 597 | auc_value = toad.metrics.AUC(score, target) 598 | 599 | ax[1].plot(fpr, tpr, color=colors[0], label="ROC Curve") 600 | ax[1].stackplot(fpr, tpr, color=colors[0], alpha=0.25) 601 | ax[1].plot([0, 1], [0, 1], color=colors[1], lw=2, linestyle=':') 602 | # ax[1].tick_params(axis='x', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize) 603 | # ax[1].tick_params(axis='y', labelrotation=0, grid_color="#FFFFFF", labelsize=fontsize) 604 | ax[1].text(0.5, 0.5, f"AUC: {auc_value:.4f}", fontsize=fontsize, horizontalalignment="center", transform=ax[1].transAxes) 605 | 606 | ax[1].spines['top'].set_color(colors[0]) 607 | ax[1].spines['bottom'].set_color(colors[0]) 608 | ax[1].spines['right'].set_color(colors[0]) 609 | ax[1].spines['left'].set_color(colors[0]) 610 | ax[1].set_xlabel("False Positive Rate", fontsize=fontsize) 611 | ax[1].set_ylabel('True Positive Rate', fontsize=fontsize) 612 | 613 | ax[1].set_xlim((0, 1)) 614 | ax[1].set_ylim((0, 1)) 615 | 616 | ax[1].yaxis.tick_right() 617 | ax[1].yaxis.set_label_position("right") 618 | 619 | handles2, labels2 = ax[1].get_legend_handles_labels() 620 | 621 | ax[1].legend(loc='upper center', ncol=len(labels2), bbox_to_anchor=(0.5, 1.1), frameon=False) 622 | 623 | if title: title += " " 624 | fig.suptitle(f"{title}K-S & ROC CURVE\n", fontsize=fontsize, fontweight="bold") 625 | 626 | plt.tight_layout() 627 | 628 | if save: 629 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 630 | os.makedirs(os.path.dirname(save)) 631 | 632 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight") 633 | 634 | return fig 635 | 636 | @staticmethod 637 | def PSI(y_pred_train, y_pred_oot): 638 | return toad.metrics.PSI(y_pred_train, y_pred_oot) 639 | 640 | @staticmethod 641 | def perf_psi(y_pred_train, y_pred_oot, y_true_train, y_true_oot, keys=["train", "test"], x_limits=None, x_tick_break=50, show_plot=True, return_distr_dat=False): 642 | return sc.perf_psi( 643 | score = {keys[0]: y_pred_train, keys[1]: y_pred_oot}, 644 | label = {keys[0]: y_true_train, keys[1]: y_true_oot}, 645 | x_limits = x_limits, 646 | x_tick_break = x_tick_break, 647 | show_plot = show_plot, 648 | return_distr_dat = return_distr_dat, 649 | ) 650 | 651 | @staticmethod 652 | def score_hist(score, y_true, figsize=(15, 10), bins=20, alpha=1, save=None): 653 | fig, ax = plt.subplots(1, 1, figsize = figsize) 654 | palette = sns.diverging_palette(340, 267, n=2, s=100, l=40) 655 | 656 | sns.histplot( 657 | x=score, hue=y_true.replace({0: "good", 1: "bad"}), element="step", stat="density", bins=bins, common_bins=True, common_norm=True, palette=palette, ax=ax 658 | ) 659 | 660 | sns.despine() 661 | 662 | ax.spines['top'].set_color("#2639E9") 663 | ax.spines['bottom'].set_color("#2639E9") 664 | ax.spines['right'].set_color("#2639E9") 665 | ax.spines['left'].set_color("#2639E9") 666 | 667 | ax.set_xlabel("score") 668 | ax.set_ylabel("density") 669 | 670 | ax.legend(["坏样本", "好样本"], loc='upper center', ncol=len(y_true.unique()), bbox_to_anchor=(0.5, 1.05), frameon=False, fontsize=14) 671 | 672 | fig.tight_layout() 673 | 674 | if save: 675 | if os.path.dirname(save) and not os.path.exists(os.path.dirname(save)): 676 | os.makedirs(os.path.dirname(save)) 677 | 678 | plt.savefig(save, dpi=240, format="png", bbox_inches="tight") 679 | 680 | return fig 681 | 682 | def _format_rule(self, rule, decimal = 2, **kwargs): 683 | bins = self.format_bins(rule['bins']) 684 | scores = np.around(rule['scores'], decimals = decimal).tolist() 685 | 686 | return dict(zip(bins, scores)) 687 | 688 | @staticmethod 689 | def class_steps(pipeline, query): 690 | return [v for k, v in pipeline.named_steps.items() if isinstance(v, query)] 691 | 692 | @staticmethod 693 | def round_float(num, decimal = 4): 694 | if ~pd.isnull(num) and isinstance(num, float): 695 | return float(str(num).split(".")[0] + "." + str(num).split(".")[1][:decimal]) 696 | else: 697 | return num 698 | 699 | def feature_bins(self, bins, decimal = 4): 700 | if isinstance(bins, list): bins = np.array(bins) 701 | EMPTYBINS = len(bins) if not isinstance(bins[0], (set, list, np.ndarray)) else -1 702 | 703 | l = [] 704 | if np.issubdtype(bins.dtype, np.number): 705 | has_empty = len(bins) > 0 and np.isnan(bins[-1]) 706 | if has_empty: bins = bins[:-1] 707 | sp_l = ["负无穷"] + [self.round_float(b, decimal=decimal) for b in bins.tolist()] + ["正无穷"] 708 | for i in range(len(sp_l) - 1): l.append('['+str(sp_l[i])+' , '+str(sp_l[i+1])+')') 709 | if has_empty: l.append('缺失值') 710 | else: 711 | for keys in bins: 712 | keys_update = set() 713 | for key in keys: 714 | if pd.isnull(key) or key == "nan": 715 | keys_update.add("缺失值") 716 | elif key.strip() == "": 717 | keys_update.add("空字符串") 718 | else: 719 | keys_update.add(key) 720 | label = ','.join(keys_update) 721 | l.append(label) 722 | 723 | return {i if b != "缺失值" else EMPTYBINS: b for i, b in enumerate(l)} 724 | 725 | def feature_bin_stats(self, data, feature, target="target", rules={}, empty_separate=True, method='step', max_n_bins=10, clip_v=None, desc="评分卡分数", verbose=0, combiner=None, ks=False): 726 | if method not in ['dt', 'chi', 'quantile', 'step', 'kmeans', 'cart']: 727 | raise "method is the one of ['dt', 'chi', 'quantile', 'step', 'kmeans', 'cart']" 728 | 729 | if combiner is None: 730 | combiner = toad.transform.Combiner() 731 | 732 | if method == "cart": 733 | x = data[feature].values 734 | y = data[target] 735 | _combiner = OptimalBinning(feature, dtype="numerical", max_n_bins=max_n_bins, monotonic_trend="auto_asc_desc", gamma=0.01).fit(x, y) 736 | if _combiner.status == "OPTIMAL": 737 | rules.update({feature: [s.tolist() if isinstance(s, np.ndarray) else s for s in _combiner.splits] + [np.nan]}) 738 | else: 739 | if method == "step": 740 | combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, n_bins=max_n_bins, clip_v=clip_v) 741 | else: 742 | combiner.fit(data[[feature, target]], target, empty_separate=empty_separate, method=method, n_bins=max_n_bins) 743 | 744 | if verbose > 0: 745 | print(data[feature].describe()) 746 | 747 | if rules and isinstance(rules, list): rules = {feature: rules} 748 | if rules and isinstance(rules, dict): combiner.update(rules) 749 | 750 | feature_bin = combiner.export()[feature] 751 | feature_bin_dict = self.feature_bins(np.array(feature_bin)) 752 | 753 | df_bin = combiner.transform(data[[feature, target]], labels=False) 754 | 755 | table = df_bin[[feature, target]].groupby([feature, target]).agg(len).unstack() 756 | table.columns.name = None 757 | table = table.rename(columns = {0 : '好样本数', 1 : '坏样本数'}).fillna(0) 758 | if "好样本数" not in table.columns: 759 | table["好样本数"] = 0 760 | if "坏样本数" not in table.columns: 761 | table["坏样本数"] = 0 762 | 763 | table["指标名称"] = feature 764 | table["指标含义"] = desc 765 | table = table.reset_index().rename(columns={feature: "分箱"}) 766 | 767 | table['样本总数'] = table['好样本数'] + table['坏样本数'] 768 | table['样本占比'] = table['样本总数'] / table['样本总数'].sum() 769 | table['好样本占比'] = table['好样本数'] / table['好样本数'].sum() 770 | table['坏样本占比'] = table['坏样本数'] / table['坏样本数'].sum() 771 | table['坏样本率'] = table['坏样本数'] / table['样本总数'] 772 | 773 | table = table.fillna(0.) 774 | 775 | table['分档WOE值'] = table.apply(lambda x : np.log(x['好样本占比'] / (x['坏样本占比'] + 1e-6)),axis=1) 776 | table['分档IV值'] = table.apply(lambda x : (x['好样本占比'] - x['坏样本占比']) * np.log(x['好样本占比'] / (x['坏样本占比'] + 1e-6)), axis=1) 777 | 778 | table = table.replace(np.inf, 0).replace(-np.inf, 0) 779 | 780 | table['指标IV值'] = table['分档IV值'].sum() 781 | 782 | table["LIFT值"] = table['坏样本率'] / (table["坏样本数"].sum() / table["样本总数"].sum()) 783 | table["累积LIFT值"] = (table['坏样本数'].cumsum() / table['样本总数'].cumsum()) / (table["坏样本数"].sum() / table["样本总数"].sum()) 784 | # table["累积LIFT值"] = table["LIFT值"].cumsum() 785 | 786 | if ks: 787 | table = table.sort_values("分箱") 788 | table["累积好样本数"] = table["好样本数"].cumsum() 789 | table["累积坏样本数"] = table["坏样本数"].cumsum() 790 | table["分档KS值"] = table["累积坏样本数"] / table['坏样本数'].sum() - table["累积好样本数"] / table['好样本数'].sum() 791 | 792 | table["分箱"] = table["分箱"].map(feature_bin_dict) 793 | table = table.set_index(['指标名称', '指标含义', '分箱']).reindex([(feature, desc, b) for b in feature_bin_dict.values()]).fillna(0).reset_index() 794 | 795 | if ks: 796 | return table[['指标名称', "指标含义", '分箱', '样本总数', '样本占比', '好样本数', '好样本占比', '坏样本数', '坏样本占比', '坏样本率', '分档WOE值', '分档IV值', '指标IV值', 'LIFT值', '累积LIFT值', '累积好样本数', '累积坏样本数', '分档KS值']] 797 | else: 798 | return table[['指标名称', "指标含义", '分箱', '样本总数', '样本占比', '好样本数', '好样本占比', '坏样本数', '坏样本占比', '坏样本率', '分档WOE值', '分档IV值', '指标IV值', 'LIFT值', '累积LIFT值']] 799 | 800 | 801 | if __name__ == '__main__': 802 | # https://github.com/itlubber/openpyxl-excel-style-template/blob/main/pipeline_model.py 803 | plt.ion() 804 | 805 | target = "creditability" 806 | data = sc.germancredit() 807 | data[target] = data[target].map({"good": 0, "bad": 1}) 808 | 809 | train, test = train_test_split(data, test_size=0.3, shuffle=True, stratify=data[target]) 810 | oot = data.copy() 811 | feature_pipeline = Pipeline([ 812 | ("preprocessing_select", FeatureSelection(target=target, engine="scorecardpy")), 813 | ("combiner", Combiner(target=target, min_samples=0.2)), 814 | ("transform", WOETransformer(target=target)), 815 | ("processing_select", FeatureSelection(target=target, engine="scorecardpy")), 816 | ("stepwise", StepwiseSelection(target=target)), 817 | ]) 818 | 819 | feature_pipeline.fit(train) 820 | 821 | woe_train = feature_pipeline.transform(train) 822 | woe_test = feature_pipeline.transform(test) 823 | woe_oot = feature_pipeline.transform(oot) 824 | 825 | # save all bin_plot 826 | _combiner = feature_pipeline.named_steps["combiner"] 827 | for col in woe_train.columns: 828 | if col != target: 829 | _combiner.bin_plot(train, col, labels=True, save=f"outputs/bin_plots/train_{col}.png") 830 | _combiner.bin_plot(test, col, labels=True, save=f"outputs/bin_plots/test_{col}.png") 831 | _combiner.bin_plot(oot, col, labels=True, save=f"outputs/bin_plots/oot_{col}.png") 832 | 833 | # logistic = StatsLogisticRegression(target=target) 834 | logistic = ITLubberLogisticRegression(target=target) 835 | 836 | logistic.fit(woe_train) 837 | 838 | y_pred_train = logistic.predict_proba(woe_train.drop(columns=target))[:, 1] 839 | y_pred_test = logistic.predict_proba(woe_test.drop(columns=target))[:, 1] 840 | y_pred_oot = logistic.predict_proba(woe_oot.drop(columns=target))[:, 1] 841 | 842 | # params_grid = { 843 | # # "logistic__C": [i / 1. for i in range(1, 10, 2)], 844 | # # "logistic__penalty": ["l2"], 845 | # # "logistic__class_weight": [None, "balanced"], # + [{1: i / 10.0, 0: 1 - i / 10.0} for i in range(1, 10)], 846 | # # "logistic__max_iter": [100], 847 | # # "logistic__solver": ["sag"] # ["liblinear", "sag", "lbfgs", "newton-cg"], 848 | # "logistic__intercept": [True, False], 849 | # } 850 | 851 | # clf = GridSearchCV(feature_pipeline, params_grid, cv=5, scoring='roc_auc', verbose=-1, n_jobs=2, return_train_score=True) 852 | # clf.fit(train, train[target]) 853 | 854 | # y_pred_train = clf.best_estimator_.predict(train) 855 | # y_pred_test = clf.best_estimator_.predict(test) 856 | 857 | # print(clf.best_params_) 858 | 859 | # model summary 860 | # logistic.summary_save() 861 | 862 | logistic.plot_weights(save="outputs/logistic_train.png") 863 | 864 | summary = logistic.summary().reset_index().rename(columns={"index": "Features"}) 865 | 866 | train_corr = logistic.corr(woe_train, save="outputs/train_corr.png") 867 | test_corr = logistic.corr(woe_test, save="outputs/test_corr.png") 868 | oot_corr = logistic.corr(woe_oot, save="outputs/oot_corr.png") 869 | 870 | train_report = logistic.report(woe_train) 871 | test_report = logistic.report(woe_test) 872 | oot_report = logistic.report(woe_oot) 873 | 874 | print("train: ", toad.metrics.KS(y_pred_train, train[target]), toad.metrics.AUC(y_pred_train, train[target])) 875 | print("test: ", toad.metrics.KS(y_pred_test, test[target]), toad.metrics.AUC(y_pred_test, test[target])) 876 | print("oot: ", toad.metrics.KS(y_pred_oot, oot[target]), toad.metrics.AUC(y_pred_oot, oot[target])) 877 | 878 | card = ScoreCard(target=target, pipeline=feature_pipeline, pretrain_lr=logistic) 879 | card.fit(woe_train) 880 | 881 | train["score"] = card.predict(train) 882 | test["score"] = card.predict(test) 883 | oot["score"] = card.predict(oot) 884 | 885 | card.perf_eva(train["score"], train[target], title="Train Dataset", save="outputs/train_ksplot.png") 886 | card.perf_eva(test["score"], test[target], title="Test Dataset", save="outputs/test_ksplot.png") 887 | card.perf_eva(oot["score"], oot[target], title="OOT Dataset", save="outputs/oot_ksplot.png") 888 | 889 | card.score_hist(train["score"], train[target], save="outputs/train_scorehist.png") 890 | card.score_hist(test["score"], test[target], save="outputs/test_scorehist.png") 891 | card.score_hist(oot["score"], oot[target], save="outputs/oot_scorehist.png") 892 | 893 | train_score_rank = card.feature_bin_stats(train, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step") 894 | test_score_rank = card.feature_bin_stats(test, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step") 895 | oot_score_rank = card.feature_bin_stats(oot, "score", target=target, rules=[i for i in range(400, 800, 50)], verbose=0, method="step") 896 | 897 | card_points = card.export(to_frame=True) 898 | 899 | writer = pd.ExcelWriter("outputs/评分卡结果验证表.xlsx", engine="openpyxl") 900 | 901 | summary.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=1, index=False) 902 | train_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + 5, index=False) 903 | test_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + 9, index=False) 904 | oot_report.to_excel(writer, sheet_name="逻辑回归拟合结果", startrow=len(summary) + len(train_report) + len(test_report) + 13, index=False) 905 | 906 | worksheet = writer.sheets['逻辑回归拟合结果'] 907 | worksheet.cell(row=1, column=1).value = "入模变量系数及相关统计指标" 908 | worksheet.cell(row=len(summary) + 5, column=1).value = "训练数据集模型预测报告" 909 | worksheet.cell(row=len(summary) + len(train_report) + 9, column=1).value = "测试数据集模型预测报告" 910 | worksheet.cell(row=len(summary) + len(train_report) + len(test_report) + 13, column=1).value = "跨时间验证集模型预测报告" 911 | 912 | train_corr.to_excel(writer, sheet_name="入模变量相关性", startrow=1, index=True) 913 | test_corr.to_excel(writer, sheet_name="入模变量相关性", startrow=len(train_corr) + 5, index=True) 914 | oot_corr.to_excel(writer, sheet_name="入模变量相关性", startrow=len(train_corr) + len(test_corr) + 9, index=True) 915 | 916 | worksheet = writer.sheets['入模变量相关性'] 917 | worksheet.cell(row=2, column=1).value = "训练数据集入模变量相关性" 918 | worksheet.cell(row=len(train_corr) + 6, column=1).value = "测试数据集入模变量相关性" 919 | worksheet.cell(row=len(train_corr) + len(test_corr) + 10, column=1).value = "跨时间验证集入模变量相关性" 920 | 921 | card_points.to_excel(writer, sheet_name="评分卡", index=False) 922 | 923 | train_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=1, index=False) 924 | test_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + 5, index=False) 925 | oot_score_rank.to_excel(writer, sheet_name="评分卡排序性", startrow=len(train_score_rank) + len(test_score_rank) + 9, index=False) 926 | 927 | worksheet = writer.sheets['评分卡排序性'] 928 | 929 | worksheet.cell(row=1, column=1).value = "训练数据集评分排序性" 930 | worksheet.cell(row=len(train_score_rank) + 5, column=1).value = "测试数据集评分排序性" 931 | worksheet.cell(row=len(train_score_rank) + len(test_score_rank) + 9, column=1).value = "跨时间验证集评分排序性" 932 | 933 | writer.close() 934 | 935 | from utils.tools import render_excel 936 | 937 | render_excel("outputs/评分卡结果验证表.xlsx", border=False) 938 | 939 | --------------------------------------------------------------------------------