├── utils ├── __init__.py ├── time_utils.py ├── text_utils.py └── data_utils.py ├── data ├── graph_data │ ├── graph_demo.adjlist │ ├── readme.md │ ├── order.x │ ├── order.y │ ├── order.graph │ └── graph_demo.embeddings ├── order_data.xlsx ├── order_feas.xlsx ├── model │ ├── xgb_model.pkl │ └── xgb_model.fmap ├── README.md ├── text_data │ ├── test.txt │ └── stopwords.txt └── german_score.csv ├── chapter2 ├── ch2_00_german_credit.py ├── ch2_13_fs_variation.py ├── ch2_28_feature_extraction_mds.py ├── ch2_12_woe_encoder.py ├── ch2_25_feature_extraction_pca.py ├── ch2_11_target_encoder.py ├── ch2_27_feature_extraction_lle.py ├── ch2_15_fs_corr_scipy.py ├── ch2_02_toad_eda_detect.py ├── ch2_04_preprocess_missing_value.py ├── ch2_26_feature_extraction_lda.py ├── ch2_18_fs_chi.py ├── ch2_17_fs_iv.py ├── ch2_16_fs_vif.py ├── ch2_06_preprocess_value_bining.py ├── ch2_14_fs_corr_pandas.py ├── ch2_10_one_hot_based_category_encoders.py ├── ch2_22_fs_select_from_model.py ├── ch2_19_fs_stepwise.py ├── ch2_21_fs_l1_norm.py ├── ch2_31_model_deployment_pickle.py ├── ch2_20_fs_rfe.py ├── ch2_35_decision_tree.py ├── ch2_29_p_to_score.py ├── ch2_01_train_test_split.py ├── ch2_05_preprocess_value_scaler.py ├── ch2_08_ordinal_encode_based_category_encoders.py ├── ch2_39_lightgbm.py ├── ch2_23_fs_psi.py ├── ch2_36_randomforest.py ├── ch2_37_gbdt.py ├── ch2_03_missrate_by_month.py ├── ch2_09_one_hot_based_sklearn.py ├── ch2_40_DNN_credit_data.py ├── ch2_34_svm.py ├── ch2_07_ordinal_encoder_based_sklearn.py ├── ch2_30_validation_curve.py ├── ch2_33_lr.py ├── ch2_24_fs_badrate_by_month.py ├── ch2_41_CNN_credit_data.py ├── ch2_32_model_deployment_pmml.py └── ch2_38_xgboost.py ├── chapter3 ├── ch3_07_jieba_demo.py ├── ch3_13_random_walk.py ├── ch3_10_fasttext_vec.py ├── ch3_03_tsfresh_orders.py ├── ch3_09_word2vec.py ├── ch3_14_node2vec.py ├── ch3_11_text_classifier_bayes.py ├── ch3_05_gbdt_construct_feature.py ├── ch3_04_feature_evaluation.py ├── ch3_00_order_data_preprocess.py ├── ch3_06_cluster_alg.py ├── ch3_12_text_classifier_fasttext.py ├── ch3_08_bag_of_words.py ├── ch3_01_order_fea_gen_manual.py ├── ch3_02_order_fea_gen_rfm_auto.py └── ch3_15_gcn_order.py ├── 勘误.md ├── requirements.txt ├── chapter4 ├── ch4_03_rules_for_isolationforest.py ├── ch4_00_rules_for_iv.py ├── ch4_02_rules_for_decisiontree.py ├── ch4_01_rules_for_outliers.py ├── ch4_04_modelstrategy_for_optimization.py └── ch4_02_rules_for_decisiontree.ipynb ├── README.md ├── .gitignore └── LICENSE /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/graph_data/graph_demo.adjlist: -------------------------------------------------------------------------------- 1 | 1 2 4 2 | 2 1 3 4 3 | 3 2 5 4 | 4 1 2 5 | 5 3 -------------------------------------------------------------------------------- /data/graph_data/readme.md: -------------------------------------------------------------------------------- 1 | - order.x: 特征 2 | - order.y: label 3 | - order.graph:图结构 4 | -------------------------------------------------------------------------------- /data/order_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/order_data.xlsx -------------------------------------------------------------------------------- /data/order_feas.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/order_feas.xlsx -------------------------------------------------------------------------------- /data/graph_data/order.x: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/graph_data/order.x -------------------------------------------------------------------------------- /data/graph_data/order.y: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/graph_data/order.y -------------------------------------------------------------------------------- /data/model/xgb_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/model/xgb_model.pkl -------------------------------------------------------------------------------- /data/graph_data/order.graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/graph_data/order.graph -------------------------------------------------------------------------------- /chapter2/ch2_00_german_credit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scorecardpy as sc 4 | 5 | # 加载数据集 6 | german_credit_data = sc.germancredit() 7 | # 打印前5行, 前4列和最后一列 8 | print(german_credit_data.iloc[:5, list(range(-1, 4))]) -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | 数据说明: 2 | 1. text\_data: 文本相关数据 3 | 2. graph\_data: 图特征相关数据 4 | 3. model: 模型相关数据 5 | 4. order\_data.xlsx: 订单原始数据,RFM生成订单特征挖掘的输入 6 | 5. order\_feas.xlsx: 订单生产的特征,以及label逾期标签数据 7 | 6. german\_score.csv: 每个用户的模型得分及是否逾期 -------------------------------------------------------------------------------- /chapter3/ch3_07_jieba_demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # 结巴分词使用示例 4 | from utils.text_utils import cut_words 5 | 6 | text_demo = "通过资料审核与电话沟通用户审批通过借款金额10000元操作人小明审批时间2020年10月5日 经过电话核实用户确认所有资料均为本人提交提交时间2020年11月5日用户当前未逾期" 7 | segs = cut_words(text_demo) 8 | print("原文: ", text_demo) 9 | print("切词后的结果:", list(segs)) 10 | -------------------------------------------------------------------------------- /chapter2/ch2_13_fs_variation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | from scipy.stats import variation 9 | 10 | # 导入数值型样例数据 11 | all_x_y = data_utils.get_all_x_y() 12 | x = all_x_y.drop(data_utils.label, axis=1) 13 | # 计算各个特征的变异系数 14 | x_var = variation(x, nan_policy='omit') 15 | result = dict(zip(x.columns ,x_var)) 16 | print("变异系数结果: \n", result) -------------------------------------------------------------------------------- /勘误.md: -------------------------------------------------------------------------------- 1 | # 书籍勘误 2 | * P16:修正图2-7和2-8,删除201903和MOB12对应的数字,以及201906和MOB9对应的数字,曲线同时修改 3 | * P51:图2-24,“字数”修订为“总客户数” 4 | * P74:第一行"分类任务采用逻辑损失函数"修改为"学习任务为二分类任务";第二行"构建1000可决策树"修改为"采用传统梯度提升树方法构建弱学习器";第四、五行"10%"修改"100%" 5 | * P108:data_preprocess下注释,data和back_time后面","修改为": " 6 | * P114:表3-9邻接矩阵第2行第1列数字"0"修改为数字"1",拉普拉斯矩阵中第2行第1列"0"以及这一行的"1"均修改为"-1",第3行最后一个"1"修改为"-1" 7 | * P116:代码优化,采用本代码库中代码 8 | * P119-122:代码优化,采用本代码库中代码 9 | * P126:表格3-12中的"0.11"修改为"0.1" 10 | * P138:代码块下方第一句应删除“在Word2Vec” 11 | -------------------------------------------------------------------------------- /data/graph_data/graph_demo.embeddings: -------------------------------------------------------------------------------- 1 | 5 8 2 | 2 -0.1042217 0.20679143 0.2427775 0.72288877 0.31595996 0.39856002 -0.58908206 0.17186889 3 | 4 -0.030496102 0.10907486 0.1395583 0.6750262 0.29104736 0.3746776 -0.6623706 0.23489219 4 | 1 0.01014287 0.13469146 0.18281066 0.77614796 0.22247209 0.28817466 -0.7041028 0.21778071 5 | 3 0.0357248 0.16088022 0.12562852 0.7775058 0.34339327 0.32849503 -0.74697566 0.15016618 6 | 5 -0.05321218 0.1133668 0.20365518 0.81072015 0.28992698 0.28150764 -0.71600515 0.17657782 7 | -------------------------------------------------------------------------------- /chapter2/ch2_28_feature_extraction_mds.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import pandas as pd 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | from utils import data_utils 9 | from sklearn.manifold import MDS 10 | 11 | 12 | # 导入数值型样例数据 13 | all_x_y = data_utils.get_all_x_y() 14 | x = all_x_y.drop(data_utils.label, axis=1) 15 | mds = MDS(n_components=10) 16 | x_new = mds.fit_transform(x) 17 | x_new_df = pd.DataFrame(x_new) 18 | print("利用sklearn进行MDS特征提取结果: \n", x_new_df) 19 | -------------------------------------------------------------------------------- /chapter2/ch2_12_woe_encoder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | from category_encoders.woe import WOEEncoder 9 | 10 | # 加载数据 11 | german_credit_data = data_utils.get_data() 12 | y = german_credit_data['creditability'] 13 | x = german_credit_data[['purpose', 'personal.status.and.sex']] 14 | 15 | # WOE编码 16 | encoder = WOEEncoder(cols=x.columns) 17 | result = encoder.fit_transform(x, y) 18 | print("WOE编码结果: \n", result) 19 | -------------------------------------------------------------------------------- /chapter2/ch2_25_feature_extraction_pca.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import toad 5 | import pandas as pd 6 | sys.path.append("./") 7 | sys.path.append("../") 8 | 9 | from utils import data_utils 10 | from sklearn.decomposition import PCA 11 | 12 | 13 | # 导入数值型样例数据 14 | all_x_y = data_utils.get_all_x_y() 15 | x = all_x_y.drop(data_utils.label, axis=1) 16 | pca = PCA(n_components=0.9) 17 | x_new = pca.fit_transform(x) 18 | x_new_df = pd.DataFrame(x_new) 19 | print("利用sklearn进行PCA特征提取, 保留90%信息后结果: \n", x_new_df) 20 | -------------------------------------------------------------------------------- /chapter2/ch2_11_target_encoder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | from category_encoders.target_encoder import TargetEncoder 9 | 10 | 11 | # 加载数据 12 | german_credit_data = data_utils.get_data() 13 | y = german_credit_data['creditability'] 14 | x = german_credit_data[['purpose', 'personal.status.and.sex']] 15 | # 目标编码 16 | enc = TargetEncoder(cols=x.columns) 17 | result = enc.fit_transform(x, y) 18 | print("目标编码结果: \n", result) 19 | -------------------------------------------------------------------------------- /chapter2/ch2_27_feature_extraction_lle.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import pandas as pd 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | from utils import data_utils 9 | from sklearn.manifold import LocallyLinearEmbedding 10 | 11 | # 导入数值型样例数据 12 | all_x_y = data_utils.get_all_x_y() 13 | x = all_x_y.drop(data_utils.label, axis=1) 14 | lle = LocallyLinearEmbedding(n_neighbors=5, n_components=10) 15 | x_new = lle.fit_transform(x) 16 | x_new_df = pd.DataFrame(x_new) 17 | print("利用sklearn进行LLE特征提取结果: \n", x_new_df) 18 | -------------------------------------------------------------------------------- /chapter2/ch2_15_fs_corr_scipy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | from scipy.stats import pearsonr 9 | 10 | 11 | # 导入数值型样例数据 12 | all_x_y = data_utils.get_all_x_y() 13 | x = all_x_y.drop(data_utils.label, axis=1) 14 | x1, x2 = x.loc[:, 'age.in.years'], x.loc[:, 'credit.history',] 15 | r, p_value = pearsonr(x1, x2) 16 | print("scipy库计算 特征'age.in.years'和'credit.history'的pearson相关系数 \n", 17 | "pearson相关系数: %s, \n" % r, "p_value: %s" % p_value) 18 | -------------------------------------------------------------------------------- /chapter2/ch2_02_toad_eda_detect.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import toad 8 | from utils import data_utils 9 | 10 | # 加载数据集 11 | german_credit_data = data_utils.get_data() 12 | detect_res = toad.detector.detect(german_credit_data) 13 | # 打印前5行, 前4列 14 | 15 | print("前5行, 前4列:") 16 | print(detect_res.iloc[:5, :4]) 17 | print("前5行, 第5至9列:") 18 | # 打印前5行, 第5至9列 19 | print(detect_res.iloc[:5, 4:9]) 20 | # 打印前5行, 第10至14列 21 | print("前5行, 第10至14列:") 22 | print(detect_res.iloc[:5, 9:]) 23 | 24 | -------------------------------------------------------------------------------- /chapter2/ch2_04_preprocess_missing_value.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from utils import data_utils 10 | from sklearn.impute import SimpleImputer 11 | 12 | # 导入数值型样例数据 13 | data = data_utils.get_data() 14 | # 缺失值处理 15 | imp = SimpleImputer(missing_values=np.nan, strategy='mean') 16 | imped_data = imp.fit_transform(data[data_utils.numeric_cols]) 17 | imped_df = pd.DataFrame(imped_data, columns=data_utils.numeric_cols) 18 | print("缺失值填充结果: \n", imped_df) 19 | -------------------------------------------------------------------------------- /chapter2/ch2_26_feature_extraction_lda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import pandas as pd 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | from utils import data_utils 9 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 10 | 11 | # 导入数值型样例数据 12 | all_x_y = data_utils.get_all_x_y() 13 | x = all_x_y.drop(data_utils.label, axis=1) 14 | y = all_x_y[data_utils.label] 15 | lda = LinearDiscriminantAnalysis(n_components=1) 16 | x_new = lda.fit_transform(x, y) 17 | x_new_df = pd.DataFrame(x_new) 18 | print("利用sklearn进行LDA特征提取结果: \n", x_new_df) 19 | -------------------------------------------------------------------------------- /chapter2/ch2_18_fs_chi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | from sklearn.feature_selection import chi2 9 | from sklearn.feature_selection import SelectKBest 10 | 11 | # 导入数值型样例数据 12 | all_x_y = data_utils.get_all_x_y() 13 | y = all_x_y.pop(data_utils.label) 14 | # 选择K个最好的特征,返回选择特征后的数据 15 | fs_chi = SelectKBest(chi2, k=5) 16 | fs_chi.fit(all_x_y, y) 17 | x_new = fs_chi.transform(all_x_y) 18 | 19 | selected_cols = all_x_y.columns[fs_chi.get_support()].tolist() 20 | print("卡方检验筛选得到%s个特征: \n" % len(selected_cols), selected_cols) 21 | -------------------------------------------------------------------------------- /data/model/xgb_model.fmap: -------------------------------------------------------------------------------- 1 | 0 status.of.existing.checking.account q 2 | 1 credit.history q 3 | 2 savings.account.and.bonds q 4 | 3 present.employment.since q 5 | 4 personal.status.and.sex q 6 | 5 other.debtors.or.guarantors q 7 | 6 property q 8 | 7 other.installment.plans q 9 | 8 housing q 10 | 9 job q 11 | 10 telephone q 12 | 11 foreign.worker q 13 | 12 purpose q 14 | 13 duration.in.month q 15 | 14 credit.amount q 16 | 15 age.in.years q 17 | 16 present.residence.since q 18 | 17 number.of.existing.credits.at.this.bank q 19 | 18 installment.rate.in.percentage.of.disposable.income q 20 | 19 number.of.people.being.liable.to.provide.maintenance.for q 21 | -------------------------------------------------------------------------------- /chapter2/ch2_17_fs_iv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import toad 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | from utils import data_utils 9 | 10 | # 导入数值型样例数据 11 | all_x_y = data_utils.get_all_x_y() 12 | # 利用toad库quality()方法计算IV 13 | var_iv = toad.quality(all_x_y, 14 | target='creditability', 15 | method='quantile', 16 | n_bins=6, 17 | iv_only=True) 18 | 19 | selected_cols = var_iv[var_iv.iv > 0.1].index.tolist() 20 | print("各特征的iv值计算结果: \n", var_iv) 21 | print("设置iv阈值为0.1, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols) 22 | -------------------------------------------------------------------------------- /chapter2/ch2_16_fs_vif.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | from statsmodels.stats.outliers_influence import variance_inflation_factor 9 | 10 | # 导入数值型样例数据 11 | all_x_y = data_utils.get_all_x_y() 12 | x = all_x_y.drop(data_utils.label, axis=1) 13 | vif = [variance_inflation_factor(x.values, ix) for ix in range(x.shape[1])] 14 | print("各特征的vif值计算结果: \n", dict(zip(x.columns, vif))) 15 | 16 | # 筛选阈值小于10的特征 17 | selected_cols = x.iloc[:, [f < 10 for f in vif]].columns.tolist() 18 | print("设置vif阈值为10, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols) 19 | -------------------------------------------------------------------------------- /chapter2/ch2_06_preprocess_value_bining.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import toad 8 | from toad.plot import bin_plot 9 | from utils import data_utils 10 | 11 | german_credit_data = data_utils.get_data() 12 | # 利用toad库等频分箱 13 | # 初始化分箱对象 14 | c = toad.transform.Combiner() 15 | c.fit(german_credit_data[data_utils.x_cols], 16 | y=german_credit_data[data_utils.label], n_bins=6, method='quantile', empty_separate=True) 17 | # 特征age.in.years分箱结果画图 18 | data_binned = c.transform(german_credit_data, labels=True) 19 | bin_plot(data_binned, x='age.in.years', target=data_utils.label) 20 | -------------------------------------------------------------------------------- /chapter2/ch2_14_fs_corr_pandas.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | 9 | 10 | # 导入数值型样例数据 11 | all_x_y = data_utils.get_all_x_y() 12 | x = all_x_y.drop(data_utils.label, axis=1) 13 | # 利用pandas库计算相关系数 14 | # pearson相关系数 15 | pearson_corr = x.corr(method='pearson') 16 | print("pandas库计算 pearson相关系数: \n", pearson_corr) 17 | # spearman相关系数 18 | spearman_corr = x.corr(method='spearman') 19 | print("pandas库计算 spearman相关系数: \n", spearman_corr) 20 | # kendall相关系数 21 | kendall_corr = x.corr(method='kendall') 22 | print("pandas库计算 kendall相关系数: \n", kendall_corr) 23 | -------------------------------------------------------------------------------- /chapter2/ch2_10_one_hot_based_category_encoders.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | from category_encoders.one_hot import OneHotEncoder 9 | 10 | 11 | # 加载数据 12 | german_credit_data = data_utils.get_data() 13 | # 初始化OneHotEncoder类 14 | encoder = OneHotEncoder(cols=['purpose', 'personal.status.and.sex'], 15 | handle_unknown='indicator', 16 | handle_missing='indicator', 17 | use_cat_names=True) 18 | # 转换数据集 19 | result = encoder.fit_transform(german_credit_data) 20 | print("one-hot编码结果: \n", result) -------------------------------------------------------------------------------- /chapter2/ch2_22_fs_select_from_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import toad 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | from utils import data_utils 9 | from sklearn.feature_selection import SelectFromModel 10 | from sklearn.ensemble import GradientBoostingClassifier 11 | 12 | # 导入数值型样例数据 13 | all_x_y = data_utils.get_all_x_y() 14 | y = all_x_y.pop(data_utils.label) 15 | x = all_x_y 16 | # GBDT作为基模型的特征选择 17 | sf = SelectFromModel(GradientBoostingClassifier()) 18 | x_new = sf.fit_transform(x, y) 19 | 20 | selected_cols = x.columns[sf.get_support()].tolist() 21 | print("基于树模型筛选得到%s个特征: \n" % len(selected_cols), selected_cols) 22 | -------------------------------------------------------------------------------- /chapter3/ch3_13_random_walk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 使用DeepWalk算法生成特征(可以直接在shell命令窗口中运行deepwalk命令) 4 | """ 5 | 6 | import os 7 | import sys 8 | import pandas as pd 9 | sys.path.append("./") 10 | sys.path.append("../") 11 | 12 | size = 8 13 | os.system( 14 | "deepwalk --input data/graph_data/graph_demo.adjlist " 15 | f"--output data/graph_data/graph_demo.embeddings --representation-size {size}") 16 | 17 | fea_vec = pd.read_csv('data/graph_data/graph_demo.embeddings', sep=' ', skiprows=1, index_col=0, 18 | names=['fea_%s' % i for i in range(size)]).sort_index() 19 | print('词向量维度:', fea_vec.shape) 20 | print('词向量结果:', fea_vec) 21 | -------------------------------------------------------------------------------- /chapter2/ch2_19_fs_stepwise.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import toad 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | from utils import data_utils 9 | 10 | # 导入数值型样例数据 11 | all_x_y = data_utils.get_all_x_y() 12 | final_data = toad.selection.stepwise(all_x_y, 13 | target=data_utils.label, 14 | estimator='lr', 15 | direction='both', 16 | criterion='aic', 17 | return_drop=False) 18 | selected_cols = final_data.columns 19 | print("通过stepwise筛选得到%s个特征: \n" % len(selected_cols), selected_cols) 20 | -------------------------------------------------------------------------------- /chapter2/ch2_21_fs_l1_norm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import toad 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | from utils import data_utils 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.feature_selection import SelectFromModel 11 | 12 | # 导入数值型样例数据 13 | all_x_y = data_utils.get_all_x_y() 14 | y = all_x_y.pop(data_utils.label) 15 | x = all_x_y 16 | # 带L1惩罚项的逻辑回归作为基模型的特征选择 17 | LR = LogisticRegression(penalty='l1', C=0.1, solver='liblinear') 18 | sf = SelectFromModel(LR) 19 | x_new = sf.fit_transform(x, y) 20 | 21 | selected_cols = x.columns[sf.get_support()].tolist() 22 | print("基于L1范数筛选得到%s个特征: \n" % len(selected_cols), selected_cols) 23 | -------------------------------------------------------------------------------- /chapter2/ch2_31_model_deployment_pickle.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import numpy as np 5 | import pandas as pd 6 | sys.path.append("./") 7 | sys.path.append("../") 8 | 9 | 10 | # Pickle方式保存和读取模型 11 | def save_model_as_pkl(model, path): 12 | """ 13 | 保存模型到路径path 14 | :param model: 训练完成的模型 15 | :param path: 保存的目标路径 16 | """ 17 | import pickle 18 | with open(path, 'wb') as f: 19 | pickle.dump(model, f, protocol=2) 20 | 21 | 22 | def load_model_from_pkl(path): 23 | """ 24 | 从路径path加载模型 25 | :param path: 保存的目标路径 26 | """ 27 | import pickle 28 | with open(path, 'rb') as f: 29 | model = pickle.load(f) 30 | return model 31 | 32 | -------------------------------------------------------------------------------- /chapter2/ch2_20_fs_rfe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import toad 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | from utils import data_utils 9 | from sklearn.feature_selection import RFE 10 | from sklearn.linear_model import LogisticRegression 11 | 12 | 13 | # 导入数值型样例数据 14 | all_x_y = data_utils.get_all_x_y() 15 | y = all_x_y.pop(data_utils.label) 16 | x = all_x_y 17 | # 递归特征消除法,返回特征选择后的数据 18 | # 参数estimator为基模型 19 | # 参数n_features_to_select为选择的特征个数 20 | rfe = RFE(estimator=LogisticRegression(), n_features_to_select=10) 21 | x_new = rfe.fit_transform(x, y) 22 | 23 | selected_cols = x.columns[rfe.get_support()].tolist() 24 | print("通过递归特征消除法筛选得到%s个特征: \n" % len(selected_cols), selected_cols) 25 | -------------------------------------------------------------------------------- /chapter2/ch2_35_decision_tree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | 8 | from sklearn.tree import DecisionTreeClassifier 9 | from utils import data_utils 10 | from sklearn.metrics import roc_auc_score 11 | from sklearn.tree import DecisionTreeClassifier 12 | 13 | # 导入数值型样例数据 14 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2) 15 | # 导入数值型样例数据 16 | clf = DecisionTreeClassifier(criterion='gini', 17 | max_depth=8, 18 | min_samples_leaf=15, 19 | random_state=88) 20 | clf.fit(train_x, train_y) 21 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1]) 22 | print("决策树模型 AUC: ", auc_score) 23 | -------------------------------------------------------------------------------- /chapter2/ch2_29_p_to_score.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import numpy as np 4 | import pandas as pd 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | def p_to_score(p, pdo, base, odds): 9 | """ 10 | 逾期概率转换分数 11 | :param p: 逾期概率 12 | :param pdo: points double odds. default = 60 13 | :param base: base points. default = 600 14 | :param odds: odds. default = 1.0/15.0 15 | :returns: 模型分数 16 | """ 17 | B = pdo / np.log(2) 18 | A = base + B * np.log(odds) 19 | score = A - B * np.log(p / (1 - p)) 20 | return round(score, 0) 21 | 22 | pros = pd.Series(np.random.rand(100)) 23 | pros_score = p_to_score(pros, pdo=60.0, base=600, odds=1.0 / 15.0) 24 | print("随机产生100个概率并转化为score结果: \n", dict(zip(pros, pros_score))) 25 | -------------------------------------------------------------------------------- /chapter2/ch2_01_train_test_split.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | from sklearn.model_selection import train_test_split 9 | 10 | # 导入添加month列的数据 11 | model_data = data_utils.get_data() 12 | # 选取OOT样本 13 | oot_set = model_data[model_data['month'] == '2020-05'] 14 | # 划分训练集和测试集 15 | train_valid_set = model_data[model_data['month'] != '2020-05'] 16 | X = train_valid_set[data_utils.x_cols] 17 | Y = train_valid_set['creditability'] 18 | X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.3, random_state=88) 19 | model_data.loc[oot_set.index, 'sample_set'] = 'oot' 20 | model_data.loc[X_train.index, 'sample_set'] = 'train' 21 | model_data.loc[X_valid.index, 'sample_set'] = 'valid' 22 | -------------------------------------------------------------------------------- /chapter2/ch2_05_preprocess_value_scaler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import pandas as pd 8 | from utils import data_utils 9 | from sklearn.preprocessing import MinMaxScaler 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | # 导入数值型样例数据 13 | data = data_utils.get_data() 14 | # max-min标准化 15 | X_MinMaxScaler = MinMaxScaler().fit_transform(data[data_utils.numeric_cols]) 16 | max_min_df = pd.DataFrame(X_MinMaxScaler, columns=data_utils.numeric_cols) 17 | print("max-min标准化结果: \n", max_min_df) 18 | # z-score标准化 19 | X_StandardScaler = StandardScaler().fit_transform(data[data_utils.numeric_cols]) 20 | standard_df = pd.DataFrame(X_StandardScaler, columns=data_utils.numeric_cols) 21 | print("z-score标准化结果: \n", standard_df) 22 | -------------------------------------------------------------------------------- /chapter2/ch2_08_ordinal_encode_based_category_encoders.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from utils import data_utils 8 | from category_encoders.ordinal import OrdinalEncoder 9 | 10 | # 加载数据 11 | german_credit_data = data_utils.get_data() 12 | # 初始化OrdinalEncoder类 13 | encoder = OrdinalEncoder(cols=['purpose', 'personal.status.and.sex'], 14 | handle_unknown='value', 15 | handle_missing='value') 16 | # 将 handle_unknown设为"value",即测试集中的未知特征值将被标记为-1 17 | # 将 handle_missing设为"value",即测试集中的缺失值将被标记为-2 18 | # 当设为"error",即报错;当设为"return_nan",即未知值/缺失值被标记为nan 19 | result = encoder.fit_transform(german_credit_data) 20 | category_mapping = encoder.category_mapping 21 | print("类别编码结果: \n", result) 22 | print("类别编码映射关系: \n", category_mapping) 23 | -------------------------------------------------------------------------------- /chapter2/ch2_39_lightgbm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import lightgbm as lgb 8 | from utils import data_utils 9 | from sklearn.metrics import roc_auc_score 10 | 11 | # 导入数值型样例数据 12 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2) 13 | clf = lgb.LGBMClassifier(objective='binary', 14 | boosting_type='gbdt', 15 | max_depth=3, 16 | n_estimators=1000, 17 | subsample=1, 18 | colsample_bytree=1) 19 | lgb_model = clf.fit(train_x, train_y, eval_set=[(test_x, test_y)], eval_metric='auc', early_stopping_rounds=30) 20 | auc_score = roc_auc_score(test_y, lgb_model.predict_proba(test_x)[:, 1]) 21 | print("LightGBM模型 AUC: ", auc_score) 22 | -------------------------------------------------------------------------------- /chapter2/ch2_23_fs_psi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import toad 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | from utils import data_utils 9 | 10 | # 加载数据 11 | all_x_y = data_utils.get_all_x_y() 12 | # 定义分箱方法 13 | Combiner = toad.transform.Combiner() 14 | Combiner.fit(all_x_y, 15 | y=data_utils.label, 16 | n_bins=6, 17 | method='quantile', 18 | empty_separate=True) 19 | # 计算psi 20 | var_psi = toad.metrics.PSI(all_x_y.iloc[:500, :], 21 | all_x_y.iloc[500:, :], 22 | combiner=Combiner) 23 | var_psi_df = var_psi.to_frame(name='psi') 24 | 25 | selected_cols = var_psi[var_psi_df.psi < 0.1].index.tolist() 26 | print("各特征的psi值计算结果: \n", var_psi_df) 27 | print("设置psi阈值为0.1, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols) 28 | -------------------------------------------------------------------------------- /chapter2/ch2_36_randomforest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from sklearn.ensemble import RandomForestClassifier 8 | from utils import data_utils 9 | from sklearn.metrics import roc_auc_score 10 | 11 | # 导入数值型样例数据 12 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2) 13 | clf = RandomForestClassifier(n_estimators=200, 14 | criterion='gini', 15 | max_depth=6, 16 | min_samples_leaf=15, 17 | bootstrap=True, 18 | oob_score=True, 19 | random_state=88) 20 | clf.fit(train_x, train_y) 21 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1]) 22 | print("随机森林模型 AUC: ", auc_score) 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | graphviz==0.16 2 | gensim==3.8.3 3 | deepwalk==1.0.3 4 | fasttext==0.9.2 5 | xgboost==1.4.2 6 | pyod==0.8.8 7 | simplejson==3.16.0 8 | scorecardpy==0.1.9.2 9 | toad==0.1.0 10 | pytz==2021.1 11 | bayesian_optimization==1.2.0 12 | pydantic==1.8.2 13 | tensorflow==2.5.0 14 | pypmml==0.9.11 15 | jieba==0.42.1 16 | sklearn2pmml==0.71.1 17 | torch==1.7.1 18 | mlxtend==0.18.0 19 | category_encoders==2.2.2 20 | matplotlib==3.4.2 21 | scipy==1.6.3 22 | tsfresh==0.18.0 23 | pandas==1.2.4 24 | gevent==21.1.2 25 | requests==2.25.1 26 | shap==0.39.0 27 | lightgbm==3.2.1 28 | sklearn_pandas==2.2.0 29 | scikit-image==0.17.2 30 | statsmodels==0.12.2 31 | python_dateutil==2.8.1 32 | node2vec==0.4.3 33 | pyltp==0.2.1 34 | openpyxl==3.0.10 35 | networkx==2.8.2 36 | numpy==1.19.5 37 | scikit-learn==0.24.2 38 | python-dateutil==2.8.1 39 | nltk==3.7 40 | textrank4zh==0.3 -------------------------------------------------------------------------------- /chapter2/ch2_37_gbdt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | from sklearn.ensemble import GradientBoostingClassifier 8 | from utils import data_utils 9 | from sklearn.metrics import roc_auc_score 10 | from sklearn.ensemble import GradientBoostingClassifier 11 | 12 | # 导入数值型样例数据 13 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2) 14 | clf = GradientBoostingClassifier(n_estimators=100, 15 | learning_rate=0.1, 16 | subsample=0.9, 17 | max_depth=4, 18 | min_samples_leaf=20, 19 | random_state=88) 20 | clf.fit(train_x, train_y) 21 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1]) 22 | print("GBDT模型 AUC: ", auc_score) 23 | -------------------------------------------------------------------------------- /chapter2/ch2_03_missrate_by_month.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | from utils import data_utils 7 | 8 | def missrate_by_month(x_with_month, month_col, x_cols): 9 | """ 10 | 按月统计缺失率 11 | :param x_cols: x变量列名 12 | :param month_col: 月份时间列名 13 | :param x_with_month: 包含月份的数据 14 | :return: 15 | """ 16 | df = x_with_month.groupby(month_col)[x_cols].apply(lambda x: x.isna().sum() / len(x)) 17 | df = df.T 18 | df['miss_rate_std'] = df.std(axis=1) 19 | return df 20 | 21 | def main(): 22 | """ 23 | 主函数 24 | """ 25 | # 导入添加month列的数据 26 | model_data = data_utils.get_data() 27 | miss_rate_by_month = missrate_by_month(model_data, month_col='month', x_cols=data_utils.numeric_cols) 28 | print("按月统计缺失率结果: \n", miss_rate_by_month) 29 | 30 | if __name__ == "__main__": 31 | main() 32 | 33 | -------------------------------------------------------------------------------- /chapter3/ch3_10_fasttext_vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # 文本特征挖掘:fasttext 8 | import pandas as pd 9 | from utils.text_utils import sentences_prepare 10 | import fasttext 11 | 12 | if __name__ == '__main__': 13 | # 加载语料 14 | sentences = sentences_prepare() 15 | 16 | # 预处理过后的文本写入文件unsupervised_train_data 17 | with open('data/text_data/unsupervised_train_data.txt', 'w') as f: 18 | for sentence in sentences: 19 | f.write(sentence) 20 | f.write('\n') 21 | 22 | # 获取fasttext词向量 23 | model = fasttext.train_unsupervised('data/text_data/unsupervised_train_data.txt', model='skipgram', dim=8) 24 | fea_vec = pd.DataFrame([model.get_sentence_vector(x).tolist() for x in sentences]) 25 | fea_vec.columns = ['fea_%s' % i for i in range(model.get_dimension())] 26 | print('词向量维度:', fea_vec.shape) 27 | -------------------------------------------------------------------------------- /chapter4/ch4_03_rules_for_isolationforest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import numpy as np 8 | from chapter4.ch4_01_rules_for_outliers import rule_discover 9 | from pyod.models.iforest import IForest 10 | from utils import data_utils 11 | 12 | # 加载数据 13 | german_credit_data = data_utils.get_data() 14 | 15 | # 构造数据集 16 | X = german_credit_data[data_utils.numeric_cols] 17 | y = german_credit_data['creditability'] 18 | 19 | # 初始化模型 20 | clf = IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0, max_samples='auto', n_estimators=500, random_state=20, verbose=0) 21 | 22 | # 训练模型 23 | clf.fit(X) 24 | 25 | # 预测结果 26 | german_credit_data['out_pred'] = clf.predict_proba(X)[:, 1] 27 | # 将预测概率大于0.7以上的设为异常值 28 | german_credit_data['iforest_rule'] = np.where(german_credit_data['out_pred'] > 0.7, 1, 0) 29 | 30 | # 效果评估 31 | rule_iforest = rule_discover(data_df=german_credit_data, var='iforest_rule', target='creditability', rule_term='==1') 32 | print("孤立森林评估结果: \n", rule_iforest.T) 33 | -------------------------------------------------------------------------------- /chapter3/ch3_03_tsfresh_orders.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # 时间序列特征挖掘 8 | import pandas as pd 9 | from tsfresh.feature_extraction import extract_features 10 | 11 | if __name__ == '__main__': 12 | # 读取数据 13 | orders = pd.read_excel('data/order_data.xlsx') 14 | orders_new = [] 15 | for i in range(len(orders)): 16 | sub_data = pd.DataFrame.from_records(eval(orders['data'][i])) 17 | sub_data['uid'] = orders['uid'][i] 18 | orders_new.append(sub_data) 19 | orders_new_df = pd.concat(orders_new) 20 | # 数据格式 21 | orders_new_df['application_amount'] = orders_new_df['application_amount'].astype(float) 22 | orders_new_df['has_overdue'] = orders_new_df['has_overdue'].astype(float) 23 | 24 | # 调用extract_features生成时间序列特征:order_feas 25 | order_feas = extract_features(orders_new_df[['uid', 'create_time', 'application_amount', 'has_overdue']], column_id="uid", column_sort="create_time") 26 | print("时间序列挖掘特征数: \n", order_feas.shape[1]) 27 | print("时间序列特征挖掘结果: \n", order_feas.head()) 28 | -------------------------------------------------------------------------------- /chapter2/ch2_09_one_hot_based_sklearn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import pandas as pd 8 | from utils import data_utils 9 | from sklearn.preprocessing import OneHotEncoder 10 | from sklearn.preprocessing import OrdinalEncoder 11 | 12 | 13 | def one_hot_encode(x): 14 | """ 15 | 将原始类别变量进行one-hot编码 16 | :param str x: 需要编码的原始变量 17 | :returns: x_oht one-hot编码后的变量 18 | """ 19 | # 首先将类别值进行数值化 20 | re = OrdinalEncoder() 21 | x_encoded = re.fit_transform(x.astype(str)) 22 | x_encoded = pd.DataFrame(x_encoded).values 23 | # 在对数值化后的类别变量进行one-hot编码 24 | ohe = OneHotEncoder(handle_unknown='ignore') 25 | x_oht = ohe.fit_transform(x_encoded).toarray() 26 | return x_oht 27 | 28 | def main(): 29 | """ 30 | 主函数 31 | """ 32 | # 加载数据 33 | german_credit_data = data_utils.get_data() 34 | # 以特征purpose为例,进行one-hot编码 35 | label_encode_x = one_hot_encode(german_credit_data[['purpose']]) 36 | label_encode_df = pd.DataFrame(label_encode_x) 37 | print("特征purpose的one-hot编码结果: \n", label_encode_df) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() -------------------------------------------------------------------------------- /chapter3/ch3_09_word2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # 文本特征挖掘:word2vec 8 | import numpy as np 9 | import pandas as pd 10 | from utils.text_utils import sentences_prepare 11 | from gensim.models import word2vec 12 | 13 | 14 | def sent2vec(words, w2v_model): 15 | """ 16 | 转换成句向量 17 | :param words: 词列表 18 | :param w2v_model: word2vec模型 19 | :return: 20 | """ 21 | if words == '': 22 | return np.array([0] * model.wv.vector_size) 23 | 24 | vector_list = [] 25 | for w in words: 26 | try: 27 | vector_list.append(w2v_model.wv[w]) 28 | except: 29 | continue 30 | vector_list = np.array(vector_list) 31 | v = vector_list.sum(axis=0) 32 | return v / np.sqrt((v ** 2).sum()) 33 | 34 | 35 | if __name__ == '__main__': 36 | # 加载语料 37 | sentences = sentences_prepare() 38 | 39 | # 获取词向量 40 | model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=2, workers=2) 41 | fea_vec = pd.DataFrame([sent2vec(x, model).tolist() for x in sentences]) 42 | fea_vec.columns = ['fea_%s' % i for i in range(model.wv.vector_size)] 43 | print('词向量维度:', fea_vec.shape) 44 | -------------------------------------------------------------------------------- /chapter3/ch3_14_node2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # 使用Node2Vec算法生成特征 8 | import networkx as nx 9 | import pandas as pd 10 | from node2vec import Node2Vec 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | def adj_to_graph(adj_table): 15 | # 根据邻接表生成图G 16 | graph = nx.Graph() 17 | # 添加边 18 | for i in range(0, len(adj_table)): 19 | node_edgs = adj_table[i] 20 | for j in range(0, len(node_edgs)): 21 | graph.add_edge(node_edgs[0], node_edgs[j]) 22 | return graph 23 | 24 | 25 | def gen_node2vec_fea(graph, dimensions=8): 26 | # 生成随机游走序列 27 | node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=30, num_walks=100, workers=4) 28 | # 向量化 29 | model = node2vec.fit(window=10, min_count=1, batch_words=4) 30 | return model.wv.vectors 31 | 32 | 33 | if __name__ == '__main__': 34 | # 数据读取 35 | adj_tbl = [] 36 | with open('data/graph_data/graph_demo.adjlist') as f: 37 | for line in f.readlines(): 38 | adj_tbl.append(line.replace('\n', '').split(' ')) 39 | G = adj_to_graph(adj_tbl) 40 | # 使用networkx展示图结构 41 | nx.draw(G, with_labels=True) 42 | plt.show() 43 | feas = gen_node2vec_fea(G, dimensions=8) 44 | print(pd.DataFrame(feas)) 45 | -------------------------------------------------------------------------------- /chapter2/ch2_40_DNN_credit_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # https://keras.io 8 | 9 | from utils import data_utils 10 | import tensorflow as tf 11 | from sklearn.metrics import roc_auc_score 12 | from tensorflow.keras import layers, models, callbacks 13 | 14 | # 加载数据集 15 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(transform_method='standard') 16 | 17 | # 设置随机数种子 18 | tf.random.set_seed(1) 19 | # 设置早停 20 | callback = callbacks.EarlyStopping(monitor='val_loss', patience=30, mode='min') 21 | # 构建DNN模型结构 22 | model = models.Sequential() 23 | model.add(layers.Flatten(input_shape=(train_x.shape[1], 1))) 24 | model.add(layers.Dense(32, activation=tf.nn.relu)) 25 | model.add(layers.Dropout(0.3, seed=1)) 26 | model.add(layers.Dense(16, activation=tf.nn.relu)) 27 | model.add(layers.Dense(1, activation=tf.nn.sigmoid)) 28 | # 显示模型的结构 29 | model.summary() 30 | # 设置模型训练参数 31 | model.compile(optimizer='SGD', 32 | metrics=[tf.metrics.AUC()], 33 | loss='binary_crossentropy') 34 | # 模型训练 35 | model.fit(train_x, train_y, validation_data=(test_x, test_y), batch_size=16, epochs=240, callbacks=[callback], verbose=2) 36 | 37 | # 效果评估 38 | auc_score = roc_auc_score(train_y, model.predict(train_x)) 39 | print("训练集AUC", auc_score) 40 | auc_score = roc_auc_score(test_y, model.predict(test_x)) 41 | print("测试集AUC", auc_score) 42 | -------------------------------------------------------------------------------- /chapter3/ch3_11_text_classifier_bayes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # 文本分类算法:朴素贝叶斯 8 | import pandas as pd 9 | from utils.text_utils import sentences_prepare_x_y 10 | from sklearn.feature_extraction.text import TfidfVectorizer 11 | from sklearn.naive_bayes import GaussianNB 12 | from sklearn.metrics import roc_auc_score 13 | 14 | 15 | def get_model(x, y): 16 | # 训练朴素贝叶斯分类器 17 | clf = GaussianNB() 18 | bayes_model = clf.fit(x, y) 19 | return bayes_model 20 | 21 | 22 | def text_sample_split(texts, y, rate=0.75): 23 | # 文本向量化 24 | cv = TfidfVectorizer(binary=True) 25 | sentence_vec = cv.fit_transform(texts) 26 | 27 | # 划分训练集和测试集 28 | split_size = int(len(texts) * rate) 29 | x_train = sentence_vec[:split_size].toarray() 30 | y_train = y[:split_size] 31 | x_test = sentence_vec[split_size:].toarray() 32 | y_test = y[split_size:] 33 | return x_train, y_train, x_test, y_test 34 | 35 | 36 | if __name__ == '__main__': 37 | # 加载语料 38 | sentences, target = sentences_prepare_x_y() 39 | print("文本数目: %s" % len(sentences)) 40 | # 训练模型 41 | x_train, y_train, x_test, y_test = text_sample_split(pd.Series(sentences), pd.Series(target)) 42 | model = get_model(x_train, y_train) 43 | # 预测 44 | y_pred = model.predict_proba(x_test)[:, 1] 45 | auc_score = roc_auc_score(y_test, y_pred) 46 | print("AUC结果: ", auc_score) 47 | -------------------------------------------------------------------------------- /chapter2/ch2_34_svm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import numpy as np 5 | import pandas as pd 6 | sys.path.append("./") 7 | sys.path.append("../") 8 | 9 | from sklearn.svm import LinearSVC 10 | from sklearn.pipeline import make_pipeline 11 | from sklearn.preprocessing import StandardScaler 12 | from sklearn.svm import SVC 13 | from utils import data_utils 14 | from sklearn.metrics import roc_auc_score 15 | from sklearn.metrics import accuracy_score 16 | from category_encoders.woe import WOEEncoder 17 | 18 | # 导入数值型样例数据 19 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2) 20 | 21 | # woe特征处理 22 | encoder = WOEEncoder(cols=train_x.columns) 23 | train_x = encoder.fit_transform(train_x, train_y) 24 | test_x = encoder.transform(test_x) 25 | 26 | # 线性SVM, Linear Support Vector Classification 27 | line_svm = LinearSVC(penalty='l2', 28 | loss='hinge', 29 | C=0.2, 30 | tol=0.001) 31 | clf = make_pipeline(StandardScaler(), line_svm) 32 | clf.fit(train_x, train_y) 33 | acc_score = accuracy_score(test_y, clf.predict(test_x)) 34 | print("线性SVM模型 ACC: ", acc_score) 35 | 36 | 37 | # 支持核函数的SVM, C-Support Vector Classification 38 | svm = SVC(C=0.2, 39 | kernel='rbf', 40 | tol=0.001, 41 | probability=True) 42 | clf = make_pipeline(StandardScaler(), svm) 43 | clf.fit(train_x, train_y) 44 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1]) 45 | print("支持核函数SVM模型 AUC: ", auc_score) 46 | -------------------------------------------------------------------------------- /chapter2/ch2_07_ordinal_encoder_based_sklearn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import pandas as pd 8 | from utils import data_utils 9 | from sklearn.preprocessing import LabelEncoder 10 | from sklearn.preprocessing import OrdinalEncoder 11 | 12 | def label_encode(x): 13 | """ 14 | 将原始分类变量用数字编码 15 | :param str x: 需要编码的原始变量 16 | :returns: x_encoded 数字编码后的变量 17 | """ 18 | le = LabelEncoder() 19 | x_encoded = le.fit_transform(x.astype(str)) 20 | class_ = le.classes_ 21 | return class_, pd.DataFrame(x_encoded, columns=x.columns) 22 | 23 | def ordinal_encode(x): 24 | """ 25 | 将原始分类变量用数字编码 26 | :param str x: 需要编码的原始变量,shape为[m,n] 27 | :returns: x_encoded 数字编码后的变量 28 | """ 29 | enc = OrdinalEncoder() 30 | x_encoded = enc.fit_transform(x.astype(str)) 31 | return pd.DataFrame(x_encoded).values 32 | 33 | 34 | def main(): 35 | """ 36 | 主函数 37 | """ 38 | # 加载数据 39 | german_credit_data = data_utils.get_data() 40 | # 以特征purpose为例,进行类别编码 41 | class_, label_encode_x = label_encode(german_credit_data[['purpose']]) 42 | print("特征'purpose'的类别编码结果: \n", label_encode_x) 43 | print("特征'purpose'编码顺序为: \n", class_) 44 | # 以特征purpose、credit.history为例,进行类别编码 45 | ordinal_encode_x = ordinal_encode(german_credit_data[['purpose', 'credit.history']]) 46 | print("特征'purpose'和'credit.history'的类别编码结果: \n", ordinal_encode_x) 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 《智能风控实践指南:从模型、特征到决策》 2 | 今年6月,我和团队一起创作的书籍《智能风控实践指南:从模型、特征到决策》终于出版,完成自己的一个心愿。过去10年在金融科技、数据挖掘领域的工作中,我逐步积累起一套智能风控方法体系,尝试将碎片化的模型、特征、策略知识点,融入到这套完整的框架中。希望通过本书将自己和团队在智能风控方面的思考、探索和实践进行分享,与同行的朋友或者将要入行的朋友进行交流探讨。 3 | 4 | 书籍的购买链接也放这里提供给有缘的朋友: 5 | 6 | * 京东官方店购买地址:[店铺地址](https://item.jd.com/13197919.html) 7 | * 当当官方店购买地址:[店铺地址](http://product.dangdang.com/29418079.html) 8 | * 京东临时活动地址【5折】:[促销活动](https://item.m.jd.com/product/13221799.html?_fd=jdm&PTAG=17053.1.1&utm_source=weixin&utm_medium=weixin&utm_campaign=t_1000072672_17053_001) 9 | * 淘宝临时活动地址【5折】:[促销活动](https://detail.tmall.com/item.htm?spm=a230r.1.14.37.7aa37dcaaXk7Ow&id=676830218907&ns=1&abbucket=13) 10 | 11 | ## 配套代码说明 12 | * 请建立Python3环境运行本书代码 13 | * chapter2到4为对应章节的代码 14 | * utils为公共基础代码 15 | * data为数据文件 16 | * requirements.txt为本代码所依赖的包说明 17 | 18 | ## 书籍主要章节 19 | 本书整体贯穿了智能风控模型、特征和策略以及智能风控管理,读者可以按顺序阅读或者根据自身知识背景有选择地阅读相应章节。 20 | 以下是《智能风控实践指南:从模型、特征到决策》主要章节: 21 | 22 | * 第1章介绍了智能风控技术的发展历史和智能风控的相关概念和应用; 23 | * 第2章介绍了搭建智能风控模型的方法、智能算法、模型优化、模型体系等,并融合模型开发实践经验; 24 | * 第3章介绍了搭建特征画像的方法、智能算法、特征挖掘、特征画像体系等,并融合特征挖掘实践经验; 25 | * 第4章介绍了搭建智能风控策略的方法、智能算法、策略体系、策略监控等,并融合策略实践经验; 26 | * 第5章介绍了智能决策与人的经验结合,剖析智能风控中的局限以及如何发挥人的价值; 27 | * 第6章介绍了智能风控相关的管理经验,解读智能风控中的一系列管理原则。 28 | 29 | ## 作者简介 30 | **蒋宏** 31 | 资深风控算法专家、数据科学家,长期从事风控模型算法和应用方面工作,带领模型团队建立智能风控体系,在风控模型、智能算法、数据挖掘、科学决策方向有深入研究和实践。拥有德勤咨询、百融云创等知名企业工作经验。上海交通大学学士、清华大学MBA。 32 | **王欢** 33 | 高级数据算法工程师,中国科学院软件研究所计算机硕士,参与国内及海外多个业务线的风控搭建、建模及特征工作,在风控模型和特征挖掘方面有丰富的实践经验。 34 | **王超** 35 | 高级风控算法工程师,历任多家知名金融科技公司算法工程师、建模咨询师,擅长风控模型、风控策略等智能风控研究方向,致力于应用深度学习等前沿技术推动智能风控的发展。 36 | **马海彪** 37 | 风控算法专家,北京航空航天大学硕士,擅长机器学习模型、数据挖掘、风控模型开发等,对风控业务有丰富的实践经验和深刻的理解。 38 | -------------------------------------------------------------------------------- /chapter2/ch2_30_validation_curve.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 绘制验证曲线 3 | 4 | import sys 5 | import numpy as np 6 | import pandas as pd 7 | sys.path.append("./") 8 | sys.path.append("../") 9 | 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | from sklearn.svm import SVC 13 | from sklearn.datasets import load_digits 14 | from sklearn.model_selection import validation_curve 15 | 16 | X, y = load_digits(return_X_y=True) 17 | 18 | param_range = np.logspace(-6, -1, 5) 19 | train_scores, test_scores = validation_curve( 20 | SVC(), X, y, param_name="gamma", param_range=param_range, 21 | scoring="accuracy", n_jobs=1) 22 | train_scores_mean = np.mean(train_scores, axis=1) 23 | train_scores_std = np.std(train_scores, axis=1) 24 | test_scores_mean = np.mean(test_scores, axis=1) 25 | test_scores_std = np.std(test_scores, axis=1) 26 | 27 | plt.title("Validation Curve with SVM") 28 | plt.xlabel(r"$\gamma$") 29 | plt.ylabel("Score") 30 | plt.ylim(0.0, 1.1) 31 | lw = 2 32 | plt.semilogx(param_range, train_scores_mean, label="Training score", 33 | color="darkorange", lw=lw) 34 | plt.fill_between(param_range, train_scores_mean - train_scores_std, 35 | train_scores_mean + train_scores_std, alpha=0.2, 36 | color="darkorange", lw=lw) 37 | plt.semilogx(param_range, test_scores_mean, label="Cross-validation score", 38 | color="navy", lw=lw) 39 | plt.fill_between(param_range, test_scores_mean - test_scores_std, 40 | test_scores_mean + test_scores_std, alpha=0.2, 41 | color="navy", lw=lw) 42 | plt.legend(loc="best") 43 | plt.show() 44 | -------------------------------------------------------------------------------- /chapter2/ch2_33_lr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import numpy as np 5 | import pandas as pd 6 | sys.path.append("./") 7 | sys.path.append("../") 8 | 9 | from utils import data_utils 10 | from sklearn.linear_model import SGDClassifier 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.metrics import roc_auc_score 13 | from sklearn.pipeline import make_pipeline 14 | from sklearn.preprocessing import StandardScaler 15 | from category_encoders.woe import WOEEncoder 16 | 17 | # 导入数值型样例数据 18 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2) 19 | 20 | # woe特征处理 21 | encoder = WOEEncoder(cols=train_x.columns) 22 | train_x = encoder.fit_transform(train_x, train_y) 23 | test_x = encoder.transform(test_x) 24 | 25 | # 利用梯度下降法训练逻辑回归模型 26 | lr = SGDClassifier(loss="log", 27 | penalty="l2", 28 | learning_rate='optimal', 29 | max_iter=100, 30 | tol=0.001, 31 | epsilon=0.1, 32 | random_state=1) 33 | clf = make_pipeline(StandardScaler(), lr) 34 | clf.fit(train_x, train_y) 35 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1]) 36 | print("梯度下降法训练逻辑回归模型 AUC: ", auc_score) 37 | 38 | # 利用牛顿法训练逻辑回归模型 39 | lr = LogisticRegression(penalty="l2", 40 | solver='lbfgs', 41 | max_iter=100, 42 | tol=0.001, 43 | random_state=1) 44 | clf = make_pipeline(StandardScaler(), lr) 45 | clf.fit(train_x, train_y) 46 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1]) 47 | print("牛顿法训练逻辑回归模型 AUC: ", auc_score) 48 | -------------------------------------------------------------------------------- /chapter3/ch3_05_gbdt_construct_feature.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # 使用GBDT算法做特征衍生 8 | import pandas as pd 9 | from sklearn.preprocessing import OneHotEncoder 10 | from sklearn.ensemble import GradientBoostingClassifier 11 | 12 | 13 | def gbdt_fea_gen(train_data, label, n_estimators=100): 14 | # 训练GBDT模型 15 | gbc_model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=1) 16 | gbc_model.fit(train_data, label) 17 | 18 | # 得到样本元素落在叶节点中的位置 19 | train_leaf_fea = gbc_model.apply(train_data).reshape(-1, n_estimators) 20 | 21 | # 借用编码将位置信息转化为0,1 22 | one_hot_encoder = OneHotEncoder() 23 | one_hot_encoder.fit(train_leaf_fea) 24 | return gbc_model, one_hot_encoder 25 | 26 | 27 | def gbdt_fea_appy(data, model, encoder): 28 | # 获得GBDT特征 29 | new_feature_train = encoder.transform(model.apply(data).reshape(-1, model.n_estimators)).toarray() 30 | # new_feas为生成的新特征 31 | new_fea = pd.DataFrame(new_feature_train) 32 | new_fea.index = data.index 33 | new_fea.columns = ['fea_%s' % i for i in range(1, new_fea.shape[1] + 1)] 34 | return new_fea 35 | 36 | 37 | if __name__ == '__main__': 38 | # 读取原始特征数据 39 | all_x_y = pd.read_excel('data/order_feas.xlsx') 40 | all_x_y.set_index('order_no', inplace=True) 41 | # 生成训练数据 42 | x_train = all_x_y.drop(columns='label') 43 | x_train.fillna(0, inplace=True) 44 | y = all_x_y['label'] 45 | # 获取特征 46 | gbr, encode = gbdt_fea_gen(x_train, y, n_estimators=100) 47 | new_features = gbdt_fea_appy(x_train, gbr, encode) 48 | print("使用GBDT算法衍生特征结果: \n", new_features.head()) 49 | -------------------------------------------------------------------------------- /chapter2/ch2_24_fs_badrate_by_month.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import toad 8 | import pandas as pd 9 | from utils import data_utils 10 | 11 | 12 | # 导入添加month列的数据 13 | model_data = data_utils.get_data() 14 | 15 | x = model_data[data_utils.x_cols] 16 | y = model_data[data_utils.label] 17 | 18 | # 分箱 19 | Combiner = toad.transform.Combiner() 20 | x_cat = Combiner.fit_transform(x, y, n_bins=6, method='quantile', empty_separate=True) 21 | 22 | # 合并标签和month 23 | x_cat_with_month = x_cat.merge(model_data[['month', 'creditability']], left_index=True, right_index=True) 24 | 25 | # 单个特征对比逾期率 26 | feature_col = 'age.in.years' 27 | x_cat_one = x_cat_with_month[[feature_col, 'month', 'creditability']] 28 | feature_var = x_cat_one.pivot_table(index=feature_col, 29 | columns='month', 30 | values='creditability', 31 | aggfunc=['mean']) 32 | print("特征'age.in.years'的按月分箱逾期率统计结果: \n", feature_var) 33 | 34 | 35 | # 计算特征按月逾期率波动值 36 | def variation_by_month(df, time_col, columns, label_col): 37 | variation_dict = {} 38 | for col in columns: 39 | feature_v = df.pivot_table( 40 | index=col, columns=time_col, values=label_col, aggfunc=['mean']) 41 | variation_dict[col] = feature_v.rank().std(axis=1).mean() 42 | 43 | return pd.DataFrame([variation_dict], index=['variation']).T 44 | 45 | 46 | var_badrate = variation_by_month(x_cat_with_month, 'month', data_utils.x_cols, 'creditability') 47 | print("各特征按月逾期率的标准差: \n", var_badrate) 48 | 49 | selected_cols = var_badrate[var_badrate['variation'] < 0.8].index.tolist() 50 | print("设置标准差阈值为0.8, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols) 51 | -------------------------------------------------------------------------------- /chapter2/ch2_41_CNN_credit_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import tensorflow as tf 8 | from tensorflow.keras import layers, models, callbacks 9 | from utils import data_utils 10 | from sklearn.metrics import roc_auc_score 11 | from tensorflow.keras import layers, models, callbacks 12 | 13 | # 加载数据集 14 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(transform_method='standard') 15 | 16 | # 数据预处理 17 | train_x = train_x.to_numpy().reshape((train_x.shape[0], train_x.shape[1], 1)) 18 | test_x = test_x.to_numpy().reshape((test_x.shape[0], test_x.shape[1], 1)) 19 | train_y = train_y.values.reshape((train_y.shape[0], 1)) 20 | test_y = test_y.values.reshape((test_y.shape[0], 1)) 21 | 22 | # 设置随机数种子,保证每次运行结果一致 23 | tf.random.set_seed(1) 24 | callback = callbacks.EarlyStopping(monitor='val_loss', patience=30, mode='min') 25 | 26 | # 构建CNN模型结构 27 | model = models.Sequential() 28 | model.add(layers.Conv1D(filters=16, kernel_size=4, activation='relu', input_shape=(train_x.shape[1], 1))) 29 | model.add(layers.Conv1D(filters=8, kernel_size=1, activation='relu')) 30 | model.add(layers.Flatten()) 31 | model.add(layers.Dropout(0.3, seed=1)) 32 | model.add(layers.Dense(16, activation='relu')) 33 | model.add(layers.Dense(1, activation='sigmoid')) 34 | # 显示模型的结构 35 | model.summary() 36 | # 设置模型训练参数 37 | model.compile(optimizer='SGD', 38 | metrics=[tf.metrics.AUC()], 39 | loss='binary_crossentropy') 40 | # 模型训练 41 | model.fit(train_x, train_y, validation_data=(test_x, test_y), batch_size=16, epochs=240, callbacks=[callback], verbose=2) 42 | 43 | # 测试集效果评估 44 | auc_score = roc_auc_score(train_y, model.predict(train_x)) 45 | print("训练集AUC", auc_score) 46 | auc_score = roc_auc_score(test_y, model.predict(test_x)) 47 | print("测试集AUC", auc_score) 48 | -------------------------------------------------------------------------------- /chapter3/ch3_04_feature_evaluation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | import time 5 | import numpy as np 6 | import pandas as pd 7 | from scipy.stats import variation 8 | sys.path.append("./") 9 | sys.path.append("../") 10 | 11 | def cover_ratio(x): 12 | """ 13 | 计算特征覆盖度 14 | :param x: 特征向量 15 | :return: cover_ratio, 特征覆盖度 16 | """ 17 | len_x = len(x) 18 | len_nan = sum(pd.isnull(x)) 19 | ratio = 1 - len_nan / float(len_x) 20 | return ratio 21 | 22 | 23 | def get_datestamps(begin_date, end_date): 24 | """ 25 | 返回[begin_date,end_date]之间日期的时间戳 26 | :param begin_date: 开始时间 27 | :param end_date: 结束时间 28 | :return: [begin_date,end_date]日期的时间戳 29 | """ 30 | date_arr = [int(time.mktime(x.timetuple())) for x in list(pd.date_range(start=begin_date, end=end_date))] 31 | return date_arr 32 | 33 | 34 | if __name__ == '__main__': 35 | # 模拟生成几个特征 36 | fea_1 = [-1, -1, -1, 0, 1, 1, 1] # 特征均值为0 37 | fea_2 = [1, 1, 1, 1, 1, 1, 1] # 所有特征均为唯一指 38 | fea_3 = [1, 2, 3, 4, 5, 6, 7] # 与时间正相关 39 | fea_4 = [7, 6, 5, 4, 3, 2, 1] # 与时间负相关 40 | fea_5 = [1, 2, 1, 2, np.nan, 2, np.nan] # 与时间无线性关系 41 | 42 | x_all = pd.DataFrame([fea_1, fea_2, fea_3, fea_4, fea_5]).T 43 | x_all.columns = ['fea_1', 'fea_2', 'fea_3', 'fea_4', 'fea_5'] 44 | 45 | # 特征覆盖度 46 | fea_cover = x_all.apply(cover_ratio).to_frame('cover_ratio') 47 | print("特征覆盖度: ", fea_cover) 48 | 49 | # 特征离散度 50 | fea_variation = variation(fea_2) 51 | print("特征离散度: ", fea_variation) 52 | 53 | # 计算时间相关性 54 | x_all['tm_col'] = get_datestamps('2020-10-01', '2020-10-07') 55 | 56 | # 计算三个特征与时间的Peason系数 57 | fea_time_corr = x_all.loc[:, ['fea_3', 'fea_4', 'fea_5', 'tm_col']].corr().loc[:, ['tm_col']] 58 | 59 | print("构造的特征为: \n", x_all) 60 | print("特征与时间的Peason系数计算结果: \n", fea_time_corr) 61 | -------------------------------------------------------------------------------- /utils/time_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import time 4 | import pytz 5 | import numpy as np 6 | import datetime as dt 7 | from dateutil.parser import parse 8 | 9 | 10 | def stamp_to_date(time_stamp, timezone=None): 11 | """ 12 | 时间戳转日期函数 13 | :param time_stamp:int,时间戳 14 | :param timezone:string,时区 15 | :return: datetime 16 | """ 17 | try: 18 | if timezone is None: 19 | stamp_str = str(time_stamp) 20 | if len(stamp_str) >= 10: 21 | stamp_str = stamp_str[:10] 22 | else: 23 | stamp_str = stamp_str 24 | time_stamp = int(stamp_str) 25 | date = dt.datetime.fromtimestamp(time_stamp) 26 | return date 27 | else: 28 | stamp_str = str(time_stamp) 29 | if len(stamp_str) >= 10: 30 | stamp_str = stamp_str[:10] 31 | else: 32 | stamp_str = stamp_str 33 | time_stamp = int(stamp_str) 34 | tz = pytz.timezone(timezone) 35 | date = dt.datetime.fromtimestamp(time_stamp, tz).strftime('%Y-%m-%d %H:%M:%S') 36 | date = parse(date) 37 | return date 38 | except: 39 | return parse('2100-01-01') 40 | 41 | 42 | def date_to_stamp(date_time): 43 | """ 44 | 将日期转换为时间戳 45 | :param date_time: string,datetime 46 | :return: int 47 | """ 48 | try: 49 | if isinstance(date_time, str): 50 | date_time = parse(date_time) 51 | return int(time.mktime(date_time.timetuple())) 52 | except: 53 | return int(631123200) 54 | 55 | 56 | def date_to_week(date): 57 | ''' 58 | 日期转换为星期 59 | :param date:datetime,string 60 | :return: int 61 | ''' 62 | try: 63 | if isinstance(date, str): 64 | date = parse(date) 65 | if_weekend = date.weekday() 66 | return if_weekend 67 | except: 68 | return np.nan 69 | -------------------------------------------------------------------------------- /chapter4/ch4_00_rules_for_iv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import toad 8 | import numpy as np 9 | import pandas as pd 10 | from utils import data_utils 11 | from toad.plot import bin_plot 12 | from matplotlib import pyplot as plt 13 | 14 | 15 | def cal_iv(x, y): 16 | """ 17 | IV计算函数 18 | :param x: feature 19 | :param y: label 20 | :return: 21 | """ 22 | crtab = pd.crosstab(x, y, margins=True) 23 | crtab.columns = ['good', 'bad', 'total'] 24 | crtab['factor_per'] = crtab['total'] / len(y) 25 | crtab['bad_per'] = crtab['bad'] / crtab['total'] 26 | crtab['p'] = crtab['bad'] / crtab.loc['All', 'bad'] 27 | crtab['q'] = crtab['good'] / crtab.loc['All', 'good'] 28 | crtab['woe'] = np.log(crtab['p'] / crtab['q']) 29 | crtab2 = crtab[abs(crtab.woe) != np.inf] 30 | 31 | crtab['IV'] = sum( 32 | (crtab2['p'] - crtab2['q']) * np.log(crtab2['p'] / crtab2['q'])) 33 | crtab.reset_index(inplace=True) 34 | crtab['varname'] = crtab.columns[0] 35 | crtab.rename(columns={crtab.columns[0]: 'var_level'}, inplace=True) 36 | crtab.var_level = crtab.var_level.apply(str) 37 | return crtab 38 | 39 | 40 | german_credit_data = data_utils.get_data() 41 | 42 | # 生成分箱初始化对象 43 | bin_transformer = toad.transform.Combiner() 44 | 45 | # 采用等距分箱训练 46 | bin_transformer.fit(german_credit_data, 47 | y='creditability', 48 | n_bins=6, 49 | method='step', 50 | empty_separate=True) 51 | 52 | # 分箱数据 53 | trans_data = bin_transformer.transform(german_credit_data, labels=True) 54 | 55 | # 查看Credit amount分箱结果 56 | bin_plot(trans_data, x='credit.amount', target='creditability') 57 | plt.show() 58 | 59 | # 查看Credit amount分箱数据 60 | cal_iv(trans_data['credit.amount'], trans_data['creditability']) 61 | 62 | # 构建单规则 63 | german_credit_data['credit.amount.rule'] = np.where(german_credit_data['credit.amount'] > 12366.0, 1, 0) 64 | -------------------------------------------------------------------------------- /chapter3/ch3_00_order_data_preprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from utils.data_utils import stamp_to_date 10 | from utils.data_utils import date_to_week 11 | 12 | 13 | def data_preprocess(data, time_col, back_time, dtypes_dict): 14 | """ 15 | 数据预处理函数 16 | :param data: 待处理的数据 17 | :param time_col: 回溯依据的时间列名称 18 | :param back_time: 特征计算时间,datetime.datetime时间格式 19 | :param dtypes_dict: 指定列字段类型的字典,如{'col1':int} 20 | :return: 清洗完成的数据 21 | """ 22 | # 删除time_col为空的行 23 | data = data[~data[time_col].isin(['nan', np.nan, 'NAN', 'null', 'NULL', 'Null'])] 24 | # 将时间列的时间戳转为日期格式 25 | data[time_col] = data[time_col].apply(stamp_to_date) 26 | # 过滤订单创建时间在back_time之后的数据,避免特征穿越 27 | data = data[data[time_col] <= back_time] 28 | # 删除整条缺失的数据 29 | data.dropna(how='all', inplace=True) 30 | # 空字符串替换为np.nan 31 | data.replace('', np.nan, inplace=True) 32 | # 单个字段缺失填充为0 33 | data.fillna(0, inplace=True) 34 | # 去重 35 | data.drop_duplicates(keep='first', inplace=True) 36 | # 字段格式转换 37 | data = data.astype(dtypes_dict) 38 | # 补充字段 39 | data['create_time_week'] = data[time_col].apply(date_to_week) 40 | data['is_weekend'] = data['create_time_week'].apply(lambda x: 1 if x > 5 else 0) 41 | 42 | return data 43 | 44 | 45 | if __name__ == '__main__': 46 | # 原始数据读入 47 | orders = pd.read_excel('data/order_data.xlsx') 48 | # 取一个用户的历史订单数据 49 | raw_data = pd.DataFrame(eval(orders['data'][1])) 50 | # 数据预处理 51 | data_processed = data_preprocess(raw_data, time_col='create_time', 52 | back_time='2020-12-14', 53 | dtypes_dict={'has_overdue': int, 54 | 'application_term': float, 55 | 'application_amount': float}) 56 | print(data_processed.shape) 57 | -------------------------------------------------------------------------------- /chapter3/ch3_06_cluster_alg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # 使用聚类算法衍生特征 8 | import pandas as pd 9 | from sklearn.cluster import KMeans 10 | 11 | 12 | def cluster_fea_gen(data, selected_cols, n_clusters): 13 | """ 14 | 使用聚类算法生成特征 15 | :param data: 用作输入的x,y 16 | :param selected_cols: 选取用来做聚类的特征列 17 | :param n_clusters: 聚类类别数 18 | :return: 聚类算法生成的特征 19 | """ 20 | x_cluster_feas = data.loc[:, selected_cols] 21 | # 拟合聚类模型 22 | clf = KMeans(n_clusters=n_clusters, random_state=1) 23 | clf.fit(x_cluster_feas) 24 | return clf 25 | 26 | 27 | def cluster_fea_apply(data, selected_cols, clf): 28 | """ 29 | 使用聚类算法生成特征 30 | :param data: 用作输入的x,y 31 | :param selected_cols: 选取用来做聚类的特征列 32 | :param clf: 聚类模型 33 | :return: 聚类算法生成的特征 34 | """ 35 | # 对原数据表进行类别标记 36 | data['group'] = clf.predict(data[selected_cols]) 37 | 38 | # 距质心距离特征的计算 39 | centers_df = pd.DataFrame(clf.cluster_centers_) 40 | centers_df.columns = [x + '_center' for x in selected_cols] 41 | 42 | for item in selected_cols: 43 | data[item + '_center'] = data['group'].apply( 44 | lambda x: centers_df.iloc[x, :][item + '_center']) 45 | data[item + '_distance'] = data[item] - data[item + '_center'] 46 | 47 | fea_cols = ['group'] 48 | fea_cols.extend([x + '_distance' for x in selected_cols]) 49 | 50 | return data.loc[:, fea_cols] 51 | 52 | 53 | if __name__ == '__main__': 54 | # 数据读取 55 | all_x_y = pd.read_excel('data/order_feas.xlsx') 56 | all_x_y.set_index('order_no', inplace=True) 57 | # 取以下几个特征做聚类 58 | chose_cols = ['orderv1_age', 'orderv1_90_workday_application_amount_mean', 'orderv1_history_order_num', 59 | 'orderv1_max_overdue_days'] 60 | all_x_y.fillna(0, inplace=True) 61 | 62 | # 生成聚类特征 63 | model = cluster_fea_gen(all_x_y, chose_cols, n_clusters=5) 64 | fea_cluster = cluster_fea_apply(all_x_y, chose_cols, model) 65 | print("使用聚类算法衍生特征数: \n", fea_cluster.shape[1]) 66 | print("使用聚类算法衍生特征结果: \n", fea_cluster.head()) 67 | -------------------------------------------------------------------------------- /chapter3/ch3_12_text_classifier_fasttext.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # 文本分类算法:fasttext 8 | import fasttext 9 | import pandas as pd 10 | from utils.text_utils import sentences_prepare_with_y 11 | from sklearn.metrics import roc_auc_score 12 | 13 | 14 | def process_sentences(train_path, test_path, rate=0.8): 15 | sentences = sentences_prepare_with_y() 16 | # 预处理之后的数据写入文件train_data.txt 17 | num = int(len(sentences) * rate) 18 | train_out = open(train_path, 'w') 19 | test_out = open(test_path, 'w') 20 | for sentence in sentences[:num]: 21 | train_out.write(sentence) 22 | train_out.write("\n") 23 | for sentence in sentences[num:]: 24 | test_out.write(sentence) 25 | test_out.write("\n") 26 | print("预处理之后的数据已写入文件train_data.txt, test_data.txt") 27 | print("train文本数目: %s, test文本数目: %s" % (num, len(sentences) - num)) 28 | 29 | 30 | if __name__ == '__main__': 31 | # 处理文本数据 32 | process_sentences(train_path='data/train_data.txt', test_path='data/test_data.txt', rate=0.8) 33 | 34 | # 训练、保存模型 35 | classifier = fasttext.train_supervised('data/train_data.txt', label='__label__', wordNgrams=3, loss='softmax') 36 | classifier.save_model('data/fasttext_demo.model') 37 | 38 | # 加载模型 39 | classifier = fasttext.load_model('data/fasttext_demo.model') 40 | texts = "系列 票房 不差 口碑 生化危机 资深 玩家 张艳 告诉 玩家 很难 承认 一系列 电影 " \ 41 | "电影 原著 面目全非 女主角 爱丽丝 游戏 角色 电影 渐渐 脱离 游戏 打着 游戏 名号 发展 票房 " \ 42 | "号召力 观众 影响力 电影 系列 具备 剧情 世界观 游戏 生硬 强加 角色 背景 " 43 | print("当前文本所属类别: ", classifier.predict(texts)) 44 | 45 | # 测试集 46 | test_data = pd.read_csv('data/test_data.txt', header=None) 47 | texts_new = test_data[1].tolist() 48 | y_true = [1 if x.strip() == '__label__sports' else 0 for x in test_data[0].tolist()] 49 | 50 | # 预测效果评估 51 | result = classifier.predict(texts_new) 52 | y_pre = [] 53 | for i in range(len(result[0])): 54 | if result[0][i][0] == '__label__sports': 55 | y_pre.append(result[1][i][0]) 56 | else: 57 | y_pre.append(1 - result[1][i][0]) 58 | auc_score = roc_auc_score(y_true, y_pre) 59 | print("测试集AUC为: ", auc_score) 60 | -------------------------------------------------------------------------------- /chapter3/ch3_08_bag_of_words.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # 文本特征挖掘:词袋模型示例 8 | import pandas as pd 9 | from utils.text_utils import sentences_prepare 10 | from sklearn.feature_extraction.text import CountVectorizer 11 | from sklearn.feature_extraction.text import HashingVectorizer 12 | from sklearn.feature_extraction.text import TfidfVectorizer 13 | 14 | 15 | def gen_count_doc_vec(text): 16 | """ 17 | 基于词频统计生成文本的向量表示 18 | :param text: 输入文本 19 | :return: 生成的文本向量表示 20 | """ 21 | cv = CountVectorizer(binary=True) 22 | document_vec = cv.fit_transform(text) 23 | return pd.DataFrame(document_vec.toarray()) 24 | 25 | 26 | def gen_tfidf_doc_vec(text): 27 | """ 28 | 基于TfidfVectorizer生成文本向量表示 29 | :param text: 输入文本 30 | :return: 生成的文本向量表示 31 | """ 32 | cv = TfidfVectorizer() 33 | document_vec = cv.fit_transform(text) 34 | return pd.DataFrame(document_vec.toarray()) 35 | 36 | 37 | def gen_hash_doc_vec(text, n_features=8): 38 | """ 39 | 基于HashingVectorizer生成文本向量表示 40 | :param text: 输入文本 41 | :param n_features: 指定输出特征的维数 42 | :return: 生成的文本向量表示 43 | """ 44 | cv = HashingVectorizer(n_features=n_features) 45 | document_vec = cv.fit_transform(text) 46 | return pd.DataFrame(document_vec.toarray()) 47 | 48 | 49 | def gen_ngram_doc_vec(text): 50 | ngram_cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", 51 | token_pattern=r'\b\w+\b', min_df=1) 52 | document_vec = ngram_cv.fit_transform(text) 53 | return pd.DataFrame(document_vec.toarray()) 54 | 55 | 56 | if __name__ == '__main__': 57 | sentences = sentences_prepare() 58 | # 词袋模型应用示例 59 | # 取前三条文本用于展示 60 | texts = sentences[0:5] 61 | fea_vec_count = gen_count_doc_vec(texts) 62 | print("CountVectorizer词向量:") 63 | print(fea_vec_count) 64 | 65 | fea_vec_tfidf = gen_tfidf_doc_vec(texts) 66 | print("TfidfVectorizer词向量:") 67 | print(fea_vec_tfidf) 68 | 69 | fea_vec_hash = gen_hash_doc_vec(texts, n_features=8) 70 | print("HashingVectorizer词向量:") 71 | print(fea_vec_hash) 72 | 73 | fea_vec_ngram = gen_ngram_doc_vec(texts) 74 | print("CountVectorizer词向量(ngram):") 75 | print(fea_vec_ngram) 76 | -------------------------------------------------------------------------------- /chapter4/ch4_02_rules_for_decisiontree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import sklearn.tree as st 8 | import graphviz 9 | from utils import data_utils 10 | 11 | 12 | def decision_tree_resolve(train_x, train_y, class_names=None, max_depth=3, fig_path=''): 13 | """ 14 | 基于决策树可视化 15 | :param train_x: data of train 16 | :param train_y: data of y 17 | :param class_names: 标签名称 18 | :param max_depth: 树最大深度 19 | :param fig_path: 图片路径和名称 20 | :return: 21 | """ 22 | if class_names is None: 23 | class_names = ['good', 'bad'] 24 | clf = st.DecisionTreeClassifier(max_depth=max_depth, 25 | min_samples_leaf=0.01, 26 | min_samples_split=0.01, 27 | criterion='gini', 28 | splitter='best', 29 | max_features=None) 30 | clf = clf.fit(train_x, train_y) 31 | 32 | # 比例图 33 | dot_data = st.export_graphviz(clf, out_file=None, 34 | feature_names=train_x.columns.tolist(), 35 | class_names=class_names, 36 | filled=True, 37 | rounded=True, 38 | node_ids=True, 39 | special_characters=True, 40 | proportion=True, 41 | leaves_parallel=True) 42 | graph = graphviz.Source(dot_data, filename=fig_path) 43 | return graph 44 | 45 | 46 | # 加载数据 47 | german_credit_data = data_utils.get_data() 48 | 49 | # 构造数据集 50 | X = german_credit_data[data_utils.numeric_cols].copy() 51 | y = german_credit_data['creditability'] 52 | 53 | graph = decision_tree_resolve(X, y, fig_path='data/tree') 54 | graph.view() 55 | 56 | # 转化为规则 57 | X['node_5'] = X.apply(lambda x: 1 if x['duration.in.month'] <= 34.5 and x['credit.amount'] > 8630.5 else 0, axis=1) 58 | X['node_9'] = X.apply( 59 | lambda x: 1 if x['duration.in.month'] > 34.5 and x['age.in.years'] <= 29.5 and x['credit.amount'] > 4100.0 else 0, 60 | axis=1) 61 | X['node_12'] = X.apply(lambda x: 1 if x['duration.in.month'] > 34.5 and x['age.in.years'] > 56.5 else 0, axis=1) 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | *.xml 131 | *.iml 132 | *.DS_Store -------------------------------------------------------------------------------- /chapter2/ch2_32_model_deployment_pmml.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # PMML方式保存和读取模型 8 | from sklearn2pmml import sklearn2pmml, PMMLPipeline 9 | from sklearn_pandas import DataFrameMapper 10 | from pypmml import Model 11 | from xgboost.sklearn import XGBClassifier 12 | from utils import data_utils 13 | from chapter2.ch2_31_model_deployment_pickle import load_model_from_pkl 14 | 15 | 16 | # 以xgb模型为例,方式1: 17 | # sklearn接口的xgboost,可使用sklearn2pmml生成pmml文件 18 | def save_model_as_pmml(x, y, save_file_path): 19 | """ 20 | 保存模型到路径save_file_path 21 | :param x: 训练数据特征 22 | :param y: 训练数据标签 23 | :param save_file_path: 保存的目标路径 24 | """ 25 | # 设置pmml的pipeline 26 | xgb = XGBClassifier(random_state=88) 27 | mapper = DataFrameMapper([([i], None) for i in x.columns]) 28 | pipeline = PMMLPipeline([('mapper', mapper), ('classifier', xgb)]) 29 | # 模型训练 30 | pipeline.fit(x, y) 31 | # 模型结果保存 32 | sklearn2pmml(pipeline, pmml=save_file_path, with_repr=True) 33 | 34 | 35 | # PMML格式读取 36 | def load_model_from_pmml(load_file_path): 37 | """ 38 | 从路径load_file_path加载模型 39 | :param load_file_path: pmml文件路径 40 | """ 41 | model = Model.fromFile(load_file_path) 42 | return model 43 | 44 | 45 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2) 46 | save_model_as_pmml(train_x, train_y, 'data/model/xgb_model.pmml') 47 | model = load_model_from_pmml('data/model/xgb_model.pmml') 48 | pre = model.predict(test_x) 49 | print(pre.head()) 50 | 51 | # 方式2: 52 | # 原生xgboost.core库生成的XGBoost模型,不能使用sklearn2pmml生成pmml文件,只能通过jpmml-xgboost包,将已有的.bin或.model 53 | # 格式模型文件转为pmml文件 54 | 55 | # step1.获取到xgb模型文件 56 | xgb_model = load_model_from_pkl("data/model/xgb_model.pkl") 57 | 58 | 59 | # step2.生成fmap文件 60 | def create_feature_map(file_name, features): 61 | outfile = open(file_name, 'w') 62 | for i, feat in enumerate(features): 63 | outfile.write('{0}\t{1}\tq\n'.format(i, feat)) 64 | 65 | 66 | create_feature_map('data/model/xgb_model.fmap', xgb_model.feature_names) 67 | 68 | # step3.jpmml-xgboost的环境配置及pmml转换: 69 | # step3.1. 下载jpmml-xgboost 70 | # step3.2. 命令行切换到jpmml-xgboost的项目文件夹,输入代码编译 71 | # mvn clean install 72 | # 该步执行完后,jpmml-xgboost的项目文件夹下会多出一个target文件夹,里面包含生成好的jar包 73 | # step3.3. jar包转换为pmml文件 74 | # java -jar jpmml-xgboost_path/target/jpmml-xgboost-executable-1.5-SNAPSHOT.jar --X-nan-as-missing False 75 | # --model-input data/model/xgb.model --fmap-input data/model/xgb.fmap --target-name target 76 | # --pmml-output data/model/xgb_pmml.pmml 77 | -------------------------------------------------------------------------------- /chapter3/ch3_01_order_fea_gen_manual.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import pandas as pd 8 | import datetime as dt 9 | from dateutil.parser import parse 10 | from chapter3.ch3_00_order_data_preprocess import data_preprocess 11 | 12 | 13 | def calculate_age(born_day, back_time=None): 14 | """ 15 | 根据出生日期解析年龄 16 | :param born_day: 出生日期 17 | :param back_time: 回溯时间,默认当前日期 18 | :return: 年龄 19 | """ 20 | if back_time is None: 21 | today = dt.date.today() 22 | else: 23 | today = back_time 24 | if isinstance(born_day, str): 25 | born_day = parse(born_day) 26 | if isinstance(today, str): 27 | today = parse(today) 28 | return today.year - born_day.year - ((today.month, today.day) < (born_day.month, born_day.day)) 29 | 30 | 31 | def gen_order_feature_manual(data, time_col, back_time, dtypes_dict, fea_prefix='f'): 32 | """ 33 | 根据业务逻辑生成特征 34 | :param data: 业务订单原始数据 35 | :param time_col: 回溯依据的时间列名称 36 | :param back_time: 回溯时间点 37 | :param dtypes_dict: 指定列字段类型的字典,如{'col1':int} 38 | :param fea_prefix: 特征前缀 39 | :return: features,根据业务逻辑生成的特征 40 | """ 41 | # 数据预处理函数,见文件ch3_01_order_data_preprocess.py 42 | data_processed = data_preprocess(data, time_col, back_time, dtypes_dict=dtypes_dict) 43 | features = {} 44 | # 从生日解析年龄 45 | features['%s_age' % fea_prefix] = calculate_age(data_processed.get('birthday')[0], back_time) 46 | # 用户历史订单数 47 | features['%s_history_order_num' % fea_prefix] = data_processed.shape[0] 48 | # 用户历史逾期次数 49 | features['%s_overdue_num' % fea_prefix] = data_processed['has_overdue'].sum() 50 | # 用户历史最大逾期天数 51 | features['%s_max_overdue_days' % fea_prefix] = data_processed['overdue_days'].max() 52 | # 用户历史平均逾期天数 53 | features['%s_mean_overdue_days' % fea_prefix] = data_processed['overdue_days'].mean() 54 | 55 | return features 56 | 57 | 58 | if __name__ == '__main__': 59 | # 原始数据读入 60 | orders = pd.read_excel('data/order_data.xlsx') 61 | # 取一个用户的历史订单数据 62 | raw_data = pd.DataFrame(eval(orders['data'][1])) 63 | back_time_value = orders['back_time'][1] 64 | cols_dtypes_dict = {'has_overdue': int, 'application_term': float, 'application_amount': float} 65 | 66 | # 根据业务逻辑生成用户历史订单特征 67 | features_manual = gen_order_feature_manual(raw_data, 'create_time', back_time_value, cols_dtypes_dict) 68 | print(features_manual) 69 | 70 | # 批量生成特征 71 | feature_dict = {} 72 | for i, row in orders.iterrows(): 73 | feature_dict[i] = gen_order_feature_manual(pd.DataFrame(eval(row['data'])), 'create_time', row['back_time'], 74 | cols_dtypes_dict, fea_prefix='orderv1') 75 | feature_df = pd.DataFrame(feature_dict).T 76 | # feature_df.to_excel('data/features_manual.xlsx', index=True) 77 | -------------------------------------------------------------------------------- /data/text_data/test.txt: -------------------------------------------------------------------------------- 1 | 昨天晚上,亚冠小组赛全部结束,中超两支球队山东泰山和广州队的本届亚冠之旅就此划上句号。5月1日媒体人潘伟力发声总结了两支球队的表现。 2 | 潘伟力表示:“泰山队1平5负打进2球丢24球、广州队6战全败打进0球同样丢了24球,6场比赛山东队平均控球率为25%,排名所有亚冠球队的倒数第二,广州队平均控球率20%,排名所有球队倒数第一。 3 | 6场比赛山东泰山队一共有20脚射门,广州队只有11脚射门。” 4 | “关于本届亚冠在国内争议最大的话题就是,这样的班底、阵容参加亚冠,到底是对孩子们信心的摧毁,还是会让他们得到经验,未来得到比较大的提升?这样的观点见仁见智,我们不做定论,我更关注的事情是孩子回国之后,他们未来的发展之路会是什么样。” 5 | 潘伟力继续说道:“显然两支球队00后的球员,想要进入新赛季广州队、泰山队一线队征战中超联赛概率非常低。 6 | 两个月前,足协和职业联盟计划推出U21联赛,就是为这个年龄段球员准备的。要求中超18支球队、中甲18支球队必须参加,36支球队分为四个阶段打赛会制的比赛。 7 | 如果第一阶段、第二阶段小组出线,一年最多可以打21场;如果前两个阶段出局了,一年只有10场比赛。” 8 | “试想一下20岁的年轻球员,在最需要比赛来提高的年纪,一年如果只有10场、20场的比赛机会,他们未来成为优秀球员的概率又有多大呢?即便这样,咱们U21联赛也没有板上钉钉一定能推行,因为中超联赛都还没有定论。 9 | 过去两年,咱们已经停办了预备队联赛,很多年轻球员长期没有比赛可踢,这才是中国足球最让人担心的现实处境。” 10 | 北京时间4月21日下午,国际乒联世界排名工作小组进行线上交流讨论。大满贯运动员李晓霞受邀参加,首次以新身份在国际组织工作中亮相。 11 | 该小组由2021年国际乒联年度代表大会批准设立,成员包含国际乒联执委会代表、运动员委员会代表、洲级和会员协会专家代表、国际乒联具有排名事务经验以及商业事务经验的雇员和国际乒联个人技术委员。 12 | 主要工作涵盖审议世界排名体系政策、为运动员提供获得提高世界排名的公平机会并向国际乒联执委会提供专业建议等重要内容。李晓霞由中国乒协推荐并经国际乒联执委会批准,以专家代表身份成为该小组正式成员。 13 | 工作小组就现有乒乓球世界排名积分构成、积分赛事、积分保护等关乎运动员核心利益的重要议题展开讨论。小组成员们踊跃发表专业见解,李晓霞也积极参与其中。 14 | 在表达对小组后续工作和自身职责期待时,李晓霞认为这是一项非常有意义的工作,其本人很高兴能以全新的身份参与其中,并主动提到她将继续努力学习英文,从而更好地进行国际交流,在为运动员发声的同时,也为世界排名政策制定提供更多专业建议。 15 | 在中国乒协的大力推荐下,运动员许昕于去年成功当选亚乒联盟第一副主席。 16 | 如此短时间内又有大满贯运动员在国际组织担任重要职务,充分体现了协会对国际组织工作的高度重视。 17 | 未来,中国乒协将争取选派更多具有丰富乒乓球专业经验、在世界乒坛享有良好声誉并具有较强沟通能力的优秀运动员加入国际组织,进一步提升国际话语权。 18 | 4月5日,国际乒联(ITTF)公布了新一期世界排名。由于上周重要比赛仅有世界乒乓球职业大联盟(WTT)球星挑战赛多哈站,且除雨果-卡尔德拉诺、冯天薇之外的大部分高排位选手没有参赛,所以相较于上一周,本周男女排名前10位都没有变化。 19 | 男子方面,中国选手樊振东、马龙分居前两位,梁靖崑排名第4、许昕第8、林高远第10;巴西人雨果-卡尔德拉诺处于第三位,日本球员张本智和、中国台北运动员林昀儒、德国名将奥恰洛夫分列5至7位,德国老将波尔位居第9。 20 | 女子方面,中国球员陈梦、孙颖莎、王曼昱位居三甲,王艺迪排在第5;日本名将伊藤美诚居第4位,早田希娜、石川佳纯位列6、7位。 21 | 排在8至10位的是中国香港选手杜凯琹、新加坡老将冯天薇和波多黎各一姐阿德里亚纳-迪亚兹。 22 | 在本期世界排名公布之后,国乒参加今年杭州亚运会的阵容框架也基本确定。根据3月7日中国乒协公布的《乒乓球项目参加2022年杭州亚运会选拔办法(征求意见稿)》规定,在体能测试达标的前提下: 23 | 1、在2022年3月7日至20日期间举办的WTT大满贯(新加坡站)获得男女单打冠军的中国运动员; 24 | 2、以2022年第14周期间(2022年4月4日-10日)国际乒联公布的世界排名为标准,单打排名最高的男女各3名中国运动员; 25 | 3、1999年1月1日以后出生,以2022年第14周期间(2022年4月4日-10日)国际乒联公布的世界排名为标准,单打排名最高的男女各1名中国运动员 26 | 共五名运动员,将获得参加杭州亚运会团体比赛的资格。 27 | 则为: 28 | WTT大满贯单打冠军——樊振东、陈梦; 29 | 世界排名前三位球员——樊振东、马龙、梁靖崑/陈梦、孙颖莎、王曼昱; 30 | 1999年以后出生世界排名最高运动员——王楚钦(2000年出生,世界排名13位)/孙颖莎; 31 | 目前已经确定具备参加杭州亚运会乒乓球团体比赛资格的球员为: 32 | 男子:樊振东、马龙、梁靖崑、王楚钦; 33 | 女子:陈梦、孙颖莎、王曼昱。 34 | 前述选拔办法表示: 35 | 如根据上述3个条件入选的男女运动员不足5名,则由国家队男女教练组集体研究讨论,以2021年及2022年杭州亚运会前参加国际和国内大赛成绩为基本依据,以有利于完成杭州亚运会参赛任务、有利于2024年巴黎奥运会备战练兵,着眼于2028年洛杉矶奥运会梯队建设,根据近期比赛成绩和积分排名,提名国际大赛成绩突出、心理素质和抗压能力强且体能测试达标者,进入杭州亚运会团体项目建议名单;之后选拔工作领导小组对杭州亚运会团体项目建议名单进行综合评定,最终确认团体项目参赛名单。 36 | 单打方面,选拔办法规定: 37 | 1.在2022年3月7日至20日期间举办的WTT大满贯(新加坡站)获得男女单打冠军的中国运动员; 38 | 2.以2022年第14周期间(2022年4月4日-10日)国际乒联公布的世界排名为标准,单打排名最高的男女各1名中国运动员,如与WTT大满贯(新加坡站)人选相同,则顺延为单打排名第2的中国运动员。 39 | 以此,已经获得杭州亚运会乒乓球单打比赛资格的运动员为: 40 | 男子:樊振东、马龙; 41 | 女子:陈梦、孙颖莎; 42 | 双打方面,选拔办法规定,将根据已经确认的团体、单打参赛名单,结合相关运动员技战术特点、专长并综合考虑体能情况,由教练组提名双打配对建议名单;此后选拔工作领导小组对杭州亚运会团体项目建议名单进行综合评定,最终确认双打项目参赛名单。 43 | 选拔办法中规定,杭州亚运会各分项名单报选拔工作领导小组研究确认后,将经中国乒乓球协会报国家体育总局。 44 | 选拔办法同时指出,赛前如因伤病等特殊情况造成参赛人选替换,将由国家队男女教练组综合提出运动员调整意见,报选拔工作领导小组综合评估、研究确认后,经中国乒乓球协会报国家体育总局。(搜狐体育郭健/文) 45 | -------------------------------------------------------------------------------- /utils/text_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import random 5 | import jieba 6 | import pandas as pd 7 | 8 | # 读取停用词 9 | stopwords = pd.read_csv("data/text_data/stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], 10 | encoding='utf-8') 11 | stopwords = stopwords['stopword'].values 12 | 13 | 14 | def cut_words(line, words_min=2): 15 | line_segments = jieba.lcut(line) 16 | line_segments = filter(lambda x: len(x) >= words_min, line_segments) 17 | line_segments = filter(lambda x: x not in stopwords, line_segments) 18 | return list(line_segments) 19 | 20 | 21 | def load_corpus(): 22 | """ 23 | 加载语料库:取自搜狗新闻语料库(https://www.sogou.com/labs/resource/cs.php) 24 | :return: sentences 语料库 25 | """ 26 | # 取样后的文本存储 27 | df_entertainment = pd.read_csv(os.path.join('data/text_data/entertainment_news.csv')) 28 | df_sports = pd.read_csv(os.path.join('data/text_data/sports_news.csv')) 29 | 30 | entertainment = df_entertainment.content.values.tolist() 31 | sports = df_sports.content.values.tolist() 32 | content_file = {'entertainment': entertainment, 'sports': sports} 33 | 34 | return content_file 35 | 36 | 37 | def sentences_prepare(): 38 | """ 39 | 语料库预处理(无标签) 40 | """ 41 | sentences = [] 42 | content_file = load_corpus() 43 | for category in content_file.keys(): 44 | for line in content_file[category]: 45 | try: 46 | words_list = cut_words(line) 47 | sentences.append(" ".join(words_list)) 48 | except Exception as e: 49 | sentences.append("") 50 | print(e) 51 | continue 52 | random.seed(1) 53 | random.shuffle(sentences) 54 | return sentences 55 | 56 | 57 | def sentences_prepare_with_y(): 58 | """ 59 | 语料库预处理(含标签) 60 | """ 61 | sentences = [] 62 | content_file = load_corpus() 63 | for category in content_file.keys(): 64 | for line in content_file[category]: 65 | try: 66 | words_list = cut_words(line) 67 | sentences.append("__label__" + str(category) + " , " + " ".join(words_list)) 68 | except Exception as e: 69 | sentences.append("") 70 | print(line) 71 | continue 72 | random.seed(1) 73 | random.shuffle(sentences) 74 | return sentences 75 | 76 | 77 | def sentences_prepare_x_y(): 78 | """ 79 | 语料库预处理(语料和标签分别输出) 80 | """ 81 | cate_dic = {'entertainment': 0, 'sports': 1} 82 | content_file = load_corpus() 83 | # 生成训练数据 84 | sentences = [] 85 | y = [] 86 | 87 | for category in content_file.keys(): 88 | # 文本预处理 89 | for line in content_file[category]: 90 | try: 91 | words_list = cut_words(line) 92 | sentences.append(" ".join(words_list)) 93 | y.append(str(cate_dic.get(category))) 94 | except Exception as e: 95 | print(line) 96 | continue 97 | sentences_df = pd.DataFrame({'sentences': sentences, 'target': y}) 98 | sentences_df = sentences_df.sample(frac=1, random_state=1) 99 | return sentences_df.sentences.tolist(), sentences_df.target.tolist() 100 | -------------------------------------------------------------------------------- /chapter4/ch4_01_rules_for_outliers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import pandas as pd 8 | from utils import data_utils 9 | 10 | def rule_evaluate(selected_df, total_df, target, rate=0.15, amount=10000): 11 | """ 12 | :param selected_df: 子特征列表 13 | :param total_df: 特征宽表 14 | :param target: 目标变量 15 | :param rate: 息费(%) 16 | :param amount: 平均每笔借款金额 17 | :return: 18 | """ 19 | # 命中规则的子群体指标统计 20 | hit_size = selected_df.shape[0] 21 | hit_bad_size = selected_df[target].sum() 22 | hit_bad_rate = selected_df[target].mean() 23 | # 总体指标统计 24 | total_size = total_df.shape[0] 25 | total_bad_size = total_df[target].sum() 26 | total_bad_rate = total_df[target].mean() 27 | # 命中率 28 | hit_rate = hit_size / total_size 29 | # 提升度 30 | lift = hit_bad_rate / total_bad_rate 31 | # 收益 32 | profit = hit_bad_size * amount - (hit_size - hit_bad_size) * rate * amount 33 | res = [total_size, total_bad_size, total_bad_rate, 34 | hit_rate, hit_size, hit_bad_size, hit_bad_rate, lift, profit] 35 | return res 36 | 37 | 38 | def rule_discover(data_df, var, target, rule_term, rate=0.15, amount=10000): 39 | """ 40 | :param data_df: 特征宽表 41 | :param var: 特征名称 42 | :param target: 目标变量 43 | :param rule_term: 分位数列表或规则条件 44 | :param rate: 息费(%) 45 | :param amount: 平均每笔借款金额 46 | :return: 47 | """ 48 | res_list = [] 49 | if rule_term is None: 50 | rule_term = [0.005, 0.01, 0.02, 0.05, 0.95, 0.98, 0.99, 0.995] 51 | if isinstance(rule_term, list): 52 | for q in rule_term: 53 | threshold = data_df[var].quantile(q).round(2) 54 | if q < 0.5: 55 | temp = data_df.query("`{0}` <= @threshold".format(var)) 56 | rule = "<= {0}".format(threshold) 57 | else: 58 | temp = data_df.query("`{0}` >= @threshold".format(var)) 59 | rule = ">= {0}".format(threshold) 60 | res = rule_evaluate(temp, data_df, target, rate, amount) 61 | res_list.append([var, rule] + res) 62 | else: 63 | temp = data_df.query("`{0}` {1}".format(var, rule_term)) 64 | rule = rule_term 65 | res = rule_evaluate(temp, data_df, target, rate, amount) 66 | res_list.append([var, rule] + res) 67 | columns = ['var', 'rule', 'total_size', 'total_bad_size', 'total_bad_rate', 68 | 'hit_rate', 'hit_size', 'hit_bad_size', 'hit_bad_rate', 'lift', 69 | 'profit'] 70 | result_df = pd.DataFrame(res_list, columns=columns) 71 | return result_df 72 | 73 | 74 | if __name__ == '__main__': 75 | # 数据读入 76 | german_credit_data = data_utils.get_data() 77 | german_credit_data.loc[german_credit_data.sample( 78 | frac=0.2, random_state=0).index, 'sample_set'] = 'Train' 79 | german_credit_data['sample_set'].fillna('OOT', inplace=True) 80 | # 使用分位数列表构建规则集 81 | rule_table = rule_discover(data_df=german_credit_data, var='credit.amount', 82 | target='creditability', 83 | rule_term=[0.005, 0.01, 0.02, 0.05, 0.95, 0.98, 0.99, 0.995]) 84 | print(rule_table) 85 | # 规则效果评估 86 | rule_analyze = german_credit_data.groupby('sample_set').apply( 87 | lambda x: rule_discover(data_df=x, var='credit.amount', 88 | target='creditability', rule_term='>12366.0')) 89 | print(rule_analyze) 90 | -------------------------------------------------------------------------------- /chapter4/ch4_04_modelstrategy_for_optimization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from numpy import polyfit, poly1d 11 | from sklearn.metrics import r2_score 12 | from scipy.optimize import minimize 13 | 14 | 15 | def calculate_pass_loss_decile(score_series, y_series): 16 | """ 17 | 模型分取值变化时通过率与坏账率关系 18 | :param score_series: 模型分 19 | :param y_series: Y标签 20 | :return: 21 | """ 22 | decile_df = pd.crosstab(score_series, y_series).rename(columns={0: 'N_nonEvent', 1: 'N_Event'}) 23 | decile_df.loc[:, 'N_sample'] = score_series.value_counts() 24 | 25 | decile_df.loc[:, 'EventRate'] = decile_df.N_Event * 1.0 / decile_df.N_sample 26 | decile_df.loc[:, 'BadPct'] = decile_df.N_Event * 1.0 / sum(decile_df.N_Event) 27 | decile_df.loc[:, 'GoodPct'] = decile_df.N_nonEvent * 1.0 / sum(decile_df.N_nonEvent) 28 | decile_df.loc[:, 'CumBadPct'] = decile_df.BadPct.cumsum() 29 | decile_df.loc[:, 'CumGoodPct'] = decile_df.GoodPct.cumsum() 30 | 31 | decile_df = decile_df.sort_index(ascending=False) 32 | decile_df.loc[:, 'ApprovalRate'] = decile_df.N_sample.cumsum() / decile_df.N_sample.sum() 33 | decile_df.loc[:, 'ApprovedEventRate'] = decile_df.N_Event.cumsum() / decile_df.N_sample.cumsum() 34 | decile_df = decile_df.sort_index(ascending=True) 35 | return decile_df 36 | 37 | 38 | def poly_regression(x_series, y_series, degree, plot=True): 39 | """ 40 | 多项式回归拟合 41 | :param x_series: x数据 42 | :param y_series: y数据 43 | :param degree: 指定多项式次数 44 | :param plot: 是否作图 45 | :return: 46 | """ 47 | coeff = polyfit(x_series, y_series, degree) 48 | f = poly1d(coeff) 49 | R2 = r2_score(y_series.values, f(x_series)) 50 | 51 | print(f'coef:{coeff},R2: {R2}') 52 | 53 | if plot: 54 | # 用来正常显示中文标签 55 | plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] 56 | plt.rcParams['axes.unicode_minus'] = False 57 | 58 | plt.figure(figsize=(10, 5)) 59 | plt.plot(x_series, y_series, 'rx') 60 | plt.plot(x_series, f(x_series)) 61 | plt.xlabel('通过率', {'size': 15}) 62 | plt.ylabel('坏账率', {'size': 15}) 63 | plt.show() 64 | return coeff 65 | 66 | 67 | german_score = pd.read_csv('data/german_score.csv') 68 | german_score.head() 69 | 70 | decile_df = calculate_pass_loss_decile(german_score['score'], 71 | german_score['creditability']) 72 | print(decile_df.head()) 73 | 74 | # 数据准备 75 | x = decile_df['ApprovalRate'] 76 | # 逾期率折算为坏账率 77 | y = decile_df['ApprovedEventRate'] / 2.5 78 | 79 | poly_coef = poly_regression(x, y, 2, plot=True) 80 | # 坏账率L(x)与通过率x的关系 81 | l_x = poly1d(poly_coef) 82 | print(l_x) 83 | 84 | 85 | def find_best_approval_rate(x_to_loss_func, score_df): 86 | """ 87 | 定义最优化函数 88 | 坏账率L(x)与通过率x的关系函数 89 | :param x_to_loss_func: 坏账率与通过率的函数关系 90 | :param score_df: 模型分与通过率的对应关系,index为模型分,"ApprovalRate"列为对应的通过率 91 | :return: 92 | """ 93 | 94 | # 定义目标函数,求解最大值即为负的最小值 95 | def fun(x_array): 96 | # 其中x_list[0]为通过率x,x_array[1]为对应的坏账率L(x) 97 | return -10000 * (0.16 * (1 - x_array[1]) - x_array[1] 98 | - 30 / (x_array[0] * 0.6) / 10000) 99 | 100 | # eq表示 函数结果等于0 ; ineq 表示 表达式大于等于0, 下面式子1e-6项确保相应变量不等于0或1 101 | cons = ({'type': 'eq', 'fun': lambda x_array: x_to_loss_func(x_array[0]) - x_array[1]}, 102 | {'type': 'ineq', 'fun': lambda x_array: x_array[0] - 1e-6}, 103 | {'type': 'ineq', 'fun': lambda x_array: x_array[1] - 1e-6}, 104 | {'type': 'ineq', 'fun': lambda x_array: 1 - x_array[0] - 1e-6}, 105 | {'type': 'ineq', 'fun': lambda x_array: 1 - x_array[0] - 1e-6} 106 | ) 107 | 108 | # 设置初始值 109 | x_base = np.array((0.10, 0.10)) 110 | # 采用SLSQP进行最优化求解 111 | res = minimize(fun, x_base, method='SLSQP', constraints=cons) 112 | print('利润最优:', "{:.2f}".format(-res.fun)) 113 | print('最优解对应通过率:', "{:.2%}".format(res.x[0]), '坏账率:', "{:.2%}".format(res.x[1])) 114 | print("模型分阈值:", score_df[score_df['ApprovalRate'] >= res.x[0]].index.max()) 115 | print('迭代终止是否成功:', res.success) 116 | print('迭代终止原因:', res.message) 117 | 118 | 119 | find_best_approval_rate(l_x, decile_df) 120 | -------------------------------------------------------------------------------- /chapter3/ch3_02_order_fea_gen_rfm_auto.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | 5 | sys.path.append("./") 6 | sys.path.append("../") 7 | 8 | # 根据业务逻辑自动生成用户历史订单特征 9 | import pandas as pd 10 | import numpy as np 11 | from dateutil.parser import parse 12 | from utils.data_utils import stamp_to_date 13 | from chapter3.ch3_00_order_data_preprocess import data_preprocess 14 | 15 | func_trans = {'sum': np.sum, 16 | 'mean': np.mean, 17 | 'cnt': np.size, 18 | 'max': np.max, 19 | 'min': np.min, 20 | 'std': np.std, 21 | } 22 | 23 | 24 | def apply_func(f, *args): 25 | return f(*args) 26 | 27 | 28 | def rfm_cut(data, time_col, back_time, type_dict, comp_dict, time_arr, fea_prefix='f'): 29 | """ 30 | 基于RFM思想切分数据,生成特征 31 | :param DataFrame data: 待切分的数据,时间列为create_time(timestamp),距今天数列为gap_days 32 | :param str time_col: 回溯依据的时间列名称 33 | :param datetime.datetime back_time: 回溯时间点,datetime.datetime时间格式 34 | :param dict type_dict: 类别变量,以及其对应的取值类别,用于划分数据,类别列名必须在data中 35 | :param dict comp_dict: 指定计算字段以及对该字段采用的计算方法, 计算变量名必须在data中 36 | :param list time_arr: 切分时间列表(近N天) 37 | :param fea_prefix: 特征前缀 38 | :return dict: 特征 39 | """ 40 | data[time_col] = data[time_col].apply(stamp_to_date) 41 | # 业务时间距back_time天数 42 | data['gap_days'] = data[time_col].apply(lambda x: (back_time - x).days) 43 | 44 | res_feas = {} 45 | for col_time in time_arr: 46 | for col_comp in comp_dict.keys(): 47 | for type_k, type_v in type_dict.items(): 48 | # 按类别和时间维度切分,筛选数据 49 | for item in type_v: 50 | data_cut = data[(data['gap_days'] < col_time) & (data[type_k] == item)] 51 | for func_k in comp_dict[col_comp]: 52 | func_v = func_trans.get(func_k, np.size) 53 | # 对筛选出的数据, 在各统计指标上做聚合操作生成特征 54 | fea_name = '%s_%s_%s_%s_%s' % ( 55 | fea_prefix, col_time, '%s_%s' % (type_k, item), col_comp, func_k) 56 | if data_cut.empty: 57 | res_feas[fea_name] = np.nan 58 | else: 59 | res_feas[fea_name] = apply_func(func_v, data_cut[col_comp]) 60 | return res_feas 61 | 62 | 63 | def gen_order_feature_auto(raw_data, time_col, back_time, dtypes_dict, type_dict, comp_dict, time_arr, 64 | fea_prefix='f'): 65 | """ 66 | 基于RFM切分,自动生成订单特征 67 | :param pd.DataFrame raw_data: 原始数据 68 | :param str time_col: 回溯依据的时间列名称 69 | :param str back_time: 回溯时间点,字符串格式 70 | :param dict dtypes_dict: 指定列字段类型的字典,如{'col1':int} 71 | :param list time_arr: 切分时间列表(近N天) 72 | :param dict type_dict: 类别变量,以及其对应的取值类别,用于划分数据,类别列名必须在data中 73 | :param dict comp_dict: 指定计算字段以及对该字段采用的计算方法,计算变量名必须在data中 74 | :param fea_prefix: 特征前缀 75 | :return: res_feas 最终生成的特征 76 | """ 77 | if raw_data.empty: 78 | return {} 79 | back_time = parse(str(back_time)) 80 | 81 | order_df = data_preprocess(raw_data, time_col=time_col, back_time=back_time, dtypes_dict=dtypes_dict) 82 | if order_df.empty: 83 | return {} 84 | 85 | # 特征衍生:使用rfm切分 86 | res_feas = rfm_cut(order_df, time_col, back_time, type_dict, comp_dict, time_arr, fea_prefix) 87 | return res_feas 88 | 89 | 90 | if __name__ == '__main__': 91 | # 原始数据读入 92 | orders = pd.read_excel('data/order_data.xlsx') 93 | # 取一个用户的历史订单数据 94 | raw_orders = pd.DataFrame(eval(orders['data'][1])) 95 | 96 | # 设置自动特征的参数 97 | # 类别字段及其取值 98 | type_dict_param = { 99 | 'has_overdue': [0, 1], 100 | 'is_weekend': [0, 1] 101 | } 102 | # 计算字段及其计算函数 103 | comp_dict_param = { 104 | 'order_no': ['cnt'], 105 | 'application_amount': ['sum', 'mean', 'max', 'min'] 106 | } 107 | time_cut = [30, 90, 180, 365] 108 | 109 | cols_dtypes_dict = {'has_overdue': int, 'application_term': float, 'application_amount': float} 110 | 111 | # 根据业务逻辑生成用户历史订单特征 112 | features_auto = gen_order_feature_auto(raw_orders, 'create_time', '2020-12-14', cols_dtypes_dict, 113 | type_dict_param, comp_dict_param, time_cut) 114 | print("特征维度: ", len(features_auto.keys())) 115 | print(features_auto) 116 | 117 | # 批量生成特征 118 | feature_dict = {} 119 | for i, row in orders.iterrows(): 120 | feature_dict[i] = gen_order_feature_auto(pd.DataFrame(eval(row['data'])), 'create_time', row['back_time'], 121 | cols_dtypes_dict, type_dict_param, comp_dict_param, time_cut, 122 | 'order_auto') 123 | feature_df_auto = pd.DataFrame(feature_dict).T 124 | # feature_df_auto.to_excel('data/features_auto.xlsx', index=True) 125 | -------------------------------------------------------------------------------- /utils/data_utils.py: -------------------------------------------------------------------------------- 1 | import toad 2 | import numpy as np 3 | import pandas as pd 4 | import scorecardpy as sc 5 | import datetime as dt 6 | import pytz 7 | from sklearn.preprocessing import OrdinalEncoder 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.preprocessing import MinMaxScaler 10 | from sklearn.preprocessing import StandardScaler 11 | from dateutil.parser import parse 12 | 13 | numeric_cols = ['duration.in.month', 14 | 'credit.amount', 15 | 'age.in.years', 16 | 'present.residence.since', 17 | 'number.of.existing.credits.at.this.bank', 18 | 'installment.rate.in.percentage.of.disposable.income', 19 | 'number.of.people.being.liable.to.provide.maintenance.for'] 20 | 21 | category_cols = ['status.of.existing.checking.account', 'credit.history', 22 | 'savings.account.and.bonds', 'present.employment.since', 23 | 'personal.status.and.sex', 'other.debtors.or.guarantors', 24 | 'property', 'other.installment.plans', 'housing', 'job', 25 | 'telephone', 'foreign.worker', 'purpose'] 26 | 27 | x_cols = numeric_cols + category_cols 28 | 29 | label = 'creditability' 30 | 31 | 32 | def get_data(): 33 | """ 34 | 导入原始数据集 35 | """ 36 | german_credit_data = sc.germancredit() 37 | german_credit_data[label] = np.where( 38 | german_credit_data[label] == 'bad', 1, 0) 39 | # 设置随机数种子, 确保结果可复现 40 | np.random.seed(0) 41 | month_list = ['2020-01', '2020-02', '2020-03', '2020-04', '2020-05'] 42 | # 随机分配月份 43 | german_credit_data['month'] = np.random.choice( 44 | month_list, len(german_credit_data)) 45 | return german_credit_data 46 | 47 | 48 | def get_all_x_y(transform_method='minmax'): 49 | """ 50 | 加载数据 51 | :param transform_method: 数据标准化方式 52 | """ 53 | german_credit_data = sc.germancredit() 54 | # 类别型变量转化成数值型索引变量 55 | encoder = OrdinalEncoder() 56 | category_result = encoder.fit_transform(german_credit_data[category_cols]) 57 | category_result = pd.DataFrame(data=category_result, columns=category_cols) 58 | numeric_result = german_credit_data[numeric_cols + [label]].copy() 59 | # 将标签creditability映射为数值 60 | numeric_result[label] = np.where(numeric_result[label] == 'bad', 1, 0) 61 | all_x_y = pd.merge(category_result, numeric_result, left_index=True, right_index=True) 62 | x_cols = [f for f in all_x_y.columns if f != label] 63 | # 数据标准化 64 | if transform_method == 'minmax': 65 | encoder = MinMaxScaler() 66 | all_x_y[x_cols] = encoder.fit_transform(all_x_y[x_cols]) 67 | elif transform_method == 'standard': 68 | encoder = StandardScaler() 69 | all_x_y[x_cols] = encoder.fit_transform(all_x_y[x_cols]) 70 | elif transform_method == 'origin': 71 | pass 72 | return all_x_y 73 | 74 | 75 | def get_data_after_fs(empty=0.5, iv=0.02, corr=0.7): 76 | """ 77 | 加载特征选择后的数据 78 | :param empty: 缺失率阈值 79 | :param iv: iv阈值 80 | :param corr: 相关性阈值 81 | """ 82 | all_x_y = get_all_x_y() 83 | selected_data, drop_lst = toad.selection.select( 84 | all_x_y, target=label, empty=0.5, 85 | iv=0.02, corr=0.7, return_drop=True) 86 | return selected_data 87 | 88 | 89 | def get_x_y_split(test_rate=0.2, transform_method='minmax'): 90 | """ 91 | 划分训练集和测试集 92 | :param test_rate: 测试集样本占比 93 | :param transform_method: 数据标准化方式 94 | """ 95 | german_credit_data = get_all_x_y(transform_method) 96 | y = german_credit_data.pop(label) 97 | x = german_credit_data 98 | x_train, x_valid, y_train, y_valid = train_test_split( 99 | x, y, test_size=test_rate, random_state=88) 100 | return x_train, x_valid, y_train, y_valid 101 | 102 | 103 | def stamp_to_date(time_stamp, timezone=None): 104 | """ 105 | 时间戳转日期函数 106 | :param time_stamp:int,时间戳 107 | :param timezone:string,时区 108 | :return:datetime 109 | """ 110 | try: 111 | if timezone is None: 112 | stamp_str = str(time_stamp) 113 | if len(stamp_str) >= 10: 114 | stamp_str = stamp_str[:10] 115 | else: 116 | stamp_str = stamp_str 117 | time_stamp = int(stamp_str) 118 | date = dt.datetime.fromtimestamp(time_stamp) 119 | return date 120 | else: 121 | stamp_str = str(time_stamp) 122 | if len(stamp_str) >= 10: 123 | stamp_str = stamp_str[:10] 124 | else: 125 | stamp_str = stamp_str 126 | time_stamp = int(stamp_str) 127 | tz = pytz.timezone(timezone) 128 | date = dt.datetime.fromtimestamp(time_stamp, tz).strftime('%Y-%m-%d %H:%M:%S') 129 | date = parse(date) 130 | return date 131 | except: 132 | return parse('2100-01-01') 133 | 134 | 135 | def date_to_week(date): 136 | """ 137 | 日期转换为星期 138 | :param date:datetime,string 139 | :return:int 140 | """ 141 | try: 142 | if isinstance(date, str): 143 | date = parse(date) 144 | if_weekend = date.weekday() 145 | return if_weekend 146 | except: 147 | return np.nan 148 | -------------------------------------------------------------------------------- /chapter4/ch4_02_rules_for_decisiontree.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "ename": "ModuleNotFoundError", 10 | "evalue": "No module named 'toad'", 11 | "output_type": "error", 12 | "traceback": [ 13 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 14 | "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", 15 | "Input \u001b[1;32mIn [3]\u001b[0m, in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mst\u001b[39;00m\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mgraphviz\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m data_utils\n", 16 | "File \u001b[1;32md:\\GitHub\\practice_of_intelligent_risk_control\\chapter4\\..\\utils\\data_utils.py:1\u001b[0m, in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtoad\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n", 17 | "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'toad'" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "# -*- coding: utf-8 -*-\n", 23 | "\n", 24 | "import sys\n", 25 | "sys.path.append(\"./\")\n", 26 | "sys.path.append(\"../\")\n", 27 | "\n", 28 | "import sklearn.tree as st\n", 29 | "import graphviz\n", 30 | "from utils import data_utils\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 4, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "Collecting toad\n", 43 | " Downloading toad-0.1.3-cp39-cp39-win_amd64.whl (14.3 MB)\n", 44 | "Installing collected packages: toad\n", 45 | "Successfully installed toad-0.1.3\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "# !pip install graphviz\n", 51 | "!pip install toad" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "\n", 61 | "def decision_tree_resolve(train_x, train_y, class_names=None, max_depth=3, fig_path=''):\n", 62 | " \"\"\"\n", 63 | " 基于决策树可视化\n", 64 | " :param train_x: data of train\n", 65 | " :param train_y: data of y\n", 66 | " :param class_names: 标签名称\n", 67 | " :param max_depth: 树最大深度\n", 68 | " :param fig_path: 图片路径和名称\n", 69 | " :return:\n", 70 | " \"\"\"\n", 71 | " if class_names is None:\n", 72 | " class_names = ['good', 'bad']\n", 73 | " clf = st.DecisionTreeClassifier(max_depth=max_depth,\n", 74 | " min_samples_leaf=0.01,\n", 75 | " min_samples_split=0.01,\n", 76 | " criterion='gini',\n", 77 | " splitter='best',\n", 78 | " max_features=None)\n", 79 | " clf = clf.fit(train_x, train_y)\n", 80 | "\n", 81 | " # 比例图\n", 82 | " dot_data = st.export_graphviz(clf, out_file=None,\n", 83 | " feature_names=train_x.columns.tolist(),\n", 84 | " class_names=class_names,\n", 85 | " filled=True,\n", 86 | " rounded=True,\n", 87 | " node_ids=True,\n", 88 | " special_characters=True,\n", 89 | " proportion=True,\n", 90 | " leaves_parallel=True)\n", 91 | " graph = graphviz.Source(dot_data, filename=fig_path)\n", 92 | " return graph\n", 93 | "\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# 加载数据\n", 103 | "german_credit_data = data_utils.get_data()\n", 104 | "\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# 构造数据集\n", 114 | "X = german_credit_data[data_utils.numeric_cols].copy()\n", 115 | "y = german_credit_data['creditability']\n", 116 | "\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "graph = decision_tree_resolve(X, y, fig_path='data/tree')\n", 126 | "graph.view()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "\n", 136 | "# 转化为规则\n", 137 | "X['node_5'] = X.apply(lambda x: 1 if x['duration.in.month'] <= 34.5 and x['credit.amount'] > 8630.5 else 0, axis=1)\n", 138 | "X['node_9'] = X.apply(\n", 139 | " lambda x: 1 if x['duration.in.month'] > 34.5 and x['age.in.years'] <= 29.5 and x['credit.amount'] > 4100.0 else 0,\n", 140 | " axis=1)\n", 141 | "X['node_12'] = X.apply(lambda x: 1 if x['duration.in.month'] > 34.5 and x['age.in.years'] > 56.5 else 0, axis=1)\n" 142 | ] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.9.12" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 2 166 | } 167 | -------------------------------------------------------------------------------- /chapter2/ch2_38_xgboost.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | import shap 8 | import numpy as np 9 | import pandas as pd 10 | import xgboost as xgb 11 | import bayes_opt as bo 12 | import sklearn.model_selection as sk_ms 13 | from sklearn.model_selection import ParameterGrid 14 | from sklearn.metrics import roc_auc_score 15 | from utils import data_utils 16 | import shap 17 | from chapter2.ch2_31_model_deployment_pickle import save_model_as_pkl 18 | 19 | 20 | # 确定最优树的颗数 21 | def xgb_cv(param, x, y, num_boost_round=10000): 22 | dtrain = xgb.DMatrix(x, label=y) 23 | cv_res = xgb.cv(param, dtrain, num_boost_round=num_boost_round, early_stopping_rounds=30) 24 | num_boost_round = cv_res.shape[0] 25 | return num_boost_round 26 | 27 | def train_xgb(params, x_train, y_train, x_test=None, y_test=None, num_boost_round=10000, early_stopping_rounds=30, verbose_eval=50): 28 | """ 29 | 训练xgb模型 30 | """ 31 | dtrain = xgb.DMatrix(x_train, label=y_train) 32 | if x_test is None: 33 | num_boost_round = xgb_cv(params, x_train, y_train) 34 | early_stopping_rounds = None 35 | eval_sets = () 36 | else: 37 | dtest = xgb.DMatrix(x_test, label=y_test) 38 | eval_sets = [(dtest, 'test')] 39 | model = xgb.train(params, dtrain, num_boost_round, evals=eval_sets, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval) 40 | return model 41 | 42 | 43 | def xgboost_grid_search(params_space, x_train, y_train, x_test=None, y_test=None, num_boost_round=10000): 44 | """ 45 | 网格调参, 确定其他参数 46 | """ 47 | # 设置训练参数 48 | if x_test is None: 49 | x_train, x_test, y_train, y_test = sk_ms.train_test_split(x_train, y_train, test_size=0.2, random_state=1) 50 | score_list = [] 51 | test_params = list(ParameterGrid(params_space)) 52 | for params_try in test_params: 53 | params_try['eval_metric'] = "auc" 54 | params_try['random_state'] = 1 55 | clf_obj = train_xgb(params_try, x_train, y_train, x_test, y_test, num_boost_round=num_boost_round, 56 | early_stopping_rounds=30, verbose_eval=0) 57 | score_list.append(roc_auc_score(y_test, clf_obj.predict(xgb.DMatrix(x_test)))) 58 | result = pd.DataFrame(dict(zip(score_list, test_params))).T 59 | print(result) 60 | # 取测试集上效果最好的参数组合 61 | params = test_params[np.array(score_list).argmax()] 62 | return params 63 | 64 | 65 | def xgboost_bayesian_optimization(params_space, x_train, y_train, x_test=None, y_test=None, num_boost_round=10000, nfold=5, init_points=2, n_iter=5, verbose_eval=0, early_stopping_rounds=30): 66 | """ 67 | 贝叶斯调参, 确定其他参数 68 | """ 69 | # 设置需要调节的参数及效果评价指标 70 | def xgboost_cv_for_bo(eta, gamma, max_depth, min_child_weight, 71 | subsample, colsample_bytree): 72 | params = { 73 | 'eval_metric': 'auc', 74 | 'booster': 'gbtree', 75 | 'objective': 'binary:logistic', 76 | 'eta': eta, 77 | 'gamma': gamma, 78 | 'max_depth': int(max_depth), 79 | 'min_child_weight': int(min_child_weight), 80 | 'subsample': subsample, 81 | 'colsample_bytree': colsample_bytree, 82 | 'seed': 1 83 | } 84 | if x_test is None: 85 | dtrain = xgb.DMatrix(x_train, label=y_train) 86 | xgb_cross = xgb.cv(params, 87 | dtrain, 88 | nfold=nfold, 89 | metrics='auc', 90 | early_stopping_rounds=early_stopping_rounds, 91 | num_boost_round=num_boost_round) 92 | test_auc = xgb_cross['test-auc-mean'].iloc[-1] 93 | else: 94 | clf_obj = train_xgb(params, x_train, y_train, x_test, y_test, num_boost_round=num_boost_round, 95 | early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval) 96 | test_auc = roc_auc_score(y_test, clf_obj.predict(xgb.DMatrix(x_test))) 97 | return test_auc 98 | 99 | # 指定需要调节参数的取值范围 100 | xgb_bo_obj = bo.BayesianOptimization(xgboost_cv_for_bo, params_space, random_state=1) 101 | xgb_bo_obj.maximize(init_points=init_points, n_iter=n_iter) 102 | best_params = xgb_bo_obj.max['params'] 103 | best_params['max_depth'] = int(best_params['max_depth']) 104 | best_params['min_child_weight'] = int(best_params['min_child_weight']) 105 | best_params['eval_metric'] = 'auc' 106 | best_params['booster'] = 'gbtree' 107 | best_params['objective'] = 'binary:logistic' 108 | best_params['seed'] = 1 109 | return best_params 110 | 111 | 112 | # 导入数值型样例数据 113 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2) 114 | 115 | # 经验参数 116 | exp_params = { 117 | 'eval_metric': 'auc', 118 | 'booster': 'gbtree', 119 | 'objective': 'binary:logistic', 120 | 'eta': 0.1, 121 | 'gamma': 0.01, 122 | 'max_depth': 4, 123 | 'min_child_weight': 1, 124 | 'subsample': 1, 125 | 'colsample_bytree': 1, 126 | 'seed': 1 127 | } 128 | final_xgb_model = train_xgb(exp_params, train_x, train_y, test_x, test_y) 129 | auc_score = roc_auc_score(test_y, final_xgb_model.predict(xgb.DMatrix(test_x))) 130 | print("经验参数模型AUC: ", auc_score) 131 | 132 | # 随机搜索调参 133 | choose_tuner = 'bayesian' # bayesian grid_search 134 | if choose_tuner == 'grid_search': 135 | params_test = { 136 | 'learning_rate': [0.1, 0.15], 137 | 'gamma': [0.01, 0], 138 | 'max_depth': [4, 3], 139 | 'min_child_weight': [1, 2], 140 | 'subsample': [0.95, 1], 141 | 'colsample_bytree': [1] 142 | } 143 | optimal_params = xgboost_grid_search(params_test, train_x, train_y, test_x, test_y) 144 | elif choose_tuner == 'bayesian': 145 | # 贝叶斯调参 146 | params_test = {'eta': (0.05, 0.2), 147 | 'gamma': (0.005, 0.05), 148 | 'max_depth': (3, 5), 149 | 'min_child_weight': (0, 3), 150 | 'subsample': (0.9, 1.0), 151 | 'colsample_bytree': (0.9, 1.0)} 152 | optimal_params = xgboost_bayesian_optimization(params_test, train_x, train_y, test_x, test_y, init_points=5, n_iter=8) 153 | 154 | print("随机搜索调参最优参数: ", optimal_params) 155 | 156 | final_xgb_model = train_xgb(optimal_params, train_x, train_y, test_x, test_y) 157 | auc_score = roc_auc_score(test_y, final_xgb_model.predict(xgb.DMatrix(test_x))) 158 | print("随机搜索调参模型AUC: ", auc_score) 159 | 160 | # 保存模型 161 | save_model_as_pkl(final_xgb_model, "./data/xgb_model.pkl") 162 | 163 | # SHAP计算 164 | explainer = shap.TreeExplainer(final_xgb_model) 165 | shap_values = explainer.shap_values(train_x) 166 | # SHAP可视化 167 | shap.summary_plot(shap_values, train_x, max_display=5) 168 | -------------------------------------------------------------------------------- /chapter3/ch3_15_gcn_order.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("./") 5 | sys.path.append("../") 6 | 7 | # GCN关系网络节点预测 8 | import pickle 9 | import os 10 | import itertools 11 | import numpy as np 12 | import scipy.sparse as sp 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | import torch.nn.init as init 17 | import torch.optim as optim 18 | import matplotlib.pyplot as plt 19 | from collections import namedtuple 20 | 21 | os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" 22 | cpu_type = "cuda" if torch.cuda.is_available() else "cpu" 23 | 24 | 25 | def numpy_to_tensor(x): 26 | return torch.from_numpy(x).to(cpu_type) 27 | 28 | 29 | def build_adjacency(adj_dict): 30 | """ 31 | 根据邻接表创建邻接矩阵 32 | :param adj_dict: 输入的邻接表 33 | :return: 邻接矩阵 34 | """ 35 | edge_index = [] 36 | node_counts = len(adj_dict) 37 | for src, dst in adj_dict.items(): 38 | edge_index.extend([src, v] for v in dst) 39 | edge_index.extend([v, src] for v in dst) 40 | # 去重 41 | edge_index = list(k for k, _ in itertools.groupby(sorted(edge_index))) 42 | edge_index = np.asarray(edge_index) 43 | # 构建邻接矩阵,相接的节点值为1 44 | adjacency = sp.coo_matrix((np.ones(len(edge_index)), 45 | (edge_index[:, 0], edge_index[:, 1])), 46 | shape=(node_counts, node_counts), dtype="double") 47 | return adjacency 48 | 49 | 50 | def read_data(path_of_data): 51 | """ 52 | 数据读取 53 | :param path_of_data: 文件路径 54 | :return: 55 | """ 56 | out = pickle.load(open(path_of_data, "rb"), encoding="latin1") 57 | out = out.toarray() if hasattr(out, "toarray") else out 58 | return out 59 | 60 | 61 | def data_preprocess(): 62 | print("Start data preprocess.") 63 | filenames = ["order.{}".format(name) for name in ['x', 'y', 'graph']] 64 | # 图有2000个节点,每个节点有104维特征,y值为0或1,graph用字典表示,字典key为节点编号,value为关联的节点编号list 65 | root_path = 'data/graph_data' 66 | x, y, graph = [read_data(os.path.join(root_path, name)) for name in filenames] 67 | 68 | # 划分train,validation和test节点编号 69 | train_index = list(range(0, 700)) 70 | val_index = list(range(700, 1000)) 71 | test_index = list(range(1000, 2000)) 72 | 73 | num_nodes = x.shape[0] 74 | train_mask = np.zeros(num_nodes, dtype=bool) 75 | val_mask = np.zeros(num_nodes, dtype=bool) 76 | test_mask = np.zeros(num_nodes, dtype=bool) 77 | 78 | train_mask[train_index] = True 79 | val_mask[val_index] = True 80 | test_mask[test_index] = True 81 | 82 | adjacency = build_adjacency(graph) 83 | print("特征维度: ", x.shape) 84 | print("标签长度: ", y.shape) 85 | print("邻接矩阵维度: ", adjacency.shape) 86 | # 构建带字段名的元组 87 | Data = namedtuple('Data', ['x', 'y', 'adjacency', 88 | 'train_mask', 'val_mask', 'test_mask']) 89 | return Data(x=x, y=y, adjacency=adjacency, 90 | train_mask=train_mask, val_mask=val_mask, test_mask=test_mask) 91 | 92 | 93 | def adj_norm(adjacency): 94 | """ 95 | 正则化:公式L=D^-0.5 * (A+I) * D^-0.5 96 | :param torch.sparse.FloatTensor adjacency: 97 | :return: 98 | """ 99 | adjacency += sp.eye(adjacency.shape[0]) 100 | degree = np.array(adjacency.sum(1)) 101 | d_hat = sp.diags(np.power(degree, -0.5).flatten()) 102 | return d_hat.dot(adjacency).dot(d_hat).tocoo() 103 | 104 | 105 | class GraphConv(nn.Module): 106 | def __init__(self, input_dim, output_dim, use_bias=True): 107 | """ 108 | # 图卷积层定义 109 | :param int input_dim: 输入特征维度 110 | :param int output_dim: 输出特征维度 111 | :param bool use_bias: 偏置 112 | :return: 113 | """ 114 | super(GraphConv, self).__init__() 115 | self.input_dim = input_dim 116 | self.output_dim = output_dim 117 | self.use_bias = use_bias 118 | self.weight = nn.Parameter(torch.Tensor(input_dim, output_dim)) 119 | if self.use_bias: 120 | self.bias = nn.Parameter(torch.Tensor(output_dim)) 121 | else: 122 | self.register_parameter('bias', None) 123 | self.reset_parameters() 124 | 125 | def reset_parameters(self): 126 | init.kaiming_uniform_(self.weight) 127 | if self.use_bias: 128 | init.zeros_(self.bias) 129 | 130 | def forward(self, adjacency, fea_input): 131 | """ 132 | :param torch.sparse.FloatTensor adjacency : 邻接矩阵 133 | :param torch.Tensor fea_input: 输入特征 134 | :return: 135 | """ 136 | support = torch.mm(fea_input, self.weight) 137 | output = torch.sparse.mm(adjacency, support) 138 | if self.use_bias: 139 | output += self.bias 140 | return output 141 | 142 | 143 | class GcnNet(nn.Module): 144 | def __init__(self, input_dim): 145 | """ 146 | 模型定义 147 | :param int input_dim: 输入特征维度 148 | """ 149 | super(GcnNet, self).__init__() 150 | self.gcn1 = GraphConv(input_dim, 16) 151 | self.gcn2 = GraphConv(16, 2) 152 | 153 | def forward(self, adjacency, feature): 154 | h = F.relu(self.gcn1(adjacency, feature)) 155 | lg = self.gcn2(adjacency, h) 156 | return lg 157 | 158 | 159 | def model_predict(model, tensor, tensor_adj, mask): 160 | model.eval() 161 | with torch.no_grad(): 162 | lg = model(tensor_adj, tensor) 163 | lg_mask = lg[mask] 164 | y_pred = lg_mask.max(1)[1] 165 | return y_pred 166 | 167 | 168 | def cal_accuracy(y_true, y_pred): 169 | accuracy = torch.eq(y_pred, y_true).double().mean() 170 | return accuracy 171 | 172 | 173 | def model_train(tensor_x, tensor_y, tensor_adjacency, train_mask, val_mask, epochs, learning_rate, 174 | weight_decay): 175 | # 模型定义:Model, Loss, Optimizer 176 | model = GcnNet(tensor_x.shape[1]).to(cpu_type) 177 | optimizer = optim.Adam(model.parameters(), 178 | lr=learning_rate, 179 | weight_decay=weight_decay) 180 | 181 | loss_list = [] 182 | test_accuracy_list = [] 183 | model.train() 184 | train_y = tensor_y[train_mask].long() 185 | 186 | for epoch in range(epochs): 187 | # 前向传播 188 | lg = model(tensor_adjacency, tensor_x) 189 | train_mask_logits = lg[train_mask] 190 | loss = nn.CrossEntropyLoss().to(cpu_type)(train_mask_logits, train_y) 191 | optimizer.zero_grad() 192 | # 反向传播 193 | loss.backward() 194 | optimizer.step() 195 | # 准确率 196 | train_accuracy = cal_accuracy(tensor_y[train_mask], 197 | model_predict(model, tensor_x, tensor_adjacency, train_mask)) 198 | test_accuracy = cal_accuracy(tensor_y[val_mask], 199 | model_predict(model, tensor_x, tensor_adjacency, val_mask)) 200 | 201 | loss_list.append(loss.item()) 202 | test_accuracy_list.append(test_accuracy.item()) 203 | if epoch % 10 == 1: 204 | print("epoch {:04d}: loss {:.4f}, train accuracy {:.4}, test accuracy {:.4f}".format( 205 | epoch, loss.item(), train_accuracy.item(), test_accuracy.item())) 206 | return model, loss_list, test_accuracy_list 207 | 208 | 209 | def plot_loss_with_acc(loss_history, val_acc_history): 210 | fig = plt.figure() 211 | # 坐标系ax1画曲线1 212 | ax1 = fig.add_subplot(111) # 指的是将plot界面分成1行1列,此子图占据从左到右从上到下的1位置 213 | ax1.plot(range(len(loss_history)), loss_history, 214 | c=np.array([255, 71, 90]) / 255.) # c为颜色 215 | plt.ylabel('Loss') 216 | 217 | # 坐标系ax2画曲线2 218 | ax2 = fig.add_subplot(111, sharex=ax1, frameon=False) # 其本质就是添加坐标系,设置共享ax1的x轴,ax2背景透明 219 | ax2.plot(range(len(val_acc_history)), val_acc_history, 220 | c=np.array([79, 179, 255]) / 255.) 221 | ax2.yaxis.tick_right() # 开启右边的y坐标 222 | 223 | ax2.yaxis.set_label_position("right") 224 | plt.ylabel('ValAcc') 225 | 226 | plt.xlabel('Epoch') 227 | plt.title('Training Loss & Validation Accuracy') 228 | plt.show() 229 | 230 | 231 | if __name__ == '__main__': 232 | # 数据预处理 233 | dataset = data_preprocess() 234 | 235 | # x、y规范化 236 | node_feature = (dataset.x - dataset.x.mean()) / dataset.x.std() 237 | tensor_x_all = numpy_to_tensor(node_feature).to(torch.float32) 238 | tensor_y_all = numpy_to_tensor(dataset.y) 239 | 240 | tensor_train_mask = numpy_to_tensor(dataset.train_mask) 241 | tensor_val_mask = numpy_to_tensor(dataset.val_mask) 242 | tensor_test_mask = numpy_to_tensor(dataset.test_mask) 243 | 244 | # 邻接矩阵规范化 245 | normed_adj = adj_norm(dataset.adjacency) 246 | 247 | indices = torch.from_numpy(np.asarray([normed_adj.row, 248 | normed_adj.col]).astype('int64')).long() 249 | values = torch.from_numpy(normed_adj.data.astype(np.float32)) 250 | 251 | tensor_adjacency_all = torch.sparse.FloatTensor(indices, values, 252 | (node_feature.shape[0], node_feature.shape[0])).to(cpu_type) 253 | 254 | # 训练模型并做预测 255 | gcn_model, loss_arr, test_accuracy_arr = model_train(tensor_x_all, tensor_y_all, tensor_adjacency_all, 256 | tensor_train_mask, 257 | tensor_val_mask, epochs=300, 258 | learning_rate=0.04, weight_decay=5e-4) 259 | y_predict = model_predict(gcn_model, tensor_x_all, tensor_adjacency_all, tensor_test_mask) 260 | test_acc = cal_accuracy(tensor_y_all[tensor_test_mask], y_predict) 261 | print(test_acc.item()) 262 | 263 | plot_loss_with_acc(loss_arr, test_accuracy_arr) 264 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /data/german_score.csv: -------------------------------------------------------------------------------- 1 | score,creditability 2 | 557.0,0 3 | 337.0,1 4 | 504.0,0 5 | 404.0,0 6 | 495.0,1 7 | 389.0,0 8 | 460.0,0 9 | 385.0,0 10 | 510.0,0 11 | 403.0,1 12 | 432.0,1 13 | 341.0,1 14 | 441.0,0 15 | 505.0,1 16 | 440.0,0 17 | 415.0,1 18 | 483.0,0 19 | 428.0,0 20 | 384.0,1 21 | 425.0,0 22 | 534.0,0 23 | 502.0,0 24 | 542.0,0 25 | 471.0,0 26 | 472.0,0 27 | 485.0,0 28 | 471.0,0 29 | 503.0,0 30 | 456.0,0 31 | 417.0,1 32 | 441.0,0 33 | 407.0,0 34 | 446.0,0 35 | 493.0,0 36 | 440.0,0 37 | 368.0,1 38 | 364.0,0 39 | 434.0,1 40 | 471.0,0 41 | 432.0,0 42 | 393.0,0 43 | 435.0,0 44 | 460.0,0 45 | 415.0,0 46 | 426.0,1 47 | 476.0,0 48 | 400.0,0 49 | 457.0,0 50 | 481.0,0 51 | 427.0,0 52 | 403.0,0 53 | 435.0,0 54 | 433.0,0 55 | 431.0,0 56 | 468.0,1 57 | 481.0,0 58 | 473.0,1 59 | 392.0,0 60 | 412.0,0 61 | 375.0,1 62 | 451.0,0 63 | 500.0,0 64 | 449.0,1 65 | 301.0,1 66 | 392.0,0 67 | 526.0,0 68 | 445.0,0 69 | 419.0,0 70 | 398.0,1 71 | 369.0,0 72 | 376.0,0 73 | 512.0,0 74 | 542.0,0 75 | 420.0,0 76 | 404.0,1 77 | 541.0,0 78 | 368.0,1 79 | 481.0,0 80 | 359.0,0 81 | 385.0,0 82 | 472.0,1 83 | 466.0,0 84 | 416.0,0 85 | 468.0,0 86 | 490.0,0 87 | 461.0,0 88 | 445.0,0 89 | 406.0,1 90 | 433.0,0 91 | 461.0,1 92 | 494.0,0 93 | 486.0,0 94 | 459.0,1 95 | 406.0,0 96 | 487.0,0 97 | 344.0,1 98 | 498.0,0 99 | 424.0,0 100 | 393.0,0 101 | 455.0,0 102 | 433.0,0 103 | 367.0,0 104 | 444.0,0 105 | 449.0,0 106 | 437.0,0 107 | 437.0,1 108 | 477.0,1 109 | 432.0,0 110 | 403.0,0 111 | 466.0,0 112 | 518.0,0 113 | 418.0,0 114 | 402.0,0 115 | 371.0,1 116 | 452.0,0 117 | 386.0,0 118 | 344.0,1 119 | 474.0,0 120 | 416.0,1 121 | 451.0,0 122 | 440.0,1 123 | 450.0,0 124 | 428.0,0 125 | 542.0,0 126 | 413.0,1 127 | 460.0,0 128 | 459.0,0 129 | 438.0,1 130 | 470.0,0 131 | 459.0,1 132 | 337.0,0 133 | 355.0,1 134 | 461.0,0 135 | 485.0,0 136 | 289.0,0 137 | 482.0,0 138 | 407.0,0 139 | 515.0,1 140 | 491.0,0 141 | 479.0,0 142 | 460.0,0 143 | 368.0,0 144 | 395.0,0 145 | 416.0,1 146 | 396.0,0 147 | 349.0,0 148 | 523.0,0 149 | 513.0,0 150 | 398.0,0 151 | 485.0,0 152 | 504.0,0 153 | 478.0,0 154 | 350.0,0 155 | 392.0,0 156 | 395.0,0 157 | 430.0,1 158 | 532.0,0 159 | 472.0,0 160 | 462.0,0 161 | 522.0,0 162 | 439.0,0 163 | 453.0,0 164 | 471.0,0 165 | 512.0,0 166 | 400.0,0 167 | 468.0,0 168 | 430.0,1 169 | 414.0,0 170 | 403.0,0 171 | 438.0,1 172 | 481.0,1 173 | 429.0,0 174 | 423.0,1 175 | 449.0,0 176 | 443.0,1 177 | 417.0,1 178 | 471.0,0 179 | 529.0,0 180 | 478.0,0 181 | 525.0,0 182 | 394.0,1 183 | 416.0,1 184 | 480.0,1 185 | 441.0,0 186 | 452.0,1 187 | 460.0,0 188 | 549.0,1 189 | 585.0,0 190 | 416.0,1 191 | 473.0,0 192 | 521.0,1 193 | 372.0,1 194 | 419.0,1 195 | 458.0,0 196 | 339.0,1 197 | 495.0,1 198 | 497.0,0 199 | 518.0,1 200 | 416.0,0 201 | 429.0,1 202 | 520.0,0 203 | 429.0,0 204 | 414.0,0 205 | 417.0,1 206 | 495.0,0 207 | 450.0,0 208 | 518.0,0 209 | 426.0,0 210 | 381.0,0 211 | 497.0,0 212 | 508.0,0 213 | 422.0,0 214 | 473.0,1 215 | 472.0,1 216 | 421.0,0 217 | 523.0,0 218 | 424.0,0 219 | 374.0,0 220 | 405.0,0 221 | 529.0,0 222 | 430.0,0 223 | 420.0,0 224 | 467.0,0 225 | 450.0,0 226 | 420.0,0 227 | 402.0,0 228 | 360.0,1 229 | 452.0,1 230 | 423.0,1 231 | 384.0,0 232 | 362.0,1 233 | 497.0,0 234 | 450.0,0 235 | 414.0,0 236 | 566.0,0 237 | 423.0,1 238 | 391.0,1 239 | 505.0,1 240 | 475.0,0 241 | 451.0,0 242 | 410.0,1 243 | 514.0,0 244 | 384.0,1 245 | 467.0,0 246 | 450.0,0 247 | 400.0,0 248 | 544.0,0 249 | 458.0,0 250 | 415.0,0 251 | 418.0,1 252 | 507.0,0 253 | 490.0,0 254 | 381.0,1 255 | 453.0,0 256 | 441.0,0 257 | 324.0,0 258 | 443.0,0 259 | 428.0,1 260 | 426.0,0 261 | 554.0,0 262 | 443.0,0 263 | 463.0,0 264 | 491.0,0 265 | 561.0,0 266 | 496.0,0 267 | 461.0,1 268 | 403.0,0 269 | 426.0,0 270 | 450.0,1 271 | 430.0,0 272 | 420.0,0 273 | 458.0,0 274 | 319.0,0 275 | 377.0,1 276 | 378.0,1 277 | 478.0,0 278 | 454.0,0 279 | 493.0,0 280 | 460.0,1 281 | 405.0,0 282 | 441.0,0 283 | 477.0,0 284 | 463.0,0 285 | 499.0,0 286 | 412.0,0 287 | 352.0,0 288 | 350.0,0 289 | 348.0,0 290 | 506.0,0 291 | 450.0,1 292 | 444.0,0 293 | 366.0,1 294 | 476.0,0 295 | 412.0,0 296 | 408.0,0 297 | 334.0,1 298 | 421.0,0 299 | 471.0,0 300 | 453.0,0 301 | 451.0,0 302 | 527.0,0 303 | 399.0,1 304 | 469.0,1 305 | 511.0,0 306 | 364.0,1 307 | 452.0,0 308 | 384.0,0 309 | 444.0,1 310 | 439.0,1 311 | 429.0,0 312 | 370.0,0 313 | 378.0,0 314 | 406.0,0 315 | 444.0,1 316 | 488.0,0 317 | 380.0,1 318 | 487.0,0 319 | 459.0,0 320 | 471.0,0 321 | 465.0,0 322 | 408.0,1 323 | 411.0,1 324 | 427.0,0 325 | 411.0,0 326 | 466.0,0 327 | 506.0,0 328 | 470.0,0 329 | 433.0,0 330 | 372.0,0 331 | 459.0,0 332 | 527.0,0 333 | 449.0,1 334 | 293.0,1 335 | 342.0,1 336 | 434.0,1 337 | 492.0,1 338 | 430.0,0 339 | 416.0,1 340 | 391.0,0 341 | 440.0,0 342 | 421.0,0 343 | 421.0,0 344 | 427.0,0 345 | 438.0,0 346 | 481.0,0 347 | 456.0,0 348 | 449.0,0 349 | 408.0,0 350 | 508.0,0 351 | 490.0,1 352 | 451.0,0 353 | 465.0,1 354 | 447.0,0 355 | 435.0,1 356 | 476.0,0 357 | 395.0,1 358 | 496.0,0 359 | 390.0,1 360 | 433.0,0 361 | 377.0,1 362 | 490.0,0 363 | 456.0,0 364 | 489.0,0 365 | 450.0,0 366 | 406.0,1 367 | 463.0,0 368 | 441.0,0 369 | 418.0,0 370 | 416.0,1 371 | 452.0,0 372 | 389.0,0 373 | 443.0,0 374 | 474.0,0 375 | 362.0,0 376 | 371.0,1 377 | 361.0,1 378 | 472.0,0 379 | 469.0,0 380 | 379.0,1 381 | 529.0,0 382 | 433.0,0 383 | 390.0,1 384 | 403.0,0 385 | 426.0,0 386 | 420.0,0 387 | 453.0,0 388 | 404.0,0 389 | 443.0,0 390 | 434.0,0 391 | 471.0,0 392 | 437.0,0 393 | 438.0,0 394 | 427.0,0 395 | 471.0,0 396 | 454.0,0 397 | 348.0,0 398 | 491.0,0 399 | 452.0,0 400 | 524.0,1 401 | 466.0,0 402 | 449.0,0 403 | 444.0,0 404 | 413.0,1 405 | 456.0,0 406 | 458.0,0 407 | 414.0,1 408 | 478.0,0 409 | 423.0,0 410 | 400.0,0 411 | 491.0,1 412 | 385.0,0 413 | 410.0,0 414 | 485.0,1 415 | 482.0,0 416 | 421.0,1 417 | 435.0,0 418 | 450.0,1 419 | 426.0,0 420 | 436.0,0 421 | 430.0,1 422 | 414.0,0 423 | 431.0,0 424 | 534.0,0 425 | 453.0,0 426 | 442.0,1 427 | 421.0,0 428 | 422.0,0 429 | 487.0,0 430 | 444.0,0 431 | 567.0,1 432 | 558.0,0 433 | 404.0,1 434 | 516.0,0 435 | 442.0,0 436 | 434.0,0 437 | 440.0,1 438 | 452.0,0 439 | 455.0,0 440 | 468.0,0 441 | 430.0,1 442 | 451.0,0 443 | 450.0,0 444 | 455.0,0 445 | 476.0,1 446 | 358.0,1 447 | 494.0,0 448 | 391.0,1 449 | 466.0,0 450 | 496.0,0 451 | 532.0,1 452 | 337.0,0 453 | 457.0,0 454 | 482.0,0 455 | 414.0,0 456 | 411.0,1 457 | 401.0,0 458 | 498.0,0 459 | 443.0,1 460 | 445.0,0 461 | 419.0,0 462 | 429.0,0 463 | 456.0,0 464 | 443.0,0 465 | 484.0,0 466 | 457.0,0 467 | 497.0,0 468 | 406.0,1 469 | 388.0,0 470 | 422.0,0 471 | 442.0,0 472 | 392.0,1 473 | 436.0,1 474 | 440.0,1 475 | 475.0,0 476 | 429.0,1 477 | 430.0,1 478 | 360.0,0 479 | 380.0,0 480 | 464.0,0 481 | 502.0,0 482 | 433.0,0 483 | 402.0,0 484 | 474.0,0 485 | 468.0,0 486 | 466.0,0 487 | 484.0,1 488 | 490.0,0 489 | 467.0,0 490 | 457.0,0 491 | 444.0,0 492 | 469.0,0 493 | 441.0,1 494 | 495.0,0 495 | 469.0,0 496 | 503.0,0 497 | 397.0,1 498 | 345.0,1 499 | 460.0,0 500 | 427.0,0 501 | 504.0,0 502 | 395.0,1 503 | 422.0,0 504 | 499.0,0 505 | 472.0,1 506 | 398.0,1 507 | 432.0,1 508 | 453.0,0 509 | 450.0,1 510 | 405.0,0 511 | 375.0,0 512 | 429.0,1 513 | 390.0,0 514 | 430.0,0 515 | 446.0,0 516 | 457.0,0 517 | 495.0,0 518 | 500.0,0 519 | 387.0,1 520 | 484.0,0 521 | 492.0,0 522 | 457.0,0 523 | 417.0,1 524 | 434.0,1 525 | 392.0,0 526 | 430.0,0 527 | 419.0,0 528 | 430.0,0 529 | 537.0,0 530 | 383.0,1 531 | 497.0,0 532 | 438.0,0 533 | 464.0,1 534 | 458.0,0 535 | 401.0,0 536 | 420.0,0 537 | 434.0,1 538 | 544.0,0 539 | 434.0,0 540 | 354.0,1 541 | 471.0,0 542 | 444.0,1 543 | 501.0,0 544 | 375.0,1 545 | 455.0,1 546 | 531.0,0 547 | 482.0,1 548 | 422.0,0 549 | 437.0,0 550 | 425.0,1 551 | 380.0,0 552 | 450.0,0 553 | 509.0,0 554 | 395.0,1 555 | 425.0,0 556 | 551.0,0 557 | 434.0,1 558 | 451.0,1 559 | 450.0,1 560 | 401.0,1 561 | 467.0,1 562 | 458.0,0 563 | 396.0,1 564 | 465.0,0 565 | 370.0,1 566 | 438.0,0 567 | 431.0,0 568 | 464.0,1 569 | 429.0,0 570 | 416.0,0 571 | 344.0,1 572 | 385.0,1 573 | 400.0,0 574 | 397.0,0 575 | 414.0,0 576 | 465.0,0 577 | 454.0,0 578 | 437.0,0 579 | 451.0,0 580 | 401.0,1 581 | 435.0,0 582 | 461.0,1 583 | 536.0,0 584 | 432.0,0 585 | 387.0,1 586 | 478.0,0 587 | 422.0,1 588 | 507.0,0 589 | 415.0,0 590 | 460.0,1 591 | 532.0,1 592 | 588.0,0 593 | 462.0,0 594 | 428.0,0 595 | 390.0,1 596 | 431.0,1 597 | 478.0,1 598 | 424.0,1 599 | 492.0,1 600 | 463.0,1 601 | 427.0,0 602 | 497.0,0 603 | 443.0,1 604 | 416.0,1 605 | 371.0,1 606 | 424.0,0 607 | 385.0,0 608 | 497.0,0 609 | 422.0,1 610 | 425.0,0 611 | 476.0,0 612 | 420.0,1 613 | 518.0,1 614 | 404.0,0 615 | 406.0,0 616 | 403.0,1 617 | 362.0,0 618 | 307.0,0 619 | 549.0,0 620 | 384.0,1 621 | 479.0,0 622 | 444.0,0 623 | 463.0,1 624 | 365.0,1 625 | 415.0,0 626 | 521.0,1 627 | 436.0,0 628 | 483.0,0 629 | 455.0,1 630 | 402.0,0 631 | 527.0,0 632 | 411.0,0 633 | 451.0,1 634 | 413.0,0 635 | 460.0,1 636 | 407.0,1 637 | 545.0,0 638 | 405.0,0 639 | 291.0,0 640 | 457.0,0 641 | 402.0,1 642 | 419.0,1 643 | 475.0,0 644 | 442.0,1 645 | 443.0,0 646 | 453.0,0 647 | 375.0,1 648 | 432.0,0 649 | 442.0,1 650 | 444.0,1 651 | 474.0,1 652 | 373.0,0 653 | 447.0,1 654 | 438.0,1 655 | 482.0,1 656 | 445.0,0 657 | 425.0,0 658 | 476.0,1 659 | 331.0,0 660 | 425.0,0 661 | 438.0,0 662 | 428.0,0 663 | 422.0,1 664 | 484.0,0 665 | 489.0,0 666 | 508.0,0 667 | 408.0,0 668 | 410.0,0 669 | 368.0,0 670 | 483.0,1 671 | 426.0,0 672 | 427.0,0 673 | 411.0,0 674 | 333.0,0 675 | 455.0,0 676 | 449.0,1 677 | 373.0,0 678 | 397.0,0 679 | 294.0,1 680 | 478.0,0 681 | 421.0,0 682 | 527.0,0 683 | 445.0,0 684 | 463.0,0 685 | 495.0,0 686 | 414.0,0 687 | 334.0,0 688 | 462.0,0 689 | 378.0,0 690 | 453.0,0 691 | 465.0,0 692 | 465.0,0 693 | 420.0,0 694 | 417.0,0 695 | 530.0,0 696 | 401.0,0 697 | 507.0,0 698 | 463.0,0 699 | 479.0,0 700 | 436.0,0 701 | 447.0,0 702 | 433.0,1 703 | 399.0,1 704 | 446.0,0 705 | 445.0,0 706 | 417.0,0 707 | 444.0,0 708 | 330.0,1 709 | 445.0,1 710 | 431.0,0 711 | 484.0,0 712 | 459.0,0 713 | 458.0,1 714 | 445.0,0 715 | 460.0,0 716 | 266.0,1 717 | 492.0,0 718 | 463.0,0 719 | 426.0,0 720 | 441.0,0 721 | 434.0,0 722 | 479.0,1 723 | 454.0,1 724 | 422.0,1 725 | 521.0,0 726 | 386.0,1 727 | 522.0,0 728 | 511.0,0 729 | 438.0,1 730 | 398.0,1 731 | 469.0,0 732 | 416.0,0 733 | 419.0,1 734 | 476.0,0 735 | 416.0,0 736 | 496.0,0 737 | 378.0,0 738 | 398.0,1 739 | 441.0,0 740 | 522.0,0 741 | 404.0,1 742 | 426.0,0 743 | 425.0,0 744 | 431.0,0 745 | 387.0,0 746 | 344.0,0 747 | 464.0,0 748 | 432.0,0 749 | 458.0,1 750 | 413.0,0 751 | 443.0,0 752 | 509.0,0 753 | 434.0,1 754 | 439.0,0 755 | 394.0,0 756 | 532.0,1 757 | 414.0,1 758 | 641.0,0 759 | 486.0,1 760 | 428.0,0 761 | 478.0,1 762 | 498.0,0 763 | 435.0,1 764 | 417.0,0 765 | 359.0,1 766 | 427.0,0 767 | 495.0,0 768 | 408.0,1 769 | 458.0,0 770 | 502.0,0 771 | 550.0,0 772 | 411.0,0 773 | 379.0,1 774 | 444.0,0 775 | 482.0,0 776 | 587.0,0 777 | 399.0,1 778 | 419.0,0 779 | 401.0,0 780 | 410.0,0 781 | 506.0,0 782 | 396.0,1 783 | 472.0,0 784 | 453.0,0 785 | 422.0,1 786 | 482.0,0 787 | 430.0,0 788 | 437.0,0 789 | 415.0,0 790 | 379.0,1 791 | 347.0,1 792 | 468.0,1 793 | 432.0,0 794 | 495.0,0 795 | 456.0,0 796 | 406.0,0 797 | 435.0,0 798 | 477.0,1 799 | 464.0,0 800 | 494.0,0 801 | 451.0,0 802 | 505.0,0 803 | 496.0,0 804 | 445.0,0 805 | 476.0,0 806 | 416.0,0 807 | 349.0,1 808 | 450.0,0 809 | 600.0,0 810 | 412.0,0 811 | 464.0,1 812 | 445.0,0 813 | 455.0,0 814 | 360.0,1 815 | 412.0,1 816 | 397.0,1 817 | 423.0,0 818 | 541.0,0 819 | 487.0,0 820 | 358.0,0 821 | 414.0,1 822 | 467.0,0 823 | 422.0,0 824 | 434.0,1 825 | 512.0,0 826 | 459.0,0 827 | 444.0,0 828 | 499.0,1 829 | 482.0,1 830 | 395.0,1 831 | 366.0,0 832 | 479.0,0 833 | 408.0,1 834 | 358.0,1 835 | 434.0,0 836 | 429.0,1 837 | 504.0,1 838 | 417.0,0 839 | 480.0,0 840 | 502.0,0 841 | 467.0,0 842 | 364.0,1 843 | 441.0,0 844 | 404.0,1 845 | 452.0,0 846 | 522.0,0 847 | 430.0,0 848 | 523.0,1 849 | 406.0,0 850 | 512.0,0 851 | 496.0,1 852 | 449.0,1 853 | 462.0,0 854 | 468.0,0 855 | 470.0,1 856 | 431.0,0 857 | 416.0,0 858 | 463.0,0 859 | 414.0,0 860 | 422.0,1 861 | 461.0,0 862 | 410.0,0 863 | 414.0,1 864 | 415.0,1 865 | 436.0,0 866 | 440.0,1 867 | 419.0,0 868 | 411.0,0 869 | 467.0,0 870 | 408.0,0 871 | 425.0,0 872 | 421.0,0 873 | 551.0,0 874 | 430.0,0 875 | 418.0,0 876 | 453.0,0 877 | 488.0,0 878 | 441.0,0 879 | 370.0,0 880 | 442.0,1 881 | 430.0,0 882 | 403.0,0 883 | 426.0,0 884 | 436.0,0 885 | 528.0,0 886 | 433.0,1 887 | 487.0,1 888 | 455.0,0 889 | 291.0,1 890 | 397.0,0 891 | 444.0,0 892 | 529.0,0 893 | 490.0,0 894 | 477.0,0 895 | 409.0,0 896 | 450.0,0 897 | 373.0,0 898 | 405.0,0 899 | 452.0,0 900 | 456.0,0 901 | 451.0,1 902 | 465.0,1 903 | 484.0,0 904 | 411.0,0 905 | 450.0,0 906 | 423.0,0 907 | 446.0,0 908 | 408.0,0 909 | 384.0,0 910 | 506.0,0 911 | 463.0,0 912 | 408.0,0 913 | 399.0,1 914 | 395.0,0 915 | 453.0,0 916 | 403.0,1 917 | 305.0,1 918 | 475.0,0 919 | 486.0,1 920 | 436.0,1 921 | 419.0,1 922 | 445.0,0 923 | 319.0,0 924 | 432.0,1 925 | 455.0,0 926 | 453.0,1 927 | 502.0,1 928 | 404.0,0 929 | 409.0,1 930 | 455.0,0 931 | 507.0,0 932 | 395.0,0 933 | 422.0,1 934 | 479.0,0 935 | 509.0,0 936 | 419.0,0 937 | 424.0,1 938 | 448.0,1 939 | 443.0,0 940 | 337.0,1 941 | 478.0,0 942 | 488.0,0 943 | 463.0,0 944 | 414.0,0 945 | 567.0,0 946 | 461.0,0 947 | 380.0,0 948 | 415.0,1 949 | 444.0,0 950 | 464.0,1 951 | 447.0,1 952 | 515.0,0 953 | 414.0,1 954 | 400.0,1 955 | 358.0,1 956 | 430.0,0 957 | 498.0,0 958 | 456.0,0 959 | 532.0,0 960 | 429.0,1 961 | 401.0,0 962 | 491.0,0 963 | 502.0,0 964 | 424.0,0 965 | 453.0,1 966 | 442.0,0 967 | 387.0,0 968 | 412.0,1 969 | 469.0,0 970 | 358.0,0 971 | 515.0,0 972 | 410.0,0 973 | 448.0,0 974 | 463.0,1 975 | 319.0,1 976 | 397.0,0 977 | 477.0,0 978 | 540.0,0 979 | 472.0,0 980 | 485.0,1 981 | 435.0,1 982 | 420.0,1 983 | 358.0,1 984 | 429.0,0 985 | 374.0,1 986 | 452.0,0 987 | 446.0,0 988 | 401.0,0 989 | 522.0,0 990 | 382.0,0 991 | 476.0,0 992 | 500.0,0 993 | 451.0,0 994 | 450.0,0 995 | 372.0,0 996 | 473.0,0 997 | 443.0,0 998 | 407.0,0 999 | 455.0,0 1000 | 349.0,1 1001 | 352.0,0 1002 | -------------------------------------------------------------------------------- /data/text_data/stopwords.txt: -------------------------------------------------------------------------------- 1 | ! 2 | " 3 | # 4 | $ 5 | % 6 | & 7 | ' 8 | ( 9 | ) 10 | * 11 | + 12 | , 13 | - 14 | -- 15 | . 16 | .. 17 | ... 18 | ...... 19 | ................... 20 | ./ 21 | .一 22 | 记者 23 | 数 24 | 年 25 | 月 26 | 日 27 | 时 28 | 分 29 | 秒 30 | / 31 | // 32 | 0 33 | 1 34 | 2 35 | 3 36 | 4 37 | 5 38 | 6 39 | 7 40 | 8 41 | 9 42 | : 43 | :// 44 | :: 45 | ; 46 | < 47 | = 48 | > 49 | >> 50 | ? 51 | @ 52 | A 53 | Lex 54 | [ 55 | \ 56 | ] 57 | 【 58 | 】 59 | ^ 60 | _ 61 | ` 62 | exp 63 | sub 64 | sup 65 | | 66 | } 67 | ~ 68 | ~~~~ 69 | · 70 | × 71 | ××× 72 | Δ 73 | Ψ 74 | γ 75 | μ 76 | φ 77 | φ. 78 | В 79 | — 80 | —— 81 | ——— 82 | ‘ 83 | ’ 84 | ’‘ 85 | “ 86 | ” 87 | ”, 88 | … 89 | …… 90 | …………………………………………………③ 91 | ′∈ 92 | ′| 93 | ℃ 94 | Ⅲ 95 | ↑ 96 | → 97 | ∈[ 98 | ∪φ∈ 99 | ≈ 100 | ① 101 | ② 102 | ②c 103 | ③ 104 | ③] 105 | ④ 106 | ⑤ 107 | ⑥ 108 | ⑦ 109 | ⑧ 110 | ⑨ 111 | ⑩ 112 | ── 113 | ■ 114 | ▲ 115 |   116 | 、 117 | 。 118 | 〈 119 | 〉 120 | 《 121 | 》 122 | 》), 123 | 」 124 | 『 125 | 』 126 | 〔 127 | 〕 128 | 〕〔 129 | ㈧ 130 | 一 131 | 一. 132 | 一一 133 | 一下 134 | 一个 135 | 一些 136 | 一何 137 | 一切 138 | 一则 139 | 一则通过 140 | 一天 141 | 一定 142 | 一方面 143 | 一旦 144 | 一时 145 | 一来 146 | 一样 147 | 一次 148 | 一片 149 | 一番 150 | 一直 151 | 一致 152 | 一般 153 | 一起 154 | 一转眼 155 | 一边 156 | 一面 157 | 男子 158 | 女子 159 | 七 160 | 万一 161 | 三 162 | 三天两头 163 | 三番两次 164 | 三番五次 165 | 上 166 | 上下 167 | 上升 168 | 上去 169 | 上来 170 | 上述 171 | 上面 172 | 下 173 | 下列 174 | 下去 175 | 下来 176 | 下面 177 | 不 178 | 不一 179 | 不下 180 | 不久 181 | 不了 182 | 不亦乐乎 183 | 不仅 184 | 不仅...而且 185 | 不仅仅 186 | 不仅仅是 187 | 不会 188 | 不但 189 | 不但...而且 190 | 不光 191 | 不免 192 | 不再 193 | 不力 194 | 不单 195 | 不变 196 | 不只 197 | 不可 198 | 不可开交 199 | 不可抗拒 200 | 不同 201 | 不外 202 | 不外乎 203 | 不够 204 | 不大 205 | 不如 206 | 不妨 207 | 不定 208 | 不对 209 | 不少 210 | 不尽 211 | 不尽然 212 | 不巧 213 | 不已 214 | 不常 215 | 不得 216 | 不得不 217 | 不得了 218 | 不得已 219 | 不必 220 | 不怎么 221 | 不怕 222 | 不惟 223 | 不成 224 | 不拘 225 | 不择手段 226 | 不敢 227 | 不料 228 | 不断 229 | 不日 230 | 不时 231 | 不是 232 | 不曾 233 | 不止 234 | 不止一次 235 | 不比 236 | 不消 237 | 不满 238 | 不然 239 | 不然的话 240 | 不特 241 | 不独 242 | 不由得 243 | 不知不觉 244 | 不管 245 | 不管怎样 246 | 不经意 247 | 不胜 248 | 不能 249 | 不能不 250 | 不至于 251 | 不若 252 | 不要 253 | 不论 254 | 不起 255 | 不足 256 | 不过 257 | 不迭 258 | 不问 259 | 不限 260 | 与 261 | 与其 262 | 与其说 263 | 与否 264 | 与此同时 265 | 专门 266 | 且 267 | 且不说 268 | 且说 269 | 两者 270 | 严格 271 | 严重 272 | 个 273 | 个人 274 | 个别 275 | 中小 276 | 中间 277 | 丰富 278 | 串行 279 | 临 280 | 临到 281 | 为 282 | 为主 283 | 为了 284 | 为什么 285 | 为什麽 286 | 为何 287 | 为止 288 | 为此 289 | 为着 290 | 主张 291 | 主要 292 | 举凡 293 | 举行 294 | 乃 295 | 乃至 296 | 乃至于 297 | 么 298 | 之 299 | 之一 300 | 之前 301 | 之后 302 | 之後 303 | 之所以 304 | 之类 305 | 乌乎 306 | 乎 307 | 乒 308 | 乘 309 | 乘势 310 | 乘机 311 | 乘胜 312 | 乘虚 313 | 乘隙 314 | 九 315 | 也 316 | 也好 317 | 也就是说 318 | 也是 319 | 也罢 320 | 了 321 | 了解 322 | 争取 323 | 二 324 | 二来 325 | 二话不说 326 | 二话没说 327 | 于 328 | 于是 329 | 于是乎 330 | 云云 331 | 云尔 332 | 互 333 | 互相 334 | 五 335 | 些 336 | 交口 337 | 亦 338 | 产生 339 | 亲口 340 | 亲手 341 | 亲眼 342 | 亲自 343 | 亲身 344 | 人 345 | 人人 346 | 人们 347 | 人家 348 | 人民 349 | 什么 350 | 什么样 351 | 什麽 352 | 仅 353 | 仅仅 354 | 今 355 | 今后 356 | 今天 357 | 今年 358 | 今後 359 | 介于 360 | 仍 361 | 仍旧 362 | 仍然 363 | 从 364 | 从不 365 | 从严 366 | 从中 367 | 从事 368 | 从今以后 369 | 从优 370 | 从古到今 371 | 从古至今 372 | 从头 373 | 从宽 374 | 从小 375 | 从新 376 | 从无到有 377 | 从早到晚 378 | 从未 379 | 从来 380 | 从此 381 | 从此以后 382 | 从而 383 | 从轻 384 | 从速 385 | 从重 386 | 他 387 | 他人 388 | 他们 389 | 他是 390 | 他的 391 | 代替 392 | 以 393 | 以上 394 | 以下 395 | 以为 396 | 以便 397 | 以免 398 | 以前 399 | 以及 400 | 以后 401 | 以外 402 | 以後 403 | 以故 404 | 以期 405 | 以来 406 | 以至 407 | 以至于 408 | 以致 409 | 们 410 | 任 411 | 任何 412 | 任凭 413 | 任务 414 | 企图 415 | 伙同 416 | 会 417 | 伟大 418 | 传 419 | 传说 420 | 传闻 421 | 似乎 422 | 似的 423 | 但 424 | 但凡 425 | 但愿 426 | 但是 427 | 何 428 | 何乐而不为 429 | 何以 430 | 何况 431 | 何处 432 | 何妨 433 | 何尝 434 | 何必 435 | 何时 436 | 何止 437 | 何苦 438 | 何须 439 | 余外 440 | 作为 441 | 你 442 | 你们 443 | 你是 444 | 你的 445 | 使 446 | 使得 447 | 使用 448 | 例如 449 | 依 450 | 依据 451 | 依照 452 | 依靠 453 | 便 454 | 便于 455 | 促进 456 | 保持 457 | 保管 458 | 保险 459 | 俺 460 | 俺们 461 | 倍加 462 | 倍感 463 | 倒不如 464 | 倒不如说 465 | 倒是 466 | 倘 467 | 倘使 468 | 倘或 469 | 倘然 470 | 倘若 471 | 借 472 | 借以 473 | 借此 474 | 假使 475 | 假如 476 | 假若 477 | 偏偏 478 | 做到 479 | 偶尔 480 | 偶而 481 | 傥然 482 | 像 483 | 儿 484 | 允许 485 | 元/吨 486 | 充其极 487 | 充其量 488 | 充分 489 | 先不先 490 | 先后 491 | 先後 492 | 先生 493 | 光 494 | 光是 495 | 全体 496 | 全力 497 | 全年 498 | 全然 499 | 全身心 500 | 全部 501 | 全都 502 | 全面 503 | 八 504 | 八成 505 | 公然 506 | 六 507 | 兮 508 | 共 509 | 共同 510 | 共总 511 | 关于 512 | 其 513 | 其一 514 | 其中 515 | 其二 516 | 其他 517 | 其余 518 | 其后 519 | 其它 520 | 其实 521 | 其次 522 | 具体 523 | 具体地说 524 | 具体来说 525 | 具体说来 526 | 具有 527 | 兼之 528 | 内 529 | 再 530 | 再其次 531 | 再则 532 | 再有 533 | 再次 534 | 再者 535 | 再者说 536 | 再说 537 | 冒 538 | 冲 539 | 决不 540 | 决定 541 | 决非 542 | 况且 543 | 准备 544 | 凑巧 545 | 凝神 546 | 几 547 | 几乎 548 | 几度 549 | 几时 550 | 几番 551 | 几经 552 | 凡 553 | 凡是 554 | 凭 555 | 凭借 556 | 出 557 | 出于 558 | 出去 559 | 出来 560 | 出现 561 | 分别 562 | 分头 563 | 分期 564 | 分期分批 565 | 切 566 | 切不可 567 | 切切 568 | 切勿 569 | 切莫 570 | 则 571 | 则甚 572 | 刚 573 | 刚好 574 | 刚巧 575 | 刚才 576 | 初 577 | 别 578 | 别人 579 | 别处 580 | 别是 581 | 别的 582 | 别管 583 | 别说 584 | 到 585 | 到了儿 586 | 到处 587 | 到头 588 | 到头来 589 | 到底 590 | 到目前为止 591 | 前后 592 | 前此 593 | 前者 594 | 前进 595 | 前面 596 | 加上 597 | 加之 598 | 加以 599 | 加入 600 | 加强 601 | 动不动 602 | 动辄 603 | 勃然 604 | 匆匆 605 | 十分 606 | 千 607 | 千万 608 | 千万千万 609 | 半 610 | 单 611 | 单单 612 | 单纯 613 | 即 614 | 即令 615 | 即使 616 | 即便 617 | 即刻 618 | 即如 619 | 即将 620 | 即或 621 | 即是说 622 | 即若 623 | 却 624 | 却不 625 | 历 626 | 原来 627 | 去 628 | 又 629 | 又及 630 | 及 631 | 及其 632 | 及时 633 | 及至 634 | 双方 635 | 反之 636 | 反之亦然 637 | 反之则 638 | 反倒 639 | 反倒是 640 | 反应 641 | 反手 642 | 反映 643 | 反而 644 | 反过来 645 | 反过来说 646 | 取得 647 | 取道 648 | 受到 649 | 变成 650 | 古来 651 | 另 652 | 另一个 653 | 另一方面 654 | 另外 655 | 另悉 656 | 另方面 657 | 另行 658 | 只 659 | 只当 660 | 只怕 661 | 只是 662 | 只有 663 | 只消 664 | 只要 665 | 只限 666 | 叫 667 | 叫做 668 | 召开 669 | 叮咚 670 | 叮当 671 | 可 672 | 可以 673 | 可好 674 | 可是 675 | 可能 676 | 可见 677 | 各 678 | 各个 679 | 各人 680 | 各位 681 | 各地 682 | 各式 683 | 各种 684 | 各级 685 | 各自 686 | 合理 687 | 同 688 | 同一 689 | 同时 690 | 同样 691 | 后 692 | 后来 693 | 后者 694 | 后面 695 | 向 696 | 向使 697 | 向着 698 | 吓 699 | 吗 700 | 否则 701 | 吧 702 | 吧哒 703 | 吱 704 | 呀 705 | 呃 706 | 呆呆地 707 | 呐 708 | 呕 709 | 呗 710 | 呜 711 | 呜呼 712 | 呢 713 | 周围 714 | 呵 715 | 呵呵 716 | 呸 717 | 呼哧 718 | 呼啦 719 | 咋 720 | 和 721 | 咚 722 | 咦 723 | 咧 724 | 咱 725 | 咱们 726 | 咳 727 | 哇 728 | 哈 729 | 哈哈 730 | 哉 731 | 哎 732 | 哎呀 733 | 哎哟 734 | 哗 735 | 哗啦 736 | 哟 737 | 哦 738 | 哩 739 | 哪 740 | 哪个 741 | 哪些 742 | 哪儿 743 | 哪天 744 | 哪年 745 | 哪怕 746 | 哪样 747 | 哪边 748 | 哪里 749 | 哼 750 | 哼唷 751 | 唉 752 | 唯有 753 | 啊 754 | 啊呀 755 | 啊哈 756 | 啊哟 757 | 啐 758 | 啥 759 | 啦 760 | 啪达 761 | 啷当 762 | 喀 763 | 喂 764 | 喏 765 | 喔唷 766 | 喽 767 | 嗡 768 | 嗡嗡 769 | 嗬 770 | 嗯 771 | 嗳 772 | 嘎 773 | 嘎嘎 774 | 嘎登 775 | 嘘 776 | 嘛 777 | 嘻 778 | 嘿 779 | 嘿嘿 780 | 四 781 | 因 782 | 因为 783 | 因了 784 | 因此 785 | 因着 786 | 因而 787 | 固 788 | 固然 789 | 在 790 | 在下 791 | 在于 792 | 地 793 | 均 794 | 坚决 795 | 坚持 796 | 基于 797 | 基本 798 | 基本上 799 | 处在 800 | 处处 801 | 处理 802 | 复杂 803 | 多 804 | 多么 805 | 多亏 806 | 多多 807 | 多多少少 808 | 多多益善 809 | 多少 810 | 多年前 811 | 多年来 812 | 多数 813 | 多次 814 | 够瞧的 815 | 大 816 | 大不了 817 | 大举 818 | 大事 819 | 大体 820 | 大体上 821 | 大凡 822 | 大力 823 | 大多 824 | 大多数 825 | 大大 826 | 大家 827 | 大张旗鼓 828 | 大批 829 | 大抵 830 | 大概 831 | 大略 832 | 大约 833 | 大致 834 | 大都 835 | 大量 836 | 大面儿上 837 | 失去 838 | 奇 839 | 奈 840 | 奋勇 841 | 她 842 | 她们 843 | 她是 844 | 她的 845 | 好 846 | 好在 847 | 好的 848 | 好象 849 | 如 850 | 如上 851 | 如上所述 852 | 如下 853 | 如今 854 | 如何 855 | 如其 856 | 如前所述 857 | 如同 858 | 如常 859 | 如是 860 | 如期 861 | 如果 862 | 如次 863 | 如此 864 | 如此等等 865 | 如若 866 | 始而 867 | 姑且 868 | 存在 869 | 存心 870 | 孰料 871 | 孰知 872 | 宁 873 | 宁可 874 | 宁愿 875 | 宁肯 876 | 它 877 | 它们 878 | 它们的 879 | 它是 880 | 它的 881 | 安全 882 | 完全 883 | 完成 884 | 定 885 | 实现 886 | 实际 887 | 宣布 888 | 容易 889 | 密切 890 | 对 891 | 对于 892 | 对应 893 | 对待 894 | 对方 895 | 对比 896 | 将 897 | 将才 898 | 将要 899 | 将近 900 | 小 901 | 少数 902 | 尔 903 | 尔后 904 | 尔尔 905 | 尔等 906 | 尚且 907 | 尤其 908 | 就 909 | 就地 910 | 就是 911 | 就是了 912 | 就是说 913 | 就此 914 | 就算 915 | 就要 916 | 尽 917 | 尽可能 918 | 尽如人意 919 | 尽心尽力 920 | 尽心竭力 921 | 尽快 922 | 尽早 923 | 尽然 924 | 尽管 925 | 尽管如此 926 | 尽量 927 | 局外 928 | 居然 929 | 届时 930 | 属于 931 | 屡 932 | 屡屡 933 | 屡次 934 | 屡次三番 935 | 岂 936 | 岂但 937 | 岂止 938 | 岂非 939 | 川流不息 940 | 左右 941 | 巨大 942 | 巩固 943 | 差一点 944 | 差不多 945 | 己 946 | 已 947 | 已矣 948 | 已经 949 | 巴 950 | 巴巴 951 | 带 952 | 帮助 953 | 常 954 | 常常 955 | 常言说 956 | 常言说得好 957 | 常言道 958 | 平素 959 | 年复一年 960 | 并 961 | 并不 962 | 并不是 963 | 并且 964 | 并排 965 | 并无 966 | 并没 967 | 并没有 968 | 并肩 969 | 并非 970 | 广大 971 | 广泛 972 | 应当 973 | 应用 974 | 应该 975 | 庶乎 976 | 庶几 977 | 开外 978 | 开始 979 | 开展 980 | 引起 981 | 弗 982 | 弹指之间 983 | 强烈 984 | 强调 985 | 归 986 | 归根到底 987 | 归根结底 988 | 归齐 989 | 当 990 | 当下 991 | 当中 992 | 当儿 993 | 当前 994 | 当即 995 | 当口儿 996 | 当地 997 | 当场 998 | 当头 999 | 当庭 1000 | 当时 1001 | 当然 1002 | 当真 1003 | 当着 1004 | 形成 1005 | 彻夜 1006 | 彻底 1007 | 彼 1008 | 彼时 1009 | 彼此 1010 | 往 1011 | 往往 1012 | 待 1013 | 待到 1014 | 很 1015 | 很多 1016 | 很少 1017 | 後来 1018 | 後面 1019 | 得 1020 | 得了 1021 | 得出 1022 | 得到 1023 | 得天独厚 1024 | 得起 1025 | 心里 1026 | 必 1027 | 必定 1028 | 必将 1029 | 必然 1030 | 必要 1031 | 必须 1032 | 快 1033 | 快要 1034 | 忽地 1035 | 忽然 1036 | 怎 1037 | 怎么 1038 | 怎么办 1039 | 怎么样 1040 | 怎奈 1041 | 怎样 1042 | 怎麽 1043 | 怕 1044 | 急匆匆 1045 | 怪 1046 | 怪不得 1047 | 总之 1048 | 总是 1049 | 总的来看 1050 | 总的来说 1051 | 总的说来 1052 | 总结 1053 | 总而言之 1054 | 恍然 1055 | 恐怕 1056 | 恰似 1057 | 恰好 1058 | 恰如 1059 | 恰巧 1060 | 恰恰 1061 | 恰恰相反 1062 | 恰逢 1063 | 您 1064 | 您们 1065 | 您是 1066 | 惟其 1067 | 惯常 1068 | 意思 1069 | 愤然 1070 | 愿意 1071 | 慢说 1072 | 成为 1073 | 成年 1074 | 成年累月 1075 | 成心 1076 | 我 1077 | 我们 1078 | 我是 1079 | 我的 1080 | 或 1081 | 或则 1082 | 或多或少 1083 | 或是 1084 | 或曰 1085 | 或者 1086 | 或许 1087 | 战斗 1088 | 截然 1089 | 截至 1090 | 所 1091 | 所以 1092 | 所在 1093 | 所幸 1094 | 所有 1095 | 所谓 1096 | 才 1097 | 才能 1098 | 扑通 1099 | 打 1100 | 打从 1101 | 打开天窗说亮话 1102 | 扩大 1103 | 把 1104 | 抑或 1105 | 抽冷子 1106 | 拦腰 1107 | 拿 1108 | 按 1109 | 按时 1110 | 按期 1111 | 按照 1112 | 按理 1113 | 按说 1114 | 挨个 1115 | 挨家挨户 1116 | 挨次 1117 | 挨着 1118 | 挨门挨户 1119 | 挨门逐户 1120 | 换句话说 1121 | 换言之 1122 | 据 1123 | 据实 1124 | 据悉 1125 | 据我所知 1126 | 据此 1127 | 据称 1128 | 据说 1129 | 掌握 1130 | 接下来 1131 | 接着 1132 | 接著 1133 | 接连不断 1134 | 放量 1135 | 故 1136 | 故意 1137 | 故此 1138 | 故而 1139 | 敞开儿 1140 | 敢 1141 | 敢于 1142 | 敢情 1143 | 数/ 1144 | 整个 1145 | 断然 1146 | 方 1147 | 方便 1148 | 方才 1149 | 方能 1150 | 方面 1151 | 旁人 1152 | 无 1153 | 无宁 1154 | 无法 1155 | 无论 1156 | 既 1157 | 既...又 1158 | 既往 1159 | 既是 1160 | 既然 1161 | 日复一日 1162 | 日渐 1163 | 日益 1164 | 日臻 1165 | 日见 1166 | 时候 1167 | 昂然 1168 | 明显 1169 | 明确 1170 | 是 1171 | 是不是 1172 | 是以 1173 | 是否 1174 | 是的 1175 | 显然 1176 | 显著 1177 | 普通 1178 | 普遍 1179 | 暗中 1180 | 暗地里 1181 | 暗自 1182 | 更 1183 | 更为 1184 | 更加 1185 | 更进一步 1186 | 曾 1187 | 曾经 1188 | 替 1189 | 替代 1190 | 最 1191 | 最后 1192 | 最大 1193 | 最好 1194 | 最後 1195 | 最近 1196 | 最高 1197 | 有 1198 | 有些 1199 | 有关 1200 | 有利 1201 | 有力 1202 | 有及 1203 | 有所 1204 | 有效 1205 | 有时 1206 | 有点 1207 | 有的 1208 | 有的是 1209 | 有着 1210 | 有著 1211 | 望 1212 | 朝 1213 | 朝着 1214 | 末##末 1215 | 本 1216 | 本人 1217 | 本地 1218 | 本着 1219 | 本身 1220 | 权时 1221 | 来 1222 | 来不及 1223 | 来得及 1224 | 来看 1225 | 来着 1226 | 来自 1227 | 来讲 1228 | 来说 1229 | 极 1230 | 极为 1231 | 极了 1232 | 极其 1233 | 极力 1234 | 极大 1235 | 极度 1236 | 极端 1237 | 构成 1238 | 果然 1239 | 果真 1240 | 某 1241 | 某个 1242 | 某些 1243 | 某某 1244 | 根据 1245 | 根本 1246 | 格外 1247 | 梆 1248 | 概 1249 | 次第 1250 | 欢迎 1251 | 欤 1252 | 正值 1253 | 正在 1254 | 正如 1255 | 正巧 1256 | 正常 1257 | 正是 1258 | 此 1259 | 此中 1260 | 此后 1261 | 此地 1262 | 此处 1263 | 此外 1264 | 此时 1265 | 此次 1266 | 此间 1267 | 殆 1268 | 毋宁 1269 | 每 1270 | 每个 1271 | 每天 1272 | 每年 1273 | 每当 1274 | 每时每刻 1275 | 每每 1276 | 每逢 1277 | 比 1278 | 比及 1279 | 比如 1280 | 比如说 1281 | 比方 1282 | 比照 1283 | 比起 1284 | 比较 1285 | 毕竟 1286 | 毫不 1287 | 毫无 1288 | 毫无例外 1289 | 毫无保留地 1290 | 汝 1291 | 沙沙 1292 | 没 1293 | 没奈何 1294 | 没有 1295 | 沿 1296 | 沿着 1297 | 注意 1298 | 活 1299 | 深入 1300 | 清楚 1301 | 满 1302 | 满足 1303 | 漫说 1304 | 焉 1305 | 然 1306 | 然则 1307 | 然后 1308 | 然後 1309 | 然而 1310 | 照 1311 | 照着 1312 | 牢牢 1313 | 特别是 1314 | 特殊 1315 | 特点 1316 | 犹且 1317 | 犹自 1318 | 独 1319 | 独自 1320 | 猛然 1321 | 猛然间 1322 | 率尔 1323 | 率然 1324 | 现代 1325 | 现在 1326 | 理应 1327 | 理当 1328 | 理该 1329 | 瑟瑟 1330 | 甚且 1331 | 甚么 1332 | 甚或 1333 | 甚而 1334 | 甚至 1335 | 甚至于 1336 | 用 1337 | 用来 1338 | 甫 1339 | 甭 1340 | 由 1341 | 由于 1342 | 由是 1343 | 由此 1344 | 由此可见 1345 | 略 1346 | 略为 1347 | 略加 1348 | 略微 1349 | 白 1350 | 白白 1351 | 的 1352 | 的确 1353 | 的话 1354 | 皆可 1355 | 目前 1356 | 直到 1357 | 直接 1358 | 相似 1359 | 相信 1360 | 相反 1361 | 相同 1362 | 相对 1363 | 相对而言 1364 | 相应 1365 | 相当 1366 | 相等 1367 | 省得 1368 | 看 1369 | 看上去 1370 | 看出 1371 | 看到 1372 | 看来 1373 | 看样子 1374 | 看看 1375 | 看见 1376 | 看起来 1377 | 真是 1378 | 真正 1379 | 眨眼 1380 | 着 1381 | 着呢 1382 | 矣 1383 | 矣乎 1384 | 矣哉 1385 | 知道 1386 | 砰 1387 | 确定 1388 | 碰巧 1389 | 社会主义 1390 | 离 1391 | 种 1392 | 积极 1393 | 移动 1394 | 究竟 1395 | 穷年累月 1396 | 突出 1397 | 突然 1398 | 窃 1399 | 立 1400 | 立刻 1401 | 立即 1402 | 立地 1403 | 立时 1404 | 立马 1405 | 竟 1406 | 竟然 1407 | 竟而 1408 | 第 1409 | 第二 1410 | 等 1411 | 等到 1412 | 等等 1413 | 策略地 1414 | 简直 1415 | 简而言之 1416 | 简言之 1417 | 管 1418 | 类如 1419 | 粗 1420 | 精光 1421 | 紧接着 1422 | 累年 1423 | 累次 1424 | 纯 1425 | 纯粹 1426 | 纵 1427 | 纵令 1428 | 纵使 1429 | 纵然 1430 | 练习 1431 | 组成 1432 | 经 1433 | 经常 1434 | 经过 1435 | 结合 1436 | 结果 1437 | 给 1438 | 绝 1439 | 绝不 1440 | 绝对 1441 | 绝非 1442 | 绝顶 1443 | 继之 1444 | 继后 1445 | 继续 1446 | 继而 1447 | 维持 1448 | 综上所述 1449 | 缕缕 1450 | 罢了 1451 | 老 1452 | 老大 1453 | 老是 1454 | 老老实实 1455 | 考虑 1456 | 者 1457 | 而 1458 | 而且 1459 | 而况 1460 | 而又 1461 | 而后 1462 | 而外 1463 | 而已 1464 | 而是 1465 | 而言 1466 | 而论 1467 | 联系 1468 | 联袂 1469 | 背地里 1470 | 背靠背 1471 | 能 1472 | 能否 1473 | 能够 1474 | 腾 1475 | 自 1476 | 自个儿 1477 | 自从 1478 | 自各儿 1479 | 自后 1480 | 自家 1481 | 自己 1482 | 自打 1483 | 自身 1484 | 臭 1485 | 至 1486 | 至于 1487 | 至今 1488 | 至若 1489 | 致 1490 | 般的 1491 | 良好 1492 | 若 1493 | 若夫 1494 | 若是 1495 | 若果 1496 | 若非 1497 | 范围 1498 | 莫 1499 | 莫不 1500 | 莫不然 1501 | 莫如 1502 | 莫若 1503 | 莫非 1504 | 获得 1505 | 藉以 1506 | 虽 1507 | 虽则 1508 | 虽然 1509 | 虽说 1510 | 蛮 1511 | 行为 1512 | 行动 1513 | 表明 1514 | 表示 1515 | 被 1516 | 要 1517 | 要不 1518 | 要不是 1519 | 要不然 1520 | 要么 1521 | 要是 1522 | 要求 1523 | 见 1524 | 规定 1525 | 觉得 1526 | 譬喻 1527 | 譬如 1528 | 认为 1529 | 认真 1530 | 认识 1531 | 让 1532 | 许多 1533 | 论 1534 | 论说 1535 | 设使 1536 | 设或 1537 | 设若 1538 | 诚如 1539 | 诚然 1540 | 话说 1541 | 该 1542 | 该当 1543 | 说明 1544 | 说来 1545 | 说说 1546 | 请勿 1547 | 诸 1548 | 诸位 1549 | 诸如 1550 | 谁 1551 | 谁人 1552 | 谁料 1553 | 谁知 1554 | 谨 1555 | 豁然 1556 | 贼死 1557 | 赖以 1558 | 赶 1559 | 赶快 1560 | 赶早不赶晚 1561 | 起 1562 | 起先 1563 | 起初 1564 | 起头 1565 | 起来 1566 | 起见 1567 | 起首 1568 | 趁 1569 | 趁便 1570 | 趁势 1571 | 趁早 1572 | 趁机 1573 | 趁热 1574 | 趁着 1575 | 越是 1576 | 距 1577 | 跟 1578 | 路经 1579 | 转动 1580 | 转变 1581 | 转贴 1582 | 轰然 1583 | 较 1584 | 较为 1585 | 较之 1586 | 较比 1587 | 边 1588 | 达到 1589 | 达旦 1590 | 迄 1591 | 迅速 1592 | 过 1593 | 过于 1594 | 过去 1595 | 过来 1596 | 运用 1597 | 近 1598 | 近几年来 1599 | 近年来 1600 | 近来 1601 | 还 1602 | 还是 1603 | 还有 1604 | 还要 1605 | 这 1606 | 这一来 1607 | 这个 1608 | 这么 1609 | 这么些 1610 | 这么样 1611 | 这么点儿 1612 | 这些 1613 | 这会儿 1614 | 这儿 1615 | 这就是说 1616 | 这时 1617 | 这样 1618 | 这次 1619 | 这点 1620 | 这种 1621 | 这般 1622 | 这边 1623 | 这里 1624 | 这麽 1625 | 进入 1626 | 进去 1627 | 进来 1628 | 进步 1629 | 进而 1630 | 进行 1631 | 连 1632 | 连同 1633 | 连声 1634 | 连日 1635 | 连日来 1636 | 连袂 1637 | 连连 1638 | 迟早 1639 | 迫于 1640 | 适应 1641 | 适当 1642 | 适用 1643 | 逐步 1644 | 逐渐 1645 | 通常 1646 | 通过 1647 | 造成 1648 | 逢 1649 | 遇到 1650 | 遭到 1651 | 遵循 1652 | 遵照 1653 | 避免 1654 | 那 1655 | 那个 1656 | 那么 1657 | 那么些 1658 | 那么样 1659 | 那些 1660 | 那会儿 1661 | 那儿 1662 | 那时 1663 | 那末 1664 | 那样 1665 | 那般 1666 | 那边 1667 | 那里 1668 | 那麽 1669 | 部分 1670 | 都 1671 | 鄙人 1672 | 采取 1673 | 里面 1674 | 重大 1675 | 重新 1676 | 重要 1677 | 鉴于 1678 | 针对 1679 | 长期以来 1680 | 长此下去 1681 | 长线 1682 | 长话短说 1683 | 问题 1684 | 间或 1685 | 防止 1686 | 阿 1687 | 附近 1688 | 陈年 1689 | 限制 1690 | 陡然 1691 | 除 1692 | 除了 1693 | 除却 1694 | 除去 1695 | 除外 1696 | 除开 1697 | 除此 1698 | 除此之外 1699 | 除此以外 1700 | 除此而外 1701 | 除非 1702 | 随 1703 | 随后 1704 | 随时 1705 | 随着 1706 | 随著 1707 | 隔夜 1708 | 隔日 1709 | 难得 1710 | 难怪 1711 | 难说 1712 | 难道 1713 | 难道说 1714 | 集中 1715 | 零 1716 | 需要 1717 | 非但 1718 | 非常 1719 | 非徒 1720 | 非得 1721 | 非特 1722 | 非独 1723 | 靠 1724 | 顶多 1725 | 顷 1726 | 顷刻 1727 | 顷刻之间 1728 | 顷刻间 1729 | 顺 1730 | 顺着 1731 | 顿时 1732 | 颇 1733 | 风雨无阻 1734 | 饱 1735 | 首先 1736 | 马上 1737 | 高低 1738 | 高兴 1739 | 默然 1740 | 默默地 1741 | 齐 1742 | ︿ 1743 | ! 1744 | # 1745 | $ 1746 | % 1747 | & 1748 | ' 1749 | ( 1750 | ) 1751 | )÷(1- 1752 | )、 1753 | * 1754 | + 1755 | +ξ 1756 | ++ 1757 | , 1758 | ,也 1759 | - 1760 | -β 1761 | -- 1762 | -[*]- 1763 | . 1764 | / 1765 | 0 1766 | 0:2 1767 | 1 1768 | 1. 1769 | 12% 1770 | 2 1771 | 2.3% 1772 | 3 1773 | 4 1774 | 5 1775 | 5:0 1776 | 6 1777 | 7 1778 | 8 1779 | 9 1780 | : 1781 | ; 1782 | < 1783 | <± 1784 | <Δ 1785 | <λ 1786 | <φ 1787 | << 1788 | = 1789 | =″ 1790 | =☆ 1791 | =( 1792 | =- 1793 | =[ 1794 | ={ 1795 | > 1796 | >λ 1797 | ? 1798 | @ 1799 | A 1800 | LI 1801 | R.L. 1802 | ZXFITL 1803 | 1804 | [*] 1805 | [- 1806 | [] 1807 | ] 1808 | ]∧′=[ 1809 | ][ 1810 | _ 1811 | a] 1812 | b] 1813 | c] 1814 | e] 1815 | f] 1816 | ng昉 1817 | { 1818 | {- 1819 | | 1820 | } 1821 | }> 1822 | ~ 1823 | ~± 1824 | ~+ 1825 | ¥ 1826 | secondly 1827 | all 1828 | whose 1829 | under 1830 | sorry 1831 | four 1832 | we'll 1833 | somewhere 1834 | likely 1835 | even 1836 | above 1837 | ever 1838 | never 1839 | ZZ 1840 | hers 1841 | i'd 1842 | howbeit 1843 | i'm 1844 | theres 1845 | changes 1846 | anyhow 1847 | would 1848 | therefore 1849 | is 1850 | hereby 1851 | must 1852 | me 1853 | my 1854 | indicated 1855 | indicates 1856 | keep 1857 | far 1858 | after 1859 | hereupon 1860 | keeps 1861 | every 1862 | over 1863 | before 1864 | better 1865 | then 1866 | them 1867 | they 1868 | reasonably 1869 | each 1870 | went 1871 | mean 1872 | we'd 1873 | rd 1874 | re 1875 | got 1876 | forth 1877 | you're 1878 | little 1879 | whereupon 1880 | uses 1881 | already 1882 | another 1883 | took 1884 | second 1885 | seen 1886 | seem 1887 | relatively 1888 | thoroughly 1889 | latter 1890 | that 1891 | thorough 1892 | nobody 1893 | definitely 1894 | came 1895 | saying 1896 | specify 1897 | do 1898 | next 1899 | despite 1900 | unfortunately 1901 | twice 1902 | best 1903 | said 1904 | away 1905 | there's 1906 | unto 1907 | hopefully 1908 | seven 1909 | we 1910 | ltd 1911 | here 1912 | against 1913 | com 1914 | ZT 1915 | aren't 1916 | been 1917 | much 1918 | concerning 1919 | wish 1920 | say 1921 | near 1922 | unlikely 1923 | cant 1924 | in 1925 | ie 1926 | if 1927 | containing 1928 | beside 1929 | several 1930 | kept 1931 | whereby 1932 | whoever 1933 | the 1934 | yours 1935 | just 1936 | yes 1937 | yet 1938 | had 1939 | has 1940 | t's 1941 | possible 1942 | apart 1943 | right 1944 | old 1945 | somehow 1946 | for 1947 | everything 1948 | asking 1949 | who 1950 | of 1951 | theirs 1952 | plus 1953 | formerly 1954 | down 1955 | c's 1956 | accordingly 1957 | way 1958 | was 1959 | becoming 1960 | tell 1961 | sometime 1962 | no 1963 | whereas 1964 | nd 1965 | welcome 1966 | let's 1967 | certainly 1968 | a's 1969 | did 1970 | it'll 1971 | says 1972 | appear 1973 | alone 1974 | wherever 1975 | example 1976 | usually 1977 | nowhere 1978 | hither 1979 | regardless 1980 | everybody 1981 | thru 1982 | everywhere 1983 | can 1984 | following 1985 | want 1986 | didn't 1987 | may 1988 | such 1989 | whenever 1990 | maybe 1991 | ones 1992 | so 1993 | seeing 1994 | indeed 1995 | course 1996 | still 1997 | thank 1998 | he's 1999 | selves 2000 | ours 2001 | outside 2002 | non 2003 | within 2004 | thereby 2005 | not 2006 | now 2007 | nor 2008 | entirely 2009 | eg 2010 | ex 2011 | et 2012 | hadn't 2013 | furthermore 2014 | looking 2015 | seriously 2016 | shouldn't 2017 | she 2018 | quite 2019 | besides 2020 | think 2021 | first 2022 | ignored 2023 | awfully 2024 | given 2025 | anyone 2026 | indicate 2027 | gives 2028 | mostly 2029 | than 2030 | here's 2031 | were 2032 | and 2033 | appreciate 2034 | himself 2035 | saw 2036 | any 2037 | downwards 2038 | take 2039 | sure 2040 | especially 2041 | later 2042 | that's 2043 | fifth 2044 | don't 2045 | aside 2046 | only 2047 | going 2048 | get 2049 | truly 2050 | cannot 2051 | nearly 2052 | regarding 2053 | us 2054 | where 2055 | up 2056 | namely 2057 | anyways 2058 | wonder 2059 | behind 2060 | between 2061 | it 2062 | across 2063 | come 2064 | many 2065 | whereafter 2066 | according 2067 | comes 2068 | afterwards 2069 | couldn't 2070 | moreover 2071 | considering 2072 | sensible 2073 | hardly 2074 | wants 2075 | former 2076 | those 2077 | these 2078 | [ 2079 | somebody 2080 | different 2081 | etc 2082 | insofar 2083 | same 2084 | without 2085 | can't 2086 | very 2087 | you've 2088 | among 2089 | being 2090 | we've 2091 | seems 2092 | around 2093 | using 2094 | specified 2095 | on 2096 | ok 2097 | oh 2098 | whence 2099 | it's 2100 | or 2101 | everyone 2102 | your 2103 | her 2104 | there 2105 | amongst 2106 | trying 2107 | with 2108 | they're 2109 | wasn't 2110 | gone 2111 | certain 2112 | am 2113 | an 2114 | as 2115 | at 2116 | again 2117 | serious 2118 | hello 2119 | since 2120 | consider 2121 | causes 2122 | to 2123 | th 2124 | myself 2125 | i'll 2126 | zero 2127 | further 2128 | what 2129 | brief 2130 | seemed 2131 | c'mon 2132 | allows 2133 | followed 2134 | ask 2135 | viz 2136 | contains 2137 | two 2138 | taken 2139 | more 2140 | knows 2141 | ain't 2142 | particular 2143 | known 2144 | none 2145 | nine 2146 | needs 2147 | rather 2148 | [ 2149 | okay 2150 | tried 2151 | tries 2152 | onto 2153 | perhaps 2154 | specifying 2155 | ] 2156 | help 2157 | soon 2158 | through 2159 | its 2160 | seeming 2161 | inward 2162 | actually 2163 | might 2164 | haven't 2165 | someone 2166 | hereafter 2167 | always 2168 | isn't 2169 | beyond 2170 | really 2171 | they'll 2172 | enough 2173 | thereafter 2174 | done 2175 | together 2176 | least 2177 | too 2178 | immediate 2179 | believe 2180 | gotten 2181 | toward 2182 | self 2183 | also 2184 | towards 2185 | most 2186 | nothing 2187 | they'd 2188 | sometimes 2189 | lest 2190 | particularly 2191 | somewhat 2192 | his 2193 | goes 2194 | meanwhile 2195 | during 2196 | him 2197 | greetings 2198 | see 2199 | are 2200 | currently 2201 | please 2202 | various 2203 | probably 2204 | available 2205 | both 2206 | last 2207 | wouldn't 2208 | became 2209 | whole 2210 | liked 2211 | whatever 2212 | except 2213 | throughout 2214 | along 2215 | described 2216 | though 2217 | whom 2218 | beforehand 2219 | what's 2220 | new 2221 | else 2222 | look 2223 | while 2224 | herein 2225 | itself 2226 | wherein 2227 | used 2228 | anybody 2229 | obviously 2230 | thats 2231 | from 2232 | useful 2233 | merely 2234 | follows 2235 | often 2236 | some 2237 | ourselves 2238 | shall 2239 | per 2240 | tends 2241 | either 2242 | be 2243 | by 2244 | anything 2245 | consequently 2246 | into 2247 | appropriate 2248 | we're 2249 | elsewhere 2250 | hasn't 2251 | un 2252 | noone 2253 | associated 2254 | thanks 2255 | having 2256 | once 2257 | edu 2258 | go 2259 | sent 2260 | provides 2261 | yourselves 2262 | they've 2263 | try 2264 | this 2265 | you'd 2266 | yourself 2267 | zz 2268 | zt 2269 | respectively 2270 | let 2271 | others 2272 | until 2273 | weren't 2274 | use 2275 | few 2276 | themselves 2277 | becomes 2278 | anywhere 2279 | something 2280 | six 2281 | allow 2282 | won't 2283 | thence 2284 | willing 2285 | instead 2286 | whither 2287 | doing 2288 | how 2289 | cause 2290 | thereupon 2291 | que 2292 | via 2293 | could 2294 | hence 2295 | third 2296 | doesn't 2297 | their 2298 | exactly 2299 | regards 2300 | herself 2301 | have 2302 | need 2303 | clearly 2304 | i've 2305 | able 2306 | which 2307 | unless 2308 | where's 2309 | eight 2310 | why 2311 | you'll 2312 | normally 2313 | anyway 2314 | one 2315 | should 2316 | mainly 2317 | overall 2318 | qv 2319 | contain 2320 | looks 2321 | neither 2322 | however 2323 | otherwise 2324 | co 2325 | it'd 2326 | corresponding 2327 | thanx 2328 | novel 2329 | value 2330 | will 2331 | almost 2332 | thus 2333 | vs 2334 | when 2335 | gets 2336 | upon 2337 | off 2338 | nevertheless 2339 | well 2340 | less 2341 | presumably 2342 | ought 2343 | who's 2344 | five 2345 | know 2346 | you 2347 | name 2348 | necessary 2349 | like 2350 | become 2351 | therein 2352 | because 2353 | happens 2354 | does 2355 | although 2356 | about 2357 | getting 2358 | own 2359 | three 2360 | inasmuch 2361 | inner 2362 | but 2363 | hi 2364 | he 2365 | whether 2366 | placed 2367 | below 2368 | our 2369 | 上去-- 2370 | inc 2371 | lately 2372 | other 2373 | latterly 2374 | out 2375 | 是什么 2376 | 什么时候 2377 | 是什么意思 2378 | 什么意思 2379 | 多少钱 2380 | 有没有 2381 | 更有趣 2382 | 更有甚者 2383 | 更有效 2384 | 更有意义 2385 | 更远的 2386 | 更重要的是 2387 | 正确 2388 | 错误 2389 | 第二把 2390 | 第二波 2391 | 第二大节 2392 | 第二单元 2393 | 第二关 2394 | 第二行 2395 | 第二集 2396 | 第二讲 2397 | 第二款 2398 | 第二类 2399 | 第二盘 2400 | 第二任 2401 | 第二声 2402 | 第二十 2403 | 第二首 2404 | 第二项 2405 | 第三遍 2406 | 第三册 2407 | 第三层 2408 | 第三产业 2409 | 第三大 2410 | 第三单元 2411 | 第三行 2412 | 第三回 2413 | 第三集 2414 | 第三件 2415 | 第三句 2416 | 第三卷 2417 | 第三课 2418 | 第三类 2419 | 第三篇 2420 | 第三期 2421 | 第三日 2422 | 第三声 2423 | 地三鲜 2424 | 第三项 2425 | 第三站 2426 | 第三张 2427 | 第十八 2428 | 第十次 2429 | 第十二 2430 | 的士高 2431 | 第十集 2432 | 第十届 2433 | 第十九 2434 | 第十六 2435 | 第十名 2436 | 第十三 2437 | 第十四 2438 | 第十天 2439 | 第十一 2440 | 第十一个 2441 | 第四版 2442 | 第四册 2443 | 第四场 2444 | 第四代 2445 | 第四单元 2446 | 第四集 2447 | 第四届 2448 | 第四年 2449 | 第四期 2450 | 第四声 2451 | 第四套 2452 | 第四位 2453 | 第四张 2454 | 第四者 2455 | 第四种 2456 | 第五部 2457 | 第五大道 2458 | 第五单元 2459 | 第五集 2460 | 第五卷 2461 | 第五课 2462 | 第五年 2463 | 第五期 2464 | 第五位 2465 | 第五元素 2466 | 第五组 2467 | 召唤 2468 | 最后一班 2469 | 最后一遍 2470 | 最后一关 2471 | 最后一集 2472 | 最后一科 2473 | 最后一颗子弹 2474 | 最后一派 2475 | 最后一题 2476 | 最后一眼 2477 | 最后一页 2478 | 10 2479 | 11 2480 | 12 2481 | 35 2482 | 25 2483 | 2016 2484 | 2015 2485 | 2014 2486 | 又为什么 2487 | 有问题吗 2488 | 有问题么 2489 | 又喜欢 2490 | 有喜欢 2491 | 又小 2492 | 又笑 2493 | 有笑 2494 | 有效地 2495 | 有一百 2496 | 又一遍 2497 | 有一部 2498 | 又一城 2499 | 又一村 2500 | 有一道 2501 | 有意的 2502 | 有一堆 2503 | 有一对 2504 | 有一方 2505 | 有一根 2506 | 有一会了 2507 | 有一批 2508 | 有一片 2509 | 有一期 2510 | 有一起 2511 | 有一群 2512 | 又又 2513 | 由由 2514 | 财新网 2515 | 上午 2516 | 下午 2517 | NULL 2518 | 新华社 2519 | 消息 2520 | 13 2521 | 14 2522 | 15 2523 | 16 2524 | 17 2525 | 18 2526 | 19 2527 | 20 2528 | 21 2529 | 22 2530 | 23 2531 | 24 2532 | 26 2533 | 27 2534 | 28 2535 | 29 2536 | 30 2537 | 31 2538 | 32 2539 | 33 2540 | 34 2541 | 36 2542 | 37 2543 | 38 2544 | 39 2545 | 40 2546 | 41 2547 | 42 2548 | 43 2549 | 44 2550 | 45 2551 | 46 2552 | 47 2553 | 48 2554 | 49 2555 | 50 2556 | 51 2557 | 52 2558 | 53 2559 | 54 2560 | 55 2561 | 56 2562 | 57 2563 | 58 2564 | 59 2565 | 60 2566 | 61 2567 | 62 2568 | 63 2569 | 64 2570 | 65 2571 | 66 2572 | 67 2573 | 68 2574 | 69 2575 | 70 2576 | 71 2577 | 72 2578 | 73 2579 | 74 2580 | 75 2581 | 76 2582 | 77 2583 | 78 2584 | 79 2585 | 80 2586 | 81 2587 | 82 2588 | 83 2589 | 84 2590 | 85 2591 | 86 2592 | 87 2593 | 88 2594 | 89 2595 | 90 2596 | 91 2597 | 92 2598 | 93 2599 | 94 2600 | 95 2601 | 96 2602 | 97 2603 | 98 2604 | 99 2605 | 100 2606 | 01 2607 | 02 2608 | 03 2609 | 04 2610 | 05 2611 | 06 2612 | 07 2613 | 08 2614 | 09 2615 | --------------------------------------------------------------------------------