├── utils
    ├── __init__.py
    ├── time_utils.py
    ├── text_utils.py
    └── data_utils.py
├── data
    ├── graph_data
    │   ├── graph_demo.adjlist
    │   ├── readme.md
    │   ├── order.x
    │   ├── order.y
    │   ├── order.graph
    │   └── graph_demo.embeddings
    ├── order_data.xlsx
    ├── order_feas.xlsx
    ├── model
    │   ├── xgb_model.pkl
    │   └── xgb_model.fmap
    ├── README.md
    ├── text_data
    │   ├── test.txt
    │   └── stopwords.txt
    └── german_score.csv
├── chapter2
    ├── ch2_00_german_credit.py
    ├── ch2_13_fs_variation.py
    ├── ch2_28_feature_extraction_mds.py
    ├── ch2_12_woe_encoder.py
    ├── ch2_25_feature_extraction_pca.py
    ├── ch2_11_target_encoder.py
    ├── ch2_27_feature_extraction_lle.py
    ├── ch2_15_fs_corr_scipy.py
    ├── ch2_02_toad_eda_detect.py
    ├── ch2_04_preprocess_missing_value.py
    ├── ch2_26_feature_extraction_lda.py
    ├── ch2_18_fs_chi.py
    ├── ch2_17_fs_iv.py
    ├── ch2_16_fs_vif.py
    ├── ch2_06_preprocess_value_bining.py
    ├── ch2_14_fs_corr_pandas.py
    ├── ch2_10_one_hot_based_category_encoders.py
    ├── ch2_22_fs_select_from_model.py
    ├── ch2_19_fs_stepwise.py
    ├── ch2_21_fs_l1_norm.py
    ├── ch2_31_model_deployment_pickle.py
    ├── ch2_20_fs_rfe.py
    ├── ch2_35_decision_tree.py
    ├── ch2_29_p_to_score.py
    ├── ch2_01_train_test_split.py
    ├── ch2_05_preprocess_value_scaler.py
    ├── ch2_08_ordinal_encode_based_category_encoders.py
    ├── ch2_39_lightgbm.py
    ├── ch2_23_fs_psi.py
    ├── ch2_36_randomforest.py
    ├── ch2_37_gbdt.py
    ├── ch2_03_missrate_by_month.py
    ├── ch2_09_one_hot_based_sklearn.py
    ├── ch2_40_DNN_credit_data.py
    ├── ch2_34_svm.py
    ├── ch2_07_ordinal_encoder_based_sklearn.py
    ├── ch2_30_validation_curve.py
    ├── ch2_33_lr.py
    ├── ch2_24_fs_badrate_by_month.py
    ├── ch2_41_CNN_credit_data.py
    ├── ch2_32_model_deployment_pmml.py
    └── ch2_38_xgboost.py
├── chapter3
    ├── ch3_07_jieba_demo.py
    ├── ch3_13_random_walk.py
    ├── ch3_10_fasttext_vec.py
    ├── ch3_03_tsfresh_orders.py
    ├── ch3_09_word2vec.py
    ├── ch3_14_node2vec.py
    ├── ch3_11_text_classifier_bayes.py
    ├── ch3_05_gbdt_construct_feature.py
    ├── ch3_04_feature_evaluation.py
    ├── ch3_00_order_data_preprocess.py
    ├── ch3_06_cluster_alg.py
    ├── ch3_12_text_classifier_fasttext.py
    ├── ch3_08_bag_of_words.py
    ├── ch3_01_order_fea_gen_manual.py
    ├── ch3_02_order_fea_gen_rfm_auto.py
    └── ch3_15_gcn_order.py
├── 勘误.md
├── requirements.txt
├── chapter4
    ├── ch4_03_rules_for_isolationforest.py
    ├── ch4_00_rules_for_iv.py
    ├── ch4_02_rules_for_decisiontree.py
    ├── ch4_01_rules_for_outliers.py
    ├── ch4_04_modelstrategy_for_optimization.py
    └── ch4_02_rules_for_decisiontree.ipynb
├── README.md
├── .gitignore
└── LICENSE


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/graph_data/graph_demo.adjlist:
--------------------------------------------------------------------------------
1 | 1 2 4
2 | 2 1 3 4
3 | 3 2 5
4 | 4 1 2
5 | 5 3


--------------------------------------------------------------------------------
/data/graph_data/readme.md:
--------------------------------------------------------------------------------
1 | - order.x: 特征
2 | - order.y: label
3 | - order.graph:图结构
4 | 


--------------------------------------------------------------------------------
/data/order_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/order_data.xlsx


--------------------------------------------------------------------------------
/data/order_feas.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/order_feas.xlsx


--------------------------------------------------------------------------------
/data/graph_data/order.x:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/graph_data/order.x


--------------------------------------------------------------------------------
/data/graph_data/order.y:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/graph_data/order.y


--------------------------------------------------------------------------------
/data/model/xgb_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/model/xgb_model.pkl


--------------------------------------------------------------------------------
/data/graph_data/order.graph:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ybNero/practice_of_intelligent_risk_control/main/data/graph_data/order.graph


--------------------------------------------------------------------------------
/chapter2/ch2_00_german_credit.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*- 
2 | 
3 | import scorecardpy as sc
4 | 
5 | # 加载数据集
6 | german_credit_data = sc.germancredit()
7 | # 打印前5行, 前4列和最后一列
8 | print(german_credit_data.iloc[:5, list(range(-1, 4))])


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | 数据说明:     
2 | 1. text\_data: 文本相关数据    
3 | 2. graph\_data: 图特征相关数据    
4 | 3. model: 模型相关数据
5 | 4. order\_data.xlsx: 订单原始数据，RFM生成订单特征挖掘的输入    
6 | 5. order\_feas.xlsx: 订单生产的特征，以及label逾期标签数据     
7 | 6. german\_score.csv: 每个用户的模型得分及是否逾期    


--------------------------------------------------------------------------------
/chapter3/ch3_07_jieba_demo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # 结巴分词使用示例
 4 | from utils.text_utils import cut_words
 5 | 
 6 | text_demo = "通过资料审核与电话沟通用户审批通过借款金额10000元操作人小明审批时间2020年10月5日 经过电话核实用户确认所有资料均为本人提交提交时间2020年11月5日用户当前未逾期"
 7 | segs = cut_words(text_demo)
 8 | print("原文: ", text_demo)
 9 | print("切词后的结果:", list(segs))
10 | 


--------------------------------------------------------------------------------
/chapter2/ch2_13_fs_variation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | from scipy.stats import variation
 9 | 
10 | # 导入数值型样例数据
11 | all_x_y = data_utils.get_all_x_y()
12 | x = all_x_y.drop(data_utils.label, axis=1)
13 | # 计算各个特征的变异系数
14 | x_var = variation(x, nan_policy='omit')
15 | result = dict(zip(x.columns ,x_var))
16 | print("变异系数结果: \n", result)


--------------------------------------------------------------------------------
/勘误.md:
--------------------------------------------------------------------------------
 1 | # 书籍勘误
 2 | * P16：修正图2-7和2-8，删除201903和MOB12对应的数字，以及201906和MOB9对应的数字，曲线同时修改
 3 | * P51：图2-24，“字数”修订为“总客户数”
 4 | * P74：第一行"分类任务采用逻辑损失函数"修改为"学习任务为二分类任务"；第二行"构建1000可决策树"修改为"采用传统梯度提升树方法构建弱学习器"；第四、五行"10%"修改"100%"
 5 | * P108：data_preprocess下注释，data和back_time后面"，"修改为": "
 6 | * P114：表3-9邻接矩阵第2行第1列数字"0"修改为数字"1"，拉普拉斯矩阵中第2行第1列"0"以及这一行的"1"均修改为"-1"，第3行最后一个"1"修改为"-1"
 7 | * P116：代码优化，采用本代码库中代码
 8 | * P119-122：代码优化，采用本代码库中代码
 9 | * P126：表格3-12中的"0.11"修改为"0.1"
10 | * P138：代码块下方第一句应删除“在Word2Vec”
11 | 


--------------------------------------------------------------------------------
/data/graph_data/graph_demo.embeddings:
--------------------------------------------------------------------------------
1 | 5 8
2 | 2 -0.1042217 0.20679143 0.2427775 0.72288877 0.31595996 0.39856002 -0.58908206 0.17186889
3 | 4 -0.030496102 0.10907486 0.1395583 0.6750262 0.29104736 0.3746776 -0.6623706 0.23489219
4 | 1 0.01014287 0.13469146 0.18281066 0.77614796 0.22247209 0.28817466 -0.7041028 0.21778071
5 | 3 0.0357248 0.16088022 0.12562852 0.7775058 0.34339327 0.32849503 -0.74697566 0.15016618
6 | 5 -0.05321218 0.1133668 0.20365518 0.81072015 0.28992698 0.28150764 -0.71600515 0.17657782
7 | 


--------------------------------------------------------------------------------
/chapter2/ch2_28_feature_extraction_mds.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import pandas as pd
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | from utils import data_utils
 9 | from sklearn.manifold import MDS
10 | 
11 | 
12 | # 导入数值型样例数据
13 | all_x_y = data_utils.get_all_x_y()
14 | x = all_x_y.drop(data_utils.label, axis=1)
15 | mds = MDS(n_components=10)
16 | x_new = mds.fit_transform(x)
17 | x_new_df = pd.DataFrame(x_new)
18 | print("利用sklearn进行MDS特征提取结果: \n", x_new_df)
19 | 


--------------------------------------------------------------------------------
/chapter2/ch2_12_woe_encoder.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | from category_encoders.woe import WOEEncoder
 9 | 
10 | # 加载数据
11 | german_credit_data = data_utils.get_data()
12 | y = german_credit_data['creditability']
13 | x = german_credit_data[['purpose', 'personal.status.and.sex']]
14 | 
15 | # WOE编码
16 | encoder = WOEEncoder(cols=x.columns)
17 | result = encoder.fit_transform(x, y)
18 | print("WOE编码结果: \n", result)
19 | 


--------------------------------------------------------------------------------
/chapter2/ch2_25_feature_extraction_pca.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import toad
 5 | import pandas as pd
 6 | sys.path.append("./")
 7 | sys.path.append("../")
 8 | 
 9 | from utils import data_utils
10 | from sklearn.decomposition import PCA
11 | 
12 | 
13 | # 导入数值型样例数据
14 | all_x_y = data_utils.get_all_x_y()
15 | x = all_x_y.drop(data_utils.label, axis=1)
16 | pca = PCA(n_components=0.9)
17 | x_new = pca.fit_transform(x)
18 | x_new_df = pd.DataFrame(x_new)
19 | print("利用sklearn进行PCA特征提取, 保留90%信息后结果: \n", x_new_df)
20 | 


--------------------------------------------------------------------------------
/chapter2/ch2_11_target_encoder.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | from category_encoders.target_encoder import TargetEncoder
 9 | 
10 | 
11 | # 加载数据
12 | german_credit_data = data_utils.get_data()
13 | y = german_credit_data['creditability']
14 | x = german_credit_data[['purpose', 'personal.status.and.sex']]
15 | # 目标编码
16 | enc = TargetEncoder(cols=x.columns)
17 | result = enc.fit_transform(x, y)
18 | print("目标编码结果: \n", result)
19 | 


--------------------------------------------------------------------------------
/chapter2/ch2_27_feature_extraction_lle.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import pandas as pd
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | from utils import data_utils
 9 | from sklearn.manifold import LocallyLinearEmbedding
10 | 
11 | # 导入数值型样例数据
12 | all_x_y = data_utils.get_all_x_y()
13 | x = all_x_y.drop(data_utils.label, axis=1)
14 | lle = LocallyLinearEmbedding(n_neighbors=5, n_components=10)
15 | x_new = lle.fit_transform(x)
16 | x_new_df = pd.DataFrame(x_new)
17 | print("利用sklearn进行LLE特征提取结果: \n", x_new_df)
18 | 


--------------------------------------------------------------------------------
/chapter2/ch2_15_fs_corr_scipy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | from scipy.stats import pearsonr
 9 | 
10 | 
11 | # 导入数值型样例数据
12 | all_x_y = data_utils.get_all_x_y()
13 | x = all_x_y.drop(data_utils.label, axis=1)
14 | x1, x2 = x.loc[:, 'age.in.years'], x.loc[:, 'credit.history',]
15 | r, p_value = pearsonr(x1, x2)
16 | print("scipy库计算 特征'age.in.years'和'credit.history'的pearson相关系数 \n", 
17 |     "pearson相关系数: %s, \n" % r, "p_value: %s" % p_value)
18 | 


--------------------------------------------------------------------------------
/chapter2/ch2_02_toad_eda_detect.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import toad
 8 | from utils import data_utils
 9 | 
10 | # 加载数据集
11 | german_credit_data = data_utils.get_data()
12 | detect_res = toad.detector.detect(german_credit_data)
13 | # 打印前5行, 前4列
14 | 
15 | print("前5行, 前4列:")
16 | print(detect_res.iloc[:5, :4])
17 | print("前5行, 第5至9列:")
18 | # 打印前5行, 第5至9列
19 | print(detect_res.iloc[:5, 4:9])
20 | # 打印前5行, 第10至14列
21 | print("前5行, 第10至14列:")
22 | print(detect_res.iloc[:5, 9:])
23 | 
24 | 


--------------------------------------------------------------------------------
/chapter2/ch2_04_preprocess_missing_value.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | from utils import data_utils
10 | from sklearn.impute import SimpleImputer
11 | 
12 | # 导入数值型样例数据
13 | data = data_utils.get_data()
14 | # 缺失值处理
15 | imp = SimpleImputer(missing_values=np.nan, strategy='mean')
16 | imped_data = imp.fit_transform(data[data_utils.numeric_cols])
17 | imped_df = pd.DataFrame(imped_data, columns=data_utils.numeric_cols)
18 | print("缺失值填充结果: \n", imped_df)
19 | 


--------------------------------------------------------------------------------
/chapter2/ch2_26_feature_extraction_lda.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import pandas as pd
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | from utils import data_utils
 9 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
10 | 
11 | # 导入数值型样例数据
12 | all_x_y = data_utils.get_all_x_y()
13 | x = all_x_y.drop(data_utils.label, axis=1)
14 | y = all_x_y[data_utils.label]
15 | lda = LinearDiscriminantAnalysis(n_components=1)
16 | x_new = lda.fit_transform(x, y)
17 | x_new_df = pd.DataFrame(x_new)
18 | print("利用sklearn进行LDA特征提取结果: \n", x_new_df)
19 | 


--------------------------------------------------------------------------------
/chapter2/ch2_18_fs_chi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | from sklearn.feature_selection import chi2
 9 | from sklearn.feature_selection import SelectKBest
10 | 
11 | # 导入数值型样例数据
12 | all_x_y = data_utils.get_all_x_y()
13 | y = all_x_y.pop(data_utils.label)
14 | # 选择K个最好的特征，返回选择特征后的数据
15 | fs_chi = SelectKBest(chi2, k=5)
16 | fs_chi.fit(all_x_y, y)
17 | x_new = fs_chi.transform(all_x_y)
18 | 
19 | selected_cols = all_x_y.columns[fs_chi.get_support()].tolist()
20 | print("卡方检验筛选得到%s个特征: \n" % len(selected_cols), selected_cols)
21 | 


--------------------------------------------------------------------------------
/data/model/xgb_model.fmap:
--------------------------------------------------------------------------------
 1 | 0	status.of.existing.checking.account	q
 2 | 1	credit.history	q
 3 | 2	savings.account.and.bonds	q
 4 | 3	present.employment.since	q
 5 | 4	personal.status.and.sex	q
 6 | 5	other.debtors.or.guarantors	q
 7 | 6	property	q
 8 | 7	other.installment.plans	q
 9 | 8	housing	q
10 | 9	job	q
11 | 10	telephone	q
12 | 11	foreign.worker	q
13 | 12	purpose	q
14 | 13	duration.in.month	q
15 | 14	credit.amount	q
16 | 15	age.in.years	q
17 | 16	present.residence.since	q
18 | 17	number.of.existing.credits.at.this.bank	q
19 | 18	installment.rate.in.percentage.of.disposable.income	q
20 | 19	number.of.people.being.liable.to.provide.maintenance.for	q
21 | 


--------------------------------------------------------------------------------
/chapter2/ch2_17_fs_iv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import toad
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | from utils import data_utils
 9 | 
10 | # 导入数值型样例数据
11 | all_x_y = data_utils.get_all_x_y()
12 | # 利用toad库quality()方法计算IV
13 | var_iv = toad.quality(all_x_y,
14 |                       target='creditability',
15 |                       method='quantile',
16 |                       n_bins=6,
17 |                       iv_only=True)
18 | 
19 | selected_cols = var_iv[var_iv.iv > 0.1].index.tolist()
20 | print("各特征的iv值计算结果: \n", var_iv)
21 | print("设置iv阈值为0.1, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols)
22 | 


--------------------------------------------------------------------------------
/chapter2/ch2_16_fs_vif.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | from statsmodels.stats.outliers_influence import variance_inflation_factor
 9 | 
10 | # 导入数值型样例数据
11 | all_x_y = data_utils.get_all_x_y()
12 | x = all_x_y.drop(data_utils.label, axis=1)
13 | vif = [variance_inflation_factor(x.values, ix) for ix in range(x.shape[1])]
14 | print("各特征的vif值计算结果: \n", dict(zip(x.columns, vif)))
15 | 
16 | # 筛选阈值小于10的特征
17 | selected_cols = x.iloc[:, [f < 10 for f in vif]].columns.tolist()
18 | print("设置vif阈值为10, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols)
19 | 


--------------------------------------------------------------------------------
/chapter2/ch2_06_preprocess_value_bining.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import toad
 8 | from toad.plot import bin_plot
 9 | from utils import data_utils
10 | 
11 | german_credit_data = data_utils.get_data()
12 | # 利用toad库等频分箱
13 | # 初始化分箱对象
14 | c = toad.transform.Combiner()
15 | c.fit(german_credit_data[data_utils.x_cols],
16 |       y=german_credit_data[data_utils.label], n_bins=6, method='quantile', empty_separate=True)
17 | # 特征age.in.years分箱结果画图
18 | data_binned = c.transform(german_credit_data, labels=True)
19 | bin_plot(data_binned, x='age.in.years', target=data_utils.label)
20 | 


--------------------------------------------------------------------------------
/chapter2/ch2_14_fs_corr_pandas.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | 
 9 | 
10 | # 导入数值型样例数据
11 | all_x_y = data_utils.get_all_x_y()
12 | x = all_x_y.drop(data_utils.label, axis=1)
13 | # 利用pandas库计算相关系数
14 | # pearson相关系数
15 | pearson_corr = x.corr(method='pearson')
16 | print("pandas库计算 pearson相关系数: \n", pearson_corr)
17 | # spearman相关系数
18 | spearman_corr = x.corr(method='spearman')  
19 | print("pandas库计算 spearman相关系数: \n", spearman_corr)
20 | # kendall相关系数
21 | kendall_corr = x.corr(method='kendall')  
22 | print("pandas库计算 kendall相关系数: \n", kendall_corr)
23 | 


--------------------------------------------------------------------------------
/chapter2/ch2_10_one_hot_based_category_encoders.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | from category_encoders.one_hot import OneHotEncoder
 9 | 
10 | 
11 | # 加载数据
12 | german_credit_data = data_utils.get_data()
13 | # 初始化OneHotEncoder类
14 | encoder = OneHotEncoder(cols=['purpose', 'personal.status.and.sex'],
15 |                         handle_unknown='indicator',
16 |                         handle_missing='indicator',
17 |                         use_cat_names=True)
18 | # 转换数据集
19 | result = encoder.fit_transform(german_credit_data)
20 | print("one-hot编码结果: \n", result)


--------------------------------------------------------------------------------
/chapter2/ch2_22_fs_select_from_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import toad
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | from utils import data_utils
 9 | from sklearn.feature_selection import SelectFromModel
10 | from sklearn.ensemble import GradientBoostingClassifier
11 | 
12 | # 导入数值型样例数据
13 | all_x_y = data_utils.get_all_x_y()
14 | y = all_x_y.pop(data_utils.label)
15 | x = all_x_y
16 | # GBDT作为基模型的特征选择
17 | sf = SelectFromModel(GradientBoostingClassifier())
18 | x_new = sf.fit_transform(x, y)
19 | 
20 | selected_cols = x.columns[sf.get_support()].tolist()
21 | print("基于树模型筛选得到%s个特征: \n" % len(selected_cols), selected_cols)
22 | 


--------------------------------------------------------------------------------
/chapter3/ch3_13_random_walk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 使用DeepWalk算法生成特征(可以直接在shell命令窗口中运行deepwalk命令)
 4 | """
 5 | 
 6 | import os
 7 | import sys
 8 | import pandas as pd
 9 | sys.path.append("./")
10 | sys.path.append("../")
11 | 
12 | size = 8
13 | os.system(
14 |     "deepwalk --input data/graph_data/graph_demo.adjlist "
15 |     f"--output data/graph_data/graph_demo.embeddings --representation-size {size}")
16 | 
17 | fea_vec = pd.read_csv('data/graph_data/graph_demo.embeddings', sep=' ', skiprows=1, index_col=0,
18 |                       names=['fea_%s' % i for i in range(size)]).sort_index()
19 | print('词向量维度：', fea_vec.shape)
20 | print('词向量结果：', fea_vec)
21 | 


--------------------------------------------------------------------------------
/chapter2/ch2_19_fs_stepwise.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import toad
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | from utils import data_utils
 9 | 
10 | # 导入数值型样例数据
11 | all_x_y = data_utils.get_all_x_y()
12 | final_data = toad.selection.stepwise(all_x_y,
13 |                                      target=data_utils.label,
14 |                                      estimator='lr',
15 |                                      direction='both',
16 |                                      criterion='aic',
17 |                                      return_drop=False)
18 | selected_cols = final_data.columns
19 | print("通过stepwise筛选得到%s个特征: \n" % len(selected_cols), selected_cols)
20 | 


--------------------------------------------------------------------------------
/chapter2/ch2_21_fs_l1_norm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import toad
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | from utils import data_utils
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.feature_selection import SelectFromModel
11 | 
12 | # 导入数值型样例数据
13 | all_x_y = data_utils.get_all_x_y()
14 | y = all_x_y.pop(data_utils.label)
15 | x = all_x_y
16 | # 带L1惩罚项的逻辑回归作为基模型的特征选择
17 | LR = LogisticRegression(penalty='l1', C=0.1, solver='liblinear')
18 | sf = SelectFromModel(LR)
19 | x_new = sf.fit_transform(x, y)
20 | 
21 | selected_cols = x.columns[sf.get_support()].tolist()
22 | print("基于L1范数筛选得到%s个特征: \n" % len(selected_cols), selected_cols)
23 | 


--------------------------------------------------------------------------------
/chapter2/ch2_31_model_deployment_pickle.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import numpy as np
 5 | import pandas as pd
 6 | sys.path.append("./")
 7 | sys.path.append("../")
 8 | 
 9 | 
10 | # Pickle方式保存和读取模型
11 | def save_model_as_pkl(model, path):
12 |     """
13 |     保存模型到路径path
14 |     :param model: 训练完成的模型
15 |     :param path: 保存的目标路径
16 |     """
17 |     import pickle
18 |     with open(path, 'wb') as f:
19 |         pickle.dump(model, f, protocol=2)
20 | 
21 | 
22 | def load_model_from_pkl(path):
23 |     """
24 |     从路径path加载模型
25 |     :param path: 保存的目标路径
26 |     """
27 |     import pickle
28 |     with open(path, 'rb') as f:
29 |         model = pickle.load(f)
30 |     return model
31 | 
32 | 


--------------------------------------------------------------------------------
/chapter2/ch2_20_fs_rfe.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import toad
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | from utils import data_utils
 9 | from sklearn.feature_selection import RFE
10 | from sklearn.linear_model import LogisticRegression
11 | 
12 | 
13 | # 导入数值型样例数据
14 | all_x_y = data_utils.get_all_x_y()
15 | y = all_x_y.pop(data_utils.label)
16 | x = all_x_y
17 | # 递归特征消除法，返回特征选择后的数据
18 | # 参数estimator为基模型
19 | # 参数n_features_to_select为选择的特征个数
20 | rfe = RFE(estimator=LogisticRegression(), n_features_to_select=10)
21 | x_new = rfe.fit_transform(x, y)
22 | 
23 | selected_cols = x.columns[rfe.get_support()].tolist()
24 | print("通过递归特征消除法筛选得到%s个特征: \n" % len(selected_cols), selected_cols)
25 | 


--------------------------------------------------------------------------------
/chapter2/ch2_35_decision_tree.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | 
 8 | from sklearn.tree import DecisionTreeClassifier
 9 | from utils import data_utils
10 | from sklearn.metrics import roc_auc_score
11 | from sklearn.tree import DecisionTreeClassifier
12 | 
13 | # 导入数值型样例数据
14 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)
15 | # 导入数值型样例数据
16 | clf = DecisionTreeClassifier(criterion='gini',
17 |                              max_depth=8,
18 |                              min_samples_leaf=15,
19 |                              random_state=88)
20 | clf.fit(train_x, train_y)
21 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
22 | print("决策树模型 AUC: ", auc_score)
23 | 


--------------------------------------------------------------------------------
/chapter2/ch2_29_p_to_score.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | import sys
 3 | import numpy as np
 4 | import pandas as pd
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | def p_to_score(p, pdo, base, odds):
 9 |     """ 
10 |     逾期概率转换分数 
11 |     :param p: 逾期概率 
12 |     :param pdo: points double odds. default = 60 
13 |     :param base: base points. default = 600 
14 |     :param odds: odds. default = 1.0/15.0 
15 |     :returns: 模型分数 
16 |     """
17 |     B = pdo / np.log(2)
18 |     A = base + B * np.log(odds)
19 |     score = A - B * np.log(p / (1 - p))
20 |     return round(score, 0)
21 | 
22 | pros = pd.Series(np.random.rand(100))
23 | pros_score = p_to_score(pros, pdo=60.0, base=600, odds=1.0 / 15.0)
24 | print("随机产生100个概率并转化为score结果: \n", dict(zip(pros, pros_score)))
25 | 


--------------------------------------------------------------------------------
/chapter2/ch2_01_train_test_split.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | # 导入添加month列的数据
11 | model_data = data_utils.get_data()
12 | # 选取OOT样本  
13 | oot_set = model_data[model_data['month'] == '2020-05']
14 | # 划分训练集和测试集
15 | train_valid_set = model_data[model_data['month'] != '2020-05']
16 | X = train_valid_set[data_utils.x_cols]
17 | Y = train_valid_set['creditability']
18 | X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.3, random_state=88)
19 | model_data.loc[oot_set.index, 'sample_set'] = 'oot'
20 | model_data.loc[X_train.index, 'sample_set'] = 'train'
21 | model_data.loc[X_valid.index, 'sample_set'] = 'valid'
22 | 


--------------------------------------------------------------------------------
/chapter2/ch2_05_preprocess_value_scaler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import pandas as pd
 8 | from utils import data_utils
 9 | from sklearn.preprocessing import MinMaxScaler
10 | from sklearn.preprocessing import StandardScaler
11 | 
12 | # 导入数值型样例数据
13 | data = data_utils.get_data()
14 | # max-min标准化
15 | X_MinMaxScaler = MinMaxScaler().fit_transform(data[data_utils.numeric_cols])
16 | max_min_df = pd.DataFrame(X_MinMaxScaler, columns=data_utils.numeric_cols)
17 | print("max-min标准化结果: \n", max_min_df)
18 | # z-score标准化
19 | X_StandardScaler = StandardScaler().fit_transform(data[data_utils.numeric_cols])
20 | standard_df = pd.DataFrame(X_StandardScaler, columns=data_utils.numeric_cols)
21 | print("z-score标准化结果: \n", standard_df)
22 | 


--------------------------------------------------------------------------------
/chapter2/ch2_08_ordinal_encode_based_category_encoders.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from utils import data_utils
 8 | from category_encoders.ordinal import OrdinalEncoder
 9 | 
10 | # 加载数据
11 | german_credit_data = data_utils.get_data()
12 | # 初始化OrdinalEncoder类
13 | encoder = OrdinalEncoder(cols=['purpose', 'personal.status.and.sex'],
14 |                          handle_unknown='value',
15 |                          handle_missing='value')
16 | # 将 handle_unknown设为"value"，即测试集中的未知特征值将被标记为-1
17 | # 将 handle_missing设为"value"，即测试集中的缺失值将被标记为-2
18 | # 当设为"error"，即报错；当设为"return_nan"，即未知值/缺失值被标记为nan
19 | result = encoder.fit_transform(german_credit_data)
20 | category_mapping = encoder.category_mapping
21 | print("类别编码结果: \n", result)
22 | print("类别编码映射关系: \n", category_mapping)
23 | 


--------------------------------------------------------------------------------
/chapter2/ch2_39_lightgbm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import lightgbm as lgb
 8 | from utils import data_utils
 9 | from sklearn.metrics import roc_auc_score
10 | 
11 | # 导入数值型样例数据
12 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)
13 | clf = lgb.LGBMClassifier(objective='binary',
14 |                          boosting_type='gbdt',
15 |                          max_depth=3,
16 |                          n_estimators=1000,
17 |                          subsample=1,
18 |                          colsample_bytree=1)
19 | lgb_model = clf.fit(train_x, train_y, eval_set=[(test_x, test_y)], eval_metric='auc', early_stopping_rounds=30)
20 | auc_score = roc_auc_score(test_y, lgb_model.predict_proba(test_x)[:, 1])
21 | print("LightGBM模型 AUC: ", auc_score)
22 | 


--------------------------------------------------------------------------------
/chapter2/ch2_23_fs_psi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import toad
 5 | sys.path.append("./")
 6 | sys.path.append("../")
 7 | 
 8 | from utils import data_utils
 9 | 
10 | # 加载数据
11 | all_x_y = data_utils.get_all_x_y()
12 | # 定义分箱方法
13 | Combiner = toad.transform.Combiner()
14 | Combiner.fit(all_x_y,
15 |              y=data_utils.label,
16 |              n_bins=6,
17 |              method='quantile',
18 |              empty_separate=True)
19 | # 计算psi
20 | var_psi = toad.metrics.PSI(all_x_y.iloc[:500, :],
21 |                            all_x_y.iloc[500:, :],
22 |                            combiner=Combiner)
23 | var_psi_df = var_psi.to_frame(name='psi')
24 | 
25 | selected_cols = var_psi[var_psi_df.psi < 0.1].index.tolist()
26 | print("各特征的psi值计算结果: \n", var_psi_df)
27 | print("设置psi阈值为0.1, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols)
28 | 


--------------------------------------------------------------------------------
/chapter2/ch2_36_randomforest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from sklearn.ensemble import RandomForestClassifier
 8 | from utils import data_utils
 9 | from sklearn.metrics import roc_auc_score
10 | 
11 | # 导入数值型样例数据
12 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)
13 | clf = RandomForestClassifier(n_estimators=200,
14 |                              criterion='gini',
15 |                              max_depth=6,
16 |                              min_samples_leaf=15,
17 |                              bootstrap=True,
18 |                              oob_score=True,
19 |                              random_state=88)
20 | clf.fit(train_x, train_y)
21 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
22 | print("随机森林模型 AUC: ", auc_score)
23 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | graphviz==0.16
 2 | gensim==3.8.3
 3 | deepwalk==1.0.3
 4 | fasttext==0.9.2
 5 | xgboost==1.4.2
 6 | pyod==0.8.8
 7 | simplejson==3.16.0
 8 | scorecardpy==0.1.9.2
 9 | toad==0.1.0
10 | pytz==2021.1
11 | bayesian_optimization==1.2.0
12 | pydantic==1.8.2
13 | tensorflow==2.5.0
14 | pypmml==0.9.11
15 | jieba==0.42.1
16 | sklearn2pmml==0.71.1
17 | torch==1.7.1
18 | mlxtend==0.18.0
19 | category_encoders==2.2.2
20 | matplotlib==3.4.2
21 | scipy==1.6.3
22 | tsfresh==0.18.0
23 | pandas==1.2.4
24 | gevent==21.1.2
25 | requests==2.25.1
26 | shap==0.39.0
27 | lightgbm==3.2.1
28 | sklearn_pandas==2.2.0
29 | scikit-image==0.17.2
30 | statsmodels==0.12.2
31 | python_dateutil==2.8.1
32 | node2vec==0.4.3
33 | pyltp==0.2.1
34 | openpyxl==3.0.10
35 | networkx==2.8.2
36 | numpy==1.19.5
37 | scikit-learn==0.24.2
38 | python-dateutil==2.8.1
39 | nltk==3.7
40 | textrank4zh==0.3


--------------------------------------------------------------------------------
/chapter2/ch2_37_gbdt.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | from sklearn.ensemble import GradientBoostingClassifier
 8 | from utils import data_utils
 9 | from sklearn.metrics import roc_auc_score
10 | from sklearn.ensemble import GradientBoostingClassifier
11 | 
12 | # 导入数值型样例数据
13 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)
14 | clf = GradientBoostingClassifier(n_estimators=100,
15 |                                  learning_rate=0.1,
16 |                                  subsample=0.9,
17 |                                  max_depth=4,
18 |                                  min_samples_leaf=20,
19 |                                  random_state=88)
20 | clf.fit(train_x, train_y)
21 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
22 | print("GBDT模型 AUC: ", auc_score)
23 | 


--------------------------------------------------------------------------------
/chapter2/ch2_03_missrate_by_month.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | from utils import data_utils
 7 | 
 8 | def missrate_by_month(x_with_month, month_col, x_cols):
 9 |     """
10 |     按月统计缺失率
11 |     :param x_cols: x变量列名
12 |     :param month_col: 月份时间列名
13 |     :param x_with_month: 包含月份的数据
14 |     :return:
15 |     """
16 |     df = x_with_month.groupby(month_col)[x_cols].apply(lambda x: x.isna().sum() / len(x))
17 |     df = df.T
18 |     df['miss_rate_std'] = df.std(axis=1)
19 |     return df
20 | 
21 | def main():
22 |     """
23 |     主函数
24 |     """
25 |     # 导入添加month列的数据
26 |     model_data = data_utils.get_data()
27 |     miss_rate_by_month = missrate_by_month(model_data, month_col='month', x_cols=data_utils.numeric_cols)
28 |     print("按月统计缺失率结果: \n", miss_rate_by_month)
29 | 
30 | if __name__ == "__main__":
31 |     main()
32 | 
33 | 


--------------------------------------------------------------------------------
/chapter3/ch3_10_fasttext_vec.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # 文本特征挖掘：fasttext
 8 | import pandas as pd
 9 | from utils.text_utils import sentences_prepare
10 | import fasttext
11 | 
12 | if __name__ == '__main__':
13 |     # 加载语料
14 |     sentences = sentences_prepare()
15 | 
16 |     # 预处理过后的文本写入文件unsupervised_train_data
17 |     with open('data/text_data/unsupervised_train_data.txt', 'w') as f:
18 |         for sentence in sentences:
19 |             f.write(sentence)
20 |             f.write('\n')
21 | 
22 |     # 获取fasttext词向量
23 |     model = fasttext.train_unsupervised('data/text_data/unsupervised_train_data.txt', model='skipgram', dim=8)
24 |     fea_vec = pd.DataFrame([model.get_sentence_vector(x).tolist() for x in sentences])
25 |     fea_vec.columns = ['fea_%s' % i for i in range(model.get_dimension())]
26 |     print('词向量维度：', fea_vec.shape)
27 | 


--------------------------------------------------------------------------------
/chapter4/ch4_03_rules_for_isolationforest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import numpy as np
 8 | from chapter4.ch4_01_rules_for_outliers import rule_discover
 9 | from pyod.models.iforest import IForest
10 | from utils import data_utils
11 | 
12 | # 加载数据
13 | german_credit_data = data_utils.get_data()
14 | 
15 | # 构造数据集
16 | X = german_credit_data[data_utils.numeric_cols]
17 | y = german_credit_data['creditability']
18 | 
19 | # 初始化模型
20 | clf = IForest(behaviour='new', bootstrap=False, contamination=0.1, max_features=1.0, max_samples='auto', n_estimators=500, random_state=20, verbose=0)
21 | 
22 | # 训练模型  
23 | clf.fit(X)
24 | 
25 | # 预测结果  
26 | german_credit_data['out_pred'] = clf.predict_proba(X)[:, 1]
27 | # 将预测概率大于0.7以上的设为异常值  
28 | german_credit_data['iforest_rule'] = np.where(german_credit_data['out_pred'] > 0.7, 1, 0)
29 | 
30 | # 效果评估  
31 | rule_iforest = rule_discover(data_df=german_credit_data, var='iforest_rule', target='creditability', rule_term='==1')
32 | print("孤立森林评估结果: \n", rule_iforest.T)
33 | 


--------------------------------------------------------------------------------
/chapter3/ch3_03_tsfresh_orders.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # 时间序列特征挖掘
 8 | import pandas as pd
 9 | from tsfresh.feature_extraction import extract_features
10 | 
11 | if __name__ == '__main__':
12 |     # 读取数据
13 |     orders = pd.read_excel('data/order_data.xlsx')
14 |     orders_new = []
15 |     for i in range(len(orders)):
16 |         sub_data = pd.DataFrame.from_records(eval(orders['data'][i]))
17 |         sub_data['uid'] = orders['uid'][i]
18 |         orders_new.append(sub_data)
19 |     orders_new_df = pd.concat(orders_new)
20 |     # 数据格式
21 |     orders_new_df['application_amount'] = orders_new_df['application_amount'].astype(float)
22 |     orders_new_df['has_overdue'] = orders_new_df['has_overdue'].astype(float)
23 | 
24 |     # 调用extract_features生成时间序列特征:order_feas
25 |     order_feas = extract_features(orders_new_df[['uid', 'create_time', 'application_amount', 'has_overdue']], column_id="uid", column_sort="create_time")
26 |     print("时间序列挖掘特征数: \n", order_feas.shape[1])
27 |     print("时间序列特征挖掘结果: \n", order_feas.head())
28 | 


--------------------------------------------------------------------------------
/chapter2/ch2_09_one_hot_based_sklearn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import pandas as pd
 8 | from utils import data_utils
 9 | from sklearn.preprocessing import OneHotEncoder
10 | from sklearn.preprocessing import OrdinalEncoder
11 | 
12 | 
13 | def one_hot_encode(x):
14 |     """
15 |     将原始类别变量进行one-hot编码
16 |     :param str x: 需要编码的原始变量
17 |     :returns: x_oht one-hot编码后的变量
18 |     """
19 |     # 首先将类别值进行数值化
20 |     re = OrdinalEncoder()
21 |     x_encoded = re.fit_transform(x.astype(str))
22 |     x_encoded = pd.DataFrame(x_encoded).values
23 |     # 在对数值化后的类别变量进行one-hot编码
24 |     ohe = OneHotEncoder(handle_unknown='ignore')
25 |     x_oht = ohe.fit_transform(x_encoded).toarray()
26 |     return x_oht
27 | 
28 | def main():
29 |     """
30 |     主函数
31 |     """
32 |     # 加载数据
33 |     german_credit_data = data_utils.get_data()
34 |     # 以特征purpose为例，进行one-hot编码
35 |     label_encode_x = one_hot_encode(german_credit_data[['purpose']])
36 |     label_encode_df = pd.DataFrame(label_encode_x)
37 |     print("特征purpose的one-hot编码结果: \n", label_encode_df)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     main()


--------------------------------------------------------------------------------
/chapter3/ch3_09_word2vec.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # 文本特征挖掘：word2vec
 8 | import numpy as np
 9 | import pandas as pd
10 | from utils.text_utils import sentences_prepare
11 | from gensim.models import word2vec
12 | 
13 | 
14 | def sent2vec(words, w2v_model):
15 |     """
16 |     转换成句向量
17 |     :param words: 词列表
18 |     :param w2v_model: word2vec模型
19 |     :return:
20 |     """
21 |     if words == '':
22 |         return np.array([0] * model.wv.vector_size)
23 | 
24 |     vector_list = []
25 |     for w in words:
26 |         try:
27 |             vector_list.append(w2v_model.wv[w])
28 |         except:
29 |             continue
30 |     vector_list = np.array(vector_list)
31 |     v = vector_list.sum(axis=0)
32 |     return v / np.sqrt((v ** 2).sum())
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     # 加载语料
37 |     sentences = sentences_prepare()
38 | 
39 |     # 获取词向量
40 |     model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=2, workers=2)
41 |     fea_vec = pd.DataFrame([sent2vec(x, model).tolist() for x in sentences])
42 |     fea_vec.columns = ['fea_%s' % i for i in range(model.wv.vector_size)]
43 |     print('词向量维度：', fea_vec.shape)
44 | 


--------------------------------------------------------------------------------
/chapter3/ch3_14_node2vec.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # 使用Node2Vec算法生成特征
 8 | import networkx as nx
 9 | import pandas as pd
10 | from node2vec import Node2Vec
11 | import matplotlib.pyplot as plt
12 | 
13 | 
14 | def adj_to_graph(adj_table):
15 |     # 根据邻接表生成图G
16 |     graph = nx.Graph()
17 |     # 添加边
18 |     for i in range(0, len(adj_table)):
19 |         node_edgs = adj_table[i]
20 |         for j in range(0, len(node_edgs)):
21 |             graph.add_edge(node_edgs[0], node_edgs[j])
22 |     return graph
23 | 
24 | 
25 | def gen_node2vec_fea(graph, dimensions=8):
26 |     # 生成随机游走序列
27 |     node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=30, num_walks=100, workers=4)
28 |     # 向量化
29 |     model = node2vec.fit(window=10, min_count=1, batch_words=4)
30 |     return model.wv.vectors
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     # 数据读取
35 |     adj_tbl = []
36 |     with open('data/graph_data/graph_demo.adjlist') as f:
37 |         for line in f.readlines():
38 |             adj_tbl.append(line.replace('\n', '').split(' '))
39 |     G = adj_to_graph(adj_tbl)
40 |     # 使用networkx展示图结构
41 |     nx.draw(G, with_labels=True)
42 |     plt.show()
43 |     feas = gen_node2vec_fea(G, dimensions=8)
44 |     print(pd.DataFrame(feas))
45 | 


--------------------------------------------------------------------------------
/chapter2/ch2_40_DNN_credit_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # https://keras.io
 8 | 
 9 | from utils import data_utils
10 | import tensorflow as tf
11 | from sklearn.metrics import roc_auc_score
12 | from tensorflow.keras import layers, models, callbacks
13 | 
14 | # 加载数据集
15 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(transform_method='standard')
16 | 
17 | # 设置随机数种子
18 | tf.random.set_seed(1)
19 | # 设置早停
20 | callback = callbacks.EarlyStopping(monitor='val_loss', patience=30, mode='min')
21 | # 构建DNN模型结构
22 | model = models.Sequential()
23 | model.add(layers.Flatten(input_shape=(train_x.shape[1], 1)))
24 | model.add(layers.Dense(32, activation=tf.nn.relu))
25 | model.add(layers.Dropout(0.3, seed=1))
26 | model.add(layers.Dense(16, activation=tf.nn.relu))
27 | model.add(layers.Dense(1, activation=tf.nn.sigmoid))
28 | # 显示模型的结构
29 | model.summary()
30 | # 设置模型训练参数
31 | model.compile(optimizer='SGD',
32 |               metrics=[tf.metrics.AUC()],
33 |               loss='binary_crossentropy')
34 | # 模型训练
35 | model.fit(train_x, train_y, validation_data=(test_x, test_y), batch_size=16, epochs=240, callbacks=[callback], verbose=2)
36 | 
37 | # 效果评估
38 | auc_score = roc_auc_score(train_y, model.predict(train_x))
39 | print("训练集AUC", auc_score)
40 | auc_score = roc_auc_score(test_y, model.predict(test_x))
41 | print("测试集AUC", auc_score)
42 | 


--------------------------------------------------------------------------------
/chapter3/ch3_11_text_classifier_bayes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # 文本分类算法：朴素贝叶斯
 8 | import pandas as pd
 9 | from utils.text_utils import sentences_prepare_x_y
10 | from sklearn.feature_extraction.text import TfidfVectorizer
11 | from sklearn.naive_bayes import GaussianNB
12 | from sklearn.metrics import roc_auc_score
13 | 
14 | 
15 | def get_model(x, y):
16 |     # 训练朴素贝叶斯分类器
17 |     clf = GaussianNB()
18 |     bayes_model = clf.fit(x, y)
19 |     return bayes_model
20 | 
21 | 
22 | def text_sample_split(texts, y, rate=0.75):
23 |     # 文本向量化
24 |     cv = TfidfVectorizer(binary=True)
25 |     sentence_vec = cv.fit_transform(texts)
26 | 
27 |     # 划分训练集和测试集
28 |     split_size = int(len(texts) * rate)
29 |     x_train = sentence_vec[:split_size].toarray()
30 |     y_train = y[:split_size]
31 |     x_test = sentence_vec[split_size:].toarray()
32 |     y_test = y[split_size:]
33 |     return x_train, y_train, x_test, y_test
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     # 加载语料
38 |     sentences, target = sentences_prepare_x_y()
39 |     print("文本数目: %s" % len(sentences))
40 |     # 训练模型
41 |     x_train, y_train, x_test, y_test = text_sample_split(pd.Series(sentences), pd.Series(target))
42 |     model = get_model(x_train, y_train)
43 |     # 预测
44 |     y_pred = model.predict_proba(x_test)[:, 1]
45 |     auc_score = roc_auc_score(y_test, y_pred)
46 |     print("AUC结果: ", auc_score)
47 | 


--------------------------------------------------------------------------------
/chapter2/ch2_34_svm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import numpy as np
 5 | import pandas as pd
 6 | sys.path.append("./")
 7 | sys.path.append("../")
 8 | 
 9 | from sklearn.svm import LinearSVC
10 | from sklearn.pipeline import make_pipeline
11 | from sklearn.preprocessing import StandardScaler
12 | from sklearn.svm import SVC
13 | from utils import data_utils
14 | from sklearn.metrics import roc_auc_score
15 | from sklearn.metrics import accuracy_score
16 | from category_encoders.woe import WOEEncoder
17 | 
18 | # 导入数值型样例数据
19 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)
20 | 
21 | # woe特征处理
22 | encoder = WOEEncoder(cols=train_x.columns)
23 | train_x = encoder.fit_transform(train_x, train_y)
24 | test_x = encoder.transform(test_x)
25 | 
26 | # 线性SVM, Linear Support Vector Classification
27 | line_svm = LinearSVC(penalty='l2',
28 |                      loss='hinge',
29 |                      C=0.2,
30 |                      tol=0.001)
31 | clf = make_pipeline(StandardScaler(), line_svm)
32 | clf.fit(train_x, train_y)
33 | acc_score = accuracy_score(test_y, clf.predict(test_x))
34 | print("线性SVM模型 ACC: ", acc_score)
35 | 
36 | 
37 | # 支持核函数的SVM, C-Support Vector Classification
38 | svm = SVC(C=0.2,
39 |           kernel='rbf',
40 |           tol=0.001,
41 |           probability=True)
42 | clf = make_pipeline(StandardScaler(), svm)
43 | clf.fit(train_x, train_y)
44 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
45 | print("支持核函数SVM模型 AUC: ", auc_score)
46 | 


--------------------------------------------------------------------------------
/chapter2/ch2_07_ordinal_encoder_based_sklearn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import pandas as pd
 8 | from utils import data_utils
 9 | from sklearn.preprocessing import LabelEncoder
10 | from sklearn.preprocessing import OrdinalEncoder
11 | 
12 | def label_encode(x):
13 |     """
14 |     将原始分类变量用数字编码
15 |     :param str x: 需要编码的原始变量
16 |     :returns: x_encoded 数字编码后的变量
17 |     """
18 |     le = LabelEncoder()
19 |     x_encoded = le.fit_transform(x.astype(str))
20 |     class_ = le.classes_
21 |     return class_, pd.DataFrame(x_encoded, columns=x.columns)
22 | 
23 | def ordinal_encode(x):
24 |     """
25 |     将原始分类变量用数字编码
26 |     :param str x: 需要编码的原始变量，shape为[m,n]
27 |     :returns: x_encoded 数字编码后的变量
28 |     """
29 |     enc = OrdinalEncoder()
30 |     x_encoded = enc.fit_transform(x.astype(str))
31 |     return pd.DataFrame(x_encoded).values
32 | 
33 | 
34 | def main():
35 |     """
36 |     主函数
37 |     """
38 |     # 加载数据
39 |     german_credit_data = data_utils.get_data()
40 |     # 以特征purpose为例，进行类别编码
41 |     class_, label_encode_x = label_encode(german_credit_data[['purpose']])
42 |     print("特征'purpose'的类别编码结果: \n", label_encode_x)
43 |     print("特征'purpose'编码顺序为: \n", class_)
44 |     # 以特征purpose、credit.history为例，进行类别编码
45 |     ordinal_encode_x = ordinal_encode(german_credit_data[['purpose', 'credit.history']])
46 |     print("特征'purpose'和'credit.history'的类别编码结果: \n", ordinal_encode_x)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     main()
51 | 
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 《智能风控实践指南：从模型、特征到决策》
 2 | 今年6月，我和团队一起创作的书籍《智能风控实践指南：从模型、特征到决策》终于出版，完成自己的一个心愿。过去10年在金融科技、数据挖掘领域的工作中，我逐步积累起一套智能风控方法体系，尝试将碎片化的模型、特征、策略知识点，融入到这套完整的框架中。希望通过本书将自己和团队在智能风控方面的思考、探索和实践进行分享，与同行的朋友或者将要入行的朋友进行交流探讨。
 3 | 
 4 | 书籍的购买链接也放这里提供给有缘的朋友： 
 5 | 
 6 | * 京东官方店购买地址：[店铺地址](https://item.jd.com/13197919.html)  
 7 | * 当当官方店购买地址：[店铺地址](http://product.dangdang.com/29418079.html)  
 8 | * 京东临时活动地址【5折】：[促销活动](https://item.m.jd.com/product/13221799.html?_fd=jdm&PTAG=17053.1.1&utm_source=weixin&utm_medium=weixin&utm_campaign=t_1000072672_17053_001)  
 9 | * 淘宝临时活动地址【5折】：[促销活动](https://detail.tmall.com/item.htm?spm=a230r.1.14.37.7aa37dcaaXk7Ow&id=676830218907&ns=1&abbucket=13)  
10 | 
11 | ## 配套代码说明
12 | * 请建立Python3环境运行本书代码
13 | * chapter2到4为对应章节的代码
14 | * utils为公共基础代码
15 | * data为数据文件
16 | * requirements.txt为本代码所依赖的包说明
17 | 
18 | ## 书籍主要章节
19 | 本书整体贯穿了智能风控模型、特征和策略以及智能风控管理，读者可以按顺序阅读或者根据自身知识背景有选择地阅读相应章节。
20 | 以下是《智能风控实践指南：从模型、特征到决策》主要章节：
21 | 
22 | * 第1章介绍了智能风控技术的发展历史和智能风控的相关概念和应用；
23 | * 第2章介绍了搭建智能风控模型的方法、智能算法、模型优化、模型体系等，并融合模型开发实践经验；
24 | * 第3章介绍了搭建特征画像的方法、智能算法、特征挖掘、特征画像体系等，并融合特征挖掘实践经验；
25 | * 第4章介绍了搭建智能风控策略的方法、智能算法、策略体系、策略监控等，并融合策略实践经验；
26 | * 第5章介绍了智能决策与人的经验结合，剖析智能风控中的局限以及如何发挥人的价值；
27 | * 第6章介绍了智能风控相关的管理经验，解读智能风控中的一系列管理原则。
28 | 
29 | ## 作者简介
30 | **蒋宏**  
31 | 资深风控算法专家、数据科学家，长期从事风控模型算法和应用方面工作，带领模型团队建立智能风控体系，在风控模型、智能算法、数据挖掘、科学决策方向有深入研究和实践。拥有德勤咨询、百融云创等知名企业工作经验。上海交通大学学士、清华大学MBA。  
32 | **王欢**  
33 | 高级数据算法工程师，中国科学院软件研究所计算机硕士，参与国内及海外多个业务线的风控搭建、建模及特征工作，在风控模型和特征挖掘方面有丰富的实践经验。  
34 | **王超**  
35 | 高级风控算法工程师，历任多家知名金融科技公司算法工程师、建模咨询师，擅长风控模型、风控策略等智能风控研究方向，致力于应用深度学习等前沿技术推动智能风控的发展。  
36 | **马海彪**  
37 | 风控算法专家，北京航空航天大学硕士，擅长机器学习模型、数据挖掘、风控模型开发等，对风控业务有丰富的实践经验和深刻的理解。  
38 | 


--------------------------------------------------------------------------------
/chapter2/ch2_30_validation_curve.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | # 绘制验证曲线
 3 | 
 4 | import sys
 5 | import numpy as np
 6 | import pandas as pd
 7 | sys.path.append("./")
 8 | sys.path.append("../")
 9 | 
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | from sklearn.svm import SVC
13 | from sklearn.datasets import load_digits
14 | from sklearn.model_selection import validation_curve
15 | 
16 | X, y = load_digits(return_X_y=True)
17 | 
18 | param_range = np.logspace(-6, -1, 5)
19 | train_scores, test_scores = validation_curve(
20 |     SVC(), X, y, param_name="gamma", param_range=param_range,
21 |     scoring="accuracy", n_jobs=1)
22 | train_scores_mean = np.mean(train_scores, axis=1)
23 | train_scores_std = np.std(train_scores, axis=1)
24 | test_scores_mean = np.mean(test_scores, axis=1)
25 | test_scores_std = np.std(test_scores, axis=1)
26 | 
27 | plt.title("Validation Curve with SVM")
28 | plt.xlabel(r"$\gamma$")
29 | plt.ylabel("Score")
30 | plt.ylim(0.0, 1.1)
31 | lw = 2
32 | plt.semilogx(param_range, train_scores_mean, label="Training score",
33 |              color="darkorange", lw=lw)
34 | plt.fill_between(param_range, train_scores_mean - train_scores_std,
35 |                  train_scores_mean + train_scores_std, alpha=0.2,
36 |                  color="darkorange", lw=lw)
37 | plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
38 |              color="navy", lw=lw)
39 | plt.fill_between(param_range, test_scores_mean - test_scores_std,
40 |                  test_scores_mean + test_scores_std, alpha=0.2,
41 |                  color="navy", lw=lw)
42 | plt.legend(loc="best")
43 | plt.show()
44 | 


--------------------------------------------------------------------------------
/chapter2/ch2_33_lr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | import numpy as np
 5 | import pandas as pd
 6 | sys.path.append("./")
 7 | sys.path.append("../")
 8 | 
 9 | from utils import data_utils
10 | from sklearn.linear_model import SGDClassifier
11 | from sklearn.linear_model import LogisticRegression
12 | from sklearn.metrics import roc_auc_score
13 | from sklearn.pipeline import make_pipeline
14 | from sklearn.preprocessing import StandardScaler
15 | from category_encoders.woe import WOEEncoder
16 | 
17 | # 导入数值型样例数据
18 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)
19 | 
20 | # woe特征处理
21 | encoder = WOEEncoder(cols=train_x.columns)
22 | train_x = encoder.fit_transform(train_x, train_y)
23 | test_x = encoder.transform(test_x)
24 | 
25 | # 利用梯度下降法训练逻辑回归模型
26 | lr = SGDClassifier(loss="log",
27 |                    penalty="l2",
28 |                    learning_rate='optimal',
29 |                    max_iter=100,
30 |                    tol=0.001,
31 |                    epsilon=0.1,
32 |                    random_state=1)
33 | clf = make_pipeline(StandardScaler(), lr)
34 | clf.fit(train_x, train_y)
35 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
36 | print("梯度下降法训练逻辑回归模型 AUC: ", auc_score)
37 | 
38 | # 利用牛顿法训练逻辑回归模型
39 | lr = LogisticRegression(penalty="l2",
40 |                         solver='lbfgs',
41 |                         max_iter=100,
42 |                         tol=0.001,
43 |                         random_state=1)
44 | clf = make_pipeline(StandardScaler(), lr)
45 | clf.fit(train_x, train_y)
46 | auc_score = roc_auc_score(test_y, clf.predict_proba(test_x)[:, 1])
47 | print("牛顿法训练逻辑回归模型 AUC: ", auc_score)
48 | 


--------------------------------------------------------------------------------
/chapter3/ch3_05_gbdt_construct_feature.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # 使用GBDT算法做特征衍生
 8 | import pandas as pd
 9 | from sklearn.preprocessing import OneHotEncoder
10 | from sklearn.ensemble import GradientBoostingClassifier
11 | 
12 | 
13 | def gbdt_fea_gen(train_data, label, n_estimators=100):
14 |     # 训练GBDT模型
15 |     gbc_model = GradientBoostingClassifier(n_estimators=n_estimators, random_state=1)
16 |     gbc_model.fit(train_data, label)
17 | 
18 |     # 得到样本元素落在叶节点中的位置
19 |     train_leaf_fea = gbc_model.apply(train_data).reshape(-1, n_estimators)
20 | 
21 |     # 借用编码将位置信息转化为0，1
22 |     one_hot_encoder = OneHotEncoder()
23 |     one_hot_encoder.fit(train_leaf_fea)
24 |     return gbc_model, one_hot_encoder
25 | 
26 | 
27 | def gbdt_fea_appy(data, model, encoder):
28 |     # 获得GBDT特征
29 |     new_feature_train = encoder.transform(model.apply(data).reshape(-1, model.n_estimators)).toarray()
30 |     # new_feas为生成的新特征
31 |     new_fea = pd.DataFrame(new_feature_train)
32 |     new_fea.index = data.index
33 |     new_fea.columns = ['fea_%s' % i for i in range(1, new_fea.shape[1] + 1)]
34 |     return new_fea
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     # 读取原始特征数据
39 |     all_x_y = pd.read_excel('data/order_feas.xlsx')
40 |     all_x_y.set_index('order_no', inplace=True)
41 |     # 生成训练数据
42 |     x_train = all_x_y.drop(columns='label')
43 |     x_train.fillna(0, inplace=True)
44 |     y = all_x_y['label']
45 |     # 获取特征
46 |     gbr, encode = gbdt_fea_gen(x_train, y, n_estimators=100)
47 |     new_features = gbdt_fea_appy(x_train, gbr, encode)
48 |     print("使用GBDT算法衍生特征结果: \n", new_features.head())
49 | 


--------------------------------------------------------------------------------
/chapter2/ch2_24_fs_badrate_by_month.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import toad
 8 | import pandas as pd
 9 | from utils import data_utils
10 | 
11 | 
12 | # 导入添加month列的数据
13 | model_data = data_utils.get_data()
14 | 
15 | x = model_data[data_utils.x_cols]
16 | y = model_data[data_utils.label]
17 | 
18 | # 分箱
19 | Combiner = toad.transform.Combiner()
20 | x_cat = Combiner.fit_transform(x, y, n_bins=6, method='quantile', empty_separate=True)
21 | 
22 | # 合并标签和month
23 | x_cat_with_month = x_cat.merge(model_data[['month', 'creditability']], left_index=True, right_index=True)
24 | 
25 | # 单个特征对比逾期率
26 | feature_col = 'age.in.years'
27 | x_cat_one = x_cat_with_month[[feature_col, 'month', 'creditability']]
28 | feature_var = x_cat_one.pivot_table(index=feature_col,
29 |                                 columns='month',
30 |                                 values='creditability',
31 |                                 aggfunc=['mean'])
32 | print("特征'age.in.years'的按月分箱逾期率统计结果: \n", feature_var)
33 | 
34 | 
35 | # 计算特征按月逾期率波动值
36 | def variation_by_month(df, time_col, columns, label_col):
37 |     variation_dict = {}
38 |     for col in columns:
39 |         feature_v = df.pivot_table(
40 |             index=col, columns=time_col, values=label_col, aggfunc=['mean'])
41 |         variation_dict[col] = feature_v.rank().std(axis=1).mean()
42 | 
43 |     return pd.DataFrame([variation_dict], index=['variation']).T
44 | 
45 | 
46 | var_badrate = variation_by_month(x_cat_with_month, 'month', data_utils.x_cols, 'creditability')
47 | print("各特征按月逾期率的标准差: \n", var_badrate)
48 | 
49 | selected_cols = var_badrate[var_badrate['variation'] < 0.8].index.tolist()
50 | print("设置标准差阈值为0.8, 筛选得到%s个特征: \n" % len(selected_cols), selected_cols)
51 | 


--------------------------------------------------------------------------------
/chapter2/ch2_41_CNN_credit_data.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import tensorflow as tf
 8 | from tensorflow.keras import layers, models, callbacks
 9 | from utils import data_utils
10 | from sklearn.metrics import roc_auc_score
11 | from tensorflow.keras import layers, models, callbacks
12 | 
13 | # 加载数据集
14 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(transform_method='standard')
15 | 
16 | # 数据预处理
17 | train_x = train_x.to_numpy().reshape((train_x.shape[0], train_x.shape[1], 1))
18 | test_x = test_x.to_numpy().reshape((test_x.shape[0], test_x.shape[1], 1))
19 | train_y = train_y.values.reshape((train_y.shape[0], 1))
20 | test_y = test_y.values.reshape((test_y.shape[0], 1))
21 | 
22 | # 设置随机数种子，保证每次运行结果一致
23 | tf.random.set_seed(1)
24 | callback = callbacks.EarlyStopping(monitor='val_loss', patience=30, mode='min')
25 | 
26 | # 构建CNN模型结构
27 | model = models.Sequential()
28 | model.add(layers.Conv1D(filters=16, kernel_size=4, activation='relu', input_shape=(train_x.shape[1], 1)))
29 | model.add(layers.Conv1D(filters=8, kernel_size=1, activation='relu'))
30 | model.add(layers.Flatten())
31 | model.add(layers.Dropout(0.3, seed=1))
32 | model.add(layers.Dense(16, activation='relu'))
33 | model.add(layers.Dense(1, activation='sigmoid'))
34 | # 显示模型的结构
35 | model.summary()
36 | # 设置模型训练参数
37 | model.compile(optimizer='SGD',
38 |               metrics=[tf.metrics.AUC()],
39 |               loss='binary_crossentropy')
40 | # 模型训练
41 | model.fit(train_x, train_y, validation_data=(test_x, test_y), batch_size=16, epochs=240, callbacks=[callback], verbose=2)
42 | 
43 | # 测试集效果评估
44 | auc_score = roc_auc_score(train_y, model.predict(train_x))
45 | print("训练集AUC", auc_score)
46 | auc_score = roc_auc_score(test_y, model.predict(test_x))
47 | print("测试集AUC", auc_score)
48 | 


--------------------------------------------------------------------------------
/chapter3/ch3_04_feature_evaluation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import time
 5 | import numpy as np
 6 | import pandas as pd
 7 | from scipy.stats import variation
 8 | sys.path.append("./")
 9 | sys.path.append("../")
10 | 
11 | def cover_ratio(x):
12 |     """
13 |     计算特征覆盖度
14 |     :param x: 特征向量
15 |     :return: cover_ratio, 特征覆盖度
16 |     """
17 |     len_x = len(x)
18 |     len_nan = sum(pd.isnull(x))
19 |     ratio = 1 - len_nan / float(len_x)
20 |     return ratio
21 | 
22 | 
23 | def get_datestamps(begin_date, end_date):
24 |     """
25 |     返回[begin_date,end_date]之间日期的时间戳
26 |     :param begin_date: 开始时间
27 |     :param end_date: 结束时间
28 |     :return: [begin_date,end_date]日期的时间戳
29 |     """
30 |     date_arr = [int(time.mktime(x.timetuple())) for x in list(pd.date_range(start=begin_date, end=end_date))]
31 |     return date_arr
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     # 模拟生成几个特征
36 |     fea_1 = [-1, -1, -1, 0, 1, 1, 1]  # 特征均值为0
37 |     fea_2 = [1, 1, 1, 1, 1, 1, 1]  # 所有特征均为唯一指
38 |     fea_3 = [1, 2, 3, 4, 5, 6, 7]  # 与时间正相关
39 |     fea_4 = [7, 6, 5, 4, 3, 2, 1]  # 与时间负相关
40 |     fea_5 = [1, 2, 1, 2, np.nan, 2, np.nan]  # 与时间无线性关系
41 | 
42 |     x_all = pd.DataFrame([fea_1, fea_2, fea_3, fea_4, fea_5]).T
43 |     x_all.columns = ['fea_1', 'fea_2', 'fea_3', 'fea_4', 'fea_5']
44 | 
45 |     # 特征覆盖度
46 |     fea_cover = x_all.apply(cover_ratio).to_frame('cover_ratio')
47 |     print("特征覆盖度: ", fea_cover)
48 | 
49 |     # 特征离散度
50 |     fea_variation = variation(fea_2)
51 |     print("特征离散度: ", fea_variation)
52 | 
53 |     # 计算时间相关性
54 |     x_all['tm_col'] = get_datestamps('2020-10-01', '2020-10-07')
55 | 
56 |     # 计算三个特征与时间的Peason系数
57 |     fea_time_corr = x_all.loc[:, ['fea_3', 'fea_4', 'fea_5', 'tm_col']].corr().loc[:, ['tm_col']]
58 | 
59 |     print("构造的特征为: \n", x_all)
60 |     print("特征与时间的Peason系数计算结果: \n", fea_time_corr)
61 | 


--------------------------------------------------------------------------------
/utils/time_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import time
 4 | import pytz
 5 | import numpy as np
 6 | import datetime as dt
 7 | from dateutil.parser import parse
 8 | 
 9 | 
10 | def stamp_to_date(time_stamp, timezone=None):
11 |     """
12 |     时间戳转日期函数
13 |     :param time_stamp:int，时间戳
14 |     :param timezone:string，时区
15 |     :return: datetime
16 |     """
17 |     try:
18 |         if timezone is None:
19 |             stamp_str = str(time_stamp)
20 |             if len(stamp_str) >= 10:
21 |                 stamp_str = stamp_str[:10]
22 |             else:
23 |                 stamp_str = stamp_str
24 |             time_stamp = int(stamp_str)
25 |             date = dt.datetime.fromtimestamp(time_stamp)
26 |             return date
27 |         else:
28 |             stamp_str = str(time_stamp)
29 |             if len(stamp_str) >= 10:
30 |                 stamp_str = stamp_str[:10]
31 |             else:
32 |                 stamp_str = stamp_str
33 |             time_stamp = int(stamp_str)
34 |             tz = pytz.timezone(timezone)
35 |             date = dt.datetime.fromtimestamp(time_stamp, tz).strftime('%Y-%m-%d %H:%M:%S')
36 |             date = parse(date)
37 |             return date
38 |     except:
39 |         return parse('2100-01-01')
40 | 
41 | 
42 | def date_to_stamp(date_time):
43 |     """
44 |     将日期转换为时间戳
45 |     :param date_time: string，datetime
46 |     :return: int
47 |     """
48 |     try:
49 |         if isinstance(date_time, str):
50 |             date_time = parse(date_time)
51 |         return int(time.mktime(date_time.timetuple()))
52 |     except:
53 |         return int(631123200)
54 | 
55 | 
56 | def date_to_week(date):
57 |     '''
58 |     日期转换为星期
59 |     :param date:datetime，string
60 |     :return: int
61 |     '''
62 |     try:
63 |         if isinstance(date, str):
64 |             date = parse(date)
65 |         if_weekend = date.weekday()
66 |         return if_weekend
67 |     except:
68 |         return np.nan
69 | 


--------------------------------------------------------------------------------
/chapter4/ch4_00_rules_for_iv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import toad
 8 | import numpy as np
 9 | import pandas as pd
10 | from utils import data_utils
11 | from toad.plot import bin_plot
12 | from matplotlib import pyplot as plt
13 | 
14 | 
15 | def cal_iv(x, y):
16 |     """ 
17 |     IV计算函数  
18 |     :param x: feature 
19 |     :param y: label 
20 |     :return: 
21 |     """
22 |     crtab = pd.crosstab(x, y, margins=True)
23 |     crtab.columns = ['good', 'bad', 'total']
24 |     crtab['factor_per'] = crtab['total'] / len(y)
25 |     crtab['bad_per'] = crtab['bad'] / crtab['total']
26 |     crtab['p'] = crtab['bad'] / crtab.loc['All', 'bad']
27 |     crtab['q'] = crtab['good'] / crtab.loc['All', 'good']
28 |     crtab['woe'] = np.log(crtab['p'] / crtab['q'])
29 |     crtab2 = crtab[abs(crtab.woe) != np.inf]
30 | 
31 |     crtab['IV'] = sum(
32 |         (crtab2['p'] - crtab2['q']) * np.log(crtab2['p'] / crtab2['q']))
33 |     crtab.reset_index(inplace=True)
34 |     crtab['varname'] = crtab.columns[0]
35 |     crtab.rename(columns={crtab.columns[0]: 'var_level'}, inplace=True)
36 |     crtab.var_level = crtab.var_level.apply(str)
37 |     return crtab
38 | 
39 | 
40 | german_credit_data = data_utils.get_data()
41 | 
42 | # 生成分箱初始化对象  
43 | bin_transformer = toad.transform.Combiner()
44 | 
45 | # 采用等距分箱训练  
46 | bin_transformer.fit(german_credit_data,
47 |                     y='creditability',
48 |                     n_bins=6,
49 |                     method='step',
50 |                     empty_separate=True)
51 | 
52 | # 分箱数据  
53 | trans_data = bin_transformer.transform(german_credit_data, labels=True)
54 | 
55 | # 查看Credit amount分箱结果  
56 | bin_plot(trans_data, x='credit.amount', target='creditability')
57 | plt.show()
58 | 
59 | # 查看Credit amount分箱数据  
60 | cal_iv(trans_data['credit.amount'], trans_data['creditability'])
61 | 
62 | # 构建单规则
63 | german_credit_data['credit.amount.rule'] = np.where(german_credit_data['credit.amount'] > 12366.0, 1, 0)
64 | 


--------------------------------------------------------------------------------
/chapter3/ch3_00_order_data_preprocess.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | from utils.data_utils import stamp_to_date
10 | from utils.data_utils import date_to_week
11 | 
12 | 
13 | def data_preprocess(data, time_col, back_time, dtypes_dict):
14 |     """
15 |     数据预处理函数
16 |     :param data: 待处理的数据
17 |     :param time_col: 回溯依据的时间列名称
18 |     :param back_time: 特征计算时间，datetime.datetime时间格式
19 |     :param dtypes_dict: 指定列字段类型的字典，如{'col1':int}
20 |     :return: 清洗完成的数据
21 |     """
22 |     # 删除time_col为空的行
23 |     data = data[~data[time_col].isin(['nan', np.nan, 'NAN', 'null', 'NULL', 'Null'])]
24 |     # 将时间列的时间戳转为日期格式
25 |     data[time_col] = data[time_col].apply(stamp_to_date)
26 |     # 过滤订单创建时间在back_time之后的数据，避免特征穿越
27 |     data = data[data[time_col] <= back_time]
28 |     # 删除整条缺失的数据
29 |     data.dropna(how='all', inplace=True)
30 |     # 空字符串替换为np.nan
31 |     data.replace('', np.nan, inplace=True)
32 |     # 单个字段缺失填充为0
33 |     data.fillna(0, inplace=True)
34 |     # 去重
35 |     data.drop_duplicates(keep='first', inplace=True)
36 |     # 字段格式转换
37 |     data = data.astype(dtypes_dict)
38 |     # 补充字段
39 |     data['create_time_week'] = data[time_col].apply(date_to_week)
40 |     data['is_weekend'] = data['create_time_week'].apply(lambda x: 1 if x > 5 else 0)
41 | 
42 |     return data
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     # 原始数据读入
47 |     orders = pd.read_excel('data/order_data.xlsx')
48 |     # 取一个用户的历史订单数据
49 |     raw_data = pd.DataFrame(eval(orders['data'][1]))
50 |     # 数据预处理
51 |     data_processed = data_preprocess(raw_data, time_col='create_time',
52 |                                      back_time='2020-12-14',
53 |                                      dtypes_dict={'has_overdue': int,
54 |                                                   'application_term': float,
55 |                                                   'application_amount': float})
56 |     print(data_processed.shape)
57 | 


--------------------------------------------------------------------------------
/chapter3/ch3_06_cluster_alg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # 使用聚类算法衍生特征
 8 | import pandas as pd
 9 | from sklearn.cluster import KMeans
10 | 
11 | 
12 | def cluster_fea_gen(data, selected_cols, n_clusters):
13 |     """
14 |     使用聚类算法生成特征
15 |     :param data: 用作输入的x,y
16 |     :param selected_cols: 选取用来做聚类的特征列
17 |     :param n_clusters: 聚类类别数
18 |     :return: 聚类算法生成的特征
19 |     """
20 |     x_cluster_feas = data.loc[:, selected_cols]
21 |     # 拟合聚类模型
22 |     clf = KMeans(n_clusters=n_clusters, random_state=1)
23 |     clf.fit(x_cluster_feas)
24 |     return clf
25 | 
26 | 
27 | def cluster_fea_apply(data, selected_cols, clf):
28 |     """
29 |     使用聚类算法生成特征
30 |     :param data: 用作输入的x,y
31 |     :param selected_cols: 选取用来做聚类的特征列
32 |     :param clf: 聚类模型
33 |     :return: 聚类算法生成的特征
34 |     """
35 |     # 对原数据表进行类别标记
36 |     data['group'] = clf.predict(data[selected_cols])
37 | 
38 |     # 距质心距离特征的计算
39 |     centers_df = pd.DataFrame(clf.cluster_centers_)
40 |     centers_df.columns = [x + '_center' for x in selected_cols]
41 | 
42 |     for item in selected_cols:
43 |         data[item + '_center'] = data['group'].apply(
44 |             lambda x: centers_df.iloc[x, :][item + '_center'])
45 |         data[item + '_distance'] = data[item] - data[item + '_center']
46 | 
47 |     fea_cols = ['group']
48 |     fea_cols.extend([x + '_distance' for x in selected_cols])
49 | 
50 |     return data.loc[:, fea_cols]
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     # 数据读取
55 |     all_x_y = pd.read_excel('data/order_feas.xlsx')
56 |     all_x_y.set_index('order_no', inplace=True)
57 |     # 取以下几个特征做聚类
58 |     chose_cols = ['orderv1_age', 'orderv1_90_workday_application_amount_mean', 'orderv1_history_order_num',
59 |                   'orderv1_max_overdue_days']
60 |     all_x_y.fillna(0, inplace=True)
61 | 
62 |     # 生成聚类特征
63 |     model = cluster_fea_gen(all_x_y, chose_cols, n_clusters=5)
64 |     fea_cluster = cluster_fea_apply(all_x_y, chose_cols, model)
65 |     print("使用聚类算法衍生特征数: \n", fea_cluster.shape[1])
66 |     print("使用聚类算法衍生特征结果: \n", fea_cluster.head())
67 | 


--------------------------------------------------------------------------------
/chapter3/ch3_12_text_classifier_fasttext.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # 文本分类算法：fasttext
 8 | import fasttext
 9 | import pandas as pd
10 | from utils.text_utils import sentences_prepare_with_y
11 | from sklearn.metrics import roc_auc_score
12 | 
13 | 
14 | def process_sentences(train_path, test_path, rate=0.8):
15 |     sentences = sentences_prepare_with_y()
16 |     # 预处理之后的数据写入文件train_data.txt
17 |     num = int(len(sentences) * rate)
18 |     train_out = open(train_path, 'w')
19 |     test_out = open(test_path, 'w')
20 |     for sentence in sentences[:num]:
21 |         train_out.write(sentence)
22 |         train_out.write("\n")
23 |     for sentence in sentences[num:]:
24 |         test_out.write(sentence)
25 |         test_out.write("\n")
26 |     print("预处理之后的数据已写入文件train_data.txt, test_data.txt")
27 |     print("train文本数目: %s, test文本数目: %s" % (num, len(sentences) - num))
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     # 处理文本数据
32 |     process_sentences(train_path='data/train_data.txt', test_path='data/test_data.txt', rate=0.8)
33 | 
34 |     # 训练、保存模型
35 |     classifier = fasttext.train_supervised('data/train_data.txt', label='__label__', wordNgrams=3, loss='softmax')
36 |     classifier.save_model('data/fasttext_demo.model')
37 | 
38 |     # 加载模型
39 |     classifier = fasttext.load_model('data/fasttext_demo.model')
40 |     texts = "系列 票房 不差 口碑 生化危机 资深 玩家 张艳 告诉 玩家 很难 承认 一系列 电影 " \
41 |             "电影 原著 面目全非 女主角 爱丽丝 游戏 角色 电影 渐渐 脱离 游戏 打着 游戏 名号 发展 票房 " \
42 |             "号召力 观众 影响力 电影 系列 具备 剧情 世界观 游戏 生硬 强加 角色 背景 "
43 |     print("当前文本所属类别: ", classifier.predict(texts))
44 | 
45 |     # 测试集
46 |     test_data = pd.read_csv('data/test_data.txt', header=None)
47 |     texts_new = test_data[1].tolist()
48 |     y_true = [1 if x.strip() == '__label__sports' else 0 for x in test_data[0].tolist()]
49 | 
50 |     # 预测效果评估
51 |     result = classifier.predict(texts_new)
52 |     y_pre = []
53 |     for i in range(len(result[0])):
54 |         if result[0][i][0] == '__label__sports':
55 |             y_pre.append(result[1][i][0])
56 |         else:
57 |             y_pre.append(1 - result[1][i][0])
58 |     auc_score = roc_auc_score(y_true, y_pre)
59 |     print("测试集AUC为: ", auc_score)
60 | 


--------------------------------------------------------------------------------
/chapter3/ch3_08_bag_of_words.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # 文本特征挖掘：词袋模型示例
 8 | import pandas as pd
 9 | from utils.text_utils import sentences_prepare
10 | from sklearn.feature_extraction.text import CountVectorizer
11 | from sklearn.feature_extraction.text import HashingVectorizer
12 | from sklearn.feature_extraction.text import TfidfVectorizer
13 | 
14 | 
15 | def gen_count_doc_vec(text):
16 |     """
17 |     基于词频统计生成文本的向量表示
18 |     :param text: 输入文本
19 |     :return: 生成的文本向量表示
20 |     """
21 |     cv = CountVectorizer(binary=True)
22 |     document_vec = cv.fit_transform(text)
23 |     return pd.DataFrame(document_vec.toarray())
24 | 
25 | 
26 | def gen_tfidf_doc_vec(text):
27 |     """
28 |     基于TfidfVectorizer生成文本向量表示
29 |     :param text: 输入文本
30 |     :return: 生成的文本向量表示
31 |     """
32 |     cv = TfidfVectorizer()
33 |     document_vec = cv.fit_transform(text)
34 |     return pd.DataFrame(document_vec.toarray())
35 | 
36 | 
37 | def gen_hash_doc_vec(text, n_features=8):
38 |     """
39 |     基于HashingVectorizer生成文本向量表示
40 |     :param text: 输入文本
41 |     :param n_features: 指定输出特征的维数
42 |     :return: 生成的文本向量表示
43 |     """
44 |     cv = HashingVectorizer(n_features=n_features)
45 |     document_vec = cv.fit_transform(text)
46 |     return pd.DataFrame(document_vec.toarray())
47 | 
48 | 
49 | def gen_ngram_doc_vec(text):
50 |     ngram_cv = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",
51 |                                token_pattern=r'\b\w+\b', min_df=1)
52 |     document_vec = ngram_cv.fit_transform(text)
53 |     return pd.DataFrame(document_vec.toarray())
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     sentences = sentences_prepare()
58 |     # 词袋模型应用示例
59 |     # 取前三条文本用于展示
60 |     texts = sentences[0:5]
61 |     fea_vec_count = gen_count_doc_vec(texts)
62 |     print("CountVectorizer词向量:")
63 |     print(fea_vec_count)
64 | 
65 |     fea_vec_tfidf = gen_tfidf_doc_vec(texts)
66 |     print("TfidfVectorizer词向量:")
67 |     print(fea_vec_tfidf)
68 | 
69 |     fea_vec_hash = gen_hash_doc_vec(texts, n_features=8)
70 |     print("HashingVectorizer词向量:")
71 |     print(fea_vec_hash)
72 | 
73 |     fea_vec_ngram = gen_ngram_doc_vec(texts)
74 |     print("CountVectorizer词向量(ngram):")
75 |     print(fea_vec_ngram)
76 | 


--------------------------------------------------------------------------------
/chapter4/ch4_02_rules_for_decisiontree.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import sklearn.tree as st
 8 | import graphviz
 9 | from utils import data_utils
10 | 
11 | 
12 | def decision_tree_resolve(train_x, train_y, class_names=None, max_depth=3, fig_path=''):
13 |     """
14 |     基于决策树可视化
15 |     :param train_x: data of train
16 |     :param train_y: data of y
17 |     :param class_names:  标签名称
18 |     :param max_depth: 树最大深度
19 |     :param fig_path: 图片路径和名称
20 |     :return:
21 |     """
22 |     if class_names is None:
23 |         class_names = ['good', 'bad']
24 |     clf = st.DecisionTreeClassifier(max_depth=max_depth,
25 |                                     min_samples_leaf=0.01,
26 |                                     min_samples_split=0.01,
27 |                                     criterion='gini',
28 |                                     splitter='best',
29 |                                     max_features=None)
30 |     clf = clf.fit(train_x, train_y)
31 | 
32 |     # 比例图
33 |     dot_data = st.export_graphviz(clf, out_file=None,
34 |                                   feature_names=train_x.columns.tolist(),
35 |                                   class_names=class_names,
36 |                                   filled=True,
37 |                                   rounded=True,
38 |                                   node_ids=True,
39 |                                   special_characters=True,
40 |                                   proportion=True,
41 |                                   leaves_parallel=True)
42 |     graph = graphviz.Source(dot_data, filename=fig_path)
43 |     return graph
44 | 
45 | 
46 | # 加载数据
47 | german_credit_data = data_utils.get_data()
48 | 
49 | # 构造数据集
50 | X = german_credit_data[data_utils.numeric_cols].copy()
51 | y = german_credit_data['creditability']
52 | 
53 | graph = decision_tree_resolve(X, y, fig_path='data/tree')
54 | graph.view()
55 | 
56 | # 转化为规则
57 | X['node_5'] = X.apply(lambda x: 1 if x['duration.in.month'] <= 34.5 and x['credit.amount'] > 8630.5 else 0, axis=1)
58 | X['node_9'] = X.apply(
59 |     lambda x: 1 if x['duration.in.month'] > 34.5 and x['age.in.years'] <= 29.5 and x['credit.amount'] > 4100.0 else 0,
60 |     axis=1)
61 | X['node_12'] = X.apply(lambda x: 1 if x['duration.in.month'] > 34.5 and x['age.in.years'] > 56.5 else 0, axis=1)
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | *.xml
131 | *.iml
132 | *.DS_Store


--------------------------------------------------------------------------------
/chapter2/ch2_32_model_deployment_pmml.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- 
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | # PMML方式保存和读取模型
 8 | from sklearn2pmml import sklearn2pmml, PMMLPipeline
 9 | from sklearn_pandas import DataFrameMapper
10 | from pypmml import Model
11 | from xgboost.sklearn import XGBClassifier
12 | from utils import data_utils
13 | from chapter2.ch2_31_model_deployment_pickle import load_model_from_pkl
14 | 
15 | 
16 | # 以xgb模型为例，方式1：
17 | # sklearn接口的xgboost，可使用sklearn2pmml生成pmml文件
18 | def save_model_as_pmml(x, y, save_file_path):
19 |     """
20 |     保存模型到路径save_file_path
21 |     :param x: 训练数据特征
22 |     :param y: 训练数据标签
23 |     :param save_file_path: 保存的目标路径
24 |     """
25 |     # 设置pmml的pipeline
26 |     xgb = XGBClassifier(random_state=88)
27 |     mapper = DataFrameMapper([([i], None) for i in x.columns])
28 |     pipeline = PMMLPipeline([('mapper', mapper), ('classifier', xgb)])
29 |     # 模型训练
30 |     pipeline.fit(x, y)
31 |     # 模型结果保存
32 |     sklearn2pmml(pipeline, pmml=save_file_path, with_repr=True)
33 | 
34 | 
35 | # PMML格式读取
36 | def load_model_from_pmml(load_file_path):
37 |     """
38 |     从路径load_file_path加载模型
39 |     :param load_file_path: pmml文件路径
40 |     """
41 |     model = Model.fromFile(load_file_path)
42 |     return model
43 | 
44 | 
45 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)
46 | save_model_as_pmml(train_x, train_y, 'data/model/xgb_model.pmml')
47 | model = load_model_from_pmml('data/model/xgb_model.pmml')
48 | pre = model.predict(test_x)
49 | print(pre.head())
50 | 
51 | # 方式2：
52 | # 原生xgboost.core库生成的XGBoost模型，不能使用sklearn2pmml生成pmml文件，只能通过jpmml-xgboost包，将已有的.bin或.model
53 | # 格式模型文件转为pmml文件
54 | 
55 | # step1.获取到xgb模型文件
56 | xgb_model = load_model_from_pkl("data/model/xgb_model.pkl")
57 | 
58 | 
59 | # step2.生成fmap文件
60 | def create_feature_map(file_name, features):
61 |     outfile = open(file_name, 'w')
62 |     for i, feat in enumerate(features):
63 |         outfile.write('{0}\t{1}\tq\n'.format(i, feat))
64 | 
65 | 
66 | create_feature_map('data/model/xgb_model.fmap', xgb_model.feature_names)
67 | 
68 | # step3.jpmml-xgboost的环境配置及pmml转换：
69 | # step3.1. 下载jpmml-xgboost
70 | # step3.2. 命令行切换到jpmml-xgboost的项目文件夹，输入代码编译
71 | # mvn clean install
72 | # 该步执行完后，jpmml-xgboost的项目文件夹下会多出一个target文件夹，里面包含生成好的jar包
73 | # step3.3. jar包转换为pmml文件
74 | # java -jar jpmml-xgboost_path/target/jpmml-xgboost-executable-1.5-SNAPSHOT.jar  --X-nan-as-missing False
75 | # --model-input data/model/xgb.model --fmap-input data/model/xgb.fmap --target-name target
76 | # --pmml-output data/model/xgb_pmml.pmml
77 | 


--------------------------------------------------------------------------------
/chapter3/ch3_01_order_fea_gen_manual.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import pandas as pd
 8 | import datetime as dt
 9 | from dateutil.parser import parse
10 | from chapter3.ch3_00_order_data_preprocess import data_preprocess
11 | 
12 | 
13 | def calculate_age(born_day, back_time=None):
14 |     """
15 |     根据出生日期解析年龄
16 |     :param born_day: 出生日期
17 |     :param back_time: 回溯时间，默认当前日期
18 |     :return: 年龄
19 |     """
20 |     if back_time is None:
21 |         today = dt.date.today()
22 |     else:
23 |         today = back_time
24 |     if isinstance(born_day, str):
25 |         born_day = parse(born_day)
26 |     if isinstance(today, str):
27 |         today = parse(today)
28 |     return today.year - born_day.year - ((today.month, today.day) < (born_day.month, born_day.day))
29 | 
30 | 
31 | def gen_order_feature_manual(data, time_col, back_time, dtypes_dict, fea_prefix='f'):
32 |     """
33 |     根据业务逻辑生成特征
34 |     :param data: 业务订单原始数据
35 |     :param time_col: 回溯依据的时间列名称
36 |     :param back_time: 回溯时间点
37 |     :param dtypes_dict: 指定列字段类型的字典，如{'col1':int}
38 |     :param fea_prefix: 特征前缀
39 |     :return: features，根据业务逻辑生成的特征
40 |     """
41 |     # 数据预处理函数，见文件ch3_01_order_data_preprocess.py
42 |     data_processed = data_preprocess(data, time_col, back_time, dtypes_dict=dtypes_dict)
43 |     features = {}
44 |     # 从生日解析年龄
45 |     features['%s_age' % fea_prefix] = calculate_age(data_processed.get('birthday')[0], back_time)
46 |     # 用户历史订单数
47 |     features['%s_history_order_num' % fea_prefix] = data_processed.shape[0]
48 |     # 用户历史逾期次数
49 |     features['%s_overdue_num' % fea_prefix] = data_processed['has_overdue'].sum()
50 |     # 用户历史最大逾期天数
51 |     features['%s_max_overdue_days' % fea_prefix] = data_processed['overdue_days'].max()
52 |     # 用户历史平均逾期天数
53 |     features['%s_mean_overdue_days' % fea_prefix] = data_processed['overdue_days'].mean()
54 | 
55 |     return features
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     # 原始数据读入
60 |     orders = pd.read_excel('data/order_data.xlsx')
61 |     # 取一个用户的历史订单数据
62 |     raw_data = pd.DataFrame(eval(orders['data'][1]))
63 |     back_time_value = orders['back_time'][1]
64 |     cols_dtypes_dict = {'has_overdue': int, 'application_term': float, 'application_amount': float}
65 | 
66 |     # 根据业务逻辑生成用户历史订单特征
67 |     features_manual = gen_order_feature_manual(raw_data, 'create_time', back_time_value, cols_dtypes_dict)
68 |     print(features_manual)
69 | 
70 |     # 批量生成特征
71 |     feature_dict = {}
72 |     for i, row in orders.iterrows():
73 |         feature_dict[i] = gen_order_feature_manual(pd.DataFrame(eval(row['data'])), 'create_time', row['back_time'],
74 |                                                    cols_dtypes_dict, fea_prefix='orderv1')
75 |     feature_df = pd.DataFrame(feature_dict).T
76 |     # feature_df.to_excel('data/features_manual.xlsx', index=True)
77 | 


--------------------------------------------------------------------------------
/data/text_data/test.txt:
--------------------------------------------------------------------------------
 1 | 昨天晚上，亚冠小组赛全部结束，中超两支球队山东泰山和广州队的本届亚冠之旅就此划上句号。5月1日媒体人潘伟力发声总结了两支球队的表现。
 2 | 潘伟力表示：“泰山队1平5负打进2球丢24球、广州队6战全败打进0球同样丢了24球，6场比赛山东队平均控球率为25%，排名所有亚冠球队的倒数第二，广州队平均控球率20%，排名所有球队倒数第一。
 3 | 6场比赛山东泰山队一共有20脚射门，广州队只有11脚射门。”
 4 | “关于本届亚冠在国内争议最大的话题就是，这样的班底、阵容参加亚冠，到底是对孩子们信心的摧毁，还是会让他们得到经验，未来得到比较大的提升？这样的观点见仁见智，我们不做定论，我更关注的事情是孩子回国之后，他们未来的发展之路会是什么样。”
 5 | 潘伟力继续说道：“显然两支球队00后的球员，想要进入新赛季广州队、泰山队一线队征战中超联赛概率非常低。
 6 | 两个月前，足协和职业联盟计划推出U21联赛，就是为这个年龄段球员准备的。要求中超18支球队、中甲18支球队必须参加，36支球队分为四个阶段打赛会制的比赛。
 7 | 如果第一阶段、第二阶段小组出线，一年最多可以打21场；如果前两个阶段出局了，一年只有10场比赛。”
 8 | “试想一下20岁的年轻球员，在最需要比赛来提高的年纪，一年如果只有10场、20场的比赛机会，他们未来成为优秀球员的概率又有多大呢？即便这样，咱们U21联赛也没有板上钉钉一定能推行，因为中超联赛都还没有定论。
 9 | 过去两年，咱们已经停办了预备队联赛，很多年轻球员长期没有比赛可踢，这才是中国足球最让人担心的现实处境。”
10 | 北京时间4月21日下午，国际乒联世界排名工作小组进行线上交流讨论。大满贯运动员李晓霞受邀参加，首次以新身份在国际组织工作中亮相。
11 | 该小组由2021年国际乒联年度代表大会批准设立，成员包含国际乒联执委会代表、运动员委员会代表、洲级和会员协会专家代表、国际乒联具有排名事务经验以及商业事务经验的雇员和国际乒联个人技术委员。
12 | 主要工作涵盖审议世界排名体系政策、为运动员提供获得提高世界排名的公平机会并向国际乒联执委会提供专业建议等重要内容。李晓霞由中国乒协推荐并经国际乒联执委会批准，以专家代表身份成为该小组正式成员。
13 | 工作小组就现有乒乓球世界排名积分构成、积分赛事、积分保护等关乎运动员核心利益的重要议题展开讨论。小组成员们踊跃发表专业见解，李晓霞也积极参与其中。
14 | 在表达对小组后续工作和自身职责期待时，李晓霞认为这是一项非常有意义的工作，其本人很高兴能以全新的身份参与其中，并主动提到她将继续努力学习英文，从而更好地进行国际交流，在为运动员发声的同时，也为世界排名政策制定提供更多专业建议。
15 | 在中国乒协的大力推荐下，运动员许昕于去年成功当选亚乒联盟第一副主席。
16 | 如此短时间内又有大满贯运动员在国际组织担任重要职务，充分体现了协会对国际组织工作的高度重视。
17 | 未来，中国乒协将争取选派更多具有丰富乒乓球专业经验、在世界乒坛享有良好声誉并具有较强沟通能力的优秀运动员加入国际组织，进一步提升国际话语权。
18 | 4月5日，国际乒联（ITTF）公布了新一期世界排名。由于上周重要比赛仅有世界乒乓球职业大联盟（WTT）球星挑战赛多哈站，且除雨果-卡尔德拉诺、冯天薇之外的大部分高排位选手没有参赛，所以相较于上一周，本周男女排名前10位都没有变化。
19 | 男子方面，中国选手樊振东、马龙分居前两位，梁靖崑排名第4、许昕第8、林高远第10；巴西人雨果-卡尔德拉诺处于第三位，日本球员张本智和、中国台北运动员林昀儒、德国名将奥恰洛夫分列5至7位，德国老将波尔位居第9。
20 | 女子方面，中国球员陈梦、孙颖莎、王曼昱位居三甲，王艺迪排在第5；日本名将伊藤美诚居第4位，早田希娜、石川佳纯位列6、7位。
21 | 排在8至10位的是中国香港选手杜凯琹、新加坡老将冯天薇和波多黎各一姐阿德里亚纳-迪亚兹。
22 | 在本期世界排名公布之后，国乒参加今年杭州亚运会的阵容框架也基本确定。根据3月7日中国乒协公布的《乒乓球项目参加2022年杭州亚运会选拔办法（征求意见稿）》规定，在体能测试达标的前提下：
23 | 1、在2022年3月7日至20日期间举办的WTT大满贯（新加坡站）获得男女单打冠军的中国运动员；
24 | 2、以2022年第14周期间（2022年4月4日-10日）国际乒联公布的世界排名为标准，单打排名最高的男女各3名中国运动员；
25 | 3、1999年1月1日以后出生，以2022年第14周期间（2022年4月4日-10日）国际乒联公布的世界排名为标准，单打排名最高的男女各1名中国运动员
26 | 共五名运动员，将获得参加杭州亚运会团体比赛的资格。
27 | 则为：
28 | WTT大满贯单打冠军——樊振东、陈梦；
29 | 世界排名前三位球员——樊振东、马龙、梁靖崑/陈梦、孙颖莎、王曼昱；
30 | 1999年以后出生世界排名最高运动员——王楚钦（2000年出生，世界排名13位）/孙颖莎；
31 | 目前已经确定具备参加杭州亚运会乒乓球团体比赛资格的球员为：
32 | 男子：樊振东、马龙、梁靖崑、王楚钦；
33 | 女子：陈梦、孙颖莎、王曼昱。
34 | 前述选拔办法表示：
35 | 如根据上述3个条件入选的男女运动员不足5名，则由国家队男女教练组集体研究讨论，以2021年及2022年杭州亚运会前参加国际和国内大赛成绩为基本依据，以有利于完成杭州亚运会参赛任务、有利于2024年巴黎奥运会备战练兵，着眼于2028年洛杉矶奥运会梯队建设，根据近期比赛成绩和积分排名，提名国际大赛成绩突出、心理素质和抗压能力强且体能测试达标者，进入杭州亚运会团体项目建议名单；之后选拔工作领导小组对杭州亚运会团体项目建议名单进行综合评定，最终确认团体项目参赛名单。
36 | 单打方面，选拔办法规定：
37 | 1.在2022年3月7日至20日期间举办的WTT大满贯（新加坡站）获得男女单打冠军的中国运动员；
38 | 2.以2022年第14周期间（2022年4月4日-10日）国际乒联公布的世界排名为标准，单打排名最高的男女各1名中国运动员，如与WTT大满贯（新加坡站）人选相同，则顺延为单打排名第2的中国运动员。
39 | 以此，已经获得杭州亚运会乒乓球单打比赛资格的运动员为：
40 | 男子：樊振东、马龙；
41 | 女子：陈梦、孙颖莎；
42 | 双打方面，选拔办法规定，将根据已经确认的团体、单打参赛名单，结合相关运动员技战术特点、专长并综合考虑体能情况，由教练组提名双打配对建议名单；此后选拔工作领导小组对杭州亚运会团体项目建议名单进行综合评定，最终确认双打项目参赛名单。
43 | 选拔办法中规定，杭州亚运会各分项名单报选拔工作领导小组研究确认后，将经中国乒乓球协会报国家体育总局。
44 | 选拔办法同时指出，赛前如因伤病等特殊情况造成参赛人选替换，将由国家队男女教练组综合提出运动员调整意见，报选拔工作领导小组综合评估、研究确认后，经中国乒乓球协会报国家体育总局。（搜狐体育郭健/文）
45 | 


--------------------------------------------------------------------------------
/utils/text_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os
  4 | import random
  5 | import jieba
  6 | import pandas as pd
  7 | 
  8 | # 读取停用词
  9 | stopwords = pd.read_csv("data/text_data/stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],
 10 |                         encoding='utf-8')
 11 | stopwords = stopwords['stopword'].values
 12 | 
 13 | 
 14 | def cut_words(line, words_min=2):
 15 |     line_segments = jieba.lcut(line)
 16 |     line_segments = filter(lambda x: len(x) >= words_min, line_segments)
 17 |     line_segments = filter(lambda x: x not in stopwords, line_segments)
 18 |     return list(line_segments)
 19 | 
 20 | 
 21 | def load_corpus():
 22 |     """
 23 |     加载语料库：取自搜狗新闻语料库(https://www.sogou.com/labs/resource/cs.php)
 24 |     :return: sentences 语料库
 25 |     """
 26 |     # 取样后的文本存储
 27 |     df_entertainment = pd.read_csv(os.path.join('data/text_data/entertainment_news.csv'))
 28 |     df_sports = pd.read_csv(os.path.join('data/text_data/sports_news.csv'))
 29 | 
 30 |     entertainment = df_entertainment.content.values.tolist()
 31 |     sports = df_sports.content.values.tolist()
 32 |     content_file = {'entertainment': entertainment, 'sports': sports}
 33 | 
 34 |     return content_file
 35 | 
 36 | 
 37 | def sentences_prepare():
 38 |     """
 39 |     语料库预处理（无标签）
 40 |     """
 41 |     sentences = []
 42 |     content_file = load_corpus()
 43 |     for category in content_file.keys():
 44 |         for line in content_file[category]:
 45 |             try:
 46 |                 words_list = cut_words(line)
 47 |                 sentences.append(" ".join(words_list))
 48 |             except Exception as e:
 49 |                 sentences.append("")
 50 |                 print(e)
 51 |                 continue
 52 |     random.seed(1)
 53 |     random.shuffle(sentences)
 54 |     return sentences
 55 | 
 56 | 
 57 | def sentences_prepare_with_y():
 58 |     """
 59 |     语料库预处理（含标签）
 60 |     """
 61 |     sentences = []
 62 |     content_file = load_corpus()
 63 |     for category in content_file.keys():
 64 |         for line in content_file[category]:
 65 |             try:
 66 |                 words_list = cut_words(line)
 67 |                 sentences.append("__label__" + str(category) + " , " + " ".join(words_list))
 68 |             except Exception as e:
 69 |                 sentences.append("")
 70 |                 print(line)
 71 |                 continue
 72 |     random.seed(1)
 73 |     random.shuffle(sentences)
 74 |     return sentences
 75 | 
 76 | 
 77 | def sentences_prepare_x_y():
 78 |     """
 79 |     语料库预处理（语料和标签分别输出）
 80 |     """
 81 |     cate_dic = {'entertainment': 0, 'sports': 1}
 82 |     content_file = load_corpus()
 83 |     # 生成训练数据
 84 |     sentences = []
 85 |     y = []
 86 | 
 87 |     for category in content_file.keys():
 88 |         # 文本预处理
 89 |         for line in content_file[category]:
 90 |             try:
 91 |                 words_list = cut_words(line)
 92 |                 sentences.append(" ".join(words_list))
 93 |                 y.append(str(cate_dic.get(category)))
 94 |             except Exception as e:
 95 |                 print(line)
 96 |                 continue
 97 |     sentences_df = pd.DataFrame({'sentences': sentences, 'target': y})
 98 |     sentences_df = sentences_df.sample(frac=1, random_state=1)
 99 |     return sentences_df.sentences.tolist(), sentences_df.target.tolist()
100 | 


--------------------------------------------------------------------------------
/chapter4/ch4_01_rules_for_outliers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | sys.path.append("./")
 5 | sys.path.append("../")
 6 | 
 7 | import pandas as pd
 8 | from utils import data_utils
 9 | 
10 | def rule_evaluate(selected_df, total_df, target, rate=0.15, amount=10000):
11 |     """
12 |     :param selected_df: 子特征列表
13 |     :param total_df: 特征宽表
14 |     :param target: 目标变量
15 |     :param rate: 息费（%）
16 |     :param amount: 平均每笔借款金额
17 |     :return:
18 |     """
19 |     # 命中规则的子群体指标统计
20 |     hit_size = selected_df.shape[0]
21 |     hit_bad_size = selected_df[target].sum()
22 |     hit_bad_rate = selected_df[target].mean()
23 |     # 总体指标统计
24 |     total_size = total_df.shape[0]
25 |     total_bad_size = total_df[target].sum()
26 |     total_bad_rate = total_df[target].mean()
27 |     # 命中率
28 |     hit_rate = hit_size / total_size
29 |     # 提升度
30 |     lift = hit_bad_rate / total_bad_rate
31 |     # 收益
32 |     profit = hit_bad_size * amount - (hit_size - hit_bad_size) * rate * amount
33 |     res = [total_size, total_bad_size, total_bad_rate,
34 |            hit_rate, hit_size, hit_bad_size, hit_bad_rate, lift, profit]
35 |     return res
36 | 
37 | 
38 | def rule_discover(data_df, var, target, rule_term, rate=0.15, amount=10000):
39 |     """
40 |     :param data_df: 特征宽表
41 |     :param var: 特征名称
42 |     :param target: 目标变量
43 |     :param rule_term: 分位数列表或规则条件
44 |     :param rate: 息费（%）
45 |     :param amount: 平均每笔借款金额
46 |     :return:
47 |     """
48 |     res_list = []
49 |     if rule_term is None:
50 |         rule_term = [0.005, 0.01, 0.02, 0.05, 0.95, 0.98, 0.99, 0.995]
51 |     if isinstance(rule_term, list):
52 |         for q in rule_term:
53 |             threshold = data_df[var].quantile(q).round(2)
54 |             if q < 0.5:
55 |                 temp = data_df.query("`{0}` <= @threshold".format(var))
56 |                 rule = "<= {0}".format(threshold)
57 |             else:
58 |                 temp = data_df.query("`{0}` >= @threshold".format(var))
59 |                 rule = ">= {0}".format(threshold)
60 |             res = rule_evaluate(temp, data_df, target, rate, amount)
61 |             res_list.append([var, rule] + res)
62 |     else:
63 |         temp = data_df.query("`{0}` {1}".format(var, rule_term))
64 |         rule = rule_term
65 |         res = rule_evaluate(temp, data_df, target, rate, amount)
66 |         res_list.append([var, rule] + res)
67 |     columns = ['var', 'rule', 'total_size', 'total_bad_size', 'total_bad_rate',
68 |                'hit_rate', 'hit_size', 'hit_bad_size', 'hit_bad_rate', 'lift',
69 |                'profit']
70 |     result_df = pd.DataFrame(res_list, columns=columns)
71 |     return result_df
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     # 数据读入
76 |     german_credit_data = data_utils.get_data()
77 |     german_credit_data.loc[german_credit_data.sample(
78 |         frac=0.2, random_state=0).index, 'sample_set'] = 'Train'
79 |     german_credit_data['sample_set'].fillna('OOT', inplace=True)
80 |     # 使用分位数列表构建规则集
81 |     rule_table = rule_discover(data_df=german_credit_data, var='credit.amount',
82 |                                target='creditability',
83 |                                rule_term=[0.005, 0.01, 0.02, 0.05, 0.95, 0.98, 0.99, 0.995])
84 |     print(rule_table)
85 |     # 规则效果评估
86 |     rule_analyze = german_credit_data.groupby('sample_set').apply(
87 |         lambda x: rule_discover(data_df=x, var='credit.amount',
88 |                                 target='creditability', rule_term='>12366.0'))
89 |     print(rule_analyze)
90 | 


--------------------------------------------------------------------------------
/chapter4/ch4_04_modelstrategy_for_optimization.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | sys.path.append("./")
  5 | sys.path.append("../")
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | from numpy import polyfit, poly1d
 11 | from sklearn.metrics import r2_score
 12 | from scipy.optimize import minimize
 13 | 
 14 | 
 15 | def calculate_pass_loss_decile(score_series, y_series):
 16 |     """
 17 |     模型分取值变化时通过率与坏账率关系
 18 |     :param score_series: 模型分
 19 |     :param y_series: Y标签
 20 |     :return:  
 21 |     """
 22 |     decile_df = pd.crosstab(score_series, y_series).rename(columns={0: 'N_nonEvent', 1: 'N_Event'})
 23 |     decile_df.loc[:, 'N_sample'] = score_series.value_counts()
 24 | 
 25 |     decile_df.loc[:, 'EventRate'] = decile_df.N_Event * 1.0 / decile_df.N_sample
 26 |     decile_df.loc[:, 'BadPct'] = decile_df.N_Event * 1.0 / sum(decile_df.N_Event)
 27 |     decile_df.loc[:, 'GoodPct'] = decile_df.N_nonEvent * 1.0 / sum(decile_df.N_nonEvent)
 28 |     decile_df.loc[:, 'CumBadPct'] = decile_df.BadPct.cumsum()
 29 |     decile_df.loc[:, 'CumGoodPct'] = decile_df.GoodPct.cumsum()
 30 | 
 31 |     decile_df = decile_df.sort_index(ascending=False)
 32 |     decile_df.loc[:, 'ApprovalRate'] = decile_df.N_sample.cumsum() / decile_df.N_sample.sum()
 33 |     decile_df.loc[:, 'ApprovedEventRate'] = decile_df.N_Event.cumsum() / decile_df.N_sample.cumsum()
 34 |     decile_df = decile_df.sort_index(ascending=True)
 35 |     return decile_df
 36 | 
 37 | 
 38 | def poly_regression(x_series, y_series, degree, plot=True):
 39 |     """
 40 |     多项式回归拟合
 41 |     :param x_series: x数据
 42 |     :param y_series: y数据
 43 |     :param degree: 指定多项式次数
 44 |     :param plot: 是否作图
 45 |     :return:
 46 |     """
 47 |     coeff = polyfit(x_series, y_series, degree)
 48 |     f = poly1d(coeff)
 49 |     R2 = r2_score(y_series.values, f(x_series))
 50 | 
 51 |     print(f'coef:{coeff},R2: {R2}')
 52 | 
 53 |     if plot:
 54 |         # 用来正常显示中文标签
 55 |         plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
 56 |         plt.rcParams['axes.unicode_minus'] = False
 57 | 
 58 |         plt.figure(figsize=(10, 5))
 59 |         plt.plot(x_series, y_series, 'rx')
 60 |         plt.plot(x_series, f(x_series))
 61 |         plt.xlabel('通过率', {'size': 15})
 62 |         plt.ylabel('坏账率', {'size': 15})
 63 |         plt.show()
 64 |     return coeff
 65 | 
 66 | 
 67 | german_score = pd.read_csv('data/german_score.csv')
 68 | german_score.head()
 69 | 
 70 | decile_df = calculate_pass_loss_decile(german_score['score'],
 71 |                                        german_score['creditability'])
 72 | print(decile_df.head())
 73 | 
 74 | # 数据准备
 75 | x = decile_df['ApprovalRate']
 76 | # 逾期率折算为坏账率
 77 | y = decile_df['ApprovedEventRate'] / 2.5
 78 | 
 79 | poly_coef = poly_regression(x, y, 2, plot=True)
 80 | # 坏账率L(x)与通过率x的关系
 81 | l_x = poly1d(poly_coef)
 82 | print(l_x)
 83 | 
 84 | 
 85 | def find_best_approval_rate(x_to_loss_func, score_df):
 86 |     """
 87 |     定义最优化函数
 88 |     坏账率L(x)与通过率x的关系函数
 89 |     :param x_to_loss_func: 坏账率与通过率的函数关系
 90 |     :param score_df: 模型分与通过率的对应关系，index为模型分，"ApprovalRate"列为对应的通过率
 91 |     :return:
 92 |     """
 93 | 
 94 |     # 定义目标函数，求解最大值即为负的最小值
 95 |     def fun(x_array):
 96 |         # 其中x_list[0]为通过率x，x_array[1]为对应的坏账率L(x)
 97 |         return -10000 * (0.16 * (1 - x_array[1]) - x_array[1]
 98 |                          - 30 / (x_array[0] * 0.6) / 10000)
 99 | 
100 |     # eq表示 函数结果等于0 ； ineq 表示 表达式大于等于0， 下面式子1e-6项确保相应变量不等于0或1
101 |     cons = ({'type': 'eq', 'fun': lambda x_array: x_to_loss_func(x_array[0]) - x_array[1]},
102 |             {'type': 'ineq', 'fun': lambda x_array: x_array[0] - 1e-6},
103 |             {'type': 'ineq', 'fun': lambda x_array: x_array[1] - 1e-6},
104 |             {'type': 'ineq', 'fun': lambda x_array: 1 - x_array[0] - 1e-6},
105 |             {'type': 'ineq', 'fun': lambda x_array: 1 - x_array[0] - 1e-6}
106 |             )
107 | 
108 |     # 设置初始值
109 |     x_base = np.array((0.10, 0.10))
110 |     # 采用SLSQP进行最优化求解
111 |     res = minimize(fun, x_base, method='SLSQP', constraints=cons)
112 |     print('利润最优：', "{:.2f}".format(-res.fun))
113 |     print('最优解对应通过率：', "{:.2%}".format(res.x[0]), '坏账率：', "{:.2%}".format(res.x[1]))
114 |     print("模型分阈值：", score_df[score_df['ApprovalRate'] >= res.x[0]].index.max())
115 |     print('迭代终止是否成功：', res.success)
116 |     print('迭代终止原因：', res.message)
117 | 
118 | 
119 | find_best_approval_rate(l_x, decile_df)
120 | 


--------------------------------------------------------------------------------
/chapter3/ch3_02_order_fea_gen_rfm_auto.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | 
  5 | sys.path.append("./")
  6 | sys.path.append("../")
  7 | 
  8 | # 根据业务逻辑自动生成用户历史订单特征
  9 | import pandas as pd
 10 | import numpy as np
 11 | from dateutil.parser import parse
 12 | from utils.data_utils import stamp_to_date
 13 | from chapter3.ch3_00_order_data_preprocess import data_preprocess
 14 | 
 15 | func_trans = {'sum': np.sum,
 16 |               'mean': np.mean,
 17 |               'cnt': np.size,
 18 |               'max': np.max,
 19 |               'min': np.min,
 20 |               'std': np.std,
 21 |               }
 22 | 
 23 | 
 24 | def apply_func(f, *args):
 25 |     return f(*args)
 26 | 
 27 | 
 28 | def rfm_cut(data, time_col, back_time, type_dict, comp_dict, time_arr, fea_prefix='f'):
 29 |     """
 30 |     基于RFM思想切分数据，生成特征
 31 |     :param DataFrame data: 待切分的数据，时间列为create_time(timestamp)，距今天数列为gap_days
 32 |     :param str time_col: 回溯依据的时间列名称
 33 |     :param datetime.datetime back_time: 回溯时间点，datetime.datetime时间格式
 34 |     :param dict type_dict: 类别变量，以及其对应的取值类别，用于划分数据，类别列名必须在data中
 35 |     :param dict comp_dict: 指定计算字段以及对该字段采用的计算方法, 计算变量名必须在data中
 36 |     :param list time_arr: 切分时间列表(近N天)
 37 |     :param fea_prefix: 特征前缀
 38 |     :return dict: 特征
 39 |     """
 40 |     data[time_col] = data[time_col].apply(stamp_to_date)
 41 |     # 业务时间距back_time天数
 42 |     data['gap_days'] = data[time_col].apply(lambda x: (back_time - x).days)
 43 | 
 44 |     res_feas = {}
 45 |     for col_time in time_arr:
 46 |         for col_comp in comp_dict.keys():
 47 |             for type_k, type_v in type_dict.items():
 48 |                 # 按类别和时间维度切分,筛选数据
 49 |                 for item in type_v:
 50 |                     data_cut = data[(data['gap_days'] < col_time) & (data[type_k] == item)]
 51 |                     for func_k in comp_dict[col_comp]:
 52 |                         func_v = func_trans.get(func_k, np.size)
 53 |                         # 对筛选出的数据, 在各统计指标上做聚合操作生成特征
 54 |                         fea_name = '%s_%s_%s_%s_%s' % (
 55 |                             fea_prefix, col_time, '%s_%s' % (type_k, item), col_comp, func_k)
 56 |                         if data_cut.empty:
 57 |                             res_feas[fea_name] = np.nan
 58 |                         else:
 59 |                             res_feas[fea_name] = apply_func(func_v, data_cut[col_comp])
 60 |     return res_feas
 61 | 
 62 | 
 63 | def gen_order_feature_auto(raw_data, time_col, back_time, dtypes_dict, type_dict, comp_dict, time_arr,
 64 |                            fea_prefix='f'):
 65 |     """
 66 |     基于RFM切分，自动生成订单特征
 67 |     :param pd.DataFrame raw_data: 原始数据
 68 |     :param str time_col: 回溯依据的时间列名称
 69 |     :param str back_time: 回溯时间点，字符串格式
 70 |     :param dict dtypes_dict: 指定列字段类型的字典，如{'col1':int}
 71 |     :param list time_arr: 切分时间列表(近N天)
 72 |     :param dict type_dict: 类别变量，以及其对应的取值类别，用于划分数据，类别列名必须在data中
 73 |     :param dict comp_dict: 指定计算字段以及对该字段采用的计算方法,计算变量名必须在data中
 74 |     :param fea_prefix: 特征前缀
 75 |     :return: res_feas 最终生成的特征
 76 |     """
 77 |     if raw_data.empty:
 78 |         return {}
 79 |     back_time = parse(str(back_time))
 80 | 
 81 |     order_df = data_preprocess(raw_data, time_col=time_col, back_time=back_time, dtypes_dict=dtypes_dict)
 82 |     if order_df.empty:
 83 |         return {}
 84 | 
 85 |     # 特征衍生：使用rfm切分
 86 |     res_feas = rfm_cut(order_df, time_col, back_time, type_dict, comp_dict, time_arr, fea_prefix)
 87 |     return res_feas
 88 | 
 89 | 
 90 | if __name__ == '__main__':
 91 |     # 原始数据读入
 92 |     orders = pd.read_excel('data/order_data.xlsx')
 93 |     # 取一个用户的历史订单数据
 94 |     raw_orders = pd.DataFrame(eval(orders['data'][1]))
 95 | 
 96 |     # 设置自动特征的参数
 97 |     # 类别字段及其取值
 98 |     type_dict_param = {
 99 |         'has_overdue': [0, 1],
100 |         'is_weekend': [0, 1]
101 |     }
102 |     # 计算字段及其计算函数
103 |     comp_dict_param = {
104 |         'order_no': ['cnt'],
105 |         'application_amount': ['sum', 'mean', 'max', 'min']
106 |     }
107 |     time_cut = [30, 90, 180, 365]
108 | 
109 |     cols_dtypes_dict = {'has_overdue': int, 'application_term': float, 'application_amount': float}
110 | 
111 |     # 根据业务逻辑生成用户历史订单特征
112 |     features_auto = gen_order_feature_auto(raw_orders, 'create_time', '2020-12-14', cols_dtypes_dict,
113 |                                            type_dict_param, comp_dict_param, time_cut)
114 |     print("特征维度: ", len(features_auto.keys()))
115 |     print(features_auto)
116 | 
117 |     # 批量生成特征
118 |     feature_dict = {}
119 |     for i, row in orders.iterrows():
120 |         feature_dict[i] = gen_order_feature_auto(pd.DataFrame(eval(row['data'])), 'create_time', row['back_time'],
121 |                                                  cols_dtypes_dict, type_dict_param, comp_dict_param, time_cut,
122 |                                                  'order_auto')
123 |     feature_df_auto = pd.DataFrame(feature_dict).T
124 |     # feature_df_auto.to_excel('data/features_auto.xlsx', index=True)
125 | 


--------------------------------------------------------------------------------
/utils/data_utils.py:
--------------------------------------------------------------------------------
  1 | import toad
  2 | import numpy as np
  3 | import pandas as pd
  4 | import scorecardpy as sc
  5 | import datetime as dt
  6 | import pytz
  7 | from sklearn.preprocessing import OrdinalEncoder
  8 | from sklearn.model_selection import train_test_split
  9 | from sklearn.preprocessing import MinMaxScaler
 10 | from sklearn.preprocessing import StandardScaler
 11 | from dateutil.parser import parse
 12 | 
 13 | numeric_cols = ['duration.in.month',
 14 |                 'credit.amount',
 15 |                 'age.in.years',
 16 |                 'present.residence.since',
 17 |                 'number.of.existing.credits.at.this.bank',
 18 |                 'installment.rate.in.percentage.of.disposable.income',
 19 |                 'number.of.people.being.liable.to.provide.maintenance.for']
 20 | 
 21 | category_cols = ['status.of.existing.checking.account', 'credit.history',
 22 |                  'savings.account.and.bonds', 'present.employment.since',
 23 |                  'personal.status.and.sex', 'other.debtors.or.guarantors',
 24 |                  'property', 'other.installment.plans', 'housing', 'job',
 25 |                  'telephone', 'foreign.worker', 'purpose']
 26 | 
 27 | x_cols = numeric_cols + category_cols
 28 | 
 29 | label = 'creditability'
 30 | 
 31 | 
 32 | def get_data():
 33 |     """
 34 |     导入原始数据集
 35 |     """
 36 |     german_credit_data = sc.germancredit()
 37 |     german_credit_data[label] = np.where(
 38 |         german_credit_data[label] == 'bad', 1, 0)
 39 |     # 设置随机数种子, 确保结果可复现
 40 |     np.random.seed(0)
 41 |     month_list = ['2020-01', '2020-02', '2020-03', '2020-04', '2020-05']
 42 |     # 随机分配月份
 43 |     german_credit_data['month'] = np.random.choice(
 44 |         month_list, len(german_credit_data))
 45 |     return german_credit_data
 46 | 
 47 | 
 48 | def get_all_x_y(transform_method='minmax'):
 49 |     """
 50 |     加载数据
 51 |     :param transform_method: 数据标准化方式
 52 |     """
 53 |     german_credit_data = sc.germancredit()
 54 |     # 类别型变量转化成数值型索引变量
 55 |     encoder = OrdinalEncoder()
 56 |     category_result = encoder.fit_transform(german_credit_data[category_cols])
 57 |     category_result = pd.DataFrame(data=category_result, columns=category_cols)
 58 |     numeric_result = german_credit_data[numeric_cols + [label]].copy()
 59 |     # 将标签creditability映射为数值
 60 |     numeric_result[label] = np.where(numeric_result[label] == 'bad', 1, 0)
 61 |     all_x_y = pd.merge(category_result, numeric_result, left_index=True, right_index=True)
 62 |     x_cols = [f for f in all_x_y.columns if f != label]
 63 |     # 数据标准化
 64 |     if transform_method == 'minmax':
 65 |         encoder = MinMaxScaler()
 66 |         all_x_y[x_cols] = encoder.fit_transform(all_x_y[x_cols])
 67 |     elif transform_method == 'standard':
 68 |         encoder = StandardScaler()
 69 |         all_x_y[x_cols] = encoder.fit_transform(all_x_y[x_cols])
 70 |     elif transform_method == 'origin':
 71 |         pass
 72 |     return all_x_y
 73 | 
 74 | 
 75 | def get_data_after_fs(empty=0.5, iv=0.02, corr=0.7):
 76 |     """
 77 |     加载特征选择后的数据
 78 |     :param empty: 缺失率阈值
 79 |     :param iv: iv阈值
 80 |     :param corr: 相关性阈值
 81 |     """
 82 |     all_x_y = get_all_x_y()
 83 |     selected_data, drop_lst = toad.selection.select(
 84 |         all_x_y, target=label, empty=0.5,
 85 |         iv=0.02, corr=0.7, return_drop=True)
 86 |     return selected_data
 87 | 
 88 | 
 89 | def get_x_y_split(test_rate=0.2, transform_method='minmax'):
 90 |     """
 91 |     划分训练集和测试集
 92 |     :param test_rate: 测试集样本占比
 93 |     :param transform_method: 数据标准化方式
 94 |     """
 95 |     german_credit_data = get_all_x_y(transform_method)
 96 |     y = german_credit_data.pop(label)
 97 |     x = german_credit_data
 98 |     x_train, x_valid, y_train, y_valid = train_test_split(
 99 |         x, y, test_size=test_rate, random_state=88)
100 |     return x_train, x_valid, y_train, y_valid
101 | 
102 | 
103 | def stamp_to_date(time_stamp, timezone=None):
104 |     """
105 |     时间戳转日期函数
106 |     :param time_stamp:int，时间戳
107 |     :param timezone:string，时区
108 |     :return:datetime
109 |     """
110 |     try:
111 |         if timezone is None:
112 |             stamp_str = str(time_stamp)
113 |             if len(stamp_str) >= 10:
114 |                 stamp_str = stamp_str[:10]
115 |             else:
116 |                 stamp_str = stamp_str
117 |             time_stamp = int(stamp_str)
118 |             date = dt.datetime.fromtimestamp(time_stamp)
119 |             return date
120 |         else:
121 |             stamp_str = str(time_stamp)
122 |             if len(stamp_str) >= 10:
123 |                 stamp_str = stamp_str[:10]
124 |             else:
125 |                 stamp_str = stamp_str
126 |             time_stamp = int(stamp_str)
127 |             tz = pytz.timezone(timezone)
128 |             date = dt.datetime.fromtimestamp(time_stamp, tz).strftime('%Y-%m-%d %H:%M:%S')
129 |             date = parse(date)
130 |             return date
131 |     except:
132 |         return parse('2100-01-01')
133 | 
134 | 
135 | def date_to_week(date):
136 |     """
137 |     日期转换为星期
138 |     :param date:datetime，string
139 |     :return:int
140 |     """
141 |     try:
142 |         if isinstance(date, str):
143 |             date = parse(date)
144 |         if_weekend = date.weekday()
145 |         return if_weekend
146 |     except:
147 |         return np.nan
148 | 


--------------------------------------------------------------------------------
/chapter4/ch4_02_rules_for_decisiontree.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "ename": "ModuleNotFoundError",
 10 |      "evalue": "No module named 'toad'",
 11 |      "output_type": "error",
 12 |      "traceback": [
 13 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
 14 |       "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
 15 |       "Input \u001b[1;32mIn [3]\u001b[0m, in \u001b[0;36m<cell line: 9>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      7\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtree\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mst\u001b[39;00m\n\u001b[0;32m      8\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mgraphviz\u001b[39;00m\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m data_utils\n",
 16 |       "File \u001b[1;32md:\\GitHub\\practice_of_intelligent_risk_control\\chapter4\\..\\utils\\data_utils.py:1\u001b[0m, in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtoad\u001b[39;00m\n\u001b[0;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mnumpy\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mnp\u001b[39;00m\n\u001b[0;32m      3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n",
 17 |       "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'toad'"
 18 |      ]
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "# -*- coding: utf-8 -*-\n",
 23 |     "\n",
 24 |     "import sys\n",
 25 |     "sys.path.append(\"./\")\n",
 26 |     "sys.path.append(\"../\")\n",
 27 |     "\n",
 28 |     "import sklearn.tree as st\n",
 29 |     "import graphviz\n",
 30 |     "from utils import data_utils\n"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 4,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "Collecting toad\n",
 43 |       "  Downloading toad-0.1.3-cp39-cp39-win_amd64.whl (14.3 MB)\n",
 44 |       "Installing collected packages: toad\n",
 45 |       "Successfully installed toad-0.1.3\n"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "# !pip install graphviz\n",
 51 |     "!pip install toad"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "\n",
 61 |     "def decision_tree_resolve(train_x, train_y, class_names=None, max_depth=3, fig_path=''):\n",
 62 |     "    \"\"\"\n",
 63 |     "    基于决策树可视化\n",
 64 |     "    :param train_x: data of train\n",
 65 |     "    :param train_y: data of y\n",
 66 |     "    :param class_names:  标签名称\n",
 67 |     "    :param max_depth: 树最大深度\n",
 68 |     "    :param fig_path: 图片路径和名称\n",
 69 |     "    :return:\n",
 70 |     "    \"\"\"\n",
 71 |     "    if class_names is None:\n",
 72 |     "        class_names = ['good', 'bad']\n",
 73 |     "    clf = st.DecisionTreeClassifier(max_depth=max_depth,\n",
 74 |     "                                    min_samples_leaf=0.01,\n",
 75 |     "                                    min_samples_split=0.01,\n",
 76 |     "                                    criterion='gini',\n",
 77 |     "                                    splitter='best',\n",
 78 |     "                                    max_features=None)\n",
 79 |     "    clf = clf.fit(train_x, train_y)\n",
 80 |     "\n",
 81 |     "    # 比例图\n",
 82 |     "    dot_data = st.export_graphviz(clf, out_file=None,\n",
 83 |     "                                  feature_names=train_x.columns.tolist(),\n",
 84 |     "                                  class_names=class_names,\n",
 85 |     "                                  filled=True,\n",
 86 |     "                                  rounded=True,\n",
 87 |     "                                  node_ids=True,\n",
 88 |     "                                  special_characters=True,\n",
 89 |     "                                  proportion=True,\n",
 90 |     "                                  leaves_parallel=True)\n",
 91 |     "    graph = graphviz.Source(dot_data, filename=fig_path)\n",
 92 |     "    return graph\n",
 93 |     "\n"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# 加载数据\n",
103 |     "german_credit_data = data_utils.get_data()\n",
104 |     "\n"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "# 构造数据集\n",
114 |     "X = german_credit_data[data_utils.numeric_cols].copy()\n",
115 |     "y = german_credit_data['creditability']\n",
116 |     "\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "graph = decision_tree_resolve(X, y, fig_path='data/tree')\n",
126 |     "graph.view()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "\n",
136 |     "# 转化为规则\n",
137 |     "X['node_5'] = X.apply(lambda x: 1 if x['duration.in.month'] <= 34.5 and x['credit.amount'] > 8630.5 else 0, axis=1)\n",
138 |     "X['node_9'] = X.apply(\n",
139 |     "    lambda x: 1 if x['duration.in.month'] > 34.5 and x['age.in.years'] <= 29.5 and x['credit.amount'] > 4100.0 else 0,\n",
140 |     "    axis=1)\n",
141 |     "X['node_12'] = X.apply(lambda x: 1 if x['duration.in.month'] > 34.5 and x['age.in.years'] > 56.5 else 0, axis=1)\n"
142 |    ]
143 |   }
144 |  ],
145 |  "metadata": {
146 |   "kernelspec": {
147 |    "display_name": "Python 3",
148 |    "language": "python",
149 |    "name": "python3"
150 |   },
151 |   "language_info": {
152 |    "codemirror_mode": {
153 |     "name": "ipython",
154 |     "version": 3
155 |    },
156 |    "file_extension": ".py",
157 |    "mimetype": "text/x-python",
158 |    "name": "python",
159 |    "nbconvert_exporter": "python",
160 |    "pygments_lexer": "ipython3",
161 |    "version": "3.9.12"
162 |   }
163 |  },
164 |  "nbformat": 4,
165 |  "nbformat_minor": 2
166 | }
167 | 


--------------------------------------------------------------------------------
/chapter2/ch2_38_xgboost.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | import sys
  4 | sys.path.append("./")
  5 | sys.path.append("../")
  6 | 
  7 | import shap
  8 | import numpy as np
  9 | import pandas as pd
 10 | import xgboost as xgb
 11 | import bayes_opt as bo
 12 | import sklearn.model_selection as sk_ms
 13 | from sklearn.model_selection import ParameterGrid
 14 | from sklearn.metrics import roc_auc_score
 15 | from utils import data_utils
 16 | import shap
 17 | from chapter2.ch2_31_model_deployment_pickle import save_model_as_pkl
 18 | 
 19 | 
 20 | # 确定最优树的颗数
 21 | def xgb_cv(param, x, y, num_boost_round=10000):
 22 |     dtrain = xgb.DMatrix(x, label=y)
 23 |     cv_res = xgb.cv(param, dtrain, num_boost_round=num_boost_round, early_stopping_rounds=30)
 24 |     num_boost_round = cv_res.shape[0]
 25 |     return num_boost_round
 26 | 
 27 | def train_xgb(params, x_train, y_train, x_test=None, y_test=None, num_boost_round=10000, early_stopping_rounds=30, verbose_eval=50):
 28 |     """
 29 |     训练xgb模型
 30 |     """
 31 |     dtrain = xgb.DMatrix(x_train, label=y_train)
 32 |     if x_test is None:
 33 |         num_boost_round = xgb_cv(params, x_train, y_train)
 34 |         early_stopping_rounds = None
 35 |         eval_sets = ()
 36 |     else:
 37 |         dtest = xgb.DMatrix(x_test, label=y_test)
 38 |         eval_sets = [(dtest, 'test')]
 39 |     model = xgb.train(params, dtrain, num_boost_round, evals=eval_sets, early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)
 40 |     return model
 41 | 
 42 | 
 43 | def xgboost_grid_search(params_space, x_train, y_train, x_test=None, y_test=None, num_boost_round=10000):
 44 |     """
 45 |     网格调参, 确定其他参数
 46 |     """
 47 |     # 设置训练参数
 48 |     if x_test is None:
 49 |         x_train, x_test, y_train, y_test = sk_ms.train_test_split(x_train, y_train, test_size=0.2, random_state=1)
 50 |     score_list = []
 51 |     test_params = list(ParameterGrid(params_space))
 52 |     for params_try in test_params:
 53 |         params_try['eval_metric'] = "auc"
 54 |         params_try['random_state'] = 1
 55 |         clf_obj = train_xgb(params_try, x_train, y_train, x_test, y_test, num_boost_round=num_boost_round,
 56 |                             early_stopping_rounds=30, verbose_eval=0)
 57 |         score_list.append(roc_auc_score(y_test, clf_obj.predict(xgb.DMatrix(x_test))))
 58 |     result = pd.DataFrame(dict(zip(score_list, test_params))).T
 59 |     print(result)
 60 |     # 取测试集上效果最好的参数组合
 61 |     params = test_params[np.array(score_list).argmax()]
 62 |     return params
 63 | 
 64 | 
 65 | def xgboost_bayesian_optimization(params_space, x_train, y_train, x_test=None, y_test=None, num_boost_round=10000, nfold=5, init_points=2, n_iter=5, verbose_eval=0, early_stopping_rounds=30):
 66 |     """
 67 |     贝叶斯调参, 确定其他参数
 68 |     """
 69 |     # 设置需要调节的参数及效果评价指标
 70 |     def xgboost_cv_for_bo(eta, gamma, max_depth, min_child_weight,
 71 |                           subsample, colsample_bytree):
 72 |         params = {
 73 |             'eval_metric': 'auc',
 74 |             'booster': 'gbtree',
 75 |             'objective': 'binary:logistic',
 76 |             'eta': eta,
 77 |             'gamma': gamma,
 78 |             'max_depth': int(max_depth),
 79 |             'min_child_weight': int(min_child_weight),
 80 |             'subsample': subsample,
 81 |             'colsample_bytree': colsample_bytree,
 82 |             'seed': 1
 83 |         }
 84 |         if x_test is None:
 85 |             dtrain = xgb.DMatrix(x_train, label=y_train)
 86 |             xgb_cross = xgb.cv(params,
 87 |                                dtrain,
 88 |                                nfold=nfold,
 89 |                                metrics='auc',
 90 |                                early_stopping_rounds=early_stopping_rounds,
 91 |                                num_boost_round=num_boost_round)
 92 |             test_auc = xgb_cross['test-auc-mean'].iloc[-1]
 93 |         else:
 94 |             clf_obj = train_xgb(params, x_train, y_train, x_test, y_test, num_boost_round=num_boost_round,
 95 |                                 early_stopping_rounds=early_stopping_rounds, verbose_eval=verbose_eval)
 96 |             test_auc = roc_auc_score(y_test, clf_obj.predict(xgb.DMatrix(x_test)))
 97 |         return test_auc
 98 | 
 99 |     # 指定需要调节参数的取值范围
100 |     xgb_bo_obj = bo.BayesianOptimization(xgboost_cv_for_bo, params_space, random_state=1)
101 |     xgb_bo_obj.maximize(init_points=init_points, n_iter=n_iter)
102 |     best_params = xgb_bo_obj.max['params']
103 |     best_params['max_depth'] = int(best_params['max_depth'])
104 |     best_params['min_child_weight'] = int(best_params['min_child_weight'])
105 |     best_params['eval_metric'] = 'auc'
106 |     best_params['booster'] = 'gbtree'
107 |     best_params['objective'] = 'binary:logistic'
108 |     best_params['seed'] = 1
109 |     return best_params
110 | 
111 | 
112 | # 导入数值型样例数据
113 | train_x, test_x, train_y, test_y = data_utils.get_x_y_split(test_rate=0.2)
114 | 
115 | # 经验参数
116 | exp_params = {
117 |     'eval_metric': 'auc',
118 |     'booster': 'gbtree',
119 |     'objective': 'binary:logistic',
120 |     'eta': 0.1,
121 |     'gamma': 0.01,
122 |     'max_depth': 4,
123 |     'min_child_weight': 1,
124 |     'subsample': 1,
125 |     'colsample_bytree': 1,
126 |     'seed': 1
127 | }
128 | final_xgb_model = train_xgb(exp_params, train_x, train_y, test_x, test_y)
129 | auc_score = roc_auc_score(test_y, final_xgb_model.predict(xgb.DMatrix(test_x)))
130 | print("经验参数模型AUC: ", auc_score)
131 | 
132 | # 随机搜索调参
133 | choose_tuner = 'bayesian'  # bayesian grid_search
134 | if choose_tuner == 'grid_search':
135 |     params_test = {
136 |         'learning_rate': [0.1, 0.15],
137 |         'gamma': [0.01, 0],
138 |         'max_depth': [4, 3],
139 |         'min_child_weight': [1, 2],
140 |         'subsample': [0.95, 1],
141 |         'colsample_bytree': [1]
142 |     }
143 |     optimal_params = xgboost_grid_search(params_test, train_x, train_y, test_x, test_y)
144 | elif choose_tuner == 'bayesian':
145 |     # 贝叶斯调参
146 |     params_test = {'eta': (0.05, 0.2),
147 |                    'gamma': (0.005, 0.05),
148 |                    'max_depth': (3, 5),
149 |                    'min_child_weight': (0, 3),
150 |                    'subsample': (0.9, 1.0),
151 |                    'colsample_bytree': (0.9, 1.0)}
152 |     optimal_params = xgboost_bayesian_optimization(params_test, train_x, train_y, test_x, test_y, init_points=5, n_iter=8)
153 | 
154 | print("随机搜索调参最优参数: ", optimal_params)
155 | 
156 | final_xgb_model = train_xgb(optimal_params, train_x, train_y, test_x, test_y)
157 | auc_score = roc_auc_score(test_y, final_xgb_model.predict(xgb.DMatrix(test_x)))
158 | print("随机搜索调参模型AUC: ", auc_score)
159 | 
160 | # 保存模型
161 | save_model_as_pkl(final_xgb_model, "./data/xgb_model.pkl")
162 | 
163 | # SHAP计算
164 | explainer = shap.TreeExplainer(final_xgb_model)
165 | shap_values = explainer.shap_values(train_x)
166 | # SHAP可视化
167 | shap.summary_plot(shap_values, train_x, max_display=5)
168 | 


--------------------------------------------------------------------------------
/chapter3/ch3_15_gcn_order.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import sys
  4 | sys.path.append("./")
  5 | sys.path.append("../")
  6 | 
  7 | # GCN关系网络节点预测
  8 | import pickle
  9 | import os
 10 | import itertools
 11 | import numpy as np
 12 | import scipy.sparse as sp
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.nn.functional as F
 16 | import torch.nn.init as init
 17 | import torch.optim as optim
 18 | import matplotlib.pyplot as plt
 19 | from collections import namedtuple
 20 | 
 21 | os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
 22 | cpu_type = "cuda" if torch.cuda.is_available() else "cpu"
 23 | 
 24 | 
 25 | def numpy_to_tensor(x):
 26 |     return torch.from_numpy(x).to(cpu_type)
 27 | 
 28 | 
 29 | def build_adjacency(adj_dict):
 30 |     """
 31 |     根据邻接表创建邻接矩阵
 32 |     :param adj_dict: 输入的邻接表
 33 |     :return: 邻接矩阵
 34 |     """
 35 |     edge_index = []
 36 |     node_counts = len(adj_dict)
 37 |     for src, dst in adj_dict.items():
 38 |         edge_index.extend([src, v] for v in dst)
 39 |         edge_index.extend([v, src] for v in dst)
 40 |     # 去重
 41 |     edge_index = list(k for k, _ in itertools.groupby(sorted(edge_index)))
 42 |     edge_index = np.asarray(edge_index)
 43 |     # 构建邻接矩阵，相接的节点值为1
 44 |     adjacency = sp.coo_matrix((np.ones(len(edge_index)),
 45 |                                (edge_index[:, 0], edge_index[:, 1])),
 46 |                               shape=(node_counts, node_counts), dtype="double")
 47 |     return adjacency
 48 | 
 49 | 
 50 | def read_data(path_of_data):
 51 |     """
 52 |     数据读取
 53 |     :param path_of_data: 文件路径
 54 |     :return:
 55 |     """
 56 |     out = pickle.load(open(path_of_data, "rb"), encoding="latin1")
 57 |     out = out.toarray() if hasattr(out, "toarray") else out
 58 |     return out
 59 | 
 60 | 
 61 | def data_preprocess():
 62 |     print("Start data preprocess.")
 63 |     filenames = ["order.{}".format(name) for name in ['x', 'y', 'graph']]
 64 |     # 图有2000个节点，每个节点有104维特征，y值为0或1，graph用字典表示，字典key为节点编号，value为关联的节点编号list
 65 |     root_path = 'data/graph_data'
 66 |     x, y, graph = [read_data(os.path.join(root_path, name)) for name in filenames]
 67 | 
 68 |     # 划分train，validation和test节点编号
 69 |     train_index = list(range(0, 700))
 70 |     val_index = list(range(700, 1000))
 71 |     test_index = list(range(1000, 2000))
 72 | 
 73 |     num_nodes = x.shape[0]
 74 |     train_mask = np.zeros(num_nodes, dtype=bool)
 75 |     val_mask = np.zeros(num_nodes, dtype=bool)
 76 |     test_mask = np.zeros(num_nodes, dtype=bool)
 77 | 
 78 |     train_mask[train_index] = True
 79 |     val_mask[val_index] = True
 80 |     test_mask[test_index] = True
 81 | 
 82 |     adjacency = build_adjacency(graph)
 83 |     print("特征维度: ", x.shape)
 84 |     print("标签长度: ", y.shape)
 85 |     print("邻接矩阵维度: ", adjacency.shape)
 86 |     # 构建带字段名的元组
 87 |     Data = namedtuple('Data', ['x', 'y', 'adjacency',
 88 |                                'train_mask', 'val_mask', 'test_mask'])
 89 |     return Data(x=x, y=y, adjacency=adjacency,
 90 |                 train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
 91 | 
 92 | 
 93 | def adj_norm(adjacency):
 94 |     """
 95 |     正则化：公式L=D^-0.5 * (A+I) * D^-0.5
 96 |     :param torch.sparse.FloatTensor adjacency:
 97 |     :return:
 98 |     """
 99 |     adjacency += sp.eye(adjacency.shape[0])
100 |     degree = np.array(adjacency.sum(1))
101 |     d_hat = sp.diags(np.power(degree, -0.5).flatten())
102 |     return d_hat.dot(adjacency).dot(d_hat).tocoo()
103 | 
104 | 
105 | class GraphConv(nn.Module):
106 |     def __init__(self, input_dim, output_dim, use_bias=True):
107 |         """
108 |         # 图卷积层定义
109 |         :param int input_dim: 输入特征维度
110 |         :param int output_dim: 输出特征维度
111 |         :param bool use_bias: 偏置
112 |         :return:
113 |         """
114 |         super(GraphConv, self).__init__()
115 |         self.input_dim = input_dim
116 |         self.output_dim = output_dim
117 |         self.use_bias = use_bias
118 |         self.weight = nn.Parameter(torch.Tensor(input_dim, output_dim))
119 |         if self.use_bias:
120 |             self.bias = nn.Parameter(torch.Tensor(output_dim))
121 |         else:
122 |             self.register_parameter('bias', None)
123 |         self.reset_parameters()
124 | 
125 |     def reset_parameters(self):
126 |         init.kaiming_uniform_(self.weight)
127 |         if self.use_bias:
128 |             init.zeros_(self.bias)
129 | 
130 |     def forward(self, adjacency, fea_input):
131 |         """
132 |         :param torch.sparse.FloatTensor adjacency : 邻接矩阵
133 |         :param torch.Tensor fea_input: 输入特征
134 |         :return:
135 |         """
136 |         support = torch.mm(fea_input, self.weight)
137 |         output = torch.sparse.mm(adjacency, support)
138 |         if self.use_bias:
139 |             output += self.bias
140 |         return output
141 | 
142 | 
143 | class GcnNet(nn.Module):
144 |     def __init__(self, input_dim):
145 |         """
146 |         模型定义
147 |         :param int input_dim: 输入特征维度
148 |         """
149 |         super(GcnNet, self).__init__()
150 |         self.gcn1 = GraphConv(input_dim, 16)
151 |         self.gcn2 = GraphConv(16, 2)
152 | 
153 |     def forward(self, adjacency, feature):
154 |         h = F.relu(self.gcn1(adjacency, feature))
155 |         lg = self.gcn2(adjacency, h)
156 |         return lg
157 | 
158 | 
159 | def model_predict(model, tensor, tensor_adj, mask):
160 |     model.eval()
161 |     with torch.no_grad():
162 |         lg = model(tensor_adj, tensor)
163 |         lg_mask = lg[mask]
164 |         y_pred = lg_mask.max(1)[1]
165 |     return y_pred
166 | 
167 | 
168 | def cal_accuracy(y_true, y_pred):
169 |     accuracy = torch.eq(y_pred, y_true).double().mean()
170 |     return accuracy
171 | 
172 | 
173 | def model_train(tensor_x, tensor_y, tensor_adjacency, train_mask, val_mask, epochs, learning_rate,
174 |                 weight_decay):
175 |     # 模型定义：Model, Loss, Optimizer
176 |     model = GcnNet(tensor_x.shape[1]).to(cpu_type)
177 |     optimizer = optim.Adam(model.parameters(),
178 |                            lr=learning_rate,
179 |                            weight_decay=weight_decay)
180 | 
181 |     loss_list = []
182 |     test_accuracy_list = []
183 |     model.train()
184 |     train_y = tensor_y[train_mask].long()
185 | 
186 |     for epoch in range(epochs):
187 |         # 前向传播
188 |         lg = model(tensor_adjacency, tensor_x)
189 |         train_mask_logits = lg[train_mask]
190 |         loss = nn.CrossEntropyLoss().to(cpu_type)(train_mask_logits, train_y)
191 |         optimizer.zero_grad()
192 |         # 反向传播
193 |         loss.backward()
194 |         optimizer.step()
195 |         # 准确率
196 |         train_accuracy = cal_accuracy(tensor_y[train_mask],
197 |                                       model_predict(model, tensor_x, tensor_adjacency, train_mask))
198 |         test_accuracy = cal_accuracy(tensor_y[val_mask],
199 |                                      model_predict(model, tensor_x, tensor_adjacency, val_mask))
200 | 
201 |         loss_list.append(loss.item())
202 |         test_accuracy_list.append(test_accuracy.item())
203 |         if epoch % 10 == 1:
204 |             print("epoch {:04d}: loss {:.4f}, train accuracy {:.4}, test accuracy {:.4f}".format(
205 |                 epoch, loss.item(), train_accuracy.item(), test_accuracy.item()))
206 |     return model, loss_list, test_accuracy_list
207 | 
208 | 
209 | def plot_loss_with_acc(loss_history, val_acc_history):
210 |     fig = plt.figure()
211 |     # 坐标系ax1画曲线1
212 |     ax1 = fig.add_subplot(111)  # 指的是将plot界面分成1行1列，此子图占据从左到右从上到下的1位置
213 |     ax1.plot(range(len(loss_history)), loss_history,
214 |              c=np.array([255, 71, 90]) / 255.)  # c为颜色
215 |     plt.ylabel('Loss')
216 | 
217 |     # 坐标系ax2画曲线2
218 |     ax2 = fig.add_subplot(111, sharex=ax1, frameon=False)  # 其本质就是添加坐标系，设置共享ax1的x轴，ax2背景透明
219 |     ax2.plot(range(len(val_acc_history)), val_acc_history,
220 |              c=np.array([79, 179, 255]) / 255.)
221 |     ax2.yaxis.tick_right()  # 开启右边的y坐标
222 | 
223 |     ax2.yaxis.set_label_position("right")
224 |     plt.ylabel('ValAcc')
225 | 
226 |     plt.xlabel('Epoch')
227 |     plt.title('Training Loss & Validation Accuracy')
228 |     plt.show()
229 | 
230 | 
231 | if __name__ == '__main__':
232 |     # 数据预处理
233 |     dataset = data_preprocess()
234 | 
235 |     # x、y规范化
236 |     node_feature = (dataset.x - dataset.x.mean()) / dataset.x.std()
237 |     tensor_x_all = numpy_to_tensor(node_feature).to(torch.float32)
238 |     tensor_y_all = numpy_to_tensor(dataset.y)
239 | 
240 |     tensor_train_mask = numpy_to_tensor(dataset.train_mask)
241 |     tensor_val_mask = numpy_to_tensor(dataset.val_mask)
242 |     tensor_test_mask = numpy_to_tensor(dataset.test_mask)
243 | 
244 |     # 邻接矩阵规范化
245 |     normed_adj = adj_norm(dataset.adjacency)
246 | 
247 |     indices = torch.from_numpy(np.asarray([normed_adj.row,
248 |                                            normed_adj.col]).astype('int64')).long()
249 |     values = torch.from_numpy(normed_adj.data.astype(np.float32))
250 | 
251 |     tensor_adjacency_all = torch.sparse.FloatTensor(indices, values,
252 |                                                     (node_feature.shape[0], node_feature.shape[0])).to(cpu_type)
253 | 
254 |     # 训练模型并做预测
255 |     gcn_model, loss_arr, test_accuracy_arr = model_train(tensor_x_all, tensor_y_all, tensor_adjacency_all,
256 |                                                          tensor_train_mask,
257 |                                                          tensor_val_mask, epochs=300,
258 |                                                          learning_rate=0.04, weight_decay=5e-4)
259 |     y_predict = model_predict(gcn_model, tensor_x_all, tensor_adjacency_all, tensor_test_mask)
260 |     test_acc = cal_accuracy(tensor_y_all[tensor_test_mask], y_predict)
261 |     print(test_acc.item())
262 | 
263 |     plot_loss_with_acc(loss_arr, test_accuracy_arr)
264 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/data/german_score.csv:
--------------------------------------------------------------------------------
   1 | score,creditability
   2 | 557.0,0
   3 | 337.0,1
   4 | 504.0,0
   5 | 404.0,0
   6 | 495.0,1
   7 | 389.0,0
   8 | 460.0,0
   9 | 385.0,0
  10 | 510.0,0
  11 | 403.0,1
  12 | 432.0,1
  13 | 341.0,1
  14 | 441.0,0
  15 | 505.0,1
  16 | 440.0,0
  17 | 415.0,1
  18 | 483.0,0
  19 | 428.0,0
  20 | 384.0,1
  21 | 425.0,0
  22 | 534.0,0
  23 | 502.0,0
  24 | 542.0,0
  25 | 471.0,0
  26 | 472.0,0
  27 | 485.0,0
  28 | 471.0,0
  29 | 503.0,0
  30 | 456.0,0
  31 | 417.0,1
  32 | 441.0,0
  33 | 407.0,0
  34 | 446.0,0
  35 | 493.0,0
  36 | 440.0,0
  37 | 368.0,1
  38 | 364.0,0
  39 | 434.0,1
  40 | 471.0,0
  41 | 432.0,0
  42 | 393.0,0
  43 | 435.0,0
  44 | 460.0,0
  45 | 415.0,0
  46 | 426.0,1
  47 | 476.0,0
  48 | 400.0,0
  49 | 457.0,0
  50 | 481.0,0
  51 | 427.0,0
  52 | 403.0,0
  53 | 435.0,0
  54 | 433.0,0
  55 | 431.0,0
  56 | 468.0,1
  57 | 481.0,0
  58 | 473.0,1
  59 | 392.0,0
  60 | 412.0,0
  61 | 375.0,1
  62 | 451.0,0
  63 | 500.0,0
  64 | 449.0,1
  65 | 301.0,1
  66 | 392.0,0
  67 | 526.0,0
  68 | 445.0,0
  69 | 419.0,0
  70 | 398.0,1
  71 | 369.0,0
  72 | 376.0,0
  73 | 512.0,0
  74 | 542.0,0
  75 | 420.0,0
  76 | 404.0,1
  77 | 541.0,0
  78 | 368.0,1
  79 | 481.0,0
  80 | 359.0,0
  81 | 385.0,0
  82 | 472.0,1
  83 | 466.0,0
  84 | 416.0,0
  85 | 468.0,0
  86 | 490.0,0
  87 | 461.0,0
  88 | 445.0,0
  89 | 406.0,1
  90 | 433.0,0
  91 | 461.0,1
  92 | 494.0,0
  93 | 486.0,0
  94 | 459.0,1
  95 | 406.0,0
  96 | 487.0,0
  97 | 344.0,1
  98 | 498.0,0
  99 | 424.0,0
 100 | 393.0,0
 101 | 455.0,0
 102 | 433.0,0
 103 | 367.0,0
 104 | 444.0,0
 105 | 449.0,0
 106 | 437.0,0
 107 | 437.0,1
 108 | 477.0,1
 109 | 432.0,0
 110 | 403.0,0
 111 | 466.0,0
 112 | 518.0,0
 113 | 418.0,0
 114 | 402.0,0
 115 | 371.0,1
 116 | 452.0,0
 117 | 386.0,0
 118 | 344.0,1
 119 | 474.0,0
 120 | 416.0,1
 121 | 451.0,0
 122 | 440.0,1
 123 | 450.0,0
 124 | 428.0,0
 125 | 542.0,0
 126 | 413.0,1
 127 | 460.0,0
 128 | 459.0,0
 129 | 438.0,1
 130 | 470.0,0
 131 | 459.0,1
 132 | 337.0,0
 133 | 355.0,1
 134 | 461.0,0
 135 | 485.0,0
 136 | 289.0,0
 137 | 482.0,0
 138 | 407.0,0
 139 | 515.0,1
 140 | 491.0,0
 141 | 479.0,0
 142 | 460.0,0
 143 | 368.0,0
 144 | 395.0,0
 145 | 416.0,1
 146 | 396.0,0
 147 | 349.0,0
 148 | 523.0,0
 149 | 513.0,0
 150 | 398.0,0
 151 | 485.0,0
 152 | 504.0,0
 153 | 478.0,0
 154 | 350.0,0
 155 | 392.0,0
 156 | 395.0,0
 157 | 430.0,1
 158 | 532.0,0
 159 | 472.0,0
 160 | 462.0,0
 161 | 522.0,0
 162 | 439.0,0
 163 | 453.0,0
 164 | 471.0,0
 165 | 512.0,0
 166 | 400.0,0
 167 | 468.0,0
 168 | 430.0,1
 169 | 414.0,0
 170 | 403.0,0
 171 | 438.0,1
 172 | 481.0,1
 173 | 429.0,0
 174 | 423.0,1
 175 | 449.0,0
 176 | 443.0,1
 177 | 417.0,1
 178 | 471.0,0
 179 | 529.0,0
 180 | 478.0,0
 181 | 525.0,0
 182 | 394.0,1
 183 | 416.0,1
 184 | 480.0,1
 185 | 441.0,0
 186 | 452.0,1
 187 | 460.0,0
 188 | 549.0,1
 189 | 585.0,0
 190 | 416.0,1
 191 | 473.0,0
 192 | 521.0,1
 193 | 372.0,1
 194 | 419.0,1
 195 | 458.0,0
 196 | 339.0,1
 197 | 495.0,1
 198 | 497.0,0
 199 | 518.0,1
 200 | 416.0,0
 201 | 429.0,1
 202 | 520.0,0
 203 | 429.0,0
 204 | 414.0,0
 205 | 417.0,1
 206 | 495.0,0
 207 | 450.0,0
 208 | 518.0,0
 209 | 426.0,0
 210 | 381.0,0
 211 | 497.0,0
 212 | 508.0,0
 213 | 422.0,0
 214 | 473.0,1
 215 | 472.0,1
 216 | 421.0,0
 217 | 523.0,0
 218 | 424.0,0
 219 | 374.0,0
 220 | 405.0,0
 221 | 529.0,0
 222 | 430.0,0
 223 | 420.0,0
 224 | 467.0,0
 225 | 450.0,0
 226 | 420.0,0
 227 | 402.0,0
 228 | 360.0,1
 229 | 452.0,1
 230 | 423.0,1
 231 | 384.0,0
 232 | 362.0,1
 233 | 497.0,0
 234 | 450.0,0
 235 | 414.0,0
 236 | 566.0,0
 237 | 423.0,1
 238 | 391.0,1
 239 | 505.0,1
 240 | 475.0,0
 241 | 451.0,0
 242 | 410.0,1
 243 | 514.0,0
 244 | 384.0,1
 245 | 467.0,0
 246 | 450.0,0
 247 | 400.0,0
 248 | 544.0,0
 249 | 458.0,0
 250 | 415.0,0
 251 | 418.0,1
 252 | 507.0,0
 253 | 490.0,0
 254 | 381.0,1
 255 | 453.0,0
 256 | 441.0,0
 257 | 324.0,0
 258 | 443.0,0
 259 | 428.0,1
 260 | 426.0,0
 261 | 554.0,0
 262 | 443.0,0
 263 | 463.0,0
 264 | 491.0,0
 265 | 561.0,0
 266 | 496.0,0
 267 | 461.0,1
 268 | 403.0,0
 269 | 426.0,0
 270 | 450.0,1
 271 | 430.0,0
 272 | 420.0,0
 273 | 458.0,0
 274 | 319.0,0
 275 | 377.0,1
 276 | 378.0,1
 277 | 478.0,0
 278 | 454.0,0
 279 | 493.0,0
 280 | 460.0,1
 281 | 405.0,0
 282 | 441.0,0
 283 | 477.0,0
 284 | 463.0,0
 285 | 499.0,0
 286 | 412.0,0
 287 | 352.0,0
 288 | 350.0,0
 289 | 348.0,0
 290 | 506.0,0
 291 | 450.0,1
 292 | 444.0,0
 293 | 366.0,1
 294 | 476.0,0
 295 | 412.0,0
 296 | 408.0,0
 297 | 334.0,1
 298 | 421.0,0
 299 | 471.0,0
 300 | 453.0,0
 301 | 451.0,0
 302 | 527.0,0
 303 | 399.0,1
 304 | 469.0,1
 305 | 511.0,0
 306 | 364.0,1
 307 | 452.0,0
 308 | 384.0,0
 309 | 444.0,1
 310 | 439.0,1
 311 | 429.0,0
 312 | 370.0,0
 313 | 378.0,0
 314 | 406.0,0
 315 | 444.0,1
 316 | 488.0,0
 317 | 380.0,1
 318 | 487.0,0
 319 | 459.0,0
 320 | 471.0,0
 321 | 465.0,0
 322 | 408.0,1
 323 | 411.0,1
 324 | 427.0,0
 325 | 411.0,0
 326 | 466.0,0
 327 | 506.0,0
 328 | 470.0,0
 329 | 433.0,0
 330 | 372.0,0
 331 | 459.0,0
 332 | 527.0,0
 333 | 449.0,1
 334 | 293.0,1
 335 | 342.0,1
 336 | 434.0,1
 337 | 492.0,1
 338 | 430.0,0
 339 | 416.0,1
 340 | 391.0,0
 341 | 440.0,0
 342 | 421.0,0
 343 | 421.0,0
 344 | 427.0,0
 345 | 438.0,0
 346 | 481.0,0
 347 | 456.0,0
 348 | 449.0,0
 349 | 408.0,0
 350 | 508.0,0
 351 | 490.0,1
 352 | 451.0,0
 353 | 465.0,1
 354 | 447.0,0
 355 | 435.0,1
 356 | 476.0,0
 357 | 395.0,1
 358 | 496.0,0
 359 | 390.0,1
 360 | 433.0,0
 361 | 377.0,1
 362 | 490.0,0
 363 | 456.0,0
 364 | 489.0,0
 365 | 450.0,0
 366 | 406.0,1
 367 | 463.0,0
 368 | 441.0,0
 369 | 418.0,0
 370 | 416.0,1
 371 | 452.0,0
 372 | 389.0,0
 373 | 443.0,0
 374 | 474.0,0
 375 | 362.0,0
 376 | 371.0,1
 377 | 361.0,1
 378 | 472.0,0
 379 | 469.0,0
 380 | 379.0,1
 381 | 529.0,0
 382 | 433.0,0
 383 | 390.0,1
 384 | 403.0,0
 385 | 426.0,0
 386 | 420.0,0
 387 | 453.0,0
 388 | 404.0,0
 389 | 443.0,0
 390 | 434.0,0
 391 | 471.0,0
 392 | 437.0,0
 393 | 438.0,0
 394 | 427.0,0
 395 | 471.0,0
 396 | 454.0,0
 397 | 348.0,0
 398 | 491.0,0
 399 | 452.0,0
 400 | 524.0,1
 401 | 466.0,0
 402 | 449.0,0
 403 | 444.0,0
 404 | 413.0,1
 405 | 456.0,0
 406 | 458.0,0
 407 | 414.0,1
 408 | 478.0,0
 409 | 423.0,0
 410 | 400.0,0
 411 | 491.0,1
 412 | 385.0,0
 413 | 410.0,0
 414 | 485.0,1
 415 | 482.0,0
 416 | 421.0,1
 417 | 435.0,0
 418 | 450.0,1
 419 | 426.0,0
 420 | 436.0,0
 421 | 430.0,1
 422 | 414.0,0
 423 | 431.0,0
 424 | 534.0,0
 425 | 453.0,0
 426 | 442.0,1
 427 | 421.0,0
 428 | 422.0,0
 429 | 487.0,0
 430 | 444.0,0
 431 | 567.0,1
 432 | 558.0,0
 433 | 404.0,1
 434 | 516.0,0
 435 | 442.0,0
 436 | 434.0,0
 437 | 440.0,1
 438 | 452.0,0
 439 | 455.0,0
 440 | 468.0,0
 441 | 430.0,1
 442 | 451.0,0
 443 | 450.0,0
 444 | 455.0,0
 445 | 476.0,1
 446 | 358.0,1
 447 | 494.0,0
 448 | 391.0,1
 449 | 466.0,0
 450 | 496.0,0
 451 | 532.0,1
 452 | 337.0,0
 453 | 457.0,0
 454 | 482.0,0
 455 | 414.0,0
 456 | 411.0,1
 457 | 401.0,0
 458 | 498.0,0
 459 | 443.0,1
 460 | 445.0,0
 461 | 419.0,0
 462 | 429.0,0
 463 | 456.0,0
 464 | 443.0,0
 465 | 484.0,0
 466 | 457.0,0
 467 | 497.0,0
 468 | 406.0,1
 469 | 388.0,0
 470 | 422.0,0
 471 | 442.0,0
 472 | 392.0,1
 473 | 436.0,1
 474 | 440.0,1
 475 | 475.0,0
 476 | 429.0,1
 477 | 430.0,1
 478 | 360.0,0
 479 | 380.0,0
 480 | 464.0,0
 481 | 502.0,0
 482 | 433.0,0
 483 | 402.0,0
 484 | 474.0,0
 485 | 468.0,0
 486 | 466.0,0
 487 | 484.0,1
 488 | 490.0,0
 489 | 467.0,0
 490 | 457.0,0
 491 | 444.0,0
 492 | 469.0,0
 493 | 441.0,1
 494 | 495.0,0
 495 | 469.0,0
 496 | 503.0,0
 497 | 397.0,1
 498 | 345.0,1
 499 | 460.0,0
 500 | 427.0,0
 501 | 504.0,0
 502 | 395.0,1
 503 | 422.0,0
 504 | 499.0,0
 505 | 472.0,1
 506 | 398.0,1
 507 | 432.0,1
 508 | 453.0,0
 509 | 450.0,1
 510 | 405.0,0
 511 | 375.0,0
 512 | 429.0,1
 513 | 390.0,0
 514 | 430.0,0
 515 | 446.0,0
 516 | 457.0,0
 517 | 495.0,0
 518 | 500.0,0
 519 | 387.0,1
 520 | 484.0,0
 521 | 492.0,0
 522 | 457.0,0
 523 | 417.0,1
 524 | 434.0,1
 525 | 392.0,0
 526 | 430.0,0
 527 | 419.0,0
 528 | 430.0,0
 529 | 537.0,0
 530 | 383.0,1
 531 | 497.0,0
 532 | 438.0,0
 533 | 464.0,1
 534 | 458.0,0
 535 | 401.0,0
 536 | 420.0,0
 537 | 434.0,1
 538 | 544.0,0
 539 | 434.0,0
 540 | 354.0,1
 541 | 471.0,0
 542 | 444.0,1
 543 | 501.0,0
 544 | 375.0,1
 545 | 455.0,1
 546 | 531.0,0
 547 | 482.0,1
 548 | 422.0,0
 549 | 437.0,0
 550 | 425.0,1
 551 | 380.0,0
 552 | 450.0,0
 553 | 509.0,0
 554 | 395.0,1
 555 | 425.0,0
 556 | 551.0,0
 557 | 434.0,1
 558 | 451.0,1
 559 | 450.0,1
 560 | 401.0,1
 561 | 467.0,1
 562 | 458.0,0
 563 | 396.0,1
 564 | 465.0,0
 565 | 370.0,1
 566 | 438.0,0
 567 | 431.0,0
 568 | 464.0,1
 569 | 429.0,0
 570 | 416.0,0
 571 | 344.0,1
 572 | 385.0,1
 573 | 400.0,0
 574 | 397.0,0
 575 | 414.0,0
 576 | 465.0,0
 577 | 454.0,0
 578 | 437.0,0
 579 | 451.0,0
 580 | 401.0,1
 581 | 435.0,0
 582 | 461.0,1
 583 | 536.0,0
 584 | 432.0,0
 585 | 387.0,1
 586 | 478.0,0
 587 | 422.0,1
 588 | 507.0,0
 589 | 415.0,0
 590 | 460.0,1
 591 | 532.0,1
 592 | 588.0,0
 593 | 462.0,0
 594 | 428.0,0
 595 | 390.0,1
 596 | 431.0,1
 597 | 478.0,1
 598 | 424.0,1
 599 | 492.0,1
 600 | 463.0,1
 601 | 427.0,0
 602 | 497.0,0
 603 | 443.0,1
 604 | 416.0,1
 605 | 371.0,1
 606 | 424.0,0
 607 | 385.0,0
 608 | 497.0,0
 609 | 422.0,1
 610 | 425.0,0
 611 | 476.0,0
 612 | 420.0,1
 613 | 518.0,1
 614 | 404.0,0
 615 | 406.0,0
 616 | 403.0,1
 617 | 362.0,0
 618 | 307.0,0
 619 | 549.0,0
 620 | 384.0,1
 621 | 479.0,0
 622 | 444.0,0
 623 | 463.0,1
 624 | 365.0,1
 625 | 415.0,0
 626 | 521.0,1
 627 | 436.0,0
 628 | 483.0,0
 629 | 455.0,1
 630 | 402.0,0
 631 | 527.0,0
 632 | 411.0,0
 633 | 451.0,1
 634 | 413.0,0
 635 | 460.0,1
 636 | 407.0,1
 637 | 545.0,0
 638 | 405.0,0
 639 | 291.0,0
 640 | 457.0,0
 641 | 402.0,1
 642 | 419.0,1
 643 | 475.0,0
 644 | 442.0,1
 645 | 443.0,0
 646 | 453.0,0
 647 | 375.0,1
 648 | 432.0,0
 649 | 442.0,1
 650 | 444.0,1
 651 | 474.0,1
 652 | 373.0,0
 653 | 447.0,1
 654 | 438.0,1
 655 | 482.0,1
 656 | 445.0,0
 657 | 425.0,0
 658 | 476.0,1
 659 | 331.0,0
 660 | 425.0,0
 661 | 438.0,0
 662 | 428.0,0
 663 | 422.0,1
 664 | 484.0,0
 665 | 489.0,0
 666 | 508.0,0
 667 | 408.0,0
 668 | 410.0,0
 669 | 368.0,0
 670 | 483.0,1
 671 | 426.0,0
 672 | 427.0,0
 673 | 411.0,0
 674 | 333.0,0
 675 | 455.0,0
 676 | 449.0,1
 677 | 373.0,0
 678 | 397.0,0
 679 | 294.0,1
 680 | 478.0,0
 681 | 421.0,0
 682 | 527.0,0
 683 | 445.0,0
 684 | 463.0,0
 685 | 495.0,0
 686 | 414.0,0
 687 | 334.0,0
 688 | 462.0,0
 689 | 378.0,0
 690 | 453.0,0
 691 | 465.0,0
 692 | 465.0,0
 693 | 420.0,0
 694 | 417.0,0
 695 | 530.0,0
 696 | 401.0,0
 697 | 507.0,0
 698 | 463.0,0
 699 | 479.0,0
 700 | 436.0,0
 701 | 447.0,0
 702 | 433.0,1
 703 | 399.0,1
 704 | 446.0,0
 705 | 445.0,0
 706 | 417.0,0
 707 | 444.0,0
 708 | 330.0,1
 709 | 445.0,1
 710 | 431.0,0
 711 | 484.0,0
 712 | 459.0,0
 713 | 458.0,1
 714 | 445.0,0
 715 | 460.0,0
 716 | 266.0,1
 717 | 492.0,0
 718 | 463.0,0
 719 | 426.0,0
 720 | 441.0,0
 721 | 434.0,0
 722 | 479.0,1
 723 | 454.0,1
 724 | 422.0,1
 725 | 521.0,0
 726 | 386.0,1
 727 | 522.0,0
 728 | 511.0,0
 729 | 438.0,1
 730 | 398.0,1
 731 | 469.0,0
 732 | 416.0,0
 733 | 419.0,1
 734 | 476.0,0
 735 | 416.0,0
 736 | 496.0,0
 737 | 378.0,0
 738 | 398.0,1
 739 | 441.0,0
 740 | 522.0,0
 741 | 404.0,1
 742 | 426.0,0
 743 | 425.0,0
 744 | 431.0,0
 745 | 387.0,0
 746 | 344.0,0
 747 | 464.0,0
 748 | 432.0,0
 749 | 458.0,1
 750 | 413.0,0
 751 | 443.0,0
 752 | 509.0,0
 753 | 434.0,1
 754 | 439.0,0
 755 | 394.0,0
 756 | 532.0,1
 757 | 414.0,1
 758 | 641.0,0
 759 | 486.0,1
 760 | 428.0,0
 761 | 478.0,1
 762 | 498.0,0
 763 | 435.0,1
 764 | 417.0,0
 765 | 359.0,1
 766 | 427.0,0
 767 | 495.0,0
 768 | 408.0,1
 769 | 458.0,0
 770 | 502.0,0
 771 | 550.0,0
 772 | 411.0,0
 773 | 379.0,1
 774 | 444.0,0
 775 | 482.0,0
 776 | 587.0,0
 777 | 399.0,1
 778 | 419.0,0
 779 | 401.0,0
 780 | 410.0,0
 781 | 506.0,0
 782 | 396.0,1
 783 | 472.0,0
 784 | 453.0,0
 785 | 422.0,1
 786 | 482.0,0
 787 | 430.0,0
 788 | 437.0,0
 789 | 415.0,0
 790 | 379.0,1
 791 | 347.0,1
 792 | 468.0,1
 793 | 432.0,0
 794 | 495.0,0
 795 | 456.0,0
 796 | 406.0,0
 797 | 435.0,0
 798 | 477.0,1
 799 | 464.0,0
 800 | 494.0,0
 801 | 451.0,0
 802 | 505.0,0
 803 | 496.0,0
 804 | 445.0,0
 805 | 476.0,0
 806 | 416.0,0
 807 | 349.0,1
 808 | 450.0,0
 809 | 600.0,0
 810 | 412.0,0
 811 | 464.0,1
 812 | 445.0,0
 813 | 455.0,0
 814 | 360.0,1
 815 | 412.0,1
 816 | 397.0,1
 817 | 423.0,0
 818 | 541.0,0
 819 | 487.0,0
 820 | 358.0,0
 821 | 414.0,1
 822 | 467.0,0
 823 | 422.0,0
 824 | 434.0,1
 825 | 512.0,0
 826 | 459.0,0
 827 | 444.0,0
 828 | 499.0,1
 829 | 482.0,1
 830 | 395.0,1
 831 | 366.0,0
 832 | 479.0,0
 833 | 408.0,1
 834 | 358.0,1
 835 | 434.0,0
 836 | 429.0,1
 837 | 504.0,1
 838 | 417.0,0
 839 | 480.0,0
 840 | 502.0,0
 841 | 467.0,0
 842 | 364.0,1
 843 | 441.0,0
 844 | 404.0,1
 845 | 452.0,0
 846 | 522.0,0
 847 | 430.0,0
 848 | 523.0,1
 849 | 406.0,0
 850 | 512.0,0
 851 | 496.0,1
 852 | 449.0,1
 853 | 462.0,0
 854 | 468.0,0
 855 | 470.0,1
 856 | 431.0,0
 857 | 416.0,0
 858 | 463.0,0
 859 | 414.0,0
 860 | 422.0,1
 861 | 461.0,0
 862 | 410.0,0
 863 | 414.0,1
 864 | 415.0,1
 865 | 436.0,0
 866 | 440.0,1
 867 | 419.0,0
 868 | 411.0,0
 869 | 467.0,0
 870 | 408.0,0
 871 | 425.0,0
 872 | 421.0,0
 873 | 551.0,0
 874 | 430.0,0
 875 | 418.0,0
 876 | 453.0,0
 877 | 488.0,0
 878 | 441.0,0
 879 | 370.0,0
 880 | 442.0,1
 881 | 430.0,0
 882 | 403.0,0
 883 | 426.0,0
 884 | 436.0,0
 885 | 528.0,0
 886 | 433.0,1
 887 | 487.0,1
 888 | 455.0,0
 889 | 291.0,1
 890 | 397.0,0
 891 | 444.0,0
 892 | 529.0,0
 893 | 490.0,0
 894 | 477.0,0
 895 | 409.0,0
 896 | 450.0,0
 897 | 373.0,0
 898 | 405.0,0
 899 | 452.0,0
 900 | 456.0,0
 901 | 451.0,1
 902 | 465.0,1
 903 | 484.0,0
 904 | 411.0,0
 905 | 450.0,0
 906 | 423.0,0
 907 | 446.0,0
 908 | 408.0,0
 909 | 384.0,0
 910 | 506.0,0
 911 | 463.0,0
 912 | 408.0,0
 913 | 399.0,1
 914 | 395.0,0
 915 | 453.0,0
 916 | 403.0,1
 917 | 305.0,1
 918 | 475.0,0
 919 | 486.0,1
 920 | 436.0,1
 921 | 419.0,1
 922 | 445.0,0
 923 | 319.0,0
 924 | 432.0,1
 925 | 455.0,0
 926 | 453.0,1
 927 | 502.0,1
 928 | 404.0,0
 929 | 409.0,1
 930 | 455.0,0
 931 | 507.0,0
 932 | 395.0,0
 933 | 422.0,1
 934 | 479.0,0
 935 | 509.0,0
 936 | 419.0,0
 937 | 424.0,1
 938 | 448.0,1
 939 | 443.0,0
 940 | 337.0,1
 941 | 478.0,0
 942 | 488.0,0
 943 | 463.0,0
 944 | 414.0,0
 945 | 567.0,0
 946 | 461.0,0
 947 | 380.0,0
 948 | 415.0,1
 949 | 444.0,0
 950 | 464.0,1
 951 | 447.0,1
 952 | 515.0,0
 953 | 414.0,1
 954 | 400.0,1
 955 | 358.0,1
 956 | 430.0,0
 957 | 498.0,0
 958 | 456.0,0
 959 | 532.0,0
 960 | 429.0,1
 961 | 401.0,0
 962 | 491.0,0
 963 | 502.0,0
 964 | 424.0,0
 965 | 453.0,1
 966 | 442.0,0
 967 | 387.0,0
 968 | 412.0,1
 969 | 469.0,0
 970 | 358.0,0
 971 | 515.0,0
 972 | 410.0,0
 973 | 448.0,0
 974 | 463.0,1
 975 | 319.0,1
 976 | 397.0,0
 977 | 477.0,0
 978 | 540.0,0
 979 | 472.0,0
 980 | 485.0,1
 981 | 435.0,1
 982 | 420.0,1
 983 | 358.0,1
 984 | 429.0,0
 985 | 374.0,1
 986 | 452.0,0
 987 | 446.0,0
 988 | 401.0,0
 989 | 522.0,0
 990 | 382.0,0
 991 | 476.0,0
 992 | 500.0,0
 993 | 451.0,0
 994 | 450.0,0
 995 | 372.0,0
 996 | 473.0,0
 997 | 443.0,0
 998 | 407.0,0
 999 | 455.0,0
1000 | 349.0,1
1001 | 352.0,0
1002 | 


--------------------------------------------------------------------------------
/data/text_data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | !
   2 | "
   3 | #
   4 | $
   5 | %
   6 | &
   7 | '
   8 | (
   9 | )
  10 | *
  11 | +
  12 | ,
  13 | -
  14 | --
  15 | .
  16 | ..
  17 | ...
  18 | ......
  19 | ...................
  20 | ./
  21 | .一
  22 | 记者
  23 | 数
  24 | 年
  25 | 月
  26 | 日
  27 | 时
  28 | 分
  29 | 秒
  30 | /
  31 | //
  32 | 0
  33 | 1
  34 | 2
  35 | 3
  36 | 4
  37 | 5
  38 | 6
  39 | 7
  40 | 8
  41 | 9
  42 | :
  43 | ://
  44 | ::
  45 | ;
  46 | <
  47 | =
  48 | >
  49 | >>
  50 | ?
  51 | @
  52 | A
  53 | Lex
  54 | [
  55 | \
  56 | ]
  57 | 【
  58 | 】
  59 | ^
  60 | _
  61 | `
  62 | exp
  63 | sub
  64 | sup
  65 | |
  66 | }
  67 | ~
  68 | ~~~~
  69 | ·
  70 | ×
  71 | ×××
  72 | Δ
  73 | Ψ
  74 | γ
  75 | μ
  76 | φ
  77 | φ．
  78 | В
  79 | —
  80 | ——
  81 | ———
  82 | ‘
  83 | ’
  84 | ’‘
  85 | “
  86 | ”
  87 | ”，
  88 | …
  89 | ……
  90 | …………………………………………………③
  91 | ′∈
  92 | ′｜
  93 | ℃
  94 | Ⅲ
  95 | ↑
  96 | →
  97 | ∈［
  98 | ∪φ∈
  99 | ≈
 100 | ①
 101 | ②
 102 | ②ｃ
 103 | ③
 104 | ③］
 105 | ④
 106 | ⑤
 107 | ⑥
 108 | ⑦
 109 | ⑧
 110 | ⑨
 111 | ⑩
 112 | ──
 113 | ■
 114 | ▲
 115 | 　
 116 | 、
 117 | 。
 118 | 〈
 119 | 〉
 120 | 《
 121 | 》
 122 | 》），
 123 | 」
 124 | 『
 125 | 』
 126 | 〔
 127 | 〕
 128 | 〕〔
 129 | ㈧
 130 | 一
 131 | 一.
 132 | 一一
 133 | 一下
 134 | 一个
 135 | 一些
 136 | 一何
 137 | 一切
 138 | 一则
 139 | 一则通过
 140 | 一天
 141 | 一定
 142 | 一方面
 143 | 一旦
 144 | 一时
 145 | 一来
 146 | 一样
 147 | 一次
 148 | 一片
 149 | 一番
 150 | 一直
 151 | 一致
 152 | 一般
 153 | 一起
 154 | 一转眼
 155 | 一边
 156 | 一面
 157 | 男子
 158 | 女子
 159 | 七
 160 | 万一
 161 | 三
 162 | 三天两头
 163 | 三番两次
 164 | 三番五次
 165 | 上
 166 | 上下
 167 | 上升
 168 | 上去
 169 | 上来
 170 | 上述
 171 | 上面
 172 | 下
 173 | 下列
 174 | 下去
 175 | 下来
 176 | 下面
 177 | 不
 178 | 不一
 179 | 不下
 180 | 不久
 181 | 不了
 182 | 不亦乐乎
 183 | 不仅
 184 | 不仅...而且
 185 | 不仅仅
 186 | 不仅仅是
 187 | 不会
 188 | 不但
 189 | 不但...而且
 190 | 不光
 191 | 不免
 192 | 不再
 193 | 不力
 194 | 不单
 195 | 不变
 196 | 不只
 197 | 不可
 198 | 不可开交
 199 | 不可抗拒
 200 | 不同
 201 | 不外
 202 | 不外乎
 203 | 不够
 204 | 不大
 205 | 不如
 206 | 不妨
 207 | 不定
 208 | 不对
 209 | 不少
 210 | 不尽
 211 | 不尽然
 212 | 不巧
 213 | 不已
 214 | 不常
 215 | 不得
 216 | 不得不
 217 | 不得了
 218 | 不得已
 219 | 不必
 220 | 不怎么
 221 | 不怕
 222 | 不惟
 223 | 不成
 224 | 不拘
 225 | 不择手段
 226 | 不敢
 227 | 不料
 228 | 不断
 229 | 不日
 230 | 不时
 231 | 不是
 232 | 不曾
 233 | 不止
 234 | 不止一次
 235 | 不比
 236 | 不消
 237 | 不满
 238 | 不然
 239 | 不然的话
 240 | 不特
 241 | 不独
 242 | 不由得
 243 | 不知不觉
 244 | 不管
 245 | 不管怎样
 246 | 不经意
 247 | 不胜
 248 | 不能
 249 | 不能不
 250 | 不至于
 251 | 不若
 252 | 不要
 253 | 不论
 254 | 不起
 255 | 不足
 256 | 不过
 257 | 不迭
 258 | 不问
 259 | 不限
 260 | 与
 261 | 与其
 262 | 与其说
 263 | 与否
 264 | 与此同时
 265 | 专门
 266 | 且
 267 | 且不说
 268 | 且说
 269 | 两者
 270 | 严格
 271 | 严重
 272 | 个
 273 | 个人
 274 | 个别
 275 | 中小
 276 | 中间
 277 | 丰富
 278 | 串行
 279 | 临
 280 | 临到
 281 | 为
 282 | 为主
 283 | 为了
 284 | 为什么
 285 | 为什麽
 286 | 为何
 287 | 为止
 288 | 为此
 289 | 为着
 290 | 主张
 291 | 主要
 292 | 举凡
 293 | 举行
 294 | 乃
 295 | 乃至
 296 | 乃至于
 297 | 么
 298 | 之
 299 | 之一
 300 | 之前
 301 | 之后
 302 | 之後
 303 | 之所以
 304 | 之类
 305 | 乌乎
 306 | 乎
 307 | 乒
 308 | 乘
 309 | 乘势
 310 | 乘机
 311 | 乘胜
 312 | 乘虚
 313 | 乘隙
 314 | 九
 315 | 也
 316 | 也好
 317 | 也就是说
 318 | 也是
 319 | 也罢
 320 | 了
 321 | 了解
 322 | 争取
 323 | 二
 324 | 二来
 325 | 二话不说
 326 | 二话没说
 327 | 于
 328 | 于是
 329 | 于是乎
 330 | 云云
 331 | 云尔
 332 | 互
 333 | 互相
 334 | 五
 335 | 些
 336 | 交口
 337 | 亦
 338 | 产生
 339 | 亲口
 340 | 亲手
 341 | 亲眼
 342 | 亲自
 343 | 亲身
 344 | 人
 345 | 人人
 346 | 人们
 347 | 人家
 348 | 人民
 349 | 什么
 350 | 什么样
 351 | 什麽
 352 | 仅
 353 | 仅仅
 354 | 今
 355 | 今后
 356 | 今天
 357 | 今年
 358 | 今後
 359 | 介于
 360 | 仍
 361 | 仍旧
 362 | 仍然
 363 | 从
 364 | 从不
 365 | 从严
 366 | 从中
 367 | 从事
 368 | 从今以后
 369 | 从优
 370 | 从古到今
 371 | 从古至今
 372 | 从头
 373 | 从宽
 374 | 从小
 375 | 从新
 376 | 从无到有
 377 | 从早到晚
 378 | 从未
 379 | 从来
 380 | 从此
 381 | 从此以后
 382 | 从而
 383 | 从轻
 384 | 从速
 385 | 从重
 386 | 他
 387 | 他人
 388 | 他们
 389 | 他是
 390 | 他的
 391 | 代替
 392 | 以
 393 | 以上
 394 | 以下
 395 | 以为
 396 | 以便
 397 | 以免
 398 | 以前
 399 | 以及
 400 | 以后
 401 | 以外
 402 | 以後
 403 | 以故
 404 | 以期
 405 | 以来
 406 | 以至
 407 | 以至于
 408 | 以致
 409 | 们
 410 | 任
 411 | 任何
 412 | 任凭
 413 | 任务
 414 | 企图
 415 | 伙同
 416 | 会
 417 | 伟大
 418 | 传
 419 | 传说
 420 | 传闻
 421 | 似乎
 422 | 似的
 423 | 但
 424 | 但凡
 425 | 但愿
 426 | 但是
 427 | 何
 428 | 何乐而不为
 429 | 何以
 430 | 何况
 431 | 何处
 432 | 何妨
 433 | 何尝
 434 | 何必
 435 | 何时
 436 | 何止
 437 | 何苦
 438 | 何须
 439 | 余外
 440 | 作为
 441 | 你
 442 | 你们
 443 | 你是
 444 | 你的
 445 | 使
 446 | 使得
 447 | 使用
 448 | 例如
 449 | 依
 450 | 依据
 451 | 依照
 452 | 依靠
 453 | 便
 454 | 便于
 455 | 促进
 456 | 保持
 457 | 保管
 458 | 保险
 459 | 俺
 460 | 俺们
 461 | 倍加
 462 | 倍感
 463 | 倒不如
 464 | 倒不如说
 465 | 倒是
 466 | 倘
 467 | 倘使
 468 | 倘或
 469 | 倘然
 470 | 倘若
 471 | 借
 472 | 借以
 473 | 借此
 474 | 假使
 475 | 假如
 476 | 假若
 477 | 偏偏
 478 | 做到
 479 | 偶尔
 480 | 偶而
 481 | 傥然
 482 | 像
 483 | 儿
 484 | 允许
 485 | 元／吨
 486 | 充其极
 487 | 充其量
 488 | 充分
 489 | 先不先
 490 | 先后
 491 | 先後
 492 | 先生
 493 | 光
 494 | 光是
 495 | 全体
 496 | 全力
 497 | 全年
 498 | 全然
 499 | 全身心
 500 | 全部
 501 | 全都
 502 | 全面
 503 | 八
 504 | 八成
 505 | 公然
 506 | 六
 507 | 兮
 508 | 共
 509 | 共同
 510 | 共总
 511 | 关于
 512 | 其
 513 | 其一
 514 | 其中
 515 | 其二
 516 | 其他
 517 | 其余
 518 | 其后
 519 | 其它
 520 | 其实
 521 | 其次
 522 | 具体
 523 | 具体地说
 524 | 具体来说
 525 | 具体说来
 526 | 具有
 527 | 兼之
 528 | 内
 529 | 再
 530 | 再其次
 531 | 再则
 532 | 再有
 533 | 再次
 534 | 再者
 535 | 再者说
 536 | 再说
 537 | 冒
 538 | 冲
 539 | 决不
 540 | 决定
 541 | 决非
 542 | 况且
 543 | 准备
 544 | 凑巧
 545 | 凝神
 546 | 几
 547 | 几乎
 548 | 几度
 549 | 几时
 550 | 几番
 551 | 几经
 552 | 凡
 553 | 凡是
 554 | 凭
 555 | 凭借
 556 | 出
 557 | 出于
 558 | 出去
 559 | 出来
 560 | 出现
 561 | 分别
 562 | 分头
 563 | 分期
 564 | 分期分批
 565 | 切
 566 | 切不可
 567 | 切切
 568 | 切勿
 569 | 切莫
 570 | 则
 571 | 则甚
 572 | 刚
 573 | 刚好
 574 | 刚巧
 575 | 刚才
 576 | 初
 577 | 别
 578 | 别人
 579 | 别处
 580 | 别是
 581 | 别的
 582 | 别管
 583 | 别说
 584 | 到
 585 | 到了儿
 586 | 到处
 587 | 到头
 588 | 到头来
 589 | 到底
 590 | 到目前为止
 591 | 前后
 592 | 前此
 593 | 前者
 594 | 前进
 595 | 前面
 596 | 加上
 597 | 加之
 598 | 加以
 599 | 加入
 600 | 加强
 601 | 动不动
 602 | 动辄
 603 | 勃然
 604 | 匆匆
 605 | 十分
 606 | 千
 607 | 千万
 608 | 千万千万
 609 | 半
 610 | 单
 611 | 单单
 612 | 单纯
 613 | 即
 614 | 即令
 615 | 即使
 616 | 即便
 617 | 即刻
 618 | 即如
 619 | 即将
 620 | 即或
 621 | 即是说
 622 | 即若
 623 | 却
 624 | 却不
 625 | 历
 626 | 原来
 627 | 去
 628 | 又
 629 | 又及
 630 | 及
 631 | 及其
 632 | 及时
 633 | 及至
 634 | 双方
 635 | 反之
 636 | 反之亦然
 637 | 反之则
 638 | 反倒
 639 | 反倒是
 640 | 反应
 641 | 反手
 642 | 反映
 643 | 反而
 644 | 反过来
 645 | 反过来说
 646 | 取得
 647 | 取道
 648 | 受到
 649 | 变成
 650 | 古来
 651 | 另
 652 | 另一个
 653 | 另一方面
 654 | 另外
 655 | 另悉
 656 | 另方面
 657 | 另行
 658 | 只
 659 | 只当
 660 | 只怕
 661 | 只是
 662 | 只有
 663 | 只消
 664 | 只要
 665 | 只限
 666 | 叫
 667 | 叫做
 668 | 召开
 669 | 叮咚
 670 | 叮当
 671 | 可
 672 | 可以
 673 | 可好
 674 | 可是
 675 | 可能
 676 | 可见
 677 | 各
 678 | 各个
 679 | 各人
 680 | 各位
 681 | 各地
 682 | 各式
 683 | 各种
 684 | 各级
 685 | 各自
 686 | 合理
 687 | 同
 688 | 同一
 689 | 同时
 690 | 同样
 691 | 后
 692 | 后来
 693 | 后者
 694 | 后面
 695 | 向
 696 | 向使
 697 | 向着
 698 | 吓
 699 | 吗
 700 | 否则
 701 | 吧
 702 | 吧哒
 703 | 吱
 704 | 呀
 705 | 呃
 706 | 呆呆地
 707 | 呐
 708 | 呕
 709 | 呗
 710 | 呜
 711 | 呜呼
 712 | 呢
 713 | 周围
 714 | 呵
 715 | 呵呵
 716 | 呸
 717 | 呼哧
 718 | 呼啦
 719 | 咋
 720 | 和
 721 | 咚
 722 | 咦
 723 | 咧
 724 | 咱
 725 | 咱们
 726 | 咳
 727 | 哇
 728 | 哈
 729 | 哈哈
 730 | 哉
 731 | 哎
 732 | 哎呀
 733 | 哎哟
 734 | 哗
 735 | 哗啦
 736 | 哟
 737 | 哦
 738 | 哩
 739 | 哪
 740 | 哪个
 741 | 哪些
 742 | 哪儿
 743 | 哪天
 744 | 哪年
 745 | 哪怕
 746 | 哪样
 747 | 哪边
 748 | 哪里
 749 | 哼
 750 | 哼唷
 751 | 唉
 752 | 唯有
 753 | 啊
 754 | 啊呀
 755 | 啊哈
 756 | 啊哟
 757 | 啐
 758 | 啥
 759 | 啦
 760 | 啪达
 761 | 啷当
 762 | 喀
 763 | 喂
 764 | 喏
 765 | 喔唷
 766 | 喽
 767 | 嗡
 768 | 嗡嗡
 769 | 嗬
 770 | 嗯
 771 | 嗳
 772 | 嘎
 773 | 嘎嘎
 774 | 嘎登
 775 | 嘘
 776 | 嘛
 777 | 嘻
 778 | 嘿
 779 | 嘿嘿
 780 | 四
 781 | 因
 782 | 因为
 783 | 因了
 784 | 因此
 785 | 因着
 786 | 因而
 787 | 固
 788 | 固然
 789 | 在
 790 | 在下
 791 | 在于
 792 | 地
 793 | 均
 794 | 坚决
 795 | 坚持
 796 | 基于
 797 | 基本
 798 | 基本上
 799 | 处在
 800 | 处处
 801 | 处理
 802 | 复杂
 803 | 多
 804 | 多么
 805 | 多亏
 806 | 多多
 807 | 多多少少
 808 | 多多益善
 809 | 多少
 810 | 多年前
 811 | 多年来
 812 | 多数
 813 | 多次
 814 | 够瞧的
 815 | 大
 816 | 大不了
 817 | 大举
 818 | 大事
 819 | 大体
 820 | 大体上
 821 | 大凡
 822 | 大力
 823 | 大多
 824 | 大多数
 825 | 大大
 826 | 大家
 827 | 大张旗鼓
 828 | 大批
 829 | 大抵
 830 | 大概
 831 | 大略
 832 | 大约
 833 | 大致
 834 | 大都
 835 | 大量
 836 | 大面儿上
 837 | 失去
 838 | 奇
 839 | 奈
 840 | 奋勇
 841 | 她
 842 | 她们
 843 | 她是
 844 | 她的
 845 | 好
 846 | 好在
 847 | 好的
 848 | 好象
 849 | 如
 850 | 如上
 851 | 如上所述
 852 | 如下
 853 | 如今
 854 | 如何
 855 | 如其
 856 | 如前所述
 857 | 如同
 858 | 如常
 859 | 如是
 860 | 如期
 861 | 如果
 862 | 如次
 863 | 如此
 864 | 如此等等
 865 | 如若
 866 | 始而
 867 | 姑且
 868 | 存在
 869 | 存心
 870 | 孰料
 871 | 孰知
 872 | 宁
 873 | 宁可
 874 | 宁愿
 875 | 宁肯
 876 | 它
 877 | 它们
 878 | 它们的
 879 | 它是
 880 | 它的
 881 | 安全
 882 | 完全
 883 | 完成
 884 | 定
 885 | 实现
 886 | 实际
 887 | 宣布
 888 | 容易
 889 | 密切
 890 | 对
 891 | 对于
 892 | 对应
 893 | 对待
 894 | 对方
 895 | 对比
 896 | 将
 897 | 将才
 898 | 将要
 899 | 将近
 900 | 小
 901 | 少数
 902 | 尔
 903 | 尔后
 904 | 尔尔
 905 | 尔等
 906 | 尚且
 907 | 尤其
 908 | 就
 909 | 就地
 910 | 就是
 911 | 就是了
 912 | 就是说
 913 | 就此
 914 | 就算
 915 | 就要
 916 | 尽
 917 | 尽可能
 918 | 尽如人意
 919 | 尽心尽力
 920 | 尽心竭力
 921 | 尽快
 922 | 尽早
 923 | 尽然
 924 | 尽管
 925 | 尽管如此
 926 | 尽量
 927 | 局外
 928 | 居然
 929 | 届时
 930 | 属于
 931 | 屡
 932 | 屡屡
 933 | 屡次
 934 | 屡次三番
 935 | 岂
 936 | 岂但
 937 | 岂止
 938 | 岂非
 939 | 川流不息
 940 | 左右
 941 | 巨大
 942 | 巩固
 943 | 差一点
 944 | 差不多
 945 | 己
 946 | 已
 947 | 已矣
 948 | 已经
 949 | 巴
 950 | 巴巴
 951 | 带
 952 | 帮助
 953 | 常
 954 | 常常
 955 | 常言说
 956 | 常言说得好
 957 | 常言道
 958 | 平素
 959 | 年复一年
 960 | 并
 961 | 并不
 962 | 并不是
 963 | 并且
 964 | 并排
 965 | 并无
 966 | 并没
 967 | 并没有
 968 | 并肩
 969 | 并非
 970 | 广大
 971 | 广泛
 972 | 应当
 973 | 应用
 974 | 应该
 975 | 庶乎
 976 | 庶几
 977 | 开外
 978 | 开始
 979 | 开展
 980 | 引起
 981 | 弗
 982 | 弹指之间
 983 | 强烈
 984 | 强调
 985 | 归
 986 | 归根到底
 987 | 归根结底
 988 | 归齐
 989 | 当
 990 | 当下
 991 | 当中
 992 | 当儿
 993 | 当前
 994 | 当即
 995 | 当口儿
 996 | 当地
 997 | 当场
 998 | 当头
 999 | 当庭
1000 | 当时
1001 | 当然
1002 | 当真
1003 | 当着
1004 | 形成
1005 | 彻夜
1006 | 彻底
1007 | 彼
1008 | 彼时
1009 | 彼此
1010 | 往
1011 | 往往
1012 | 待
1013 | 待到
1014 | 很
1015 | 很多
1016 | 很少
1017 | 後来
1018 | 後面
1019 | 得
1020 | 得了
1021 | 得出
1022 | 得到
1023 | 得天独厚
1024 | 得起
1025 | 心里
1026 | 必
1027 | 必定
1028 | 必将
1029 | 必然
1030 | 必要
1031 | 必须
1032 | 快
1033 | 快要
1034 | 忽地
1035 | 忽然
1036 | 怎
1037 | 怎么
1038 | 怎么办
1039 | 怎么样
1040 | 怎奈
1041 | 怎样
1042 | 怎麽
1043 | 怕
1044 | 急匆匆
1045 | 怪
1046 | 怪不得
1047 | 总之
1048 | 总是
1049 | 总的来看
1050 | 总的来说
1051 | 总的说来
1052 | 总结
1053 | 总而言之
1054 | 恍然
1055 | 恐怕
1056 | 恰似
1057 | 恰好
1058 | 恰如
1059 | 恰巧
1060 | 恰恰
1061 | 恰恰相反
1062 | 恰逢
1063 | 您
1064 | 您们
1065 | 您是
1066 | 惟其
1067 | 惯常
1068 | 意思
1069 | 愤然
1070 | 愿意
1071 | 慢说
1072 | 成为
1073 | 成年
1074 | 成年累月
1075 | 成心
1076 | 我
1077 | 我们
1078 | 我是
1079 | 我的
1080 | 或
1081 | 或则
1082 | 或多或少
1083 | 或是
1084 | 或曰
1085 | 或者
1086 | 或许
1087 | 战斗
1088 | 截然
1089 | 截至
1090 | 所
1091 | 所以
1092 | 所在
1093 | 所幸
1094 | 所有
1095 | 所谓
1096 | 才
1097 | 才能
1098 | 扑通
1099 | 打
1100 | 打从
1101 | 打开天窗说亮话
1102 | 扩大
1103 | 把
1104 | 抑或
1105 | 抽冷子
1106 | 拦腰
1107 | 拿
1108 | 按
1109 | 按时
1110 | 按期
1111 | 按照
1112 | 按理
1113 | 按说
1114 | 挨个
1115 | 挨家挨户
1116 | 挨次
1117 | 挨着
1118 | 挨门挨户
1119 | 挨门逐户
1120 | 换句话说
1121 | 换言之
1122 | 据
1123 | 据实
1124 | 据悉
1125 | 据我所知
1126 | 据此
1127 | 据称
1128 | 据说
1129 | 掌握
1130 | 接下来
1131 | 接着
1132 | 接著
1133 | 接连不断
1134 | 放量
1135 | 故
1136 | 故意
1137 | 故此
1138 | 故而
1139 | 敞开儿
1140 | 敢
1141 | 敢于
1142 | 敢情
1143 | 数/
1144 | 整个
1145 | 断然
1146 | 方
1147 | 方便
1148 | 方才
1149 | 方能
1150 | 方面
1151 | 旁人
1152 | 无
1153 | 无宁
1154 | 无法
1155 | 无论
1156 | 既
1157 | 既...又
1158 | 既往
1159 | 既是
1160 | 既然
1161 | 日复一日
1162 | 日渐
1163 | 日益
1164 | 日臻
1165 | 日见
1166 | 时候
1167 | 昂然
1168 | 明显
1169 | 明确
1170 | 是
1171 | 是不是
1172 | 是以
1173 | 是否
1174 | 是的
1175 | 显然
1176 | 显著
1177 | 普通
1178 | 普遍
1179 | 暗中
1180 | 暗地里
1181 | 暗自
1182 | 更
1183 | 更为
1184 | 更加
1185 | 更进一步
1186 | 曾
1187 | 曾经
1188 | 替
1189 | 替代
1190 | 最
1191 | 最后
1192 | 最大
1193 | 最好
1194 | 最後
1195 | 最近
1196 | 最高
1197 | 有
1198 | 有些
1199 | 有关
1200 | 有利
1201 | 有力
1202 | 有及
1203 | 有所
1204 | 有效
1205 | 有时
1206 | 有点
1207 | 有的
1208 | 有的是
1209 | 有着
1210 | 有著
1211 | 望
1212 | 朝
1213 | 朝着
1214 | 末##末
1215 | 本
1216 | 本人
1217 | 本地
1218 | 本着
1219 | 本身
1220 | 权时
1221 | 来
1222 | 来不及
1223 | 来得及
1224 | 来看
1225 | 来着
1226 | 来自
1227 | 来讲
1228 | 来说
1229 | 极
1230 | 极为
1231 | 极了
1232 | 极其
1233 | 极力
1234 | 极大
1235 | 极度
1236 | 极端
1237 | 构成
1238 | 果然
1239 | 果真
1240 | 某
1241 | 某个
1242 | 某些
1243 | 某某
1244 | 根据
1245 | 根本
1246 | 格外
1247 | 梆
1248 | 概
1249 | 次第
1250 | 欢迎
1251 | 欤
1252 | 正值
1253 | 正在
1254 | 正如
1255 | 正巧
1256 | 正常
1257 | 正是
1258 | 此
1259 | 此中
1260 | 此后
1261 | 此地
1262 | 此处
1263 | 此外
1264 | 此时
1265 | 此次
1266 | 此间
1267 | 殆
1268 | 毋宁
1269 | 每
1270 | 每个
1271 | 每天
1272 | 每年
1273 | 每当
1274 | 每时每刻
1275 | 每每
1276 | 每逢
1277 | 比
1278 | 比及
1279 | 比如
1280 | 比如说
1281 | 比方
1282 | 比照
1283 | 比起
1284 | 比较
1285 | 毕竟
1286 | 毫不
1287 | 毫无
1288 | 毫无例外
1289 | 毫无保留地
1290 | 汝
1291 | 沙沙
1292 | 没
1293 | 没奈何
1294 | 没有
1295 | 沿
1296 | 沿着
1297 | 注意
1298 | 活
1299 | 深入
1300 | 清楚
1301 | 满
1302 | 满足
1303 | 漫说
1304 | 焉
1305 | 然
1306 | 然则
1307 | 然后
1308 | 然後
1309 | 然而
1310 | 照
1311 | 照着
1312 | 牢牢
1313 | 特别是
1314 | 特殊
1315 | 特点
1316 | 犹且
1317 | 犹自
1318 | 独
1319 | 独自
1320 | 猛然
1321 | 猛然间
1322 | 率尔
1323 | 率然
1324 | 现代
1325 | 现在
1326 | 理应
1327 | 理当
1328 | 理该
1329 | 瑟瑟
1330 | 甚且
1331 | 甚么
1332 | 甚或
1333 | 甚而
1334 | 甚至
1335 | 甚至于
1336 | 用
1337 | 用来
1338 | 甫
1339 | 甭
1340 | 由
1341 | 由于
1342 | 由是
1343 | 由此
1344 | 由此可见
1345 | 略
1346 | 略为
1347 | 略加
1348 | 略微
1349 | 白
1350 | 白白
1351 | 的
1352 | 的确
1353 | 的话
1354 | 皆可
1355 | 目前
1356 | 直到
1357 | 直接
1358 | 相似
1359 | 相信
1360 | 相反
1361 | 相同
1362 | 相对
1363 | 相对而言
1364 | 相应
1365 | 相当
1366 | 相等
1367 | 省得
1368 | 看
1369 | 看上去
1370 | 看出
1371 | 看到
1372 | 看来
1373 | 看样子
1374 | 看看
1375 | 看见
1376 | 看起来
1377 | 真是
1378 | 真正
1379 | 眨眼
1380 | 着
1381 | 着呢
1382 | 矣
1383 | 矣乎
1384 | 矣哉
1385 | 知道
1386 | 砰
1387 | 确定
1388 | 碰巧
1389 | 社会主义
1390 | 离
1391 | 种
1392 | 积极
1393 | 移动
1394 | 究竟
1395 | 穷年累月
1396 | 突出
1397 | 突然
1398 | 窃
1399 | 立
1400 | 立刻
1401 | 立即
1402 | 立地
1403 | 立时
1404 | 立马
1405 | 竟
1406 | 竟然
1407 | 竟而
1408 | 第
1409 | 第二
1410 | 等
1411 | 等到
1412 | 等等
1413 | 策略地
1414 | 简直
1415 | 简而言之
1416 | 简言之
1417 | 管
1418 | 类如
1419 | 粗
1420 | 精光
1421 | 紧接着
1422 | 累年
1423 | 累次
1424 | 纯
1425 | 纯粹
1426 | 纵
1427 | 纵令
1428 | 纵使
1429 | 纵然
1430 | 练习
1431 | 组成
1432 | 经
1433 | 经常
1434 | 经过
1435 | 结合
1436 | 结果
1437 | 给
1438 | 绝
1439 | 绝不
1440 | 绝对
1441 | 绝非
1442 | 绝顶
1443 | 继之
1444 | 继后
1445 | 继续
1446 | 继而
1447 | 维持
1448 | 综上所述
1449 | 缕缕
1450 | 罢了
1451 | 老
1452 | 老大
1453 | 老是
1454 | 老老实实
1455 | 考虑
1456 | 者
1457 | 而
1458 | 而且
1459 | 而况
1460 | 而又
1461 | 而后
1462 | 而外
1463 | 而已
1464 | 而是
1465 | 而言
1466 | 而论
1467 | 联系
1468 | 联袂
1469 | 背地里
1470 | 背靠背
1471 | 能
1472 | 能否
1473 | 能够
1474 | 腾
1475 | 自
1476 | 自个儿
1477 | 自从
1478 | 自各儿
1479 | 自后
1480 | 自家
1481 | 自己
1482 | 自打
1483 | 自身
1484 | 臭
1485 | 至
1486 | 至于
1487 | 至今
1488 | 至若
1489 | 致
1490 | 般的
1491 | 良好
1492 | 若
1493 | 若夫
1494 | 若是
1495 | 若果
1496 | 若非
1497 | 范围
1498 | 莫
1499 | 莫不
1500 | 莫不然
1501 | 莫如
1502 | 莫若
1503 | 莫非
1504 | 获得
1505 | 藉以
1506 | 虽
1507 | 虽则
1508 | 虽然
1509 | 虽说
1510 | 蛮
1511 | 行为
1512 | 行动
1513 | 表明
1514 | 表示
1515 | 被
1516 | 要
1517 | 要不
1518 | 要不是
1519 | 要不然
1520 | 要么
1521 | 要是
1522 | 要求
1523 | 见
1524 | 规定
1525 | 觉得
1526 | 譬喻
1527 | 譬如
1528 | 认为
1529 | 认真
1530 | 认识
1531 | 让
1532 | 许多
1533 | 论
1534 | 论说
1535 | 设使
1536 | 设或
1537 | 设若
1538 | 诚如
1539 | 诚然
1540 | 话说
1541 | 该
1542 | 该当
1543 | 说明
1544 | 说来
1545 | 说说
1546 | 请勿
1547 | 诸
1548 | 诸位
1549 | 诸如
1550 | 谁
1551 | 谁人
1552 | 谁料
1553 | 谁知
1554 | 谨
1555 | 豁然
1556 | 贼死
1557 | 赖以
1558 | 赶
1559 | 赶快
1560 | 赶早不赶晚
1561 | 起
1562 | 起先
1563 | 起初
1564 | 起头
1565 | 起来
1566 | 起见
1567 | 起首
1568 | 趁
1569 | 趁便
1570 | 趁势
1571 | 趁早
1572 | 趁机
1573 | 趁热
1574 | 趁着
1575 | 越是
1576 | 距
1577 | 跟
1578 | 路经
1579 | 转动
1580 | 转变
1581 | 转贴
1582 | 轰然
1583 | 较
1584 | 较为
1585 | 较之
1586 | 较比
1587 | 边
1588 | 达到
1589 | 达旦
1590 | 迄
1591 | 迅速
1592 | 过
1593 | 过于
1594 | 过去
1595 | 过来
1596 | 运用
1597 | 近
1598 | 近几年来
1599 | 近年来
1600 | 近来
1601 | 还
1602 | 还是
1603 | 还有
1604 | 还要
1605 | 这
1606 | 这一来
1607 | 这个
1608 | 这么
1609 | 这么些
1610 | 这么样
1611 | 这么点儿
1612 | 这些
1613 | 这会儿
1614 | 这儿
1615 | 这就是说
1616 | 这时
1617 | 这样
1618 | 这次
1619 | 这点
1620 | 这种
1621 | 这般
1622 | 这边
1623 | 这里
1624 | 这麽
1625 | 进入
1626 | 进去
1627 | 进来
1628 | 进步
1629 | 进而
1630 | 进行
1631 | 连
1632 | 连同
1633 | 连声
1634 | 连日
1635 | 连日来
1636 | 连袂
1637 | 连连
1638 | 迟早
1639 | 迫于
1640 | 适应
1641 | 适当
1642 | 适用
1643 | 逐步
1644 | 逐渐
1645 | 通常
1646 | 通过
1647 | 造成
1648 | 逢
1649 | 遇到
1650 | 遭到
1651 | 遵循
1652 | 遵照
1653 | 避免
1654 | 那
1655 | 那个
1656 | 那么
1657 | 那么些
1658 | 那么样
1659 | 那些
1660 | 那会儿
1661 | 那儿
1662 | 那时
1663 | 那末
1664 | 那样
1665 | 那般
1666 | 那边
1667 | 那里
1668 | 那麽
1669 | 部分
1670 | 都
1671 | 鄙人
1672 | 采取
1673 | 里面
1674 | 重大
1675 | 重新
1676 | 重要
1677 | 鉴于
1678 | 针对
1679 | 长期以来
1680 | 长此下去
1681 | 长线
1682 | 长话短说
1683 | 问题
1684 | 间或
1685 | 防止
1686 | 阿
1687 | 附近
1688 | 陈年
1689 | 限制
1690 | 陡然
1691 | 除
1692 | 除了
1693 | 除却
1694 | 除去
1695 | 除外
1696 | 除开
1697 | 除此
1698 | 除此之外
1699 | 除此以外
1700 | 除此而外
1701 | 除非
1702 | 随
1703 | 随后
1704 | 随时
1705 | 随着
1706 | 随著
1707 | 隔夜
1708 | 隔日
1709 | 难得
1710 | 难怪
1711 | 难说
1712 | 难道
1713 | 难道说
1714 | 集中
1715 | 零
1716 | 需要
1717 | 非但
1718 | 非常
1719 | 非徒
1720 | 非得
1721 | 非特
1722 | 非独
1723 | 靠
1724 | 顶多
1725 | 顷
1726 | 顷刻
1727 | 顷刻之间
1728 | 顷刻间
1729 | 顺
1730 | 顺着
1731 | 顿时
1732 | 颇
1733 | 风雨无阻
1734 | 饱
1735 | 首先
1736 | 马上
1737 | 高低
1738 | 高兴
1739 | 默然
1740 | 默默地
1741 | 齐
1742 | ︿
1743 | ！
1744 | ＃
1745 | ＄
1746 | ％
1747 | ＆
1748 | ＇
1749 | （
1750 | ）
1751 | ）÷（１－
1752 | ）、
1753 | ＊
1754 | ＋
1755 | ＋ξ
1756 | ＋＋
1757 | ，
1758 | ，也
1759 | －
1760 | －β
1761 | －－
1762 | －［＊］－
1763 | ．
1764 | ／
1765 | ０
1766 | ０：２
1767 | １
1768 | １．
1769 | １２％
1770 | ２
1771 | ２．３％
1772 | ３
1773 | ４
1774 | ５
1775 | ５：０
1776 | ６
1777 | ７
1778 | ８
1779 | ９
1780 | ：
1781 | ；
1782 | ＜
1783 | ＜±
1784 | ＜Δ
1785 | ＜λ
1786 | ＜φ
1787 | ＜＜
1788 | ＝
1789 | ＝″
1790 | ＝☆
1791 | ＝（
1792 | ＝－
1793 | ＝［
1794 | ＝｛
1795 | ＞
1796 | ＞λ
1797 | ？
1798 | ＠
1799 | Ａ
1800 | ＬＩ
1801 | Ｒ．Ｌ．
1802 | ＺＸＦＩＴＬ
1803 | 
1804 | ［＊］
1805 | ［－
1806 | ［］
1807 | ］
1808 | ］∧′＝［
1809 | ］［
1810 | ＿
1811 | ａ］
1812 | ｂ］
1813 | ｃ］
1814 | ｅ］
1815 | ｆ］
1816 | ｎｇ昉
1817 | ｛
1818 | ｛－
1819 | ｜
1820 | ｝
1821 | ｝＞
1822 | ～
1823 | ～±
1824 | ～＋
1825 | ￥
1826 | secondly
1827 | all
1828 | whose
1829 | under
1830 | sorry
1831 | four
1832 | we'll
1833 | somewhere
1834 | likely
1835 | even
1836 | above
1837 | ever
1838 | never
1839 | ZZ
1840 | hers
1841 | i'd
1842 | howbeit
1843 | i'm
1844 | theres
1845 | changes
1846 | anyhow
1847 | would
1848 | therefore
1849 | is
1850 | hereby
1851 | must
1852 | me
1853 | my
1854 | indicated
1855 | indicates
1856 | keep
1857 | far
1858 | after
1859 | hereupon
1860 | keeps
1861 | every
1862 | over
1863 | before
1864 | better
1865 | then
1866 | them
1867 | they
1868 | reasonably
1869 | each
1870 | went
1871 | mean
1872 | we'd
1873 | rd
1874 | re
1875 | got
1876 | forth
1877 | you're
1878 | little
1879 | whereupon
1880 | uses
1881 | already
1882 | another
1883 | took
1884 | second
1885 | seen
1886 | seem
1887 | relatively
1888 | thoroughly
1889 | latter
1890 | that
1891 | thorough
1892 | nobody
1893 | definitely
1894 | came
1895 | saying
1896 | specify
1897 | do
1898 | next
1899 | despite
1900 | unfortunately
1901 | twice
1902 | best
1903 | said
1904 | away
1905 | there's
1906 | unto
1907 | hopefully
1908 | seven
1909 | we
1910 | ltd
1911 | here
1912 | against
1913 | com
1914 | ZT
1915 | aren't
1916 | been
1917 | much
1918 | concerning
1919 | wish
1920 | say
1921 | near
1922 | unlikely
1923 | cant
1924 | in
1925 | ie
1926 | if
1927 | containing
1928 | beside
1929 | several
1930 | kept
1931 | whereby
1932 | whoever
1933 | the
1934 | yours
1935 | just
1936 | yes
1937 | yet
1938 | had
1939 | has
1940 | t's
1941 | possible
1942 | apart
1943 | right
1944 | old
1945 | somehow
1946 | for
1947 | everything
1948 | asking
1949 | who
1950 | of
1951 | theirs
1952 | plus
1953 | formerly
1954 | down
1955 | c's
1956 | accordingly
1957 | way
1958 | was
1959 | becoming
1960 | tell
1961 | sometime
1962 | no
1963 | whereas
1964 | nd
1965 | welcome
1966 | let's
1967 | certainly
1968 | a's
1969 | did
1970 | it'll
1971 | says
1972 | appear
1973 | alone
1974 | wherever
1975 | example
1976 | usually
1977 | nowhere
1978 | hither
1979 | regardless
1980 | everybody
1981 | thru
1982 | everywhere
1983 | can
1984 | following
1985 | want
1986 | didn't
1987 | may
1988 | such
1989 | whenever
1990 | maybe
1991 | ones
1992 | so
1993 | seeing
1994 | indeed
1995 | course
1996 | still
1997 | thank
1998 | he's
1999 | selves
2000 | ours
2001 | outside
2002 | non
2003 | within
2004 | thereby
2005 | not
2006 | now
2007 | nor
2008 | entirely
2009 | eg
2010 | ex
2011 | et
2012 | hadn't
2013 | furthermore
2014 | looking
2015 | seriously
2016 | shouldn't
2017 | she
2018 | quite
2019 | besides
2020 | think
2021 | first
2022 | ignored
2023 | awfully
2024 | given
2025 | anyone
2026 | indicate
2027 | gives
2028 | mostly
2029 | than
2030 | here's
2031 | were
2032 | and
2033 | appreciate
2034 | himself
2035 | saw
2036 | any
2037 | downwards
2038 | take
2039 | sure
2040 | especially
2041 | later
2042 | that's
2043 | fifth
2044 | don't
2045 | aside
2046 | only
2047 | going
2048 | get
2049 | truly
2050 | cannot
2051 | nearly
2052 | regarding
2053 | us
2054 | where
2055 | up
2056 | namely
2057 | anyways
2058 | wonder
2059 | behind
2060 | between
2061 | it
2062 | across
2063 | come
2064 | many
2065 | whereafter
2066 | according
2067 | comes
2068 | afterwards
2069 | couldn't
2070 | moreover
2071 | considering
2072 | sensible
2073 | hardly
2074 | wants
2075 | former
2076 | those
2077 | these
2078 |  [
2079 | somebody
2080 | different
2081 | etc
2082 | insofar
2083 | same
2084 | without
2085 | can't
2086 | very
2087 | you've
2088 | among
2089 | being
2090 | we've
2091 | seems
2092 | around
2093 | using
2094 | specified
2095 | on
2096 | ok
2097 | oh
2098 | whence
2099 | it's
2100 | or
2101 | everyone
2102 | your
2103 | her
2104 | there
2105 | amongst
2106 | trying
2107 | with
2108 | they're
2109 | wasn't
2110 | gone
2111 | certain
2112 | am
2113 | an
2114 | as
2115 | at
2116 | again
2117 | serious
2118 | hello
2119 | since
2120 | consider
2121 | causes
2122 | to
2123 | th
2124 | myself
2125 | i'll
2126 | zero
2127 | further
2128 | what
2129 | brief
2130 | seemed
2131 | c'mon
2132 | allows
2133 | followed
2134 | ask
2135 | viz
2136 | contains
2137 | two
2138 | taken
2139 | more
2140 | knows
2141 | ain't
2142 | particular
2143 | known
2144 | none
2145 | nine
2146 | needs
2147 | rather
2148 | ［
2149 | okay
2150 | tried
2151 | tries
2152 | onto
2153 | perhaps
2154 | specifying
2155 |  ]
2156 | help
2157 | soon
2158 | through
2159 | its
2160 | seeming
2161 | inward
2162 | actually
2163 | might
2164 | haven't
2165 | someone
2166 | hereafter
2167 | always
2168 | isn't
2169 | beyond
2170 | really
2171 | they'll
2172 | enough
2173 | thereafter
2174 | done
2175 | together
2176 | least
2177 | too
2178 | immediate
2179 | believe
2180 | gotten
2181 | toward
2182 | self
2183 | also
2184 | towards
2185 | most
2186 | nothing
2187 | they'd
2188 | sometimes
2189 | lest
2190 | particularly
2191 | somewhat
2192 | his
2193 | goes
2194 | meanwhile
2195 | during
2196 | him
2197 | greetings
2198 | see
2199 | are
2200 | currently
2201 | please
2202 | various
2203 | probably
2204 | available
2205 | both
2206 | last
2207 | wouldn't
2208 | became
2209 | whole
2210 | liked
2211 | whatever
2212 | except
2213 | throughout
2214 | along
2215 | described
2216 | though
2217 | whom
2218 | beforehand
2219 | what's
2220 | new
2221 | else
2222 | look
2223 | while
2224 | herein
2225 | itself
2226 | wherein
2227 | used
2228 | anybody
2229 | obviously
2230 | thats
2231 | from
2232 | useful
2233 | merely
2234 | follows
2235 | often
2236 | some
2237 | ourselves
2238 | shall
2239 | per
2240 | tends
2241 | either
2242 | be
2243 | by
2244 | anything
2245 | consequently
2246 | into
2247 | appropriate
2248 | we're
2249 | elsewhere
2250 | hasn't
2251 | un
2252 | noone
2253 | associated
2254 | thanks
2255 | having
2256 | once
2257 | edu
2258 | go
2259 | sent
2260 | provides
2261 | yourselves
2262 | they've
2263 | try
2264 | this
2265 | you'd
2266 | yourself
2267 | zz
2268 | zt
2269 | respectively
2270 | let
2271 | others
2272 | until
2273 | weren't
2274 | use
2275 | few
2276 | themselves
2277 | becomes
2278 | anywhere
2279 | something
2280 | six
2281 | allow
2282 | won't
2283 | thence
2284 | willing
2285 | instead
2286 | whither
2287 | doing
2288 | how
2289 | cause
2290 | thereupon
2291 | que
2292 | via
2293 | could
2294 | hence
2295 | third
2296 | doesn't
2297 | their
2298 | exactly
2299 | regards
2300 | herself
2301 | have
2302 | need
2303 | clearly
2304 | i've
2305 | able
2306 | which
2307 | unless
2308 | where's
2309 | eight
2310 | why
2311 | you'll
2312 | normally
2313 | anyway
2314 | one
2315 | should
2316 | mainly
2317 | overall
2318 | qv
2319 | contain
2320 | looks
2321 | neither
2322 | however
2323 | otherwise
2324 | co
2325 | it'd
2326 | corresponding
2327 | thanx
2328 | novel
2329 | value
2330 | will
2331 | almost
2332 | thus
2333 | vs
2334 | when
2335 | gets
2336 | upon
2337 | off
2338 | nevertheless
2339 | well
2340 | less
2341 | presumably
2342 | ought
2343 | who's
2344 | five
2345 | know
2346 | you
2347 | name
2348 | necessary
2349 | like
2350 | become
2351 | therein
2352 | because
2353 | happens
2354 | does
2355 | although
2356 | about
2357 | getting
2358 | own
2359 | three
2360 | inasmuch
2361 | inner
2362 | but
2363 | hi
2364 | he
2365 | whether
2366 | placed
2367 | below
2368 | our
2369 | 上去--
2370 | inc
2371 | lately
2372 | other
2373 | latterly
2374 | out
2375 | 是什么
2376 | 什么时候
2377 | 是什么意思
2378 | 什么意思
2379 | 多少钱
2380 | 有没有
2381 | 更有趣
2382 | 更有甚者
2383 | 更有效
2384 | 更有意义
2385 | 更远的
2386 | 更重要的是
2387 | 正确
2388 | 错误
2389 | 第二把
2390 | 第二波
2391 | 第二大节
2392 | 第二单元
2393 | 第二关
2394 | 第二行
2395 | 第二集
2396 | 第二讲
2397 | 第二款
2398 | 第二类
2399 | 第二盘
2400 | 第二任
2401 | 第二声
2402 | 第二十
2403 | 第二首
2404 | 第二项
2405 | 第三遍
2406 | 第三册
2407 | 第三层
2408 | 第三产业
2409 | 第三大
2410 | 第三单元
2411 | 第三行
2412 | 第三回
2413 | 第三集
2414 | 第三件
2415 | 第三句
2416 | 第三卷
2417 | 第三课
2418 | 第三类
2419 | 第三篇
2420 | 第三期
2421 | 第三日
2422 | 第三声
2423 | 地三鲜
2424 | 第三项
2425 | 第三站
2426 | 第三张
2427 | 第十八
2428 | 第十次
2429 | 第十二
2430 | 的士高
2431 | 第十集
2432 | 第十届
2433 | 第十九
2434 | 第十六
2435 | 第十名
2436 | 第十三
2437 | 第十四
2438 | 第十天
2439 | 第十一
2440 | 第十一个
2441 | 第四版
2442 | 第四册
2443 | 第四场
2444 | 第四代
2445 | 第四单元
2446 | 第四集
2447 | 第四届
2448 | 第四年
2449 | 第四期
2450 | 第四声
2451 | 第四套
2452 | 第四位
2453 | 第四张
2454 | 第四者
2455 | 第四种
2456 | 第五部
2457 | 第五大道
2458 | 第五单元
2459 | 第五集
2460 | 第五卷
2461 | 第五课
2462 | 第五年
2463 | 第五期
2464 | 第五位
2465 | 第五元素
2466 | 第五组
2467 | 召唤
2468 | 最后一班
2469 | 最后一遍
2470 | 最后一关
2471 | 最后一集
2472 | 最后一科
2473 | 最后一颗子弹
2474 | 最后一派
2475 | 最后一题
2476 | 最后一眼
2477 | 最后一页
2478 | 10
2479 | 11
2480 | 12
2481 | 35
2482 | 25
2483 | 2016
2484 | 2015
2485 | 2014
2486 | 又为什么
2487 | 有问题吗
2488 | 有问题么
2489 | 又喜欢
2490 | 有喜欢
2491 | 又小
2492 | 又笑
2493 | 有笑
2494 | 有效地
2495 | 有一百
2496 | 又一遍
2497 | 有一部
2498 | 又一城
2499 | 又一村
2500 | 有一道
2501 | 有意的
2502 | 有一堆
2503 | 有一对
2504 | 有一方
2505 | 有一根
2506 | 有一会了
2507 | 有一批
2508 | 有一片
2509 | 有一期
2510 | 有一起
2511 | 有一群
2512 | 又又
2513 | 由由
2514 | 财新网
2515 | 上午
2516 | 下午
2517 | NULL
2518 | 新华社
2519 | 消息
2520 | 13
2521 | 14
2522 | 15
2523 | 16
2524 | 17
2525 | 18
2526 | 19
2527 | 20
2528 | 21
2529 | 22
2530 | 23
2531 | 24
2532 | 26
2533 | 27
2534 | 28
2535 | 29
2536 | 30
2537 | 31
2538 | 32
2539 | 33
2540 | 34
2541 | 36
2542 | 37
2543 | 38
2544 | 39
2545 | 40
2546 | 41
2547 | 42
2548 | 43
2549 | 44
2550 | 45
2551 | 46
2552 | 47
2553 | 48
2554 | 49
2555 | 50
2556 | 51
2557 | 52
2558 | 53
2559 | 54
2560 | 55
2561 | 56
2562 | 57
2563 | 58
2564 | 59
2565 | 60
2566 | 61
2567 | 62
2568 | 63
2569 | 64
2570 | 65
2571 | 66
2572 | 67
2573 | 68
2574 | 69
2575 | 70
2576 | 71
2577 | 72
2578 | 73
2579 | 74
2580 | 75
2581 | 76
2582 | 77
2583 | 78
2584 | 79
2585 | 80
2586 | 81
2587 | 82
2588 | 83
2589 | 84
2590 | 85
2591 | 86
2592 | 87
2593 | 88
2594 | 89
2595 | 90
2596 | 91
2597 | 92
2598 | 93
2599 | 94
2600 | 95
2601 | 96
2602 | 97
2603 | 98
2604 | 99
2605 | 100
2606 | 01
2607 | 02
2608 | 03
2609 | 04
2610 | 05
2611 | 06
2612 | 07
2613 | 08
2614 | 09
2615 | 


--------------------------------------------------------------------------------