├── 方案说明.pdf
├── README.md
├── feature_selection.py
├── LICENSE
├── construct_module.py
├── XGB1.py
└── feature_engine.py
/方案说明.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rogeroyer/AI-challenger-contest/HEAD/方案说明.pdf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
马上AI全球挑战赛-违约用户风险预测 亚军
2 |
3 | ***
4 |
5 | ## **代码使用说明**
6 |
7 | ### 运行环境
8 |
9 | - Anaconda3(python3.6)
10 |
11 | - 工具包
12 | - sklean
13 | - pandas
14 | - numpy
15 | - xgboost
16 |
17 | ### 方案一
18 | - XGB1.py
19 |
20 | > 输出结果result_xgb.csv
21 | `注:使用前请更改读取数据的目录`
22 |
23 |
24 | ### 方案二
25 |
26 | - feature_selection.py
27 |
28 | > 特征选择类、XGBoost模型训练函数、Logistic Regression模型训练函数
29 |
30 |
31 | - feature_engine.py
32 |
33 | > 特征提取函数,使用前请更改读取数据的目录
34 |
35 | - construct_module.py
36 |
37 | > 程序主函数
38 |
39 | ### 执行步骤
40 |
41 | 1.运行文件XGB1.py
42 |
43 | 2.运行文件construct_module.py
44 |
45 | 3.最终结果为:sample.csv
46 |
47 | ### 数据集下载链接
48 | - [百度云盘](https://pan.baidu.com/s/13D0hwh_NVwVBh_ydpNzPjw)
49 | - 密码:d5h0
50 | ### LICENSE
51 | [Apache LICENSE](https://github.com/rogeroyer/AI-challenger-contest/blob/master/LICENSE)
52 |
--------------------------------------------------------------------------------
/feature_selection.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import xgboost as xgb
6 | import lightgbm as lgb
7 | from sklearn.metrics import roc_auc_score
8 | from sklearn.linear_model import LogisticRegression
9 |
10 | '''单变量特征选取'''
11 | from sklearn.feature_selection import SelectKBest, chi2
12 | '''去除方差小的特征'''
13 | from sklearn.feature_selection import VarianceThreshold
14 | '''循环特征选取'''
15 | from sklearn.svm import SVC
16 | from sklearn.feature_selection import RFE
17 | '''RFE_CV'''
18 | from sklearn.ensemble import ExtraTreesClassifier
19 |
20 |
21 | class FeatureSelection(object):
22 | def __init__(self, feature_num):
23 | self.feature_num = feature_num
24 | self.train_test, self.label, self.test = self.read_data() # features #
25 | self.feature_name = list(self.train_test.columns) # feature name #
26 |
27 | def read_data(self):
28 | test = pd.read_csv(r'test_feature.csv', encoding='utf-8')
29 | train_test = pd.read_csv(r'train_test_feature.csv', encoding='utf-8')
30 | train_test = train_test.drop(['feature_1', 'register_days', 'id_card_one', 'id_card_two', 'id_card_three', 'id_card_four', 'id_card_five', 'id_card_six', 'mobile', 'unicom', 'telecom', 'virtual'], axis=1)
31 | print('读取数据完毕。。。')
32 | label = train_test[['target']]
33 | test = test.iloc[:, 1:]
34 | train_test = train_test.iloc[:, 2:]
35 | return train_test, label, test
36 |
37 | def variance_threshold(self):
38 | sel = VarianceThreshold()
39 | sel.fit_transform(self.train_test)
40 | feature_var = list(sel.variances_) # feature variance #
41 | features = dict(zip(self.feature_name, feature_var))
42 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
43 | # print(features) # 100 cols #
44 | return set(features) # return set type #
45 |
46 | def select_k_best(self):
47 | ch2 = SelectKBest(chi2, k=self.feature_num)
48 | ch2.fit(self.train_test, self.label)
49 | feature_var = list(ch2.scores_) # feature scores #
50 | features = dict(zip(self.feature_name, feature_var))
51 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
52 | # print(features) # 100 cols #
53 | return set(features) # return set type #
54 |
55 | def svc_select(self):
56 | svc = SVC(kernel='rbf', C=1, random_state=2018) # linear #
57 | rfe = RFE(estimator=svc, n_features_to_select=self.feature_num, step=1)
58 | rfe.fit(self.train_test, self.label.ravel())
59 | print(rfe.ranking_)
60 | return rfe.ranking_
61 |
62 | def tree_select(self):
63 | clf = ExtraTreesClassifier(n_estimators=300, max_depth=7, n_jobs=4)
64 | clf.fit(self.train_test, self.label)
65 | feature_var = list(clf.feature_importances_) # feature scores #
66 | features = dict(zip(self.feature_name, feature_var))
67 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:]
68 | # print(features) # 100 cols #
69 | return set(features) # return set type #
70 |
71 | def return_feature_set(self, variance_threshold=False, select_k_best=False, svc_select=False, tree_select=False):
72 | names = set([])
73 | if variance_threshold is True:
74 | name_one = self.variance_threshold()
75 | names = names.union(name_one)
76 | if select_k_best is True:
77 | name_two = self.select_k_best()
78 | names = names.intersection(name_two)
79 | if svc_select is True:
80 | name_three = self.svc_select()
81 | names = names.intersection(name_three)
82 | if tree_select is True:
83 | name_four = self.tree_select()
84 | names = names.intersection(name_four)
85 |
86 | # print(len(names))
87 | print(names)
88 | return list(names)
89 |
90 |
91 | # selection = FeatureSelection(100)
92 | # selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True)
93 |
94 |
95 | def train_xgb_module(features_name, store_result=False):
96 | '''训练模型'''
97 | train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8')
98 | validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8')
99 | test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8')
100 | train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8')
101 |
102 | print('读取数据完毕。。。')
103 |
104 | validate_label = validate_feature[['target']]
105 | train_label = train_feature[['target']]
106 | train_test_label = train_test_feature[['target']]
107 |
108 | train_feature = train_feature[features_name]
109 | test_feature = test_feature[features_name]
110 | validate_feature = validate_feature[features_name]
111 | train_test_feature = train_test_feature[features_name]
112 |
113 | print('开始训练xgboost模型。。。')
114 | '''xgboost分类器'''
115 | num_round = 500 # 迭代次数 #
116 | params = {
117 | 'booster': 'gbtree',
118 | 'max_depth': 4,
119 | 'colsample_bytree': 0.8,
120 | 'subsample': 0.8,
121 | 'eta': 0.03,
122 | 'silent': 1,
123 | 'objective': 'binary:logistic',
124 | 'eval_metric': 'auc',
125 | 'min_child_weight': 1,
126 | 'scale_pos_weight': 1,
127 | 'seed': 27,
128 | 'reg_alpha': 0.01
129 | }
130 | '''训练集'''
131 | dtrain = xgb.DMatrix(train_feature, label=train_label)
132 | validate_feature = xgb.DMatrix(validate_feature)
133 | module = xgb.train(params, dtrain, num_round)
134 |
135 | if store_result is True:
136 | '''测试训练集'''
137 | dtrain_two = xgb.DMatrix(train_test_feature, label=train_test_label)
138 | test_feature = xgb.DMatrix(test_feature)
139 | module_two = xgb.train(params, dtrain_two, num_round)
140 |
141 | features = module_two.get_fscore()
142 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-20:]
143 | features.reverse()
144 | print(features) # 输出特征重要性 #
145 |
146 | result = module_two.predict(test_feature)
147 | result = pd.DataFrame(result)
148 | result.columns = ['predicted_score']
149 | test_list = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False)
150 | sample = test_list[['id']]
151 | sample['predicted_score'] = [index for index in result['predicted_score']]
152 | sample.columns = ['ID', 'PROB']
153 | sample.to_csv(r'xgb_sample.csv', index=None)
154 | print(sample)
155 | print('结果已更新。。。')
156 |
157 | print(" Score_offline:", roc_auc_score(validate_label, module.predict(validate_feature)))
158 | print('特征维数:', len(features_name))
159 |
160 |
161 | def train_lr_module(features_name, store_result=False):
162 | train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8')
163 | validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8')
164 | test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8')
165 | train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8')
166 | print('读取数据完毕。。。')
167 |
168 | validate_label = validate_feature[['target']]
169 | train_label = train_feature[['target']]
170 | train_test_label = train_test_feature[['target']]
171 |
172 | train_feature = train_feature[features_name]
173 | test_feature = test_feature[features_name]
174 | validate_feature = validate_feature[features_name]
175 | train_test_feature = train_test_feature[features_name]
176 |
177 | print('开始训练logisticRegression模型。。。')
178 | module = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4) # , solver='sag'
179 | # module = lgb.LGBMClassifier(
180 | # num_leaves=64, # num_leaves = 2^max_depth * 0.6 #
181 | # max_depth=6,
182 | # n_estimators=80,
183 | # learning_rate=0.1
184 | # )
185 | '''训练集'''
186 | module.fit(train_feature, train_label)
187 |
188 | if store_result is True:
189 | '''测试训练集'''
190 | module_two = LogisticRegression(
191 | penalty='l2',
192 | solver='sag',
193 | max_iter=500,
194 | random_state=42,
195 | n_jobs=4
196 | )
197 |
198 | # module_two = lgb.LGBMClassifier(
199 | # num_leaves=64, # num_leaves = 2^max_depth * 0.6 #
200 | # max_depth=6,
201 | # n_estimators=80,
202 | # learning_rate=0.1
203 | # )
204 | module_two.fit(train_test_feature, train_test_label)
205 |
206 | result = module_two.predict_proba(test_feature)[:, 1]
207 | result = pd.DataFrame(result)
208 | result.columns = ['predicted_score']
209 | test_list = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False)
210 | sample = test_list[['id']]
211 | sample['predicted_score'] = [index for index in result['predicted_score']]
212 | sample.columns = ['ID', 'PROB']
213 | sample.to_csv(r'lr_sample.csv', index=None)
214 | # sample.to_csv(r'lgb_sample.csv', index=None)
215 | print(sample)
216 | print('结果已更新。。。')
217 |
218 | print(" Score_offline:", roc_auc_score(validate_label, module.predict_proba(validate_feature)[:, 1]))
219 | print('特征维数:', len(features_name))
220 |
221 |
222 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/construct_module.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | import time
4 | import xgboost as xgb
5 |
6 | '''导入外部文件'''
7 | from version_two.feature_engine import *
8 | from version_two.stacking import *
9 | from version_two.feature_selection import *
10 |
11 |
12 | '''划分数据集'''
13 | train_target['date'] = [index.replace('-', '') for index in train_target['appl_sbm_tm']]
14 | train_target['date'] = [index.split(' ')[0][0:6] for index in train_target['date']]
15 | '''验证集'''
16 | validate_data = train_target[(train_target['date'] == '201704')][['target', 'id']]
17 | '''训练集'''
18 | train_data = train_target[(train_target['date'] >= '201603') & (train_target['date'] <= '201703')][['target', 'id']]
19 | '''测试集'''
20 | test_data = test_list[['id']]
21 | '''测试训练集'''
22 | train_test_data = train_target[['target', 'id']]
23 |
24 |
25 | def extract_feature():
26 | '''credit_info'''
27 | train_credit_info_feature = extract_credit_info(train_credit_info)
28 | train_test_feature = train_test_data.merge(train_credit_info_feature, on='id', how='left') # 训练测试集 #
29 | train_feature = train_data.merge(train_credit_info_feature, on='id', how='left')
30 | validate_feature = validate_data.merge(train_credit_info_feature, on='id', how='left')
31 | test_feature = test_data.merge(extract_credit_info(test_credit_info), on='id', how='left')
32 |
33 | '''order_info'''
34 | train_order_info_feature = extract_order_info(train_order_info)
35 | train_feature = train_feature.merge(train_order_info_feature, on='id', how='left')
36 | train_test_feature = train_test_feature.merge(train_order_info_feature, on='id', how='left') # 训练测试集 #
37 | validate_feature = validate_feature.merge(train_order_info_feature, on='id', how='left')
38 | test_feature = test_feature.merge(extract_order_info(test_order_info), on='id', how='left')
39 |
40 | '''user_info'''
41 | train_user_info_feature = extract_user_info(train_user_info)
42 | train_feature = train_feature.merge(train_user_info_feature, on='id', how='left')
43 | train_test_feature = train_test_feature.merge(train_user_info_feature, on='id', how='left') # 训练测试集 #
44 | validate_feature = validate_feature.merge(train_user_info_feature, on='id', how='left')
45 | test_feature = test_feature.merge(extract_user_info(test_user_info), on='id', how='left')
46 |
47 | '''recieve_addr_info'''
48 | train_recieve_addr_info_feature = extract_recieve_addr_info(train_recieve_addr_info)
49 | train_feature = train_feature.merge(train_recieve_addr_info_feature, on='id', how='left')
50 | train_test_feature = train_test_feature.merge(train_recieve_addr_info_feature, on='id', how='left') # 训练测试集 #
51 | validate_feature = validate_feature.merge(train_recieve_addr_info_feature, on='id', how='left')
52 | test_feature = test_feature.merge(extract_recieve_addr_info(test_recieve_addr_info), on='id', how='left')
53 |
54 | '''bankcard_info'''
55 | train_bankcard_info_feature = extract_bankcard_info(train_bankcard_info)
56 | train_feature = train_feature.merge(train_bankcard_info_feature, on='id', how='left')
57 | train_test_feature = train_test_feature.merge(train_bankcard_info_feature, on='id', how='left') # 训练测试集 #
58 | validate_feature = validate_feature.merge(train_bankcard_info_feature, on='id', how='left')
59 | test_feature = test_feature.merge(extract_bankcard_info(test_bankcard_info), on='id', how='left')
60 |
61 | '''auth_info'''
62 | train_auth_info_feature = extract_auth_info(train_auth_info)
63 | train_feature = train_feature.merge(train_auth_info_feature, on='id', how='left').fillna(0)
64 | train_test_feature = train_test_feature.merge(train_auth_info_feature, on='id', how='left').fillna(0) # 训练测试集 #
65 | validate_feature = validate_feature.merge(train_auth_info_feature, on='id', how='left').fillna(0)
66 | test_feature = test_feature.merge(extract_auth_info(test_auth_info), on='id', how='left').fillna(0)
67 |
68 | '''time relative features one'''
69 | train_time_feature = extract_time_feature(train_auth_info, train_target)
70 | train_feature = train_feature.merge(train_time_feature, on='id', how='left').fillna(0)
71 | train_test_feature = train_test_feature.merge(train_time_feature, on='id', how='left').fillna(0) # 训练测试集 #
72 | validate_feature = validate_feature.merge(train_time_feature, on='id', how='left').fillna(0)
73 | test_feature = test_feature.merge(extract_time_feature(test_auth_info, test_list), on='id', how='left').fillna(0)
74 |
75 | '''time relative features two'''
76 | train_order_payment_time = extract_order_payment_time(train_order_info, train_target)
77 | train_feature = train_feature.merge(train_order_payment_time, on='id', how='left').fillna(0)
78 | train_test_feature = train_test_feature.merge(train_order_payment_time, on='id', how='left').fillna(0) # 训练测试集 #
79 | validate_feature = validate_feature.merge(train_order_payment_time, on='id', how='left').fillna(0)
80 | test_feature = test_feature.merge(extract_order_payment_time(test_order_info, test_list), on='id', how='left').fillna(0)
81 |
82 | print(train_feature.head(5))
83 | print(validate_feature.head(5))
84 | print(test_feature.head(5))
85 | return train_feature, validate_feature, test_feature, train_test_feature
86 |
87 |
88 | def train_module(store_result=False, store_feature=False, select_feature=False, feature_num='all', one_encode=False):
89 | '''训练模型'''
90 | if store_feature is True:
91 | train_feature, validate_feature, test_feature, train_test_feature = extract_feature()
92 | ''' 保存特征数据 '''
93 | train_feature.to_csv(r'train_feature.csv', index=None, encoding='utf-8')
94 | validate_feature.to_csv(r'validate_feature.csv', index=None, encoding='utf-8')
95 | test_feature.to_csv(r'test_feature.csv', index=None, encoding='utf-8')
96 | train_test_feature.to_csv(r'train_test_feature.csv', index=None, encoding='utf-8')
97 | print('保存数据完毕。。。')
98 |
99 | print('特征提取完毕。。。')
100 | exit(0)
101 | else:
102 | train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8')
103 | validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8')
104 | test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8')
105 | train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8')
106 | print('读取数据完毕。。。')
107 |
108 | validate_label = validate_feature[['target']]
109 | train_label = train_feature[['target']]
110 | train_test_label = train_test_feature[['target']]
111 |
112 | train_feature = train_feature.iloc[:, 2:]
113 | test_feature = test_feature.iloc[:, 1:]
114 | validate_feature = validate_feature.iloc[:, 2:]
115 | train_test_feature = train_test_feature.iloc[:, 2:]
116 |
117 | train_feature = train_feature.drop(['feature_1', 'register_days'], axis=1)
118 | test_feature = test_feature.drop(['feature_1', 'register_days'], axis=1)
119 | validate_feature = validate_feature.drop(['feature_1', 'register_days'], axis=1)
120 | train_test_feature = train_test_feature.drop(['feature_1', 'register_days'], axis=1)
121 |
122 | if one_encode is True:
123 | features = list(train_feature.columns)
124 | continuous_feature = []
125 | one_hot = []
126 | for name in features:
127 | if len(set(train_feature[name])) != 2:
128 | continuous_feature.append(name)
129 | else:
130 | one_hot.append(name)
131 |
132 | feature = continuous_feature + one_hot[:130]
133 | train_feature = train_feature[feature]
134 | validate_feature = validate_feature[feature]
135 | test_feature = test_feature[feature]
136 | train_test_feature = train_test_feature[feature]
137 |
138 | if select_feature is True:
139 | print('开始特征选择。。。')
140 | ch2 = SelectKBest(chi2, k=feature_num)
141 | train_feature = ch2.fit_transform(train_feature, train_label)
142 | test_feature = ch2.transform(test_feature)
143 | validate_feature = ch2.transform(validate_feature)
144 | train_test_feature = ch2.transform(train_test_feature)
145 | print('特征选择完毕。。。')
146 | else:
147 | feature_num = train_feature.shape[1]
148 |
149 | print('开始训练xgboost模型。。。')
150 | '''xgboost分类器'''
151 | num_round = 500 # 迭代次数 #
152 | params = {
153 | 'booster': 'gbtree',
154 | 'max_depth': 4,
155 | 'colsample_bytree': 0.6,
156 | 'subsample': 0.7,
157 | 'eta': 0.03,
158 | 'silent': 1,
159 | 'objective': 'binary:logistic',
160 | 'eval_metric': 'auc',
161 | # 'min_child_weight': 1,
162 | 'scale_pos_weight': 1,
163 | # 'seed': 27,
164 | # 'reg_alpha': 0.01
165 | }
166 | '''训练集'''
167 | dtrain = xgb.DMatrix(train_feature, label=train_label)
168 | validate_feature = xgb.DMatrix(validate_feature)
169 | module = xgb.train(params, dtrain, num_round)
170 |
171 | if store_result is True:
172 | '''测试训练集'''
173 | dtrain_two = xgb.DMatrix(train_test_feature, label=train_test_label)
174 | test_feature = xgb.DMatrix(test_feature)
175 | module_two = xgb.train(params, dtrain_two, num_round)
176 |
177 | result = module_two.predict(test_feature)
178 | result = pd.DataFrame(result)
179 | result.columns = ['predicted_score']
180 | sample = test_list[['id']]
181 | sample['predicted_score'] = [index for index in result['predicted_score']]
182 | sample.columns = ['ID', 'PROB']
183 | sample.to_csv(r'xgb_sample.csv', index=None)
184 | print(sample)
185 | print('结果已更新。。。')
186 |
187 | print(" Score_offline:", roc_auc_score(validate_label, module.predict(validate_feature)))
188 | print('特征维数:', feature_num)
189 |
190 |
191 | ''' 模型融合 '''
192 | def module_merge_triple(prob_xgb, prob_lr, prob_lgb):
193 | xgb_sample = pd.read_csv(r'result_xgb.csv', low_memory=False) # encode:159:0.790297834417
194 | lr_sample = pd.read_csv(r'lr_sample.csv', low_memory=False) # Uncode:0.792171452209
195 | lgb_sample = pd.read_csv(r'xgb_sample_51.csv', low_memory=False)
196 |
197 | xgb_sample.columns = ['ID', 'PROB_xgb']
198 | lr_sample.columns = ['ID', 'PROB_lr']
199 | lgb_sample.columns = ['ID', 'PROB_lgb']
200 | sample = xgb_sample.merge(lr_sample, on='ID', how='left')
201 | sample = sample.merge(lgb_sample, on='ID', how='left')
202 | # print(sample)
203 | sample['PROB'] = sample['PROB_xgb'] * prob_xgb + sample['PROB_lr'] * prob_lr + sample['PROB_lgb'] * prob_lgb
204 | sample = sample[['ID', 'PROB']]
205 | print(sample)
206 | sample.to_csv(r'sample.csv', index=None)
207 | print('模型已融合。。。')
208 |
209 |
210 | def module_merge_double(prob_x, prob_l):
211 | xgb_sample = pd.read_csv(r'result0501_152.csv', low_memory=False) # encode:159:0.790297834417
212 | lr_sample = pd.read_csv(r'xgb_sample_51.csv', low_memory=False) # Uncode:0.792171452209
213 | sample = xgb_sample.merge(lr_sample, on='ID', how='left')
214 | sample['PROB'] = sample['PROB_x'] * prob_x + sample['PROB_y'] * prob_l
215 | sample = sample[['ID', 'PROB']]
216 | print(sample)
217 | sample.to_csv(r'sample.csv', index=None)
218 | print('模型已融合。。。')
219 |
220 |
221 | def main():
222 | '''xgboost单模型'''
223 | train_module(store_result=False, store_feature=True, select_feature=False, feature_num='all', one_encode=False)
224 |
225 | '''LogisticRegression单模型'''
226 | # train_LR_module(store_result=False, select_feature=True, feature_num=140, OneEncode=False)
227 | '''线性融合三个sample'''
228 | # module_merge_triple(prob_xgb=0.4, prob_lr=0.2, prob_lgb=0.4)
229 | '''现行融合两个sample'''
230 | # module_merge_double(prob_x=0.5, prob_l=0.5)
231 | '''Stacking'''
232 | # # ensemble = Ensemble(5, xgb_module, [xgb_module, lgb_module, lr_module, rf_module, gb_module])
233 | # ensemble = Ensemble(4, lr_module, [xgb_module, xgb_module, xgb_module, xgb_module])
234 | # train_test, label, test = ensemble.read_data()
235 | # result = ensemble.fit_predict(train_test, label, test)
236 | # print('模型融合完毕。。。')
237 | # result = pd.DataFrame(result, columns=['PROB'])
238 | # sample = pd.read_csv(r'lr_sample.csv', low_memory=False)
239 | # sample['PROB'] = [index for index in result['PROB']]
240 | # sample.to_csv(r'stacking.csv', index=None)
241 | # print(sample)
242 | # print('数据整合完毕。。。')
243 |
244 | '''multiply_feature_selection xgboost_module'''
245 | # for index in range(70, 200, 5):
246 | # print('want to select ', index, ' features')
247 | # selection = FeatureSelection(index)
248 | # features_name = selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True)
249 | # train_xgb_module(features_name, store_result=False)
250 |
251 | features_name = ['order_all_is_null', 'feature_1', 'register_days', 'quota', 'quota_surplus', 'all_is_null_y', 'account_grade_is_null', 'all_is_zero', 'account_grade2', 'age_three', 'type_pay_len', 'null_y', '等待付款', 'income1', 'auth_time_is_null', 'record_count', 'qq_bound_is_null', 'card_record_count', 'quota_is_zero', '新疆', '云南', 'account_grade3', '广东', 'card_time_is_null', 'have_credit_card', '充值成功', '已取消', 'credit_count', '在线', '四川', 'wechat_bound_is_null', 'null', 'credit_score_rank', '未抢中', 'null_x', '完成', '天津', 'age_two', 'female', '订单取消', 'quota_rate', '山东', '重庆', 'sts_order_len', 'merriage1', '福建', 'account_grade1', 'phone_count', 'record_is_unique', '上海', 'income3', '湖北', 'phone_is_null', 'time_phone_is_null', 'province_len', 'birthday_is_zero', '混合支付', 'auth_id_card_is_null', 'credit_score', '江西', '货到付款', '吉林', 'credit_score_is_null', '江苏', 'all_not_null', 'sex_secret', '已完成', 'card_category_count', 'card_count_one', '等待收货', '湖南', 'male', 'store_card_count']
252 | train_xgb_module(features_name, store_result=True)
253 |
254 | # 0.81882083452 seed=27
255 | # original -> 0.816853963449
256 | # colsample_bytree: 0.8 -> 0.818427843445
257 | # scale_pos_weight: 16 -> 0.82029535496
258 | # reg_alpha: 0.01 -> 0.820431061402
259 | # 'quota', 'quota_surplus', -> 0.820543215061
260 |
261 | '''multiply_feature_selection LogisticRegression_module'''
262 | # for index in range(70, 200, 5):
263 | # print('want to select ', index, ' features')
264 | # selection = FeatureSelection(index)
265 | # features_name = selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True)
266 | # # features_name = ['id_card_one', 'id_card_two', 'id_card_three', 'id_card_four', 'id_card_five', 'id_card_six', 'mobile', 'unicom', 'telecom', 'virtual', 'order_all_is_null', 'feature_1', 'record_is_unique', '浙江', '辽宁', 'card_time_is_null', 'income1', 'account_grade2', '黑龙', '江苏', '未抢中', '山东', '内蒙', '上海', '分期付款', '货到付款', 'overdraft', '公司转账', 'null', '订单取消', 'age_two', '充值成功', '在线', '新疆', '完成', 'quota_rate', 'sex_not_male', '湖北', 'quota', 'account_grade_is_null', '安徽', 'card_category_count', 'all_not_null', 'phone_is_null', '河北', 'merriage_is_null', '混合支付', 'quota_surplus_is_null', 'birthday_is_zero', 'income3', '江西', 'store_card_count', 'time_phone_is_null', 'id_card_is_null', 'auth_id_card_is_null', '已取消', '广东', 'record_count', '云南', '等待付款', '已完成', 'card_count_one', 'type_pay_len', 'female', 'sts_order_len', '福建', 'auth_time_is_null', '在线支付', 'null_x', 'income2', 'quota_is_zero', 'credit_score_is_null', 'account_grade3', '四川', '等待审核', '重庆', '河南', 'all_is_null_y', '吉林', '抢票已取消', 'province_len', 'credit_count', 'account_grade1', 'credit_score_rank', 'sts_order_count', '湖南', '充值失败;退款成功', 'wechat_bound_is_null', 'card_record_count', 'male', '邮局汇款', 'merriage1', '山西', 'phone_count', 'sex_secret', '海南', 'merriage2', '等待收货', 'all_is_zero', '天津', 'credit_score', 'age_three', 'null_y', 'qq_bound_is_null', 'have_credit_card', '北京']
267 | # # # features_name = ['record_count', 'quota', 'account_grade_is_null', '安徽', '云南', '等待付款', 'credit_count', 'account_grade1', 'credit_score_rank', '已完成', 'record_is_unique', 'card_count_one', 'card_category_count', 'all_not_null', 'sts_order_count', '湖南', '浙江', '充值失败;退款成功', 'wechat_bound_is_null', 'card_record_count', 'phone_is_null', 'type_pay_len', 'female', 'male', '辽宁', 'card_time_is_null', '河北', 'sts_order_len', '福建', 'auth_time_is_null', 'income1', '在线支付', 'merriage1', 'null_x', 'account_grade2', 'income2', 'quota_is_zero', '江苏', 'credit_score_is_null', 'merriage_is_null', '未抢中', 'phone_count', '山东', '上海', 'sex_secret', '货到付款', '北京', 'null', 'account_grade3', '等待收货', 'all_is_zero', '天津', 'credit_score', '四川', '混合支付', 'quota_surplus_is_null', 'birthday_is_zero', '订单取消', 'age_two', 'income3', '江西', 'store_card_count', 'time_phone_is_null', '充值成功', 'id_card_is_null', '在线', '新疆', '重庆', '河南', 'all_is_null_y', '吉林', 'auth_id_card_is_null', '完成', 'age_three', 'null_y', 'quota_rate', 'province_len', 'qq_bound_is_null', 'have_credit_card', '已取消', 'sex_not_male', '湖北', '广东']
268 | # train_lr_module(features_name, store_result=False)
269 | # # 0.812781111086
270 |
271 | features_name = ['order_all_is_null', 'feature_1', 'record_is_unique', '浙江', '辽宁', 'card_time_is_null', 'income1', 'account_grade2', '黑龙', '江苏', '未抢中', '山东', '内蒙', '上海', '分期付款', '货到付款', 'overdraft', '公司转账', 'null', '订单取消', 'age_two', '充值成功', '在线', '新疆', '完成', 'quota_rate', 'sex_not_male', '湖北', 'quota', 'account_grade_is_null', '安徽', 'card_category_count', 'all_not_null', 'phone_is_null', '河北', 'merriage_is_null', '混合支付', 'quota_surplus_is_null', 'birthday_is_zero', 'income3', '江西', 'store_card_count', 'time_phone_is_null', 'id_card_is_null', 'auth_id_card_is_null', '已取消', '广东', 'record_count', '云南', '等待付款', '已完成', 'card_count_one', 'type_pay_len', 'female', 'sts_order_len', '福建', 'auth_time_is_null', '在线支付', 'null_x', 'income2', 'quota_is_zero', 'credit_score_is_null', 'account_grade3', '四川', '等待审核', '重庆', '河南', 'all_is_null_y', '吉林', '抢票已取消', 'province_len', 'credit_count', 'account_grade1', 'credit_score_rank', 'sts_order_count', '湖南', '充值失败;退款成功', 'wechat_bound_is_null', 'card_record_count', 'male', '邮局汇款', 'merriage1', '山西', 'phone_count', 'sex_secret', '海南', 'merriage2', '等待收货', 'all_is_zero', '天津', 'credit_score', 'age_three', 'null_y', 'qq_bound_is_null', 'have_credit_card', '北京']
272 | train_lr_module(features_name, store_result=True)
273 |
274 | module_merge_triple(prob_xgb=0.4, prob_lr=0.2, prob_lgb=0.4)
275 |
276 | if __name__ == '__main__':
277 | start_time = time.clock()
278 | main()
279 | end_time = time.clock()
280 | print('程序耗时:', end_time - start_time)
281 |
282 |
--------------------------------------------------------------------------------
/XGB1.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Sat May 19 22:02:28 2018
4 |
5 | @author: Frank
6 | """
7 |
8 |
9 |
10 | import pandas as pd
11 | import numpy as np
12 | import datetime
13 | import xgboost as xgb
14 | from xgboost import plot_importance
15 | import operator
16 | from sklearn.metrics import roc_auc_score
17 | import matplotlib.pyplot as plt
18 | from sklearn.linear_model import LogisticRegression
19 |
20 |
21 | def setlen(group):
22 | return len(set(group))
23 |
24 |
25 | def return_set(group):
26 | return set(group)
27 |
28 |
29 | def auth_info(data):
30 | data['id_card_isnull'] = [1 if type(i) == str else 0 for i in data.id_card]
31 | data['phone_isnull'] = [1 if type(i) == str else 0 for i in data.phone]
32 | data['auth_time_isnull'] = [1 if type(i) == str else 0 for i in data.auth_time]
33 | data['first_bite'] = [i[0] if type(i) == str else '-1' for i in data['id_card']]
34 | id_card = ['2', '1', '3', '4', '6', '5']
35 | for i in id_card:
36 | data[i] = [1 if i == index else 0 for index in data['first_bite']]
37 |
38 | return data[['id_card_isnull', 'phone_isnull', 'auth_time_isnull', 'id']]
39 |
40 |
41 | def bankcard_info(data):
42 | data['card_1'] = [1 if i == '储蓄卡' else 0 for i in data['card_type']]
43 | data['card_2'] = [1 if i == '信用卡' else 0 for i in data['card_type']]
44 | card_1_cnt = pd.pivot_table(data, index='id', values='card_1', aggfunc='sum').reset_index().rename(columns={'card_1': 'card_1_cnt'})
45 | data = data.merge(card_1_cnt, on='id', how='left')
46 | card_2_cnt = pd.pivot_table(data, index='id', values='card_2', aggfunc='sum').reset_index().rename(columns={'card_2': 'card_2_cnt'})
47 | data = data.merge(card_2_cnt, on='id', how='left')
48 | bank_cnt = pd.pivot_table(data, index='bank_name', values='tail_num', aggfunc='count').reset_index().rename(columns={'tail_num': 'bank_cnt'})
49 | id_bank_cnt = pd.pivot_table(data, index='id', values='bank_name', aggfunc='count').reset_index().rename(columns={'bank_name': 'id_bank_cnt'})
50 | # id_card1_cnt = pd.pivot_table(data, index='id', values='card_1', aggfunc='sum').reset_index().rename(columns={'card_1': 'id_card1_cnt'})
51 | # id_card2_cnt = pd.pivot_table(data, index='id', values='card_2', aggfunc='sum').reset_index().rename(columns={'card_2': 'id_card2_cnt'})
52 | id_phone_set = pd.pivot_table(data, index='id', values='phone', aggfunc=setlen).reset_index().rename(columns={'phone': 'id_phone_set'})
53 | id_card_set = pd.pivot_table(data, index='id', values='card_type', aggfunc=setlen).reset_index().rename(columns={'card_type': 'id_card_set'})
54 | id_bank_set = pd.pivot_table(data, index='id', values='bank_name', aggfunc=setlen).reset_index().rename(columns={'bank_name': 'id_bank_set'})
55 |
56 | data = data.merge(bank_cnt, on='bank_name', how='left')
57 | data = data.merge(id_bank_cnt, on='id', how='left')
58 | data = data.merge(id_phone_set, on='id', how='left')
59 | data = data.merge(id_card_set, on='id', how='left') # ?
60 | data = data.merge(id_bank_set, on='id', how='left') # ?
61 | return data[['id', 'card_1_cnt', 'card_2_cnt', 'id_bank_cnt', 'id_phone_set', 'id_card_set', 'id_bank_set']].drop_duplicates(['id'])
62 |
63 |
64 | def credit_info(data):
65 | data['q_o'] = data['quota'] - data['overdraft']
66 | data['quota'] = [1 if i is np.nan else i for i in data['quota']]
67 | data['overdraft'] = [1 if i is np.nan else i for i in data['overdraft']]
68 | data['q/o'] = data[['quota', 'overdraft']].apply(lambda x: 0 if x.quota == 0 else x.overdraft/x.quota, axis=1)
69 | return data.drop_duplicates(['id'])
70 |
71 |
72 | def order_info(data):
73 | id_sample = data.drop_duplicates(['id'])[['id']]
74 |
75 | data = data.drop_duplicates()
76 | order_info_amt = data[['amt_order']]
77 | order_info_amt = order_info_amt[order_info_amt['amt_order'].notnull()]
78 | order_info_amt = order_info_amt[order_info_amt['amt_order'] != 'null']
79 | order_info_amt['amt_order'] = [float(index) for index in order_info_amt['amt_order']]
80 | mean = order_info_amt['amt_order'].mean()
81 | data['amt_order'] = data['amt_order'].fillna(mean)
82 | data['amt_order'] = [mean if index == 'null' else index for index in data['amt_order']]
83 | data['amt_order'] = [float(index) for index in data['amt_order']]
84 |
85 | data['pay_way_1'] = [1 if i == '在线支付' else 0 for i in data['type_pay']]
86 | way1_cnt = pd.pivot_table(data, index='id', values='pay_way_1', aggfunc='sum').reset_index().rename(columns={'pay_way_1': 'way1_cnt'})
87 | id_sample = id_sample.merge(way1_cnt, on='id', how='left')
88 | data['pay_way_2'] = [1 if i == '货到付款' else 0 for i in data['type_pay']]
89 | way2_cnt = pd.pivot_table(data, index='id', values='pay_way_2', aggfunc='sum').reset_index().rename(columns={'pay_way_2': 'way2_cnt'})
90 | id_sample = id_sample.merge(way2_cnt, on='id', how='left')
91 |
92 | '''统计计数特征'''
93 | # f1 = pd.pivot_table(data[['id', 'amt_order']], index='id', values='amt_order', aggfunc='mean').reset_index().rename(columns={'amt_order': 'id_amt_order_mean'})
94 | # id_sample = id_sample.merge(f1, on='id', how='left')
95 | # f2 = pd.pivot_table(data[['id', 'amt_order']], index='id', values='amt_order', aggfunc='max').reset_index().rename(columns={'amt_order': 'id_amt_order_max'})
96 | # id_sample = id_sample.merge(f2, on='id', how='left')
97 | # f3 = pd.pivot_table(data[['id', 'amt_order']], index='id', values='amt_order', aggfunc='min').reset_index().rename(columns={'amt_order': 'id_amt_order_min'})
98 | # id_sample = id_sample.merge(f3, on='id', how='left')
99 | # f4 = pd.pivot_table(data[['id', 'amt_order']], index='id', values='amt_order', aggfunc='var').reset_index().rename(columns={'amt_order': 'id_amt_order_var'})
100 | # id_sample = id_sample.merge(f4, on='id', how='left')
101 | # f5 = pd.pivot_table(data[['id', 'unit_price']], index='id', values='unit_price', aggfunc='mean').reset_index().rename(columns={'unit_price': 'id_unit_price_mean'})
102 | # id_sample = id_sample.merge(f5, on='id', how='left')
103 | # f6 = pd.pivot_table(data[['id', 'unit_price']], index='id', values='unit_price', aggfunc='max').reset_index().rename(columns={'unit_price': 'id_unit_price_max'})
104 | # id_sample = id_sample.merge(f6, on='id', how='left')
105 | # f7 = pd.pivot_table(data[['id', 'unit_price']], index='id', values='unit_price', aggfunc='min').reset_index().rename(columns={'unit_price': 'id_unit_price_min'})
106 | # id_sample = id_sample.merge(f7, on='id', how='left')
107 | # f8 = pd.pivot_table(data[['id', 'unit_price']], index='id', values='unit_price', aggfunc='var').reset_index().rename(columns={'unit_price': 'id_unit_price_var'})
108 | # id_sample = id_sample.merge(f8, on='id', how='left')
109 |
110 | f9 = pd.pivot_table(data[['id', 'type_pay']], index='id', values='type_pay', aggfunc=setlen).reset_index().rename(columns={'type_pay': 'id_type_pay_set'})
111 | id_sample = id_sample.merge(f9, on='id', how='left')
112 | f10 = pd.pivot_table(data[['id', 'sts_order']], index='id', values='sts_order', aggfunc=setlen).reset_index().rename(columns={'sts_order': 'id_sts_order_set'})
113 | id_sample = id_sample.merge(f10, on='id', how='left')
114 | f11 = pd.pivot_table(data[['id', 'phone']], index='id', values='phone', aggfunc=setlen).reset_index().rename(columns={'phone': 'id_phone_set'})
115 | id_sample = id_sample.merge(f11, on='id', how='left')
116 |
117 | '''其他特征'''
118 | data['sts_order'] = data['sts_order'].fillna('0')
119 | data['wan_cheng'] = [1 if ('完成' in i) else 0 for i in data['sts_order']]
120 | wan_cheng_cnt = pd.pivot_table(data, index='id', values='wan_cheng', aggfunc='sum').reset_index().rename(columns={'wan_cheng': 'wan_cheng_cnt'})
121 | id_sample = id_sample.merge(wan_cheng_cnt, on='id', how='left')
122 | data['cheng_gong'] = [1 if '成功' in i else 0 for i in data['sts_order']]
123 | print(data['cheng_gong'])
124 | cheng_gong_cnt = pd.pivot_table(data, index='id', values='cheng_gong', aggfunc='sum').reset_index().rename(columns={'cheng_gong': 'cheng_gong_cnt'})
125 | id_sample = id_sample.merge(cheng_gong_cnt, on='id', how='left')
126 | data['qu_xiao'] = [1 if '取消' in i else 0 for i in data['sts_order']]
127 | qu_xiao_cnt = pd.pivot_table(data, index='id', values='qu_xiao', aggfunc='sum').reset_index().rename(columns={'qu_xiao': 'qu_xiao_cnt'})
128 | id_sample = id_sample.merge(qu_xiao_cnt, on='id', how='left')
129 |
130 | '''时间'''
131 | # year_month = ['1604', '1704', '1504', '1607', '1508', '1505', '1608', '1602', '1701', '1512', '1612', '1506', '1610', '1412', '1603', '00000000', '1601', '1611', '1605', '1606']
132 | # data['year_month'] = [i[2:4] + i[5: 7] if type(i) == str else '00000000' for i in data['time_order']]
133 | # for i in year_month:
134 | # data[i] = [1 if i == index else 0 for index in data['year_month']]
135 | # t_1604 = pd.pivot_table(data, index='id', values='1604', aggfunc='sum').reset_index()
136 | # id_sample = id_sample.merge(t_1604, on='id', how='left')
137 | # t_1704 = pd.pivot_table(data, index='id', values='1704', aggfunc='sum').reset_index()
138 | # id_sample = id_sample.merge(t_1704, on='id', how='left')
139 | # t_1504 = pd.pivot_table(data, index='id', values='1504', aggfunc='sum').reset_index()
140 | # id_sample = id_sample.merge(t_1504, on='id', how='left')
141 | # t_1607 = pd.pivot_table(data, index='id', values='1607', aggfunc='sum').reset_index()
142 | # id_sample = id_sample.merge(t_1607, on='id', how='left')
143 | # t_1508 = pd.pivot_table(data, index='id', values='1508', aggfunc='sum').reset_index()
144 | # id_sample = id_sample.merge(t_1508, on='id', how='left')
145 | # t_1505 = pd.pivot_table(data, index='id', values='1505', aggfunc='sum').reset_index()
146 | # id_sample = id_sample.merge(t_1505, on='id', how='left')
147 | # t_1608 = pd.pivot_table(data, index='id', values='1608', aggfunc='sum').reset_index()
148 | # id_sample = id_sample.merge(t_1608, on='id', how='left')
149 | # t_1602 = pd.pivot_table(data, index='id', values='1602', aggfunc='sum').reset_index()
150 | # id_sample = id_sample.merge(t_1602, on='id', how='left')
151 | # t_1701 = pd.pivot_table(data, index='id', values='1701', aggfunc='sum').reset_index()
152 | # id_sample = id_sample.merge(t_1701, on='id', how='left')
153 | # t_1512 = pd.pivot_table(data, index='id', values='1512', aggfunc='sum').reset_index()
154 | # id_sample = id_sample.merge(t_1512, on='id', how='left')
155 | # t_1612 = pd.pivot_table(data, index='id', values='1612', aggfunc='sum').reset_index()
156 | # id_sample = id_sample.merge(t_1612, on='id', how='left')
157 | # t_1506 = pd.pivot_table(data, index='id', values='1506', aggfunc='sum').reset_index()
158 | # id_sample = id_sample.merge(t_1506, on='id', how='left')
159 | # t_1610 = pd.pivot_table(data, index='id', values='1610', aggfunc='sum').reset_index()
160 | # id_sample = id_sample.merge(t_1610, on='id', how='left')
161 | # t_1412 = pd.pivot_table(data, index='id', values='1412', aggfunc='sum').reset_index()
162 | # id_sample = id_sample.merge(t_1412, on='id', how='left')
163 | # t_1603 = pd.pivot_table(data, index='id', values='1603', aggfunc='sum').reset_index()
164 | # id_sample = id_sample.merge(t_1603, on='id', how='left')
165 | # t_0000 = pd.pivot_table(data, index='id', values='00000000', aggfunc='sum').reset_index()
166 | # id_sample = id_sample.merge(t_0000, on='id', how='left')
167 | # t_1601 = pd.pivot_table(data, index='id', values='1601', aggfunc='sum').reset_index()
168 | # id_sample = id_sample.merge(t_1601, on='id', how='left')
169 | # t_1611 = pd.pivot_table(data, index='id', values='1611', aggfunc='sum').reset_index()
170 | # id_sample = id_sample.merge(t_1611, on='id', how='left')
171 | # t_1605 = pd.pivot_table(data, index='id', values='1605', aggfunc='sum').reset_index()
172 | # id_sample = id_sample.merge(t_1605, on='id', how='left')
173 | # t_1606 = pd.pivot_table(data, index='id', values='1606', aggfunc='sum').reset_index()
174 | # id_sample = id_sample.merge(t_1606, on='id', how='left')
175 |
176 |
177 | # sts_order = []
178 | # for outcome in data['sts_order']:
179 | # if type(outcome) == str:
180 | # if "完成" in outcome:
181 | # sts_order.append(1)
182 | # else:
183 | # sts_order.append(0)
184 | # else:
185 | # sts_order.append(0)
186 | # data['is_ok'] = sts_order
187 | # data['no_ok'] = [1 if i == 0 else 1 for i in sts_order]
188 | # wancheng_cnt = pd.pivot_table(data[['id', 'is_ok']], index='id', values='is_ok', aggfunc='sum').reset_index().rename(columns={'is_ok': 'wancheng_cnt'})
189 | # no_ok = pd.pivot_table(data[['id', 'no_ok']], index='id', values='no_ok', aggfunc='sum').reset_index().rename(columns={'is_ok': 'no_ok'})
190 | # id_sample = id_sample.merge(wancheng_cnt, on='id', how='left')
191 | # id_sample = id_sample.merge(no_ok, on='id', how='left')
192 |
193 | # type_pay = []
194 | # for outcome in data['type_pay']:
195 | # if type(outcome) == str:
196 | # if "在线支付" in outcome:
197 | # type_pay.append(1)
198 | # else:
199 | # type_pay.append(0)
200 | # else:
201 | # type_pay.append(0)
202 | # data['zai_xian'] = sts_order
203 | # data['no_zai_xian'] = [1 if i == 0 else 1 for i in sts_order]
204 | # zai_xian_cnt = pd.pivot_table(data[['id', 'zai_xian']], index='id', values='zai_xian', aggfunc='sum').reset_index().rename(columns={'is_ok': 'zai_xian_cnt'})
205 | # no_zai_xian = pd.pivot_table(data[['id', 'no_zai_xian']], index='id', values='no_zai_xian', aggfunc='sum').reset_index().rename(columns={'is_ok': 'no_zai_xian'})
206 | # id_sample = id_sample.merge(zai_xian_cnt, on='id', how='left')
207 | # id_sample = id_sample.merge(no_zai_xian, on='id', how='left')
208 |
209 | return id_sample.drop_duplicates(['id'])
210 |
211 |
212 | def recieve_addr_info(data):
213 | province = {'甘肃', '云南', '贵州', '河南', '黑龙', '香港', '北京', '湖南', '江苏', '青海', '宁夏', '内蒙', '浙江', '吉林', '海南', '福建', '重庆', '台湾', '陕西', '湖北', '江西', '辽宁', '山西', '西藏', '广东', '安徽', '四川', '河北', '山东', '上海',
214 | '广西', '新疆', '天津', 'null'}
215 | data['province'] = data[['region']].apply(lambda x: 'null' if x.region is np.nan else x.region[0:2], axis=1)
216 | city_set = pd.pivot_table(data, index='id', values='province', aggfunc=return_set).reset_index()
217 | for string in list(province):
218 | city_set[string] = [1 if string in index else 0 for index in city_set['province']]
219 | city_set['province_p'] = city_set[['province']].apply(lambda x: x.province.clear() if 'null' in x.province else x.province, axis=1)
220 | city_set['province_len'] = [0 if index is None else len(index) for index in city_set['province']]
221 |
222 | data['phone_isnull'] = [0 if type(i) == float else 1 for i in data.phone]
223 | data['fix_phone_isnull'] = [1 if type(i) == str else 0 for i in data.fix_phone]
224 | id_phone_set = pd.pivot_table(data[['id', 'phone']], index='id', values='phone', aggfunc=setlen).reset_index().rename(columns={'phone': 'id_phone_set'})
225 | data = data.merge(id_phone_set, on='id', how='left')
226 | data = data.merge(city_set, on='id', how='left')
227 |
228 | return data[['id', 'phone_isnull', 'fix_phone_isnull', 'id_phone_set', 'province_len']].drop_duplicates(['id'])
229 |
230 |
231 | def user_info(data):
232 | id_sample = data[['id']]
233 | degree = ['本科', '初中', '中专', '其他', '硕士', '大专', '博士', '高中']
234 | for index in degree:
235 | id_sample[index] = [1 if index == string else 0 for string in data['degree']]
236 |
237 | id_sample['sex_isnull'] = [0 if type(index) == float else 1 for index in data['sex']]
238 | id_sample['sex1'] = [1 if index == '保密' else 0 for index in data['sex']]
239 | id_sample['sex2'] = [1 if index == '男' else 0 for index in data['sex']]
240 | id_sample['sex3'] = [1 if index == '女' else 0 for index in data['sex']]
241 |
242 | id_sample['0000-00-00'] = [1 if index == '0000-00-00' else 0 for index in data['birthday']]
243 |
244 | id_sample['merriage1'] = [1 if index == '未婚' else 0 for index in data['merriage']]
245 | id_sample['merriage2'] = [1 if index == '已婚' else 0 for index in data['merriage']]
246 | id_sample['merriage3'] = [1 if index == '保密' else 0 for index in data['merriage']]
247 |
248 | id_sample['income_isnull'] = [1 if type(index) == str else 0 for index in data['income']]
249 | id_sample['income1'] = [1 if index == '4000-5999元' else 0 for index in data['income']]
250 | id_sample['income2'] = [1 if index == '8000元以上' else 0 for index in data['income']]
251 | id_sample['income3'] = [1 if index == '2000-3999元' else 0 for index in data['income']]
252 | id_sample['income4'] = [1 if index == '6000-7999元' else 0 for index in data['income']]
253 | id_sample['income5'] = [1 if index == '2000元以下' else 0 for index in data['income']]
254 |
255 | id_sample['id_card_isnull'] = [1 if type(index) == str else 0 for index in data['id_card']]
256 |
257 | id_sample['qq_bound_one'] = [1 if index == '已绑定' else 0 for index in data['qq_bound']]
258 | id_sample['qq_bound_two'] = [1 if index == '未绑定' else 0 for index in data['qq_bound']]
259 |
260 | id_sample['wechat_bound_one'] = [1 if index == '已绑定' else 0 for index in data['wechat_bound']]
261 | id_sample['wechat_bound_two'] = [1 if index == '未绑定' else 0 for index in data['wechat_bound']]
262 |
263 | id_sample['account_grade_one'] = [1 if index == '注册会员' else 0 for index in data['account_grade']]
264 | id_sample['account_grade_two'] = [1 if index == '铜牌会员' else 0 for index in data['account_grade']]
265 | id_sample['account_grade_three'] = [1 if index == '银牌会员' else 0 for index in data['account_grade']]
266 | id_sample['account_grade_four'] = [1 if index == '金牌会员' else 0 for index in data['account_grade']]
267 | id_sample['account_grade_five'] = [1 if index == '钻石会员' else 0 for index in data['account_grade']]
268 | return id_sample.drop_duplicates(['id'])
269 |
270 |
271 | def days_feature(auth, order, appl):
272 | # data = auth.merge(order, on='id', how='left')
273 | data = auth.merge(appl, on='id', how='left')
274 | data['auth_time'] = data[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm[:10] if x.auth_time == '0000-00-00' else x.auth_time, axis=1)
275 | data['auth_time'] = data[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm[:10] if x.auth_time is np.nan else x.auth_time, axis=1)
276 | data['days'] = data[['auth_time', 'appl_sbm_tm']].apply(lambda x: (datetime.datetime.strptime(x.appl_sbm_tm[:10], '%Y-%m-%d') - datetime.datetime.strptime(x.auth_time[:10], '%Y-%m-%d')).days, axis=1)
277 | data['days_is_neg'] = [1 if i > 0 else 0 for i in data['days']]
278 | data['auth_year'] = data[['auth_time']].apply(lambda x: int(x.auth_time[:4]), axis=1)
279 | data['appl_year'] = data[['appl_sbm_tm']].apply(lambda x: int(x.appl_sbm_tm[:4]), axis=1)
280 | data['years'] = data['appl_year'] - data['auth_year']
281 | data['years_is_neg'] =data[['years']].apply(lambda x: 1 if x.years > 0 else 0, axis=1)
282 |
283 |
284 | # data['auth_time'] = [i if type(i) == str else '0001-01-01' for i in auth['auth_time']]
285 | # data['auth_time'] = ['0001-01-01' if i == '0000-00-00' else i for i in auth['auth_time']]
286 | # data['auth_time'] = ['0001-01-01' if i == 0 else i for i in auth['auth_time']]
287 | # data['time_order'] = [i if type(i) == str else '0001-01-01 00:00:00' for i in appl['time_order']]
288 | # data['time_order'] = [i if len(i) > 16 else '0001-01-01 00:00:00' for i in appl['time_order']]
289 | #
290 | # data['time_days'] = data[['auth_time', 'time_order']].apply(lambda x: abs((datetime.datetime.strptime(x.time_order, '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(x.auth_time, '%Y-%m-%d')).days), axis=1)
291 | # data['time_days'] = [i if ((i < 50000) & (i > 0)) else -1 for i in data['time_days']]
292 | # time_days_mean = pd.pivot_table(data[['id', 'time_days']], index='id', values='time_days', aggfunc='mean').reset_index().rename(columns={'time_days': 'time_days_mean'})
293 | # data = data.merge(time_days_mean, on='id', how='left')
294 | # data['time_days_mean_is_neg'] = [1 if i > 0 else 0 for i in data['time_days']]
295 |
296 | # appl['appl_sbm_tm'] = [i[:-2] for i in appl['appl_sbm_tm']]
297 | # data = data.merge(appl, on='id', how='left')
298 | # data['appl_age'] = data[['auth_time', 'appl_sbm_tm']].apply(lambda x: ((datetime.datetime.strptime(x.auth_time, '%Y-%m-%d') - datetime.datetime.strptime(x.appl_sbm_tm, '%Y-%m-%d %H:%M:%S')).days), axis=1)
299 | # data['appl_neg'] = [1 if i < 0 else 1 for i in data['appl_age']]
300 | print("OK")
301 | return data[['id', 'days', 'days_is_neg', 'years', 'years_is_neg']].drop_duplicates(['id'])
302 |
303 |
304 | def auth_order(auth, order):
305 | data = auth.merge(order, on='id', how='left')
306 | data['auth_time'] = [i if type(i) == str else '0001-01-01' for i in data['auth_time']]
307 | data['auth_time'] = ['0001-01-01' if i == '0000-00-00' else i for i in data['auth_time']]
308 | data['auth_time'] = ['0001-01-01' if i == 0 else i for i in data['auth_time']]
309 | data['time_order'] = [i if type(i) == str else '0001-01-01 00:00:00' for i in data['time_order']]
310 | data['time_order'] = [i if len(i) > 16 else '0001-01-01 00:00:00' for i in data['time_order']]
311 |
312 | data['time_days'] = data[['auth_time', 'time_order']].apply(
313 | lambda x: abs((datetime.datetime.strptime(x.time_order, '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(x.auth_time, '%Y-%m-%d')).days), axis=1)
314 | data['time_days'] = [i if ((i < 50000) & (i > 0)) else -1 for i in data['time_days']]
315 | time_days_mean = pd.pivot_table(data[['id', 'time_days']], index='id', values='time_days', aggfunc='mean').reset_index().rename(columns={'time_days': 'time_days_mean'})
316 | auth = auth.merge(time_days_mean, on='id', how='left')
317 | auth['time_days_mean_is_neg'] = [1 if i > 0 else 0 for i in auth['time_days_mean']]
318 | return auth[['id', 'time_days_mean', 'time_days_mean_is_neg']]
319 |
320 |
321 | def submit():
322 | '''训练集读取、提特征'''
323 | train_auth_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_auth_info.csv', low_memory=False)
324 | f_train_auth_info = auth_info(train_auth_info)
325 | train_bankcard_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_bankcard_info.csv', low_memory=False)
326 | f_train_bankcard_info = bankcard_info(train_bankcard_info)
327 | train_credit_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_credit_info.csv', low_memory=False)
328 | f_train_credit_info = credit_info(train_credit_info)
329 | train_order_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_order_info.csv', low_memory=False)
330 | f_train_order_info = order_info(train_order_info)
331 | train_recieve_addr_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_recieve_addr_info.csv', low_memory=False)
332 | f_train_recieve_addr_info = recieve_addr_info(train_recieve_addr_info)
333 | train_user_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_user_info.csv', low_memory=False)
334 | f_train_user_info = user_info(train_user_info)
335 | train_target = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_target.csv', low_memory=False)
336 | feature_l = train_target[['id', 'target']]
337 | f_day_minus = days_feature(train_auth_info[['id', 'auth_time']], train_order_info[['id', 'time_order']], train_target[['id', 'appl_sbm_tm']])
338 | f_auth_or = auth_order(train_auth_info, train_order_info)
339 | # print(f_day_minus)
340 |
341 | '''f_merge'''
342 | feature_l = feature_l.merge(f_train_auth_info, on='id', how='left')
343 | feature_l = feature_l.merge(f_train_bankcard_info, on='id', how='left')
344 | feature_l = feature_l.merge(f_train_credit_info, on='id', how='left')
345 | feature_l = feature_l.merge(f_train_order_info, on='id', how='left')
346 | feature_l = feature_l.merge(f_train_recieve_addr_info, on='id', how='left')
347 | feature_l = feature_l.merge(f_train_user_info, on='id', how='left')
348 | feature_l = feature_l.merge(f_day_minus, on='id', how='left')
349 | feature_l = feature_l.merge(f_auth_or, on='id', how='left')
350 | # feature_l.to_csv(r'F:\Python_project\AL\train_data\train_feature.csv', index=False)
351 | print(feature_l.shape)
352 | print(feature_l)
353 | train_f = feature_l.drop('target', axis=1)
354 | train_l = feature_l[['target']]
355 |
356 | xgb_train = xgb.DMatrix(train_f.values, label=train_l.values)
357 | params = {
358 | 'booster': 'gbtree',
359 | 'objective': 'binary:logistic',
360 | 'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
361 | 'max_depth': 5, # 构建树的深度,越大越容易过拟合
362 | 'lambda': 2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
363 | 'subsample': 0.8, # 随机采样训练样本
364 | 'colsample_bytree': 0.8, # 生成树时进行的列采样
365 | 'min_child_weight': 18,
366 | 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0.
367 | 'eta': 0.03, # 如同学习率
368 | 'eval_metric': 'logloss'
369 | }
370 | module = xgb.train(params, xgb_train, num_boost_round=500)
371 |
372 |
373 | '''测试集读取、提特征'''
374 | test_auth_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_auth_info.csv', low_memory=False)
375 | f_test_auth_info = auth_info(test_auth_info)
376 | test_bankcard_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_bankcard_info.csv', low_memory=False)
377 | f_test_bankcard_info = bankcard_info(test_bankcard_info)
378 | test_credit_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_credit_info.csv', low_memory=False)
379 | f_test_credit_info = credit_info(test_credit_info)
380 | test_order_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_order_info.csv', low_memory=False)
381 | f_test_order_info = order_info(test_order_info)
382 | test_recieve_addr_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_recieve_addr_info.csv', low_memory=False)
383 | f_test_recieve_addr_info = recieve_addr_info(test_recieve_addr_info)
384 | test_user_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_user_info.csv', low_memory=False)
385 | f_test_user_info = user_info(test_user_info)
386 | test_target = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_list.csv', low_memory=False)
387 | test_fl = test_target[['id']]
388 | t_day_minus = days_feature(test_auth_info[['id', 'auth_time']], test_order_info[['id', 'time_order']], test_target[['id', 'appl_sbm_tm']])
389 | t_auth_or = auth_order(test_auth_info, test_order_info)
390 |
391 | '''merge'''
392 | test_fl = test_fl.merge(f_test_auth_info, on='id', how='left')
393 | test_fl = test_fl.merge(f_test_bankcard_info, on='id', how='left')
394 | test_fl = test_fl.merge(f_test_credit_info, on='id', how='left')
395 | test_fl = test_fl.merge(f_test_order_info, on='id', how='left')
396 | test_fl = test_fl.merge(f_test_recieve_addr_info, on='id', how='left')
397 | test_fl = test_fl.merge(f_test_user_info, on='id', how='left')
398 | test_fl = test_fl.merge(t_day_minus, on='id', how='left')
399 | test_fl = test_fl.merge(t_auth_or, on='id', how='left')
400 |
401 |
402 | test_f = test_fl
403 | test_l = test_fl[['id']]
404 |
405 | xgb_test = xgb.DMatrix(test_f.values)
406 | result = module.predict(xgb_test)
407 | test_l['predicted_score'] = result
408 | test_l.columns = ['ID', 'PROB']
409 | test_l.to_csv(r'result_xgb.csv', index=None)
410 |
411 |
412 | def validation():
413 | '''训练集读取、提特征'''
414 | train_auth_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_auth_info.csv', low_memory=False)
415 | f_train_auth_info = auth_info(train_auth_info)
416 | train_bankcard_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_bankcard_info.csv', low_memory=False)
417 | f_train_bankcard_info = bankcard_info(train_bankcard_info)
418 | train_credit_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_credit_info.csv', low_memory=False)
419 | f_train_credit_info = credit_info(train_credit_info)
420 | train_order_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_order_info.csv', low_memory=False)
421 | f_train_order_info = order_info(train_order_info)
422 | train_recieve_addr_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_recieve_addr_info.csv', low_memory=False)
423 | f_train_recieve_addr_info = recieve_addr_info(train_recieve_addr_info)
424 | train_user_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_user_info.csv', low_memory=False)
425 | f_train_user_info = user_info(train_user_info)
426 | train_target = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_target.csv', low_memory=False)
427 | feature_l = train_target[['id', 'target']]
428 | day_minus = days_feature(train_auth_info[['id', 'auth_time']], train_order_info[['id', 'time_order']], train_target[['id', 'appl_sbm_tm']])
429 | auth_or = auth_order(train_auth_info[['id', 'auth_time']], train_order_info[['id', 'time_order']])
430 |
431 | '''划分验证集'''
432 | feature_l['date'] = [index.replace('-', '') for index in train_target['appl_sbm_tm']]
433 | feature_l['date'] = [index.split(' ')[0][0:6] for index in feature_l['date']]
434 | validation_train = feature_l[feature_l['date'] != '201704'][['target', 'id']]
435 | validation_test = feature_l[feature_l['date'] == '201704'][['target', 'id']]
436 |
437 | '''validation_train'''
438 | validation_train = validation_train.merge(f_train_auth_info, on='id', how='left')
439 | validation_train = validation_train.merge(f_train_bankcard_info, on='id', how='left')
440 | validation_train = validation_train.merge(f_train_credit_info, on='id', how='left')
441 | validation_train = validation_train.merge(f_train_order_info, on='id', how='left')
442 | validation_train = validation_train.merge(f_train_recieve_addr_info, on='id', how='left')
443 | validation_train = validation_train.merge(f_train_user_info, on='id', how='left')
444 | validation_train = validation_train.merge(day_minus, on='id', how='left')
445 | validation_train = validation_train.merge(auth_or, on='id', how='left')
446 |
447 | validation_train_f = validation_train.drop(['target', 'id'], axis=1)
448 | validation_train_l = validation_train[['target']]
449 | print(validation_train_f.columns)
450 |
451 | '''validation_test'''
452 | validation_test = validation_test.merge(f_train_auth_info, on='id', how='left')
453 | validation_test = validation_test.merge(f_train_bankcard_info, on='id', how='left')
454 | validation_test = validation_test.merge(f_train_credit_info, on='id', how='left')
455 | validation_test = validation_test.merge(f_train_order_info, on='id', how='left')
456 | validation_test = validation_test.merge(f_train_recieve_addr_info, on='id', how='left')
457 | validation_test = validation_test.merge(f_train_user_info, on='id', how='left')
458 | validation_test = validation_test.merge(day_minus, on='id', how='left')
459 | validation_test = validation_test.merge(auth_or, on='id', how='left')
460 | print(validation_test.shape)
461 |
462 | validation_test_f = validation_test.drop(['target', 'id'], axis=1)
463 | validation_test_l = validation_test[['target']]
464 |
465 | xgb_train = xgb.DMatrix(validation_train_f, label=validation_train_l)
466 | xgb_test = xgb.DMatrix(validation_test_f, label=validation_test_l)
467 | watchlist = [(xgb_train, 'train'), (xgb_test, 'val')]
468 | params = {
469 | 'booster': 'gbtree',
470 | 'objective': 'binary:logistic',
471 | 'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
472 | 'max_depth': 5, # 构建树的深度,越大越容易过拟合
473 | 'lambda': 2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
474 | 'subsample': 0.8, # 随机采样训练样本
475 | 'colsample_bytree': 0.8, # 生成树时进行的列采样
476 | 'min_child_weight': 18,
477 | 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0.
478 | 'eta': 0.03, # 如同学习率
479 | 'eval_metric': 'auc',
480 | }
481 | module = xgb.train(params, xgb_train, num_boost_round=500, evals=watchlist)
482 | result = module.predict(xgb_test)
483 |
484 | features = module.get_fscore()
485 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-20:]
486 | features.reverse()
487 | print(features)
488 |
489 | plot_importance(module)
490 | plt.show()
491 | print("auc: ", roc_auc_score(validation_test_l.values, result))
492 |
493 |
494 | validation()
495 |
496 |
497 | # auc: 0.809462477973
498 | # submit()
499 |
500 |
501 | '''one_hot'''
502 | # testdata = pd.DataFrame({'pet': ['chinese', 'english', 'english', 'math'],
503 | # 'age': [6, 5, 2, 2],
504 | # 'salary': [7, 5, 2, 5]})
505 | # one_hot = OneHotEncoder(sparse=False).fit_transform(testdata[['age']])
506 | # print(one_hot)
--------------------------------------------------------------------------------
/feature_engine.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | import time
4 | from datetime import datetime
5 | import pandas as pd
6 | import numpy as np
7 | import lightgbm as lgb
8 | from sklearn.metrics import roc_auc_score
9 | from sklearn.feature_selection import SelectKBest # 特征选择 #
10 | from sklearn.feature_selection import chi2
11 | from sklearn.linear_model import LogisticRegression
12 | from sklearn.linear_model import LogisticRegressionCV
13 |
14 | '''训练集'''
15 | train_auth_info = pd.read_csv('../dataset/ai_risk_train/train_auth_info.csv', low_memory=False)
16 | train_bankcard_info = pd.read_csv('../dataset/ai_risk_train/train_bankcard_info.csv', low_memory=False)
17 | train_credit_info = pd.read_csv('../dataset/ai_risk_train/train_credit_info.csv', low_memory=False)
18 | train_order_info = pd.read_csv('../dataset/ai_risk_train/train_order_info.csv', low_memory=False)
19 | train_recieve_addr_info = pd.read_csv('../dataset/ai_risk_train/train_recieve_addr_info.csv', low_memory=False)
20 | train_user_info = pd.read_csv('../dataset/ai_risk_train/train_user_info.csv', low_memory=False)
21 | train_target = pd.read_csv('../dataset/ai_risk_train/train_target.csv', low_memory=False)
22 |
23 | # '''测试集'''
24 | # test_auth_info = pd.read_csv('../dataset/ai_risk_test/test_auth_info.csv', low_memory=False)
25 | # test_bankcard_info = pd.read_csv('../dataset/ai_risk_test/test_bankcard_info.csv', low_memory=False)
26 | # test_credit_info = pd.read_csv('../dataset/ai_risk_test/test_credit_info.csv', low_memory=False)
27 | # test_order_info = pd.read_csv('../dataset/ai_risk_test/test_order_info.csv', low_memory=False)
28 | # test_recieve_addr_info = pd.read_csv('../dataset/ai_risk_test/test_recieve_addr_info.csv', low_memory=False)
29 | # test_user_info = pd.read_csv('../dataset/ai_risk_test/test_user_info.csv', low_memory=False)
30 | # test_list = pd.read_csv('../dataset/ai_risk_test/test_list.csv', low_memory=False)
31 |
32 | test_auth_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_auth_info.csv', low_memory=False)
33 | test_bankcard_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_bankcard_info.csv', low_memory=False)
34 | test_credit_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_credit_info.csv', low_memory=False)
35 | test_order_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_order_info.csv', low_memory=False)
36 | test_recieve_addr_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_recieve_addr_info.csv', low_memory=False)
37 | test_user_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_user_info.csv', low_memory=False)
38 | test_list = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False)
39 |
40 | # print(test_auth_info)
41 | # print(test_bankcard_info)
42 | # print(test_credit_info)
43 | # print(test_order_info)
44 | # print(test_recieve_addr_info)
45 | # exit(0)
46 |
47 | def cal_auc(list_one, list_two):
48 | '''计算AUC值'''
49 | positive = []
50 | negative = []
51 | for index in range(len(list_one)):
52 | if list_one[index] == 1:
53 | positive.append(index)
54 | else:
55 | negative.append(index)
56 | SUM = 0
57 | for i in positive:
58 | for j in negative:
59 | if list_two[i] > list_two[j]:
60 | SUM += 1
61 | elif list_two[i] == list_two[j]:
62 | SUM += 0.5
63 | else:
64 | pass
65 | return SUM / (len(positive)*len(negative))
66 |
67 |
68 | def return_set(group):
69 | return set(group)
70 |
71 |
72 | def extract_credit_info(credit_info):
73 | '''提取credit_info表 特征'''
74 | credit_info['credit_score'] = credit_info['credit_score'].fillna(credit_info['credit_score'].mean())
75 | credit_info['quota_is_zero'] = [1 if i != 0.0 else 0 for i in credit_info.quota] # 是否有信用额度 #
76 | credit_info['overdraft'] = credit_info['overdraft'].fillna(0)
77 | credit_info['quota'] = credit_info['quota'].fillna(0)
78 | credit_info['quota_surplus'] = credit_info['quota'] - credit_info['overdraft']
79 | # credit_info['quota_rate'] = (credit_info['overdraft'] / credit_info['quota']).fillna(0)
80 | credit_info['quota_rate'] = credit_info[['overdraft', 'quota']].apply(lambda x: x.overdraft / x.quota if x.quota != 0 else 0, axis=1)
81 | credit_info['credit_score_rank'] = credit_info['credit_score'].rank(method='first', ascending=False)
82 |
83 | credit_info.loc[:, 'all_is_null'] = credit_info[['credit_score', 'overdraft', 'quota']].apply(lambda x: 1 if ((x.credit_score is not np.nan) and (x.overdraft is not np.nan) and (x.quota is not np.nan)) else 0, axis=1)
84 | credit_info.loc[:, 'all_is_zero'] = credit_info[['credit_score', 'overdraft', 'quota']].apply(lambda x: 1 if ((x.credit_score == 0) and (x.overdraft == 0) and (x.quota == 0)) else 0, axis=1)
85 | credit_info.loc[:, 'quota_is_zero'] = credit_info[['quota']].apply(lambda x: 1 if x.quota == 0 else 0, axis=1)
86 | credit_info.loc[:, 'credit_score_is_null'] = credit_info[['credit_score']].apply(lambda x: 1 if x.credit_score == 0 else 0, axis=1)
87 | credit_info.loc[:, 'quota_surplus_is_null'] = credit_info[['quota_surplus', 'quota']].apply(lambda x: 1 if (x.quota_surplus == 0) and (x.quota != 0) else 0, axis=1)
88 |
89 | '''归一化'''
90 | credit_info[['credit_score', 'overdraft', 'quota', 'quota_surplus', 'credit_score_rank']] = credit_info[['credit_score', 'overdraft', 'quota', 'quota_surplus', 'credit_score_rank']].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
91 | return credit_info
92 |
93 | # print(extract_credit_info(train_credit_info))
94 | # print(extract_credit_info(test_credit_info))
95 |
96 |
97 | def extract_user_info(user_info):
98 | '''提取 user_info表 特征'''
99 | feature = user_info[['id']]
100 | feature.loc[:, 'birthday_is_zero'] = user_info[['birthday']].apply(lambda x: 1 if x.birthday == '0000-00-00' else 0, axis=1)
101 | feature.loc[:, 'sex_not_male'] = user_info[['sex']].apply(lambda x: 1 if x.sex != '女' else 0, axis=1)
102 | feature.loc[:, 'female'] = user_info[['sex']].apply(lambda x: 1 if x.sex == '男' else 0, axis=1)
103 | feature.loc[:, 'male'] = user_info[['sex']].apply(lambda x: 1 if x.sex == '女' else 0, axis=1)
104 | feature.loc[:, 'sex_secret'] = user_info[['sex']].apply(lambda x: 1 if x.sex == '保密' else 0, axis=1) # 0.69504936432
105 | ##
106 | feature.loc[:, 'merriage1'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage == '未婚' else 0, axis=1)
107 | feature.loc[:, 'merriage2'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage == '已婚' else 0, axis=1)
108 | feature.loc[:, 'merriage3'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage == '保密' else 0, axis=1)
109 | feature.loc[:, 'merriage_is_null'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage is np.nan else 0, axis=1) # 0.700624700466
110 | ####
111 | feature.loc[:, 'account_grade1'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '注册会员' else 0, axis=1)
112 | feature.loc[:, 'account_grade2'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '铜牌会员' else 0, axis=1)
113 | feature.loc[:, 'account_grade3'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '银牌会员' else 0, axis=1)
114 | feature.loc[:, 'account_grade4'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '金牌会员' else 0, axis=1)
115 | feature.loc[:, 'account_grade5'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '钻石会员' else 0, axis=1)
116 | feature.loc[:, 'account_grade_is_null'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade is np.nan else 0, axis=1)
117 | ###
118 | feature.loc[:, 'qq_bound_is_null'] = user_info[['qq_bound']].apply(lambda x: 1 if x.qq_bound is np.nan else 0, axis=1)
119 | feature.loc[:, 'wechat_bound_is_null'] = user_info[['wechat_bound']].apply(lambda x: 1 if x.wechat_bound is np.nan else 0, axis=1)
120 | feature.loc[:, 'degree'] = user_info[['degree']].apply(lambda x: 1 if (x.degree == '硕士') | (x.degree == '其他') | (x.degree == '博士') else 0, axis=1)
121 | feature.loc[:, 'id_card_is_null'] = user_info[['id_card']].apply(lambda x: 1 if x.id_card is np.nan else 0, axis=1)
122 | #####
123 | feature.loc[:, 'income1'] = [1 if index == '4000-5999元' else 0 for index in user_info['income']]
124 | feature.loc[:, 'income2'] = [1 if index == '8000元以上' else 0 for index in user_info['income']]
125 | feature.loc[:, 'income3'] = [1 if index == '2000-3999元' else 0 for index in user_info['income']]
126 | feature.loc[:, 'income4'] = [1 if index == '6000-7999元' else 0 for index in user_info['income']]
127 | feature.loc[:, 'income5'] = [1 if index == '2000元以下' else 0 for index in user_info['income']] # 0.775891365882 #
128 |
129 | '''年龄特征'''
130 | def is_valid_date(strdate):
131 | '''''判断是否是一个有效的日期字符串'''
132 | try:
133 | if ":" in strdate:
134 | time.strptime(strdate, "%Y-%m-%d %H:%M:%S")
135 | else:
136 | time.strptime(strdate, "%Y-%m-%d")
137 | return True
138 | except:
139 | return False
140 |
141 | ####
142 | user_info['birthday_two'] = user_info[['birthday']].apply(lambda index: is_valid_date(index.birthday), axis=1)
143 | user_info['birthday'] = user_info[['birthday']].apply(lambda index: 0 if (index.birthday is np.nan) or (index.birthday == '0000-00-00') else index.birthday[0:4], axis=1)
144 | user_info['age'] = user_info[['birthday', 'birthday_two']].apply(lambda x: 2018 - int(x.birthday) if x.birthday_two is True else 0, axis=1)
145 | # print(user_info[['birthday_two', 'age']])
146 | feature.loc[:, 'age_one'] = user_info[['age']].apply(lambda x: 1 if x.age <= 18 and x.age > 0 else 0, axis=1)
147 | feature.loc[:, 'age_two'] = user_info[['age']].apply(lambda x: 1 if x.age <= 30 and x.age > 18 else 0, axis=1)
148 | feature.loc[:, 'age_three'] = user_info[['age']].apply(lambda x: 1 if x.age <= 60 and x.age > 30 else 0, axis=1)
149 | feature.loc[:, 'age_four'] = user_info[['age']].apply(lambda x: 1 if x.age <= 100 and x.age > 60 else 0, axis=1)
150 | feature.loc[:, 'age_five'] = user_info[['age']].apply(lambda x: 1 if x.age > 100 and x.age == 0 else 0, axis=1)
151 |
152 | return feature
153 |
154 | # print(extract_user_info(train_user_info))
155 | # print(extract_user_info(test_user_info))
156 |
157 |
158 | def extract_recieve_addr_info(recieve_addr_info):
159 | '''提取 recieve_addr_info表 特征'''
160 | recieve_addr_info['all_null'] = recieve_addr_info[['addr_id', 'region', 'phone', 'fix_phone', 'receiver_md5']].apply(lambda x: 1 if (x.addr_id is np.nan) and (x.region is np.nan) and (x.phone is np.nan) and (x.fix_phone is np.nan) | (x.receiver_md5 is np.nan) else 0, axis=1)
161 | feature = recieve_addr_info.drop_duplicates(['id'])[['id']]
162 | recieve_addr_info['index'] = recieve_addr_info.index
163 | all_is_null = pd.pivot_table(recieve_addr_info, index='id', values='all_null', aggfunc='min').reset_index()
164 | addr_id = pd.pivot_table(recieve_addr_info, index='id', values='index', aggfunc='count').reset_index().rename(columns={'index': 'record_count'})
165 | feature = feature.merge(all_is_null, on='id', how='left')
166 | feature = feature.merge(addr_id, on='id', how='left')
167 | province = {'甘肃', '云南', '贵州', '河南', '黑龙', '香港', '北京', '湖南', '江苏', '青海', '宁夏', '内蒙', '浙江', '吉林', '海南', '福建', '重庆', '台湾', '陕西', '湖北', '江西', '辽宁', '山西', '西藏', '广东', '安徽', '四川', '河北', '山东', '上海', '广西', '新疆', '天津', 'null'}
168 |
169 | train_recieve_addr_info['province'] = train_recieve_addr_info[['region']].apply(lambda x: 'null' if x.region is np.nan else x.region[0:2], axis=1)
170 | city_set = pd.pivot_table(train_recieve_addr_info, index='id', values='province', aggfunc=return_set).reset_index()
171 | for string in list(province):
172 | city_set[string] = [1 if string in index else 0 for index in city_set['province']]
173 | city_set['province'] = city_set[['province']].apply(lambda x: x.province.clear() if 'null' in x.province else x.province, axis=1)
174 | city_set['province_len'] = [0 if index is None else len(index) for index in city_set['province']]
175 |
176 | feature = feature.merge(city_set.drop(['province'], axis=1), on='id', how='left')
177 | # print(feature)
178 | return feature
179 |
180 | # extract_recieve_addr_info(train_recieve_addr_info)
181 | # print(extract_recieve_addr_info(train_recieve_addr_info))
182 |
183 |
184 | def extract_bankcard_info(bankcard_info):
185 | ''' 提取 bankcard_info表 特征 '''
186 |
187 | def cal_store_card_num(group):
188 | flag = 0
189 | for index in group:
190 | if index == '储蓄卡':
191 | flag += 1
192 | return flag
193 |
194 | def if_have_credit_card(group):
195 | for index in group:
196 | if index == '信用卡':
197 | return 1
198 | else:
199 | return 0
200 | return 0
201 |
202 | def list_set(group):
203 | return len(set(group))
204 |
205 | bankcard_info = bankcard_info.drop_duplicates()
206 | feature = bankcard_info.drop_duplicates(['id'])[['id']]
207 | card_record_count = pd.pivot_table(bankcard_info, index='id', values='phone', aggfunc='count').reset_index().rename(columns={'phone': 'card_record_count'})
208 | phone_count = pd.pivot_table(bankcard_info, index='id', values='phone', aggfunc=list_set).reset_index().rename(columns={'phone': 'phone_count'})
209 | store_card_count = pd.pivot_table(bankcard_info, index='id', values='card_type', aggfunc=cal_store_card_num).reset_index().rename(columns={'card_type': 'store_card_count'})
210 | have_credit_card = pd.pivot_table(bankcard_info, index='id', values='card_type', aggfunc=if_have_credit_card).reset_index().rename(columns={'card_type': 'have_credit_card'})
211 | card_category_count = pd.pivot_table(bankcard_info, index='id', values='card_type', aggfunc=list_set).reset_index().rename(columns={'card_type': 'card_category_count'})
212 |
213 | feature = feature.merge(phone_count, on='id', how='left')
214 | feature = feature.merge(card_record_count, on='id', how='left')
215 | feature = feature.merge(store_card_count, on='id', how='left')
216 | feature = feature.merge(have_credit_card, on='id', how='left')
217 | feature = feature.merge(card_category_count, on='id', how='left')
218 | feature['credit_count'] = feature['card_record_count'] - feature['store_card_count']
219 | feature['card_count_one'] = feature[['card_record_count']].apply(lambda x: 1 if x.card_record_count > 6 else 0, axis=1)
220 | feature['record_is_unique'] = feature[['card_record_count']].apply(lambda x: 1 if x.card_record_count == 1 else 0, axis=1)
221 | # print(feature)
222 |
223 | return feature
224 |
225 | # extract_bankcard_info(train_bankcard_info)
226 | # print(extract_bankcard_info(test_bankcard_info))
227 |
228 |
229 | def extract_auth_info(auth_info):
230 | '''提取 auth_info表 特征'''
231 | feature = auth_info[['id']]
232 | feature.loc[:, 'auth_id_card_is_null'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card is not np.nan else 0, axis=1)
233 | feature.loc[:, 'auth_time_is_null'] = auth_info[['auth_time']].apply(lambda x: 1 if x.auth_time is not np.nan else 0, axis=1)
234 | feature.loc[:, 'phone_is_null'] = auth_info[['phone']].apply(lambda x: 1 if x.phone is not np.nan else 0, axis=1)
235 | feature.loc[:, 'all_is_null'] = auth_info[['id_card', 'auth_time', 'phone']].apply(lambda x: 1 if ((x.id_card is np.nan) and (x.auth_time is np.nan) and (x.phone is np.nan)) else 0, axis=1)
236 | feature.loc[:, 'all_not_null'] = auth_info[['id_card', 'auth_time', 'phone']].apply(lambda x: 1 if ((x.id_card is not np.nan) and (x.auth_time is not np.nan) and (x.phone is not np.nan)) else 0, axis=1)
237 | feature.loc[:, 'card_time_is_null'] = auth_info[['id_card', 'auth_time']].apply(lambda x: 1 if ((x.id_card is np.nan) and (x.auth_time is np.nan)) else 0, axis=1)
238 | feature.loc[:, 'time_phone_is_null'] = auth_info[['auth_time', 'phone']].apply(lambda x: 1 if ((x.phone is np.nan) and (x.auth_time is np.nan)) else 0, axis=1)
239 | # '''运营商'''
240 | # auth_info['id_card'] = [int(index[0]) if index is not np.nan else -1 for index in auth_info['id_card']]
241 | # auth_info['phone'] = [int(index[:3]) if index is not np.nan else -1 for index in auth_info['phone']]
242 | # mobile = {134, 135, 136, 137, 138, 139, 150, 151, 152, 157, 158, 159, 182, 183, 184, 187, 188, 147, 178}
243 | # unicom = {130, 131, 132, 155, 156, 185, 186, 145, 176}
244 | # telecom = {180, 181, 189, 133, 153, 177}
245 | # virtual = {170}
246 | # feature.loc[:, 'mobile'] = auth_info[['phone']].apply(lambda x: 1 if x.phone in mobile else 0, axis=1)
247 | # feature.loc[:, 'unicom'] = auth_info[['phone']].apply(lambda x: 1 if x.phone in unicom else 0, axis=1)
248 | # feature.loc[:, 'telecom'] = auth_info[['phone']].apply(lambda x: 1 if x.phone in telecom else 0, axis=1)
249 | # feature.loc[:, 'virtual'] = auth_info[['phone']].apply(lambda x: 1 if x.phone in virtual else 0, axis=1)
250 | # # 'mobile', 'unicom', 'telecom', 'virtual'
251 | # # 'id_card_one', 'id_card_two', 'id_card_three', 'id_card_four', 'id_card_five', 'id_card_six'
252 | # feature.loc[:, 'id_card_one'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 1 else 0, axis=1)
253 | # feature.loc[:, 'id_card_two'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 2 else 0, axis=1)
254 | # feature.loc[:, 'id_card_three'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 3 else 0, axis=1)
255 | # feature.loc[:, 'id_card_four'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 4 else 0, axis=1)
256 | # feature.loc[:, 'id_card_five'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 5 else 0, axis=1)
257 | # feature.loc[:, 'id_card_six'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 6 else 0, axis=1)
258 | # print(feature)
259 | return feature
260 |
261 | # extract_auth_info(train_auth_info)
262 | # print(extract_auth_info(test_auth_info))
263 |
264 |
265 | def extract_order_info(order_info):
266 | '''提取 order_info表 特征'''
267 | def cal_set(group):
268 | return len(set(group))
269 |
270 | '''求标准差'''
271 | def cal_std(group):
272 | return np.std(group)
273 |
274 | feature = order_info.drop_duplicates(['id'])[['id']]
275 | # amt_order, type_pay, time_order, sts_order, phone, unit_price, no_order_md5, name_rec_md5, product_id_md5
276 | order_info['order_all_is_null'] = order_info.apply(lambda x: 1 if ((x.amt_order is np.nan) and (x.type_pay is np.nan) and (x.time_order is np.nan) and (x.sts_order is np.nan)) else 0, axis=1)
277 | order_all_is_null = pd.pivot_table(order_info[['id', 'order_all_is_null']], index='id', values='order_all_is_null', aggfunc='max').reset_index()
278 |
279 | '''均值填充amt_order属性'''
280 | order_info_amt = order_info[['amt_order']]
281 | order_info_amt = order_info_amt[order_info_amt['amt_order'].notnull()]
282 | order_info_amt = order_info_amt[order_info_amt['amt_order'] != 'null']
283 | order_info_amt['amt_order'] = [float(index) for index in order_info_amt['amt_order']]
284 | mean = order_info_amt['amt_order'].mean()
285 | order_info['amt_order'] = order_info['amt_order'].fillna(mean)
286 | order_info['amt_order'] = [mean if index == 'null' else index for index in order_info['amt_order']]
287 | order_info['amt_order'] = [float(index) for index in order_info['amt_order']]
288 |
289 | order_info['unit_price'] = order_info[['amt_order', 'unit_price']].apply(lambda x: x.amt_order if np.isnan(x.unit_price) else x.unit_price, axis=1)
290 | unit_price_mean = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc='mean').reset_index().rename(columns={'unit_price': 'unit_price_mean'})
291 | unit_price_max = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc='max').reset_index().rename(columns={'unit_price': 'unit_price_max'})
292 | unit_price_min = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc='min').reset_index().rename(columns={'unit_price': 'unit_price_min'})
293 | unit_price_std = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc=cal_std).reset_index().rename(columns={'unit_price': 'unit_price_std'})
294 |
295 | amt_order_mean = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc='mean').reset_index().rename(columns={'amt_order': 'amt_order_mean'})
296 | amt_order_max = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc='max').reset_index().rename(columns={'amt_order': 'amt_order_max'})
297 | amt_order_min = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc='min').reset_index().rename(columns={'amt_order': 'amt_order_min'})
298 | amt_order_std = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc=cal_std).reset_index().rename(columns={'amt_order': 'amt_order_std'})
299 | type_pay_count = pd.pivot_table(order_info[['id', 'type_pay']], index='id', values='type_pay', aggfunc=cal_set).reset_index().rename(columns={'type_pay': 'type_pay_count'})
300 | sts_order_count = pd.pivot_table(order_info[['id', 'sts_order']], index='id', values='sts_order', aggfunc=cal_set).reset_index().rename(columns={'sts_order': 'sts_order_count'})
301 | order_phone_count = pd.pivot_table(order_info[['id', 'phone']], index='id', values='phone', aggfunc=cal_set).reset_index().rename(columns={'phone': 'order_phone_count'})
302 | name_rec_md5_count = pd.pivot_table(order_info[['id', 'name_rec_md5']], index='id', values='name_rec_md5', aggfunc=cal_set).reset_index().rename(columns={'name_rec_md5': 'name_rec_md5_count'})
303 |
304 | feature = feature.merge(unit_price_mean, on='id', how='left')
305 | feature = feature.merge(unit_price_max, on='id', how='left')
306 | feature = feature.merge(unit_price_min, on='id', how='left')
307 | feature = feature.merge(unit_price_std, on='id', how='left')
308 |
309 | feature = feature.merge(order_all_is_null, on='id', how='left')
310 | feature = feature.merge(amt_order_mean, on='id', how='left')
311 | feature = feature.merge(amt_order_max, on='id', how='left')
312 | feature = feature.merge(amt_order_min, on='id', how='left')
313 | feature = feature.merge(amt_order_std, on='id', how='left')
314 | feature = feature.merge(type_pay_count, on='id', how='left')
315 | feature = feature.merge(sts_order_count, on='id', how='left')
316 | feature = feature.merge(order_phone_count, on='id', how='left')
317 | feature = feature.merge(name_rec_md5_count, on='id', how='left')
318 | '''归一化'''
319 | feature.iloc[:, 1:] = feature.iloc[:, 1:].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) # 0.791859501859 #
320 | '''离散化特征'''
321 | order_info['type_pay'] = order_info[['type_pay']].apply(lambda x: 'null' if x.type_pay is np.nan else x.type_pay, axis=1)
322 | type_pay = pd.pivot_table(order_info, index='id', values='type_pay', aggfunc=return_set).reset_index()
323 | # type_pay_category = {'定向京券支付', '白条支付', '分期付款', '积分支付', '在线+限品东券', '定向东券', '东券混合支付', '余额', '京豆东券混合支付', '前台自付', '在线', '在线+东券支付', '上门自提', '公司转账', '在线支付', '在线支付 ', '在线+京豆', '邮局汇款', '货到付款',
324 | # '在线+全品东券', 'null', '京豆支付', '在线预付', '定向京券', '混合支付', '京豆', '在线+定向东券', '京豆混合支付', '在线+东券'}
325 |
326 | type_pay_category = {'定向京券支付', '白条支付', '在线+余额+限品东券', '高校代理-代理支付', '京券全额支付', '分期付款', '积分支付', '在线+限品东券', '定向东券', '东券混合支付', '余额', '京豆东券混合支付', '前台自付', '在线', '在线+东券支付', '上门自提', '公司转账', '在线支付', '在线支付 ', '在线+京豆', '邮局汇款', '在线+全品京券', '货到付款', '分期付款(招行)', '在线+全品东券', '余额+限品东券', '在线+京券支付', '在线+余额', '限品京券', 'null', '京豆支付', '在线预付', '定向京券', '混合支付', '全品京券', '京豆', '在线+定向东券', '京豆混合支付', '在线+限品京券', '高校代理-自己支付', '京券混合支付', '在线+东券'}
327 |
328 | for string in list(type_pay_category):
329 | type_pay[string] = [1 if string in index else 0 for index in type_pay['type_pay']]
330 |
331 | type_pay['type_pay'] = type_pay[['type_pay']].apply(lambda x: x.type_pay.clear() if 'null' in x.type_pay else x.type_pay, axis=1)
332 | type_pay['type_pay_len'] = [0 if index is None else len(index) for index in type_pay['type_pay']]
333 | feature = feature.merge(type_pay.drop(['type_pay'], axis=1), on='id', how='left')
334 |
335 | '''sts_order离散化'''
336 | order_info['sts_order'] = order_info[['sts_order']].apply(lambda x: 'null' if x.sts_order is np.nan else x.sts_order, axis=1)
337 | # sts_order_category = set(train_order_info['sts_order'])
338 | sts_order = pd.pivot_table(order_info, index='id', values='sts_order', aggfunc=return_set).reset_index()
339 | sts_order_category = {'null', '等待审核', '等待处理', '已退款', '已收货', '购买成功', '付款成功', '失败退款', '已完成', '预订结束', '退款完成', '正在出库', '订单已取消', '充值成功', '商品出库', '下单失败', '请上门自提', '已晒单', '充值失败;退款成功',
340 | '退款成功', '未入住', '等待收货', '配送退货', '出票失败', '等待付款确认', '缴费成功', '预约完成', '未抢中', '完成', '已取消', '出票成功', '抢票已取消', '等待付款', '已取消订单', '正在处理', '等待退款', '充值失败', '订单取消'}
341 |
342 | for string in list(sts_order_category):
343 | sts_order[string] = [1 if string in index else 0 for index in sts_order['sts_order']]
344 |
345 | sts_order['sts_order'] = sts_order[['sts_order']].apply(lambda x: x.sts_order.clear() if 'null' in x.sts_order else x.sts_order, axis=1)
346 | sts_order['sts_order_len'] = [0 if index is None else len(index) for index in sts_order['sts_order']]
347 | # print(sts_order)
348 | feature = feature.merge(sts_order.drop(['sts_order'], axis=1), on='id', how='left')
349 |
350 | # print(feature)
351 | return feature
352 |
353 | # extract_order_info(train_order_info)
354 | # print(extract_order_info(test_order_info))
355 |
356 |
357 | def extract_time_feature(auth_info, target_list):
358 | '''提取时间相关特征'''
359 | feature = target_list[['id']]
360 | target_list = target_list[['id', 'appl_sbm_tm']].merge(auth_info[['id', 'auth_time']], on='id', how='left')
361 | target_list.loc[:, 'appl_sbm_tm'] = [index.split(' ')[0] for index in target_list['appl_sbm_tm']]
362 | target_list['auth_time'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm if x.auth_time == '0000-00-00' else x.auth_time, axis=1)
363 | target_list['auth_time'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm if x.auth_time is np.nan else x.auth_time, axis=1)
364 | feature['feature_1'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: 1 if x.appl_sbm_tm < x.auth_time else 0, axis=1)
365 | feature['register_days'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: (datetime(int(x.appl_sbm_tm.split('-')[0]), int(x.appl_sbm_tm.split('-')[1]), int(x.appl_sbm_tm.split('-')[2])) - datetime(int(x.auth_time.split('-')[0]), int(x.auth_time.split('-')[1]), int(x.auth_time.split('-')[2]))).days, axis=1)
366 | # print(target_list)
367 | # print(feature)
368 | return feature
369 |
370 | # extract_time_feature(train_auth_info, train_target)
371 | # print(extract_time_feature(test_auth_info, test_list))
372 |
373 | def extract_order_payment_time(order_info, target_list):
374 | str_len = len('2016-01-19 22:38:26')
375 | feature = target_list[['id']]
376 | target_list = target_list[['id', 'appl_sbm_tm']].merge(order_info[['id', 'time_order']], on='id', how='left')
377 | target_list.loc[:, 'appl_sbm_tm'] = [index.split(' ')[0] for index in target_list['appl_sbm_tm']]
378 | target_list['time_order'] = target_list[['appl_sbm_tm', 'time_order']].apply(lambda x: x.appl_sbm_tm if x.time_order is np.nan else x.time_order, axis=1)
379 | target_list['time_order'] = target_list[['appl_sbm_tm', 'time_order']].apply(lambda x: x.appl_sbm_tm if len(x.time_order) != str_len else x.time_order, axis=1)
380 | target_list.loc[:, 'time_order'] = [index.split(' ')[0] for index in target_list['time_order']]
381 | target_list['days'] = target_list[['appl_sbm_tm', 'time_order']].apply(lambda x: (datetime(int(x.appl_sbm_tm.split('-')[0]), int(x.appl_sbm_tm.split('-')[1]), int(x.appl_sbm_tm.split('-')[2])) - datetime(int(x.time_order.split('-')[0]), int(x.time_order.split('-')[1]), int(x.time_order.split('-')[2]))).days, axis=1)
382 | print(target_list)
383 | day_mean = pd.pivot_table(target_list, index='id', values='days', aggfunc='mean').reset_index().rename(columns={'days': 'day_mean'})
384 | day_max = pd.pivot_table(target_list, index='id', values='days', aggfunc='max').reset_index().rename(columns={'days': 'day_max'})
385 | day_min = pd.pivot_table(target_list, index='id', values='days', aggfunc='min').reset_index().rename(columns={'days': 'day_min'})
386 | order_record_count = pd.pivot_table(target_list, index='id', values='days', aggfunc='count').reset_index().rename(columns={'days': 'order_record_count'})
387 | feature = feature.merge(day_mean, on='id', how='left')
388 | feature = feature.merge(day_max, on='id', how='left')
389 | feature = feature.merge(day_min, on='id', how='left')
390 | feature = feature.merge(order_record_count, on='id', how='left') # 记录数 #
391 | feature.loc[:, 'order_record_unique'] = [1 if index == 1 else 0 for index in feature['order_record_count']] # 记录数是否唯一 #
392 | print(feature)
393 | return feature
394 |
395 | extract_order_payment_time(train_order_info, train_target)
396 | # print(extract_order_payment_time(test_order_info, test_list))
397 |
398 | '''Logistic Regression'''
399 | def train_LR_module(store_result=False, store_feature=False, select_feature=False, feature_num='all', OneEncode=False):
400 | train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8')
401 | validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8')
402 | test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8')
403 | train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8')
404 | print('读取数据完毕。。。')
405 |
406 | validate_label = validate_feature[['target']]
407 | train_label = train_feature[['target']]
408 | train_test_label = train_test_feature[['target']]
409 |
410 | train_feature = train_feature.iloc[:, 2:]
411 | test_feature = test_feature.iloc[:, 1:]
412 | validate_feature = validate_feature.iloc[:, 2:]
413 | train_test_feature = train_test_feature.iloc[:, 2:]
414 |
415 | if OneEncode is True:
416 | features = list(train_feature.columns)
417 | one_hot = []
418 | continuous_feature = []
419 | for name in features:
420 | if len(set(train_feature[name])) == 2:
421 | one_hot.append(name)
422 | else:
423 | continuous_feature.append(name)
424 |
425 | feature = one_hot[:140] + continuous_feature
426 | train_feature = train_feature[feature]
427 | validate_feature = validate_feature[feature]
428 | test_feature = test_feature[feature]
429 | train_test_feature = train_test_feature[feature]
430 |
431 | if select_feature is True:
432 | print('开始特征选择。。。')
433 | ch2 = SelectKBest(chi2, k=feature_num)
434 | train_feature = ch2.fit_transform(train_feature, train_label)
435 | test_feature = ch2.transform(test_feature)
436 | validate_feature = ch2.transform(validate_feature)
437 | train_test_feature = ch2.transform(train_test_feature)
438 | print('特征选择完毕。。。')
439 | else:
440 | feature_num = train_feature.shape[1]
441 |
442 | print('开始训练logisticRegression模型。。。')
443 | module = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4) # , solver='sag'
444 | # module = lgb.LGBMClassifier(
445 | # num_leaves=64, # num_leaves = 2^max_depth * 0.6 #
446 | # max_depth=6,
447 | # n_estimators=80,
448 | # learning_rate=0.1
449 | # )
450 | '''训练集'''
451 | module.fit(train_feature, train_label)
452 |
453 | if store_result is True:
454 | '''测试训练集'''
455 | module_two = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4)
456 | # module_two = lgb.LGBMClassifier(
457 | # num_leaves=64, # num_leaves = 2^max_depth * 0.6 #
458 | # max_depth=6,
459 | # n_estimators=80,
460 | # learning_rate=0.1
461 | # )
462 | module_two.fit(train_test_feature, train_test_label)
463 |
464 | result = module_two.predict_proba(test_feature)[:, 1]
465 | result = pd.DataFrame(result)
466 | result.columns = ['predicted_score']
467 | sample = test_list[['id']]
468 | sample['predicted_score'] = [index for index in result['predicted_score']]
469 | sample.columns = ['ID', 'PROB']
470 | sample.to_csv(r'lr_sample.csv', index=None)
471 | # sample.to_csv(r'lgb_sample.csv', index=None)
472 | print(sample)
473 | print('结果已更新。。。')
474 |
475 | print(" Score_offline:", roc_auc_score(validate_label, module.predict_proba(validate_feature)[:, 1]))
476 | print('特征维数:', feature_num)
477 |
478 |
479 |
480 | # def module_merge(prob_x, prob_l):
481 | # xgb_sample = pd.read_csv(r'xgb_sample.csv', low_memory=False) # encode:159:0.790297834417
482 | # lr_sample = pd.read_csv(r'lr_sample.csv', low_memory=False) # Uncode:0.792171452209
483 | # sample = xgb_sample.merge(lr_sample, on='ID', how='left')
484 | # sample['PROB'] = sample['PROB_x'] * prob_x + sample['PROB_y'] * prob_l
485 | # sample = sample[['ID', 'PROB']]
486 | # print(sample)
487 | # sample.to_csv(r'sample.csv', index=None)
488 | # print('模型已融合。。。')
489 |
490 |
491 |
492 | # def module_merge(prob_xgb, prob_lr, prob_lgb):
493 | # xgb_sample = pd.read_csv(r'xgb_sample.csv', low_memory=False) # encode:159:0.790297834417
494 | # lr_sample = pd.read_csv(r'lr_sample.csv', low_memory=False) # Uncode:0.792171452209
495 | # lgb_sample = pd.read_csv(r'lgb_sample.csv', low_memory=False)
496 | #
497 | # xgb_sample.columns = ['ID', 'PROB_xgb']
498 | # lr_sample.columns = ['ID', 'PROB_lr']
499 | # lgb_sample.columns = ['ID', 'PROB_lgb']
500 | # sample = xgb_sample.merge(lr_sample, on='ID', how='left')
501 | # sample = sample.merge(lgb_sample, on='ID', how='left')
502 | # # print(sample)
503 | # sample['PROB'] = sample['PROB_xgb'] * prob_xgb + sample['PROB_lr'] * prob_lr + sample['PROB_lgb'] * prob_lgb
504 | # sample = sample[['ID', 'PROB']]
505 | # print(sample)
506 | # sample.to_csv(r'sample.csv', index=None)
507 | # print('模型已融合。。。')
508 |
--------------------------------------------------------------------------------