├── 方案说明.pdf ├── README.md ├── feature_selection.py ├── LICENSE ├── construct_module.py ├── XGB1.py └── feature_engine.py /方案说明.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rogeroyer/AI-challenger-contest/HEAD/方案说明.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

马上AI全球挑战赛-违约用户风险预测 亚军

2 | 3 | *** 4 | 5 | ## **代码使用说明** 6 | 7 | ### 运行环境 8 | 9 | - Anaconda3(python3.6) 10 | 11 | - 工具包 12 | - sklean 13 | - pandas 14 | - numpy 15 | - xgboost 16 | 17 | ### 方案一 18 | - XGB1.py 19 | 20 | > 输出结果result_xgb.csv 21 | `注:使用前请更改读取数据的目录` 22 | 23 | 24 | ### 方案二 25 | 26 | - feature_selection.py 27 | 28 | > 特征选择类、XGBoost模型训练函数、Logistic Regression模型训练函数 29 | 30 | 31 | - feature_engine.py 32 | 33 | > 特征提取函数,使用前请更改读取数据的目录 34 | 35 | - construct_module.py 36 | 37 | > 程序主函数 38 | 39 | ### 执行步骤 40 | 41 | 1.运行文件XGB1.py 42 | 43 | 2.运行文件construct_module.py 44 | 45 | 3.最终结果为:sample.csv 46 | 47 | ### 数据集下载链接 48 | - [百度云盘](https://pan.baidu.com/s/13D0hwh_NVwVBh_ydpNzPjw) 49 | - 密码:d5h0 50 | ### LICENSE 51 | [Apache LICENSE](https://github.com/rogeroyer/AI-challenger-contest/blob/master/LICENSE) 52 | -------------------------------------------------------------------------------- /feature_selection.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import xgboost as xgb 6 | import lightgbm as lgb 7 | from sklearn.metrics import roc_auc_score 8 | from sklearn.linear_model import LogisticRegression 9 | 10 | '''单变量特征选取''' 11 | from sklearn.feature_selection import SelectKBest, chi2 12 | '''去除方差小的特征''' 13 | from sklearn.feature_selection import VarianceThreshold 14 | '''循环特征选取''' 15 | from sklearn.svm import SVC 16 | from sklearn.feature_selection import RFE 17 | '''RFE_CV''' 18 | from sklearn.ensemble import ExtraTreesClassifier 19 | 20 | 21 | class FeatureSelection(object): 22 | def __init__(self, feature_num): 23 | self.feature_num = feature_num 24 | self.train_test, self.label, self.test = self.read_data() # features # 25 | self.feature_name = list(self.train_test.columns) # feature name # 26 | 27 | def read_data(self): 28 | test = pd.read_csv(r'test_feature.csv', encoding='utf-8') 29 | train_test = pd.read_csv(r'train_test_feature.csv', encoding='utf-8') 30 | train_test = train_test.drop(['feature_1', 'register_days', 'id_card_one', 'id_card_two', 'id_card_three', 'id_card_four', 'id_card_five', 'id_card_six', 'mobile', 'unicom', 'telecom', 'virtual'], axis=1) 31 | print('读取数据完毕。。。') 32 | label = train_test[['target']] 33 | test = test.iloc[:, 1:] 34 | train_test = train_test.iloc[:, 2:] 35 | return train_test, label, test 36 | 37 | def variance_threshold(self): 38 | sel = VarianceThreshold() 39 | sel.fit_transform(self.train_test) 40 | feature_var = list(sel.variances_) # feature variance # 41 | features = dict(zip(self.feature_name, feature_var)) 42 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:] 43 | # print(features) # 100 cols # 44 | return set(features) # return set type # 45 | 46 | def select_k_best(self): 47 | ch2 = SelectKBest(chi2, k=self.feature_num) 48 | ch2.fit(self.train_test, self.label) 49 | feature_var = list(ch2.scores_) # feature scores # 50 | features = dict(zip(self.feature_name, feature_var)) 51 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:] 52 | # print(features) # 100 cols # 53 | return set(features) # return set type # 54 | 55 | def svc_select(self): 56 | svc = SVC(kernel='rbf', C=1, random_state=2018) # linear # 57 | rfe = RFE(estimator=svc, n_features_to_select=self.feature_num, step=1) 58 | rfe.fit(self.train_test, self.label.ravel()) 59 | print(rfe.ranking_) 60 | return rfe.ranking_ 61 | 62 | def tree_select(self): 63 | clf = ExtraTreesClassifier(n_estimators=300, max_depth=7, n_jobs=4) 64 | clf.fit(self.train_test, self.label) 65 | feature_var = list(clf.feature_importances_) # feature scores # 66 | features = dict(zip(self.feature_name, feature_var)) 67 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:] 68 | # print(features) # 100 cols # 69 | return set(features) # return set type # 70 | 71 | def return_feature_set(self, variance_threshold=False, select_k_best=False, svc_select=False, tree_select=False): 72 | names = set([]) 73 | if variance_threshold is True: 74 | name_one = self.variance_threshold() 75 | names = names.union(name_one) 76 | if select_k_best is True: 77 | name_two = self.select_k_best() 78 | names = names.intersection(name_two) 79 | if svc_select is True: 80 | name_three = self.svc_select() 81 | names = names.intersection(name_three) 82 | if tree_select is True: 83 | name_four = self.tree_select() 84 | names = names.intersection(name_four) 85 | 86 | # print(len(names)) 87 | print(names) 88 | return list(names) 89 | 90 | 91 | # selection = FeatureSelection(100) 92 | # selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True) 93 | 94 | 95 | def train_xgb_module(features_name, store_result=False): 96 | '''训练模型''' 97 | train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8') 98 | validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8') 99 | test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8') 100 | train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8') 101 | 102 | print('读取数据完毕。。。') 103 | 104 | validate_label = validate_feature[['target']] 105 | train_label = train_feature[['target']] 106 | train_test_label = train_test_feature[['target']] 107 | 108 | train_feature = train_feature[features_name] 109 | test_feature = test_feature[features_name] 110 | validate_feature = validate_feature[features_name] 111 | train_test_feature = train_test_feature[features_name] 112 | 113 | print('开始训练xgboost模型。。。') 114 | '''xgboost分类器''' 115 | num_round = 500 # 迭代次数 # 116 | params = { 117 | 'booster': 'gbtree', 118 | 'max_depth': 4, 119 | 'colsample_bytree': 0.8, 120 | 'subsample': 0.8, 121 | 'eta': 0.03, 122 | 'silent': 1, 123 | 'objective': 'binary:logistic', 124 | 'eval_metric': 'auc', 125 | 'min_child_weight': 1, 126 | 'scale_pos_weight': 1, 127 | 'seed': 27, 128 | 'reg_alpha': 0.01 129 | } 130 | '''训练集''' 131 | dtrain = xgb.DMatrix(train_feature, label=train_label) 132 | validate_feature = xgb.DMatrix(validate_feature) 133 | module = xgb.train(params, dtrain, num_round) 134 | 135 | if store_result is True: 136 | '''测试训练集''' 137 | dtrain_two = xgb.DMatrix(train_test_feature, label=train_test_label) 138 | test_feature = xgb.DMatrix(test_feature) 139 | module_two = xgb.train(params, dtrain_two, num_round) 140 | 141 | features = module_two.get_fscore() 142 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-20:] 143 | features.reverse() 144 | print(features) # 输出特征重要性 # 145 | 146 | result = module_two.predict(test_feature) 147 | result = pd.DataFrame(result) 148 | result.columns = ['predicted_score'] 149 | test_list = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False) 150 | sample = test_list[['id']] 151 | sample['predicted_score'] = [index for index in result['predicted_score']] 152 | sample.columns = ['ID', 'PROB'] 153 | sample.to_csv(r'xgb_sample.csv', index=None) 154 | print(sample) 155 | print('结果已更新。。。') 156 | 157 | print(" Score_offline:", roc_auc_score(validate_label, module.predict(validate_feature))) 158 | print('特征维数:', len(features_name)) 159 | 160 | 161 | def train_lr_module(features_name, store_result=False): 162 | train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8') 163 | validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8') 164 | test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8') 165 | train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8') 166 | print('读取数据完毕。。。') 167 | 168 | validate_label = validate_feature[['target']] 169 | train_label = train_feature[['target']] 170 | train_test_label = train_test_feature[['target']] 171 | 172 | train_feature = train_feature[features_name] 173 | test_feature = test_feature[features_name] 174 | validate_feature = validate_feature[features_name] 175 | train_test_feature = train_test_feature[features_name] 176 | 177 | print('开始训练logisticRegression模型。。。') 178 | module = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4) # , solver='sag' 179 | # module = lgb.LGBMClassifier( 180 | # num_leaves=64, # num_leaves = 2^max_depth * 0.6 # 181 | # max_depth=6, 182 | # n_estimators=80, 183 | # learning_rate=0.1 184 | # ) 185 | '''训练集''' 186 | module.fit(train_feature, train_label) 187 | 188 | if store_result is True: 189 | '''测试训练集''' 190 | module_two = LogisticRegression( 191 | penalty='l2', 192 | solver='sag', 193 | max_iter=500, 194 | random_state=42, 195 | n_jobs=4 196 | ) 197 | 198 | # module_two = lgb.LGBMClassifier( 199 | # num_leaves=64, # num_leaves = 2^max_depth * 0.6 # 200 | # max_depth=6, 201 | # n_estimators=80, 202 | # learning_rate=0.1 203 | # ) 204 | module_two.fit(train_test_feature, train_test_label) 205 | 206 | result = module_two.predict_proba(test_feature)[:, 1] 207 | result = pd.DataFrame(result) 208 | result.columns = ['predicted_score'] 209 | test_list = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False) 210 | sample = test_list[['id']] 211 | sample['predicted_score'] = [index for index in result['predicted_score']] 212 | sample.columns = ['ID', 'PROB'] 213 | sample.to_csv(r'lr_sample.csv', index=None) 214 | # sample.to_csv(r'lgb_sample.csv', index=None) 215 | print(sample) 216 | print('结果已更新。。。') 217 | 218 | print(" Score_offline:", roc_auc_score(validate_label, module.predict_proba(validate_feature)[:, 1])) 219 | print('特征维数:', len(features_name)) 220 | 221 | 222 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /construct_module.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import time 4 | import xgboost as xgb 5 | 6 | '''导入外部文件''' 7 | from version_two.feature_engine import * 8 | from version_two.stacking import * 9 | from version_two.feature_selection import * 10 | 11 | 12 | '''划分数据集''' 13 | train_target['date'] = [index.replace('-', '') for index in train_target['appl_sbm_tm']] 14 | train_target['date'] = [index.split(' ')[0][0:6] for index in train_target['date']] 15 | '''验证集''' 16 | validate_data = train_target[(train_target['date'] == '201704')][['target', 'id']] 17 | '''训练集''' 18 | train_data = train_target[(train_target['date'] >= '201603') & (train_target['date'] <= '201703')][['target', 'id']] 19 | '''测试集''' 20 | test_data = test_list[['id']] 21 | '''测试训练集''' 22 | train_test_data = train_target[['target', 'id']] 23 | 24 | 25 | def extract_feature(): 26 | '''credit_info''' 27 | train_credit_info_feature = extract_credit_info(train_credit_info) 28 | train_test_feature = train_test_data.merge(train_credit_info_feature, on='id', how='left') # 训练测试集 # 29 | train_feature = train_data.merge(train_credit_info_feature, on='id', how='left') 30 | validate_feature = validate_data.merge(train_credit_info_feature, on='id', how='left') 31 | test_feature = test_data.merge(extract_credit_info(test_credit_info), on='id', how='left') 32 | 33 | '''order_info''' 34 | train_order_info_feature = extract_order_info(train_order_info) 35 | train_feature = train_feature.merge(train_order_info_feature, on='id', how='left') 36 | train_test_feature = train_test_feature.merge(train_order_info_feature, on='id', how='left') # 训练测试集 # 37 | validate_feature = validate_feature.merge(train_order_info_feature, on='id', how='left') 38 | test_feature = test_feature.merge(extract_order_info(test_order_info), on='id', how='left') 39 | 40 | '''user_info''' 41 | train_user_info_feature = extract_user_info(train_user_info) 42 | train_feature = train_feature.merge(train_user_info_feature, on='id', how='left') 43 | train_test_feature = train_test_feature.merge(train_user_info_feature, on='id', how='left') # 训练测试集 # 44 | validate_feature = validate_feature.merge(train_user_info_feature, on='id', how='left') 45 | test_feature = test_feature.merge(extract_user_info(test_user_info), on='id', how='left') 46 | 47 | '''recieve_addr_info''' 48 | train_recieve_addr_info_feature = extract_recieve_addr_info(train_recieve_addr_info) 49 | train_feature = train_feature.merge(train_recieve_addr_info_feature, on='id', how='left') 50 | train_test_feature = train_test_feature.merge(train_recieve_addr_info_feature, on='id', how='left') # 训练测试集 # 51 | validate_feature = validate_feature.merge(train_recieve_addr_info_feature, on='id', how='left') 52 | test_feature = test_feature.merge(extract_recieve_addr_info(test_recieve_addr_info), on='id', how='left') 53 | 54 | '''bankcard_info''' 55 | train_bankcard_info_feature = extract_bankcard_info(train_bankcard_info) 56 | train_feature = train_feature.merge(train_bankcard_info_feature, on='id', how='left') 57 | train_test_feature = train_test_feature.merge(train_bankcard_info_feature, on='id', how='left') # 训练测试集 # 58 | validate_feature = validate_feature.merge(train_bankcard_info_feature, on='id', how='left') 59 | test_feature = test_feature.merge(extract_bankcard_info(test_bankcard_info), on='id', how='left') 60 | 61 | '''auth_info''' 62 | train_auth_info_feature = extract_auth_info(train_auth_info) 63 | train_feature = train_feature.merge(train_auth_info_feature, on='id', how='left').fillna(0) 64 | train_test_feature = train_test_feature.merge(train_auth_info_feature, on='id', how='left').fillna(0) # 训练测试集 # 65 | validate_feature = validate_feature.merge(train_auth_info_feature, on='id', how='left').fillna(0) 66 | test_feature = test_feature.merge(extract_auth_info(test_auth_info), on='id', how='left').fillna(0) 67 | 68 | '''time relative features one''' 69 | train_time_feature = extract_time_feature(train_auth_info, train_target) 70 | train_feature = train_feature.merge(train_time_feature, on='id', how='left').fillna(0) 71 | train_test_feature = train_test_feature.merge(train_time_feature, on='id', how='left').fillna(0) # 训练测试集 # 72 | validate_feature = validate_feature.merge(train_time_feature, on='id', how='left').fillna(0) 73 | test_feature = test_feature.merge(extract_time_feature(test_auth_info, test_list), on='id', how='left').fillna(0) 74 | 75 | '''time relative features two''' 76 | train_order_payment_time = extract_order_payment_time(train_order_info, train_target) 77 | train_feature = train_feature.merge(train_order_payment_time, on='id', how='left').fillna(0) 78 | train_test_feature = train_test_feature.merge(train_order_payment_time, on='id', how='left').fillna(0) # 训练测试集 # 79 | validate_feature = validate_feature.merge(train_order_payment_time, on='id', how='left').fillna(0) 80 | test_feature = test_feature.merge(extract_order_payment_time(test_order_info, test_list), on='id', how='left').fillna(0) 81 | 82 | print(train_feature.head(5)) 83 | print(validate_feature.head(5)) 84 | print(test_feature.head(5)) 85 | return train_feature, validate_feature, test_feature, train_test_feature 86 | 87 | 88 | def train_module(store_result=False, store_feature=False, select_feature=False, feature_num='all', one_encode=False): 89 | '''训练模型''' 90 | if store_feature is True: 91 | train_feature, validate_feature, test_feature, train_test_feature = extract_feature() 92 | ''' 保存特征数据 ''' 93 | train_feature.to_csv(r'train_feature.csv', index=None, encoding='utf-8') 94 | validate_feature.to_csv(r'validate_feature.csv', index=None, encoding='utf-8') 95 | test_feature.to_csv(r'test_feature.csv', index=None, encoding='utf-8') 96 | train_test_feature.to_csv(r'train_test_feature.csv', index=None, encoding='utf-8') 97 | print('保存数据完毕。。。') 98 | 99 | print('特征提取完毕。。。') 100 | exit(0) 101 | else: 102 | train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8') 103 | validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8') 104 | test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8') 105 | train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8') 106 | print('读取数据完毕。。。') 107 | 108 | validate_label = validate_feature[['target']] 109 | train_label = train_feature[['target']] 110 | train_test_label = train_test_feature[['target']] 111 | 112 | train_feature = train_feature.iloc[:, 2:] 113 | test_feature = test_feature.iloc[:, 1:] 114 | validate_feature = validate_feature.iloc[:, 2:] 115 | train_test_feature = train_test_feature.iloc[:, 2:] 116 | 117 | train_feature = train_feature.drop(['feature_1', 'register_days'], axis=1) 118 | test_feature = test_feature.drop(['feature_1', 'register_days'], axis=1) 119 | validate_feature = validate_feature.drop(['feature_1', 'register_days'], axis=1) 120 | train_test_feature = train_test_feature.drop(['feature_1', 'register_days'], axis=1) 121 | 122 | if one_encode is True: 123 | features = list(train_feature.columns) 124 | continuous_feature = [] 125 | one_hot = [] 126 | for name in features: 127 | if len(set(train_feature[name])) != 2: 128 | continuous_feature.append(name) 129 | else: 130 | one_hot.append(name) 131 | 132 | feature = continuous_feature + one_hot[:130] 133 | train_feature = train_feature[feature] 134 | validate_feature = validate_feature[feature] 135 | test_feature = test_feature[feature] 136 | train_test_feature = train_test_feature[feature] 137 | 138 | if select_feature is True: 139 | print('开始特征选择。。。') 140 | ch2 = SelectKBest(chi2, k=feature_num) 141 | train_feature = ch2.fit_transform(train_feature, train_label) 142 | test_feature = ch2.transform(test_feature) 143 | validate_feature = ch2.transform(validate_feature) 144 | train_test_feature = ch2.transform(train_test_feature) 145 | print('特征选择完毕。。。') 146 | else: 147 | feature_num = train_feature.shape[1] 148 | 149 | print('开始训练xgboost模型。。。') 150 | '''xgboost分类器''' 151 | num_round = 500 # 迭代次数 # 152 | params = { 153 | 'booster': 'gbtree', 154 | 'max_depth': 4, 155 | 'colsample_bytree': 0.6, 156 | 'subsample': 0.7, 157 | 'eta': 0.03, 158 | 'silent': 1, 159 | 'objective': 'binary:logistic', 160 | 'eval_metric': 'auc', 161 | # 'min_child_weight': 1, 162 | 'scale_pos_weight': 1, 163 | # 'seed': 27, 164 | # 'reg_alpha': 0.01 165 | } 166 | '''训练集''' 167 | dtrain = xgb.DMatrix(train_feature, label=train_label) 168 | validate_feature = xgb.DMatrix(validate_feature) 169 | module = xgb.train(params, dtrain, num_round) 170 | 171 | if store_result is True: 172 | '''测试训练集''' 173 | dtrain_two = xgb.DMatrix(train_test_feature, label=train_test_label) 174 | test_feature = xgb.DMatrix(test_feature) 175 | module_two = xgb.train(params, dtrain_two, num_round) 176 | 177 | result = module_two.predict(test_feature) 178 | result = pd.DataFrame(result) 179 | result.columns = ['predicted_score'] 180 | sample = test_list[['id']] 181 | sample['predicted_score'] = [index for index in result['predicted_score']] 182 | sample.columns = ['ID', 'PROB'] 183 | sample.to_csv(r'xgb_sample.csv', index=None) 184 | print(sample) 185 | print('结果已更新。。。') 186 | 187 | print(" Score_offline:", roc_auc_score(validate_label, module.predict(validate_feature))) 188 | print('特征维数:', feature_num) 189 | 190 | 191 | ''' 模型融合 ''' 192 | def module_merge_triple(prob_xgb, prob_lr, prob_lgb): 193 | xgb_sample = pd.read_csv(r'result_xgb.csv', low_memory=False) # encode:159:0.790297834417 194 | lr_sample = pd.read_csv(r'lr_sample.csv', low_memory=False) # Uncode:0.792171452209 195 | lgb_sample = pd.read_csv(r'xgb_sample_51.csv', low_memory=False) 196 | 197 | xgb_sample.columns = ['ID', 'PROB_xgb'] 198 | lr_sample.columns = ['ID', 'PROB_lr'] 199 | lgb_sample.columns = ['ID', 'PROB_lgb'] 200 | sample = xgb_sample.merge(lr_sample, on='ID', how='left') 201 | sample = sample.merge(lgb_sample, on='ID', how='left') 202 | # print(sample) 203 | sample['PROB'] = sample['PROB_xgb'] * prob_xgb + sample['PROB_lr'] * prob_lr + sample['PROB_lgb'] * prob_lgb 204 | sample = sample[['ID', 'PROB']] 205 | print(sample) 206 | sample.to_csv(r'sample.csv', index=None) 207 | print('模型已融合。。。') 208 | 209 | 210 | def module_merge_double(prob_x, prob_l): 211 | xgb_sample = pd.read_csv(r'result0501_152.csv', low_memory=False) # encode:159:0.790297834417 212 | lr_sample = pd.read_csv(r'xgb_sample_51.csv', low_memory=False) # Uncode:0.792171452209 213 | sample = xgb_sample.merge(lr_sample, on='ID', how='left') 214 | sample['PROB'] = sample['PROB_x'] * prob_x + sample['PROB_y'] * prob_l 215 | sample = sample[['ID', 'PROB']] 216 | print(sample) 217 | sample.to_csv(r'sample.csv', index=None) 218 | print('模型已融合。。。') 219 | 220 | 221 | def main(): 222 | '''xgboost单模型''' 223 | train_module(store_result=False, store_feature=True, select_feature=False, feature_num='all', one_encode=False) 224 | 225 | '''LogisticRegression单模型''' 226 | # train_LR_module(store_result=False, select_feature=True, feature_num=140, OneEncode=False) 227 | '''线性融合三个sample''' 228 | # module_merge_triple(prob_xgb=0.4, prob_lr=0.2, prob_lgb=0.4) 229 | '''现行融合两个sample''' 230 | # module_merge_double(prob_x=0.5, prob_l=0.5) 231 | '''Stacking''' 232 | # # ensemble = Ensemble(5, xgb_module, [xgb_module, lgb_module, lr_module, rf_module, gb_module]) 233 | # ensemble = Ensemble(4, lr_module, [xgb_module, xgb_module, xgb_module, xgb_module]) 234 | # train_test, label, test = ensemble.read_data() 235 | # result = ensemble.fit_predict(train_test, label, test) 236 | # print('模型融合完毕。。。') 237 | # result = pd.DataFrame(result, columns=['PROB']) 238 | # sample = pd.read_csv(r'lr_sample.csv', low_memory=False) 239 | # sample['PROB'] = [index for index in result['PROB']] 240 | # sample.to_csv(r'stacking.csv', index=None) 241 | # print(sample) 242 | # print('数据整合完毕。。。') 243 | 244 | '''multiply_feature_selection xgboost_module''' 245 | # for index in range(70, 200, 5): 246 | # print('want to select ', index, ' features') 247 | # selection = FeatureSelection(index) 248 | # features_name = selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True) 249 | # train_xgb_module(features_name, store_result=False) 250 | 251 | features_name = ['order_all_is_null', 'feature_1', 'register_days', 'quota', 'quota_surplus', 'all_is_null_y', 'account_grade_is_null', 'all_is_zero', 'account_grade2', 'age_three', 'type_pay_len', 'null_y', '等待付款', 'income1', 'auth_time_is_null', 'record_count', 'qq_bound_is_null', 'card_record_count', 'quota_is_zero', '新疆', '云南', 'account_grade3', '广东', 'card_time_is_null', 'have_credit_card', '充值成功', '已取消', 'credit_count', '在线', '四川', 'wechat_bound_is_null', 'null', 'credit_score_rank', '未抢中', 'null_x', '完成', '天津', 'age_two', 'female', '订单取消', 'quota_rate', '山东', '重庆', 'sts_order_len', 'merriage1', '福建', 'account_grade1', 'phone_count', 'record_is_unique', '上海', 'income3', '湖北', 'phone_is_null', 'time_phone_is_null', 'province_len', 'birthday_is_zero', '混合支付', 'auth_id_card_is_null', 'credit_score', '江西', '货到付款', '吉林', 'credit_score_is_null', '江苏', 'all_not_null', 'sex_secret', '已完成', 'card_category_count', 'card_count_one', '等待收货', '湖南', 'male', 'store_card_count'] 252 | train_xgb_module(features_name, store_result=True) 253 | 254 | # 0.81882083452 seed=27 255 | # original -> 0.816853963449 256 | # colsample_bytree: 0.8 -> 0.818427843445 257 | # scale_pos_weight: 16 -> 0.82029535496 258 | # reg_alpha: 0.01 -> 0.820431061402 259 | # 'quota', 'quota_surplus', -> 0.820543215061 260 | 261 | '''multiply_feature_selection LogisticRegression_module''' 262 | # for index in range(70, 200, 5): 263 | # print('want to select ', index, ' features') 264 | # selection = FeatureSelection(index) 265 | # features_name = selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True) 266 | # # features_name = ['id_card_one', 'id_card_two', 'id_card_three', 'id_card_four', 'id_card_five', 'id_card_six', 'mobile', 'unicom', 'telecom', 'virtual', 'order_all_is_null', 'feature_1', 'record_is_unique', '浙江', '辽宁', 'card_time_is_null', 'income1', 'account_grade2', '黑龙', '江苏', '未抢中', '山东', '内蒙', '上海', '分期付款', '货到付款', 'overdraft', '公司转账', 'null', '订单取消', 'age_two', '充值成功', '在线', '新疆', '完成', 'quota_rate', 'sex_not_male', '湖北', 'quota', 'account_grade_is_null', '安徽', 'card_category_count', 'all_not_null', 'phone_is_null', '河北', 'merriage_is_null', '混合支付', 'quota_surplus_is_null', 'birthday_is_zero', 'income3', '江西', 'store_card_count', 'time_phone_is_null', 'id_card_is_null', 'auth_id_card_is_null', '已取消', '广东', 'record_count', '云南', '等待付款', '已完成', 'card_count_one', 'type_pay_len', 'female', 'sts_order_len', '福建', 'auth_time_is_null', '在线支付', 'null_x', 'income2', 'quota_is_zero', 'credit_score_is_null', 'account_grade3', '四川', '等待审核', '重庆', '河南', 'all_is_null_y', '吉林', '抢票已取消', 'province_len', 'credit_count', 'account_grade1', 'credit_score_rank', 'sts_order_count', '湖南', '充值失败;退款成功', 'wechat_bound_is_null', 'card_record_count', 'male', '邮局汇款', 'merriage1', '山西', 'phone_count', 'sex_secret', '海南', 'merriage2', '等待收货', 'all_is_zero', '天津', 'credit_score', 'age_three', 'null_y', 'qq_bound_is_null', 'have_credit_card', '北京'] 267 | # # # features_name = ['record_count', 'quota', 'account_grade_is_null', '安徽', '云南', '等待付款', 'credit_count', 'account_grade1', 'credit_score_rank', '已完成', 'record_is_unique', 'card_count_one', 'card_category_count', 'all_not_null', 'sts_order_count', '湖南', '浙江', '充值失败;退款成功', 'wechat_bound_is_null', 'card_record_count', 'phone_is_null', 'type_pay_len', 'female', 'male', '辽宁', 'card_time_is_null', '河北', 'sts_order_len', '福建', 'auth_time_is_null', 'income1', '在线支付', 'merriage1', 'null_x', 'account_grade2', 'income2', 'quota_is_zero', '江苏', 'credit_score_is_null', 'merriage_is_null', '未抢中', 'phone_count', '山东', '上海', 'sex_secret', '货到付款', '北京', 'null', 'account_grade3', '等待收货', 'all_is_zero', '天津', 'credit_score', '四川', '混合支付', 'quota_surplus_is_null', 'birthday_is_zero', '订单取消', 'age_two', 'income3', '江西', 'store_card_count', 'time_phone_is_null', '充值成功', 'id_card_is_null', '在线', '新疆', '重庆', '河南', 'all_is_null_y', '吉林', 'auth_id_card_is_null', '完成', 'age_three', 'null_y', 'quota_rate', 'province_len', 'qq_bound_is_null', 'have_credit_card', '已取消', 'sex_not_male', '湖北', '广东'] 268 | # train_lr_module(features_name, store_result=False) 269 | # # 0.812781111086 270 | 271 | features_name = ['order_all_is_null', 'feature_1', 'record_is_unique', '浙江', '辽宁', 'card_time_is_null', 'income1', 'account_grade2', '黑龙', '江苏', '未抢中', '山东', '内蒙', '上海', '分期付款', '货到付款', 'overdraft', '公司转账', 'null', '订单取消', 'age_two', '充值成功', '在线', '新疆', '完成', 'quota_rate', 'sex_not_male', '湖北', 'quota', 'account_grade_is_null', '安徽', 'card_category_count', 'all_not_null', 'phone_is_null', '河北', 'merriage_is_null', '混合支付', 'quota_surplus_is_null', 'birthday_is_zero', 'income3', '江西', 'store_card_count', 'time_phone_is_null', 'id_card_is_null', 'auth_id_card_is_null', '已取消', '广东', 'record_count', '云南', '等待付款', '已完成', 'card_count_one', 'type_pay_len', 'female', 'sts_order_len', '福建', 'auth_time_is_null', '在线支付', 'null_x', 'income2', 'quota_is_zero', 'credit_score_is_null', 'account_grade3', '四川', '等待审核', '重庆', '河南', 'all_is_null_y', '吉林', '抢票已取消', 'province_len', 'credit_count', 'account_grade1', 'credit_score_rank', 'sts_order_count', '湖南', '充值失败;退款成功', 'wechat_bound_is_null', 'card_record_count', 'male', '邮局汇款', 'merriage1', '山西', 'phone_count', 'sex_secret', '海南', 'merriage2', '等待收货', 'all_is_zero', '天津', 'credit_score', 'age_three', 'null_y', 'qq_bound_is_null', 'have_credit_card', '北京'] 272 | train_lr_module(features_name, store_result=True) 273 | 274 | module_merge_triple(prob_xgb=0.4, prob_lr=0.2, prob_lgb=0.4) 275 | 276 | if __name__ == '__main__': 277 | start_time = time.clock() 278 | main() 279 | end_time = time.clock() 280 | print('程序耗时:', end_time - start_time) 281 | 282 | -------------------------------------------------------------------------------- /XGB1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat May 19 22:02:28 2018 4 | 5 | @author: Frank 6 | """ 7 | 8 | 9 | 10 | import pandas as pd 11 | import numpy as np 12 | import datetime 13 | import xgboost as xgb 14 | from xgboost import plot_importance 15 | import operator 16 | from sklearn.metrics import roc_auc_score 17 | import matplotlib.pyplot as plt 18 | from sklearn.linear_model import LogisticRegression 19 | 20 | 21 | def setlen(group): 22 | return len(set(group)) 23 | 24 | 25 | def return_set(group): 26 | return set(group) 27 | 28 | 29 | def auth_info(data): 30 | data['id_card_isnull'] = [1 if type(i) == str else 0 for i in data.id_card] 31 | data['phone_isnull'] = [1 if type(i) == str else 0 for i in data.phone] 32 | data['auth_time_isnull'] = [1 if type(i) == str else 0 for i in data.auth_time] 33 | data['first_bite'] = [i[0] if type(i) == str else '-1' for i in data['id_card']] 34 | id_card = ['2', '1', '3', '4', '6', '5'] 35 | for i in id_card: 36 | data[i] = [1 if i == index else 0 for index in data['first_bite']] 37 | 38 | return data[['id_card_isnull', 'phone_isnull', 'auth_time_isnull', 'id']] 39 | 40 | 41 | def bankcard_info(data): 42 | data['card_1'] = [1 if i == '储蓄卡' else 0 for i in data['card_type']] 43 | data['card_2'] = [1 if i == '信用卡' else 0 for i in data['card_type']] 44 | card_1_cnt = pd.pivot_table(data, index='id', values='card_1', aggfunc='sum').reset_index().rename(columns={'card_1': 'card_1_cnt'}) 45 | data = data.merge(card_1_cnt, on='id', how='left') 46 | card_2_cnt = pd.pivot_table(data, index='id', values='card_2', aggfunc='sum').reset_index().rename(columns={'card_2': 'card_2_cnt'}) 47 | data = data.merge(card_2_cnt, on='id', how='left') 48 | bank_cnt = pd.pivot_table(data, index='bank_name', values='tail_num', aggfunc='count').reset_index().rename(columns={'tail_num': 'bank_cnt'}) 49 | id_bank_cnt = pd.pivot_table(data, index='id', values='bank_name', aggfunc='count').reset_index().rename(columns={'bank_name': 'id_bank_cnt'}) 50 | # id_card1_cnt = pd.pivot_table(data, index='id', values='card_1', aggfunc='sum').reset_index().rename(columns={'card_1': 'id_card1_cnt'}) 51 | # id_card2_cnt = pd.pivot_table(data, index='id', values='card_2', aggfunc='sum').reset_index().rename(columns={'card_2': 'id_card2_cnt'}) 52 | id_phone_set = pd.pivot_table(data, index='id', values='phone', aggfunc=setlen).reset_index().rename(columns={'phone': 'id_phone_set'}) 53 | id_card_set = pd.pivot_table(data, index='id', values='card_type', aggfunc=setlen).reset_index().rename(columns={'card_type': 'id_card_set'}) 54 | id_bank_set = pd.pivot_table(data, index='id', values='bank_name', aggfunc=setlen).reset_index().rename(columns={'bank_name': 'id_bank_set'}) 55 | 56 | data = data.merge(bank_cnt, on='bank_name', how='left') 57 | data = data.merge(id_bank_cnt, on='id', how='left') 58 | data = data.merge(id_phone_set, on='id', how='left') 59 | data = data.merge(id_card_set, on='id', how='left') # ? 60 | data = data.merge(id_bank_set, on='id', how='left') # ? 61 | return data[['id', 'card_1_cnt', 'card_2_cnt', 'id_bank_cnt', 'id_phone_set', 'id_card_set', 'id_bank_set']].drop_duplicates(['id']) 62 | 63 | 64 | def credit_info(data): 65 | data['q_o'] = data['quota'] - data['overdraft'] 66 | data['quota'] = [1 if i is np.nan else i for i in data['quota']] 67 | data['overdraft'] = [1 if i is np.nan else i for i in data['overdraft']] 68 | data['q/o'] = data[['quota', 'overdraft']].apply(lambda x: 0 if x.quota == 0 else x.overdraft/x.quota, axis=1) 69 | return data.drop_duplicates(['id']) 70 | 71 | 72 | def order_info(data): 73 | id_sample = data.drop_duplicates(['id'])[['id']] 74 | 75 | data = data.drop_duplicates() 76 | order_info_amt = data[['amt_order']] 77 | order_info_amt = order_info_amt[order_info_amt['amt_order'].notnull()] 78 | order_info_amt = order_info_amt[order_info_amt['amt_order'] != 'null'] 79 | order_info_amt['amt_order'] = [float(index) for index in order_info_amt['amt_order']] 80 | mean = order_info_amt['amt_order'].mean() 81 | data['amt_order'] = data['amt_order'].fillna(mean) 82 | data['amt_order'] = [mean if index == 'null' else index for index in data['amt_order']] 83 | data['amt_order'] = [float(index) for index in data['amt_order']] 84 | 85 | data['pay_way_1'] = [1 if i == '在线支付' else 0 for i in data['type_pay']] 86 | way1_cnt = pd.pivot_table(data, index='id', values='pay_way_1', aggfunc='sum').reset_index().rename(columns={'pay_way_1': 'way1_cnt'}) 87 | id_sample = id_sample.merge(way1_cnt, on='id', how='left') 88 | data['pay_way_2'] = [1 if i == '货到付款' else 0 for i in data['type_pay']] 89 | way2_cnt = pd.pivot_table(data, index='id', values='pay_way_2', aggfunc='sum').reset_index().rename(columns={'pay_way_2': 'way2_cnt'}) 90 | id_sample = id_sample.merge(way2_cnt, on='id', how='left') 91 | 92 | '''统计计数特征''' 93 | # f1 = pd.pivot_table(data[['id', 'amt_order']], index='id', values='amt_order', aggfunc='mean').reset_index().rename(columns={'amt_order': 'id_amt_order_mean'}) 94 | # id_sample = id_sample.merge(f1, on='id', how='left') 95 | # f2 = pd.pivot_table(data[['id', 'amt_order']], index='id', values='amt_order', aggfunc='max').reset_index().rename(columns={'amt_order': 'id_amt_order_max'}) 96 | # id_sample = id_sample.merge(f2, on='id', how='left') 97 | # f3 = pd.pivot_table(data[['id', 'amt_order']], index='id', values='amt_order', aggfunc='min').reset_index().rename(columns={'amt_order': 'id_amt_order_min'}) 98 | # id_sample = id_sample.merge(f3, on='id', how='left') 99 | # f4 = pd.pivot_table(data[['id', 'amt_order']], index='id', values='amt_order', aggfunc='var').reset_index().rename(columns={'amt_order': 'id_amt_order_var'}) 100 | # id_sample = id_sample.merge(f4, on='id', how='left') 101 | # f5 = pd.pivot_table(data[['id', 'unit_price']], index='id', values='unit_price', aggfunc='mean').reset_index().rename(columns={'unit_price': 'id_unit_price_mean'}) 102 | # id_sample = id_sample.merge(f5, on='id', how='left') 103 | # f6 = pd.pivot_table(data[['id', 'unit_price']], index='id', values='unit_price', aggfunc='max').reset_index().rename(columns={'unit_price': 'id_unit_price_max'}) 104 | # id_sample = id_sample.merge(f6, on='id', how='left') 105 | # f7 = pd.pivot_table(data[['id', 'unit_price']], index='id', values='unit_price', aggfunc='min').reset_index().rename(columns={'unit_price': 'id_unit_price_min'}) 106 | # id_sample = id_sample.merge(f7, on='id', how='left') 107 | # f8 = pd.pivot_table(data[['id', 'unit_price']], index='id', values='unit_price', aggfunc='var').reset_index().rename(columns={'unit_price': 'id_unit_price_var'}) 108 | # id_sample = id_sample.merge(f8, on='id', how='left') 109 | 110 | f9 = pd.pivot_table(data[['id', 'type_pay']], index='id', values='type_pay', aggfunc=setlen).reset_index().rename(columns={'type_pay': 'id_type_pay_set'}) 111 | id_sample = id_sample.merge(f9, on='id', how='left') 112 | f10 = pd.pivot_table(data[['id', 'sts_order']], index='id', values='sts_order', aggfunc=setlen).reset_index().rename(columns={'sts_order': 'id_sts_order_set'}) 113 | id_sample = id_sample.merge(f10, on='id', how='left') 114 | f11 = pd.pivot_table(data[['id', 'phone']], index='id', values='phone', aggfunc=setlen).reset_index().rename(columns={'phone': 'id_phone_set'}) 115 | id_sample = id_sample.merge(f11, on='id', how='left') 116 | 117 | '''其他特征''' 118 | data['sts_order'] = data['sts_order'].fillna('0') 119 | data['wan_cheng'] = [1 if ('完成' in i) else 0 for i in data['sts_order']] 120 | wan_cheng_cnt = pd.pivot_table(data, index='id', values='wan_cheng', aggfunc='sum').reset_index().rename(columns={'wan_cheng': 'wan_cheng_cnt'}) 121 | id_sample = id_sample.merge(wan_cheng_cnt, on='id', how='left') 122 | data['cheng_gong'] = [1 if '成功' in i else 0 for i in data['sts_order']] 123 | print(data['cheng_gong']) 124 | cheng_gong_cnt = pd.pivot_table(data, index='id', values='cheng_gong', aggfunc='sum').reset_index().rename(columns={'cheng_gong': 'cheng_gong_cnt'}) 125 | id_sample = id_sample.merge(cheng_gong_cnt, on='id', how='left') 126 | data['qu_xiao'] = [1 if '取消' in i else 0 for i in data['sts_order']] 127 | qu_xiao_cnt = pd.pivot_table(data, index='id', values='qu_xiao', aggfunc='sum').reset_index().rename(columns={'qu_xiao': 'qu_xiao_cnt'}) 128 | id_sample = id_sample.merge(qu_xiao_cnt, on='id', how='left') 129 | 130 | '''时间''' 131 | # year_month = ['1604', '1704', '1504', '1607', '1508', '1505', '1608', '1602', '1701', '1512', '1612', '1506', '1610', '1412', '1603', '00000000', '1601', '1611', '1605', '1606'] 132 | # data['year_month'] = [i[2:4] + i[5: 7] if type(i) == str else '00000000' for i in data['time_order']] 133 | # for i in year_month: 134 | # data[i] = [1 if i == index else 0 for index in data['year_month']] 135 | # t_1604 = pd.pivot_table(data, index='id', values='1604', aggfunc='sum').reset_index() 136 | # id_sample = id_sample.merge(t_1604, on='id', how='left') 137 | # t_1704 = pd.pivot_table(data, index='id', values='1704', aggfunc='sum').reset_index() 138 | # id_sample = id_sample.merge(t_1704, on='id', how='left') 139 | # t_1504 = pd.pivot_table(data, index='id', values='1504', aggfunc='sum').reset_index() 140 | # id_sample = id_sample.merge(t_1504, on='id', how='left') 141 | # t_1607 = pd.pivot_table(data, index='id', values='1607', aggfunc='sum').reset_index() 142 | # id_sample = id_sample.merge(t_1607, on='id', how='left') 143 | # t_1508 = pd.pivot_table(data, index='id', values='1508', aggfunc='sum').reset_index() 144 | # id_sample = id_sample.merge(t_1508, on='id', how='left') 145 | # t_1505 = pd.pivot_table(data, index='id', values='1505', aggfunc='sum').reset_index() 146 | # id_sample = id_sample.merge(t_1505, on='id', how='left') 147 | # t_1608 = pd.pivot_table(data, index='id', values='1608', aggfunc='sum').reset_index() 148 | # id_sample = id_sample.merge(t_1608, on='id', how='left') 149 | # t_1602 = pd.pivot_table(data, index='id', values='1602', aggfunc='sum').reset_index() 150 | # id_sample = id_sample.merge(t_1602, on='id', how='left') 151 | # t_1701 = pd.pivot_table(data, index='id', values='1701', aggfunc='sum').reset_index() 152 | # id_sample = id_sample.merge(t_1701, on='id', how='left') 153 | # t_1512 = pd.pivot_table(data, index='id', values='1512', aggfunc='sum').reset_index() 154 | # id_sample = id_sample.merge(t_1512, on='id', how='left') 155 | # t_1612 = pd.pivot_table(data, index='id', values='1612', aggfunc='sum').reset_index() 156 | # id_sample = id_sample.merge(t_1612, on='id', how='left') 157 | # t_1506 = pd.pivot_table(data, index='id', values='1506', aggfunc='sum').reset_index() 158 | # id_sample = id_sample.merge(t_1506, on='id', how='left') 159 | # t_1610 = pd.pivot_table(data, index='id', values='1610', aggfunc='sum').reset_index() 160 | # id_sample = id_sample.merge(t_1610, on='id', how='left') 161 | # t_1412 = pd.pivot_table(data, index='id', values='1412', aggfunc='sum').reset_index() 162 | # id_sample = id_sample.merge(t_1412, on='id', how='left') 163 | # t_1603 = pd.pivot_table(data, index='id', values='1603', aggfunc='sum').reset_index() 164 | # id_sample = id_sample.merge(t_1603, on='id', how='left') 165 | # t_0000 = pd.pivot_table(data, index='id', values='00000000', aggfunc='sum').reset_index() 166 | # id_sample = id_sample.merge(t_0000, on='id', how='left') 167 | # t_1601 = pd.pivot_table(data, index='id', values='1601', aggfunc='sum').reset_index() 168 | # id_sample = id_sample.merge(t_1601, on='id', how='left') 169 | # t_1611 = pd.pivot_table(data, index='id', values='1611', aggfunc='sum').reset_index() 170 | # id_sample = id_sample.merge(t_1611, on='id', how='left') 171 | # t_1605 = pd.pivot_table(data, index='id', values='1605', aggfunc='sum').reset_index() 172 | # id_sample = id_sample.merge(t_1605, on='id', how='left') 173 | # t_1606 = pd.pivot_table(data, index='id', values='1606', aggfunc='sum').reset_index() 174 | # id_sample = id_sample.merge(t_1606, on='id', how='left') 175 | 176 | 177 | # sts_order = [] 178 | # for outcome in data['sts_order']: 179 | # if type(outcome) == str: 180 | # if "完成" in outcome: 181 | # sts_order.append(1) 182 | # else: 183 | # sts_order.append(0) 184 | # else: 185 | # sts_order.append(0) 186 | # data['is_ok'] = sts_order 187 | # data['no_ok'] = [1 if i == 0 else 1 for i in sts_order] 188 | # wancheng_cnt = pd.pivot_table(data[['id', 'is_ok']], index='id', values='is_ok', aggfunc='sum').reset_index().rename(columns={'is_ok': 'wancheng_cnt'}) 189 | # no_ok = pd.pivot_table(data[['id', 'no_ok']], index='id', values='no_ok', aggfunc='sum').reset_index().rename(columns={'is_ok': 'no_ok'}) 190 | # id_sample = id_sample.merge(wancheng_cnt, on='id', how='left') 191 | # id_sample = id_sample.merge(no_ok, on='id', how='left') 192 | 193 | # type_pay = [] 194 | # for outcome in data['type_pay']: 195 | # if type(outcome) == str: 196 | # if "在线支付" in outcome: 197 | # type_pay.append(1) 198 | # else: 199 | # type_pay.append(0) 200 | # else: 201 | # type_pay.append(0) 202 | # data['zai_xian'] = sts_order 203 | # data['no_zai_xian'] = [1 if i == 0 else 1 for i in sts_order] 204 | # zai_xian_cnt = pd.pivot_table(data[['id', 'zai_xian']], index='id', values='zai_xian', aggfunc='sum').reset_index().rename(columns={'is_ok': 'zai_xian_cnt'}) 205 | # no_zai_xian = pd.pivot_table(data[['id', 'no_zai_xian']], index='id', values='no_zai_xian', aggfunc='sum').reset_index().rename(columns={'is_ok': 'no_zai_xian'}) 206 | # id_sample = id_sample.merge(zai_xian_cnt, on='id', how='left') 207 | # id_sample = id_sample.merge(no_zai_xian, on='id', how='left') 208 | 209 | return id_sample.drop_duplicates(['id']) 210 | 211 | 212 | def recieve_addr_info(data): 213 | province = {'甘肃', '云南', '贵州', '河南', '黑龙', '香港', '北京', '湖南', '江苏', '青海', '宁夏', '内蒙', '浙江', '吉林', '海南', '福建', '重庆', '台湾', '陕西', '湖北', '江西', '辽宁', '山西', '西藏', '广东', '安徽', '四川', '河北', '山东', '上海', 214 | '广西', '新疆', '天津', 'null'} 215 | data['province'] = data[['region']].apply(lambda x: 'null' if x.region is np.nan else x.region[0:2], axis=1) 216 | city_set = pd.pivot_table(data, index='id', values='province', aggfunc=return_set).reset_index() 217 | for string in list(province): 218 | city_set[string] = [1 if string in index else 0 for index in city_set['province']] 219 | city_set['province_p'] = city_set[['province']].apply(lambda x: x.province.clear() if 'null' in x.province else x.province, axis=1) 220 | city_set['province_len'] = [0 if index is None else len(index) for index in city_set['province']] 221 | 222 | data['phone_isnull'] = [0 if type(i) == float else 1 for i in data.phone] 223 | data['fix_phone_isnull'] = [1 if type(i) == str else 0 for i in data.fix_phone] 224 | id_phone_set = pd.pivot_table(data[['id', 'phone']], index='id', values='phone', aggfunc=setlen).reset_index().rename(columns={'phone': 'id_phone_set'}) 225 | data = data.merge(id_phone_set, on='id', how='left') 226 | data = data.merge(city_set, on='id', how='left') 227 | 228 | return data[['id', 'phone_isnull', 'fix_phone_isnull', 'id_phone_set', 'province_len']].drop_duplicates(['id']) 229 | 230 | 231 | def user_info(data): 232 | id_sample = data[['id']] 233 | degree = ['本科', '初中', '中专', '其他', '硕士', '大专', '博士', '高中'] 234 | for index in degree: 235 | id_sample[index] = [1 if index == string else 0 for string in data['degree']] 236 | 237 | id_sample['sex_isnull'] = [0 if type(index) == float else 1 for index in data['sex']] 238 | id_sample['sex1'] = [1 if index == '保密' else 0 for index in data['sex']] 239 | id_sample['sex2'] = [1 if index == '男' else 0 for index in data['sex']] 240 | id_sample['sex3'] = [1 if index == '女' else 0 for index in data['sex']] 241 | 242 | id_sample['0000-00-00'] = [1 if index == '0000-00-00' else 0 for index in data['birthday']] 243 | 244 | id_sample['merriage1'] = [1 if index == '未婚' else 0 for index in data['merriage']] 245 | id_sample['merriage2'] = [1 if index == '已婚' else 0 for index in data['merriage']] 246 | id_sample['merriage3'] = [1 if index == '保密' else 0 for index in data['merriage']] 247 | 248 | id_sample['income_isnull'] = [1 if type(index) == str else 0 for index in data['income']] 249 | id_sample['income1'] = [1 if index == '4000-5999元' else 0 for index in data['income']] 250 | id_sample['income2'] = [1 if index == '8000元以上' else 0 for index in data['income']] 251 | id_sample['income3'] = [1 if index == '2000-3999元' else 0 for index in data['income']] 252 | id_sample['income4'] = [1 if index == '6000-7999元' else 0 for index in data['income']] 253 | id_sample['income5'] = [1 if index == '2000元以下' else 0 for index in data['income']] 254 | 255 | id_sample['id_card_isnull'] = [1 if type(index) == str else 0 for index in data['id_card']] 256 | 257 | id_sample['qq_bound_one'] = [1 if index == '已绑定' else 0 for index in data['qq_bound']] 258 | id_sample['qq_bound_two'] = [1 if index == '未绑定' else 0 for index in data['qq_bound']] 259 | 260 | id_sample['wechat_bound_one'] = [1 if index == '已绑定' else 0 for index in data['wechat_bound']] 261 | id_sample['wechat_bound_two'] = [1 if index == '未绑定' else 0 for index in data['wechat_bound']] 262 | 263 | id_sample['account_grade_one'] = [1 if index == '注册会员' else 0 for index in data['account_grade']] 264 | id_sample['account_grade_two'] = [1 if index == '铜牌会员' else 0 for index in data['account_grade']] 265 | id_sample['account_grade_three'] = [1 if index == '银牌会员' else 0 for index in data['account_grade']] 266 | id_sample['account_grade_four'] = [1 if index == '金牌会员' else 0 for index in data['account_grade']] 267 | id_sample['account_grade_five'] = [1 if index == '钻石会员' else 0 for index in data['account_grade']] 268 | return id_sample.drop_duplicates(['id']) 269 | 270 | 271 | def days_feature(auth, order, appl): 272 | # data = auth.merge(order, on='id', how='left') 273 | data = auth.merge(appl, on='id', how='left') 274 | data['auth_time'] = data[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm[:10] if x.auth_time == '0000-00-00' else x.auth_time, axis=1) 275 | data['auth_time'] = data[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm[:10] if x.auth_time is np.nan else x.auth_time, axis=1) 276 | data['days'] = data[['auth_time', 'appl_sbm_tm']].apply(lambda x: (datetime.datetime.strptime(x.appl_sbm_tm[:10], '%Y-%m-%d') - datetime.datetime.strptime(x.auth_time[:10], '%Y-%m-%d')).days, axis=1) 277 | data['days_is_neg'] = [1 if i > 0 else 0 for i in data['days']] 278 | data['auth_year'] = data[['auth_time']].apply(lambda x: int(x.auth_time[:4]), axis=1) 279 | data['appl_year'] = data[['appl_sbm_tm']].apply(lambda x: int(x.appl_sbm_tm[:4]), axis=1) 280 | data['years'] = data['appl_year'] - data['auth_year'] 281 | data['years_is_neg'] =data[['years']].apply(lambda x: 1 if x.years > 0 else 0, axis=1) 282 | 283 | 284 | # data['auth_time'] = [i if type(i) == str else '0001-01-01' for i in auth['auth_time']] 285 | # data['auth_time'] = ['0001-01-01' if i == '0000-00-00' else i for i in auth['auth_time']] 286 | # data['auth_time'] = ['0001-01-01' if i == 0 else i for i in auth['auth_time']] 287 | # data['time_order'] = [i if type(i) == str else '0001-01-01 00:00:00' for i in appl['time_order']] 288 | # data['time_order'] = [i if len(i) > 16 else '0001-01-01 00:00:00' for i in appl['time_order']] 289 | # 290 | # data['time_days'] = data[['auth_time', 'time_order']].apply(lambda x: abs((datetime.datetime.strptime(x.time_order, '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(x.auth_time, '%Y-%m-%d')).days), axis=1) 291 | # data['time_days'] = [i if ((i < 50000) & (i > 0)) else -1 for i in data['time_days']] 292 | # time_days_mean = pd.pivot_table(data[['id', 'time_days']], index='id', values='time_days', aggfunc='mean').reset_index().rename(columns={'time_days': 'time_days_mean'}) 293 | # data = data.merge(time_days_mean, on='id', how='left') 294 | # data['time_days_mean_is_neg'] = [1 if i > 0 else 0 for i in data['time_days']] 295 | 296 | # appl['appl_sbm_tm'] = [i[:-2] for i in appl['appl_sbm_tm']] 297 | # data = data.merge(appl, on='id', how='left') 298 | # data['appl_age'] = data[['auth_time', 'appl_sbm_tm']].apply(lambda x: ((datetime.datetime.strptime(x.auth_time, '%Y-%m-%d') - datetime.datetime.strptime(x.appl_sbm_tm, '%Y-%m-%d %H:%M:%S')).days), axis=1) 299 | # data['appl_neg'] = [1 if i < 0 else 1 for i in data['appl_age']] 300 | print("OK") 301 | return data[['id', 'days', 'days_is_neg', 'years', 'years_is_neg']].drop_duplicates(['id']) 302 | 303 | 304 | def auth_order(auth, order): 305 | data = auth.merge(order, on='id', how='left') 306 | data['auth_time'] = [i if type(i) == str else '0001-01-01' for i in data['auth_time']] 307 | data['auth_time'] = ['0001-01-01' if i == '0000-00-00' else i for i in data['auth_time']] 308 | data['auth_time'] = ['0001-01-01' if i == 0 else i for i in data['auth_time']] 309 | data['time_order'] = [i if type(i) == str else '0001-01-01 00:00:00' for i in data['time_order']] 310 | data['time_order'] = [i if len(i) > 16 else '0001-01-01 00:00:00' for i in data['time_order']] 311 | 312 | data['time_days'] = data[['auth_time', 'time_order']].apply( 313 | lambda x: abs((datetime.datetime.strptime(x.time_order, '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(x.auth_time, '%Y-%m-%d')).days), axis=1) 314 | data['time_days'] = [i if ((i < 50000) & (i > 0)) else -1 for i in data['time_days']] 315 | time_days_mean = pd.pivot_table(data[['id', 'time_days']], index='id', values='time_days', aggfunc='mean').reset_index().rename(columns={'time_days': 'time_days_mean'}) 316 | auth = auth.merge(time_days_mean, on='id', how='left') 317 | auth['time_days_mean_is_neg'] = [1 if i > 0 else 0 for i in auth['time_days_mean']] 318 | return auth[['id', 'time_days_mean', 'time_days_mean_is_neg']] 319 | 320 | 321 | def submit(): 322 | '''训练集读取、提特征''' 323 | train_auth_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_auth_info.csv', low_memory=False) 324 | f_train_auth_info = auth_info(train_auth_info) 325 | train_bankcard_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_bankcard_info.csv', low_memory=False) 326 | f_train_bankcard_info = bankcard_info(train_bankcard_info) 327 | train_credit_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_credit_info.csv', low_memory=False) 328 | f_train_credit_info = credit_info(train_credit_info) 329 | train_order_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_order_info.csv', low_memory=False) 330 | f_train_order_info = order_info(train_order_info) 331 | train_recieve_addr_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_recieve_addr_info.csv', low_memory=False) 332 | f_train_recieve_addr_info = recieve_addr_info(train_recieve_addr_info) 333 | train_user_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_user_info.csv', low_memory=False) 334 | f_train_user_info = user_info(train_user_info) 335 | train_target = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_target.csv', low_memory=False) 336 | feature_l = train_target[['id', 'target']] 337 | f_day_minus = days_feature(train_auth_info[['id', 'auth_time']], train_order_info[['id', 'time_order']], train_target[['id', 'appl_sbm_tm']]) 338 | f_auth_or = auth_order(train_auth_info, train_order_info) 339 | # print(f_day_minus) 340 | 341 | '''f_merge''' 342 | feature_l = feature_l.merge(f_train_auth_info, on='id', how='left') 343 | feature_l = feature_l.merge(f_train_bankcard_info, on='id', how='left') 344 | feature_l = feature_l.merge(f_train_credit_info, on='id', how='left') 345 | feature_l = feature_l.merge(f_train_order_info, on='id', how='left') 346 | feature_l = feature_l.merge(f_train_recieve_addr_info, on='id', how='left') 347 | feature_l = feature_l.merge(f_train_user_info, on='id', how='left') 348 | feature_l = feature_l.merge(f_day_minus, on='id', how='left') 349 | feature_l = feature_l.merge(f_auth_or, on='id', how='left') 350 | # feature_l.to_csv(r'F:\Python_project\AL\train_data\train_feature.csv', index=False) 351 | print(feature_l.shape) 352 | print(feature_l) 353 | train_f = feature_l.drop('target', axis=1) 354 | train_l = feature_l[['target']] 355 | 356 | xgb_train = xgb.DMatrix(train_f.values, label=train_l.values) 357 | params = { 358 | 'booster': 'gbtree', 359 | 'objective': 'binary:logistic', 360 | 'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。 361 | 'max_depth': 5, # 构建树的深度,越大越容易过拟合 362 | 'lambda': 2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 363 | 'subsample': 0.8, # 随机采样训练样本 364 | 'colsample_bytree': 0.8, # 生成树时进行的列采样 365 | 'min_child_weight': 18, 366 | 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. 367 | 'eta': 0.03, # 如同学习率 368 | 'eval_metric': 'logloss' 369 | } 370 | module = xgb.train(params, xgb_train, num_boost_round=500) 371 | 372 | 373 | '''测试集读取、提特征''' 374 | test_auth_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_auth_info.csv', low_memory=False) 375 | f_test_auth_info = auth_info(test_auth_info) 376 | test_bankcard_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_bankcard_info.csv', low_memory=False) 377 | f_test_bankcard_info = bankcard_info(test_bankcard_info) 378 | test_credit_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_credit_info.csv', low_memory=False) 379 | f_test_credit_info = credit_info(test_credit_info) 380 | test_order_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_order_info.csv', low_memory=False) 381 | f_test_order_info = order_info(test_order_info) 382 | test_recieve_addr_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_recieve_addr_info.csv', low_memory=False) 383 | f_test_recieve_addr_info = recieve_addr_info(test_recieve_addr_info) 384 | test_user_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_user_info.csv', low_memory=False) 385 | f_test_user_info = user_info(test_user_info) 386 | test_target = pd.read_csv(r'F:\Python_project\AL\AI_Risk_data_Btest_V2.0\Btest_list.csv', low_memory=False) 387 | test_fl = test_target[['id']] 388 | t_day_minus = days_feature(test_auth_info[['id', 'auth_time']], test_order_info[['id', 'time_order']], test_target[['id', 'appl_sbm_tm']]) 389 | t_auth_or = auth_order(test_auth_info, test_order_info) 390 | 391 | '''merge''' 392 | test_fl = test_fl.merge(f_test_auth_info, on='id', how='left') 393 | test_fl = test_fl.merge(f_test_bankcard_info, on='id', how='left') 394 | test_fl = test_fl.merge(f_test_credit_info, on='id', how='left') 395 | test_fl = test_fl.merge(f_test_order_info, on='id', how='left') 396 | test_fl = test_fl.merge(f_test_recieve_addr_info, on='id', how='left') 397 | test_fl = test_fl.merge(f_test_user_info, on='id', how='left') 398 | test_fl = test_fl.merge(t_day_minus, on='id', how='left') 399 | test_fl = test_fl.merge(t_auth_or, on='id', how='left') 400 | 401 | 402 | test_f = test_fl 403 | test_l = test_fl[['id']] 404 | 405 | xgb_test = xgb.DMatrix(test_f.values) 406 | result = module.predict(xgb_test) 407 | test_l['predicted_score'] = result 408 | test_l.columns = ['ID', 'PROB'] 409 | test_l.to_csv(r'result_xgb.csv', index=None) 410 | 411 | 412 | def validation(): 413 | '''训练集读取、提特征''' 414 | train_auth_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_auth_info.csv', low_memory=False) 415 | f_train_auth_info = auth_info(train_auth_info) 416 | train_bankcard_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_bankcard_info.csv', low_memory=False) 417 | f_train_bankcard_info = bankcard_info(train_bankcard_info) 418 | train_credit_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_credit_info.csv', low_memory=False) 419 | f_train_credit_info = credit_info(train_credit_info) 420 | train_order_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_order_info.csv', low_memory=False) 421 | f_train_order_info = order_info(train_order_info) 422 | train_recieve_addr_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_recieve_addr_info.csv', low_memory=False) 423 | f_train_recieve_addr_info = recieve_addr_info(train_recieve_addr_info) 424 | train_user_info = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_user_info.csv', low_memory=False) 425 | f_train_user_info = user_info(train_user_info) 426 | train_target = pd.read_csv(r'F:\Python_project\AL\AI_Risk_Train_V3.0/train_target.csv', low_memory=False) 427 | feature_l = train_target[['id', 'target']] 428 | day_minus = days_feature(train_auth_info[['id', 'auth_time']], train_order_info[['id', 'time_order']], train_target[['id', 'appl_sbm_tm']]) 429 | auth_or = auth_order(train_auth_info[['id', 'auth_time']], train_order_info[['id', 'time_order']]) 430 | 431 | '''划分验证集''' 432 | feature_l['date'] = [index.replace('-', '') for index in train_target['appl_sbm_tm']] 433 | feature_l['date'] = [index.split(' ')[0][0:6] for index in feature_l['date']] 434 | validation_train = feature_l[feature_l['date'] != '201704'][['target', 'id']] 435 | validation_test = feature_l[feature_l['date'] == '201704'][['target', 'id']] 436 | 437 | '''validation_train''' 438 | validation_train = validation_train.merge(f_train_auth_info, on='id', how='left') 439 | validation_train = validation_train.merge(f_train_bankcard_info, on='id', how='left') 440 | validation_train = validation_train.merge(f_train_credit_info, on='id', how='left') 441 | validation_train = validation_train.merge(f_train_order_info, on='id', how='left') 442 | validation_train = validation_train.merge(f_train_recieve_addr_info, on='id', how='left') 443 | validation_train = validation_train.merge(f_train_user_info, on='id', how='left') 444 | validation_train = validation_train.merge(day_minus, on='id', how='left') 445 | validation_train = validation_train.merge(auth_or, on='id', how='left') 446 | 447 | validation_train_f = validation_train.drop(['target', 'id'], axis=1) 448 | validation_train_l = validation_train[['target']] 449 | print(validation_train_f.columns) 450 | 451 | '''validation_test''' 452 | validation_test = validation_test.merge(f_train_auth_info, on='id', how='left') 453 | validation_test = validation_test.merge(f_train_bankcard_info, on='id', how='left') 454 | validation_test = validation_test.merge(f_train_credit_info, on='id', how='left') 455 | validation_test = validation_test.merge(f_train_order_info, on='id', how='left') 456 | validation_test = validation_test.merge(f_train_recieve_addr_info, on='id', how='left') 457 | validation_test = validation_test.merge(f_train_user_info, on='id', how='left') 458 | validation_test = validation_test.merge(day_minus, on='id', how='left') 459 | validation_test = validation_test.merge(auth_or, on='id', how='left') 460 | print(validation_test.shape) 461 | 462 | validation_test_f = validation_test.drop(['target', 'id'], axis=1) 463 | validation_test_l = validation_test[['target']] 464 | 465 | xgb_train = xgb.DMatrix(validation_train_f, label=validation_train_l) 466 | xgb_test = xgb.DMatrix(validation_test_f, label=validation_test_l) 467 | watchlist = [(xgb_train, 'train'), (xgb_test, 'val')] 468 | params = { 469 | 'booster': 'gbtree', 470 | 'objective': 'binary:logistic', 471 | 'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。 472 | 'max_depth': 5, # 构建树的深度,越大越容易过拟合 473 | 'lambda': 2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 474 | 'subsample': 0.8, # 随机采样训练样本 475 | 'colsample_bytree': 0.8, # 生成树时进行的列采样 476 | 'min_child_weight': 18, 477 | 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. 478 | 'eta': 0.03, # 如同学习率 479 | 'eval_metric': 'auc', 480 | } 481 | module = xgb.train(params, xgb_train, num_boost_round=500, evals=watchlist) 482 | result = module.predict(xgb_test) 483 | 484 | features = module.get_fscore() 485 | features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-20:] 486 | features.reverse() 487 | print(features) 488 | 489 | plot_importance(module) 490 | plt.show() 491 | print("auc: ", roc_auc_score(validation_test_l.values, result)) 492 | 493 | 494 | validation() 495 | 496 | 497 | # auc: 0.809462477973 498 | # submit() 499 | 500 | 501 | '''one_hot''' 502 | # testdata = pd.DataFrame({'pet': ['chinese', 'english', 'english', 'math'], 503 | # 'age': [6, 5, 2, 2], 504 | # 'salary': [7, 5, 2, 5]}) 505 | # one_hot = OneHotEncoder(sparse=False).fit_transform(testdata[['age']]) 506 | # print(one_hot) -------------------------------------------------------------------------------- /feature_engine.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import time 4 | from datetime import datetime 5 | import pandas as pd 6 | import numpy as np 7 | import lightgbm as lgb 8 | from sklearn.metrics import roc_auc_score 9 | from sklearn.feature_selection import SelectKBest # 特征选择 # 10 | from sklearn.feature_selection import chi2 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.linear_model import LogisticRegressionCV 13 | 14 | '''训练集''' 15 | train_auth_info = pd.read_csv('../dataset/ai_risk_train/train_auth_info.csv', low_memory=False) 16 | train_bankcard_info = pd.read_csv('../dataset/ai_risk_train/train_bankcard_info.csv', low_memory=False) 17 | train_credit_info = pd.read_csv('../dataset/ai_risk_train/train_credit_info.csv', low_memory=False) 18 | train_order_info = pd.read_csv('../dataset/ai_risk_train/train_order_info.csv', low_memory=False) 19 | train_recieve_addr_info = pd.read_csv('../dataset/ai_risk_train/train_recieve_addr_info.csv', low_memory=False) 20 | train_user_info = pd.read_csv('../dataset/ai_risk_train/train_user_info.csv', low_memory=False) 21 | train_target = pd.read_csv('../dataset/ai_risk_train/train_target.csv', low_memory=False) 22 | 23 | # '''测试集''' 24 | # test_auth_info = pd.read_csv('../dataset/ai_risk_test/test_auth_info.csv', low_memory=False) 25 | # test_bankcard_info = pd.read_csv('../dataset/ai_risk_test/test_bankcard_info.csv', low_memory=False) 26 | # test_credit_info = pd.read_csv('../dataset/ai_risk_test/test_credit_info.csv', low_memory=False) 27 | # test_order_info = pd.read_csv('../dataset/ai_risk_test/test_order_info.csv', low_memory=False) 28 | # test_recieve_addr_info = pd.read_csv('../dataset/ai_risk_test/test_recieve_addr_info.csv', low_memory=False) 29 | # test_user_info = pd.read_csv('../dataset/ai_risk_test/test_user_info.csv', low_memory=False) 30 | # test_list = pd.read_csv('../dataset/ai_risk_test/test_list.csv', low_memory=False) 31 | 32 | test_auth_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_auth_info.csv', low_memory=False) 33 | test_bankcard_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_bankcard_info.csv', low_memory=False) 34 | test_credit_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_credit_info.csv', low_memory=False) 35 | test_order_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_order_info.csv', low_memory=False) 36 | test_recieve_addr_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_recieve_addr_info.csv', low_memory=False) 37 | test_user_info = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_user_info.csv', low_memory=False) 38 | test_list = pd.read_csv('../dataset/AI_Risk_BtestData_V1.0/Btest_list.csv', low_memory=False) 39 | 40 | # print(test_auth_info) 41 | # print(test_bankcard_info) 42 | # print(test_credit_info) 43 | # print(test_order_info) 44 | # print(test_recieve_addr_info) 45 | # exit(0) 46 | 47 | def cal_auc(list_one, list_two): 48 | '''计算AUC值''' 49 | positive = [] 50 | negative = [] 51 | for index in range(len(list_one)): 52 | if list_one[index] == 1: 53 | positive.append(index) 54 | else: 55 | negative.append(index) 56 | SUM = 0 57 | for i in positive: 58 | for j in negative: 59 | if list_two[i] > list_two[j]: 60 | SUM += 1 61 | elif list_two[i] == list_two[j]: 62 | SUM += 0.5 63 | else: 64 | pass 65 | return SUM / (len(positive)*len(negative)) 66 | 67 | 68 | def return_set(group): 69 | return set(group) 70 | 71 | 72 | def extract_credit_info(credit_info): 73 | '''提取credit_info表 特征''' 74 | credit_info['credit_score'] = credit_info['credit_score'].fillna(credit_info['credit_score'].mean()) 75 | credit_info['quota_is_zero'] = [1 if i != 0.0 else 0 for i in credit_info.quota] # 是否有信用额度 # 76 | credit_info['overdraft'] = credit_info['overdraft'].fillna(0) 77 | credit_info['quota'] = credit_info['quota'].fillna(0) 78 | credit_info['quota_surplus'] = credit_info['quota'] - credit_info['overdraft'] 79 | # credit_info['quota_rate'] = (credit_info['overdraft'] / credit_info['quota']).fillna(0) 80 | credit_info['quota_rate'] = credit_info[['overdraft', 'quota']].apply(lambda x: x.overdraft / x.quota if x.quota != 0 else 0, axis=1) 81 | credit_info['credit_score_rank'] = credit_info['credit_score'].rank(method='first', ascending=False) 82 | 83 | credit_info.loc[:, 'all_is_null'] = credit_info[['credit_score', 'overdraft', 'quota']].apply(lambda x: 1 if ((x.credit_score is not np.nan) and (x.overdraft is not np.nan) and (x.quota is not np.nan)) else 0, axis=1) 84 | credit_info.loc[:, 'all_is_zero'] = credit_info[['credit_score', 'overdraft', 'quota']].apply(lambda x: 1 if ((x.credit_score == 0) and (x.overdraft == 0) and (x.quota == 0)) else 0, axis=1) 85 | credit_info.loc[:, 'quota_is_zero'] = credit_info[['quota']].apply(lambda x: 1 if x.quota == 0 else 0, axis=1) 86 | credit_info.loc[:, 'credit_score_is_null'] = credit_info[['credit_score']].apply(lambda x: 1 if x.credit_score == 0 else 0, axis=1) 87 | credit_info.loc[:, 'quota_surplus_is_null'] = credit_info[['quota_surplus', 'quota']].apply(lambda x: 1 if (x.quota_surplus == 0) and (x.quota != 0) else 0, axis=1) 88 | 89 | '''归一化''' 90 | credit_info[['credit_score', 'overdraft', 'quota', 'quota_surplus', 'credit_score_rank']] = credit_info[['credit_score', 'overdraft', 'quota', 'quota_surplus', 'credit_score_rank']].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) 91 | return credit_info 92 | 93 | # print(extract_credit_info(train_credit_info)) 94 | # print(extract_credit_info(test_credit_info)) 95 | 96 | 97 | def extract_user_info(user_info): 98 | '''提取 user_info表 特征''' 99 | feature = user_info[['id']] 100 | feature.loc[:, 'birthday_is_zero'] = user_info[['birthday']].apply(lambda x: 1 if x.birthday == '0000-00-00' else 0, axis=1) 101 | feature.loc[:, 'sex_not_male'] = user_info[['sex']].apply(lambda x: 1 if x.sex != '女' else 0, axis=1) 102 | feature.loc[:, 'female'] = user_info[['sex']].apply(lambda x: 1 if x.sex == '男' else 0, axis=1) 103 | feature.loc[:, 'male'] = user_info[['sex']].apply(lambda x: 1 if x.sex == '女' else 0, axis=1) 104 | feature.loc[:, 'sex_secret'] = user_info[['sex']].apply(lambda x: 1 if x.sex == '保密' else 0, axis=1) # 0.69504936432 105 | ## 106 | feature.loc[:, 'merriage1'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage == '未婚' else 0, axis=1) 107 | feature.loc[:, 'merriage2'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage == '已婚' else 0, axis=1) 108 | feature.loc[:, 'merriage3'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage == '保密' else 0, axis=1) 109 | feature.loc[:, 'merriage_is_null'] = user_info[['merriage']].apply(lambda x: 1 if x.merriage is np.nan else 0, axis=1) # 0.700624700466 110 | #### 111 | feature.loc[:, 'account_grade1'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '注册会员' else 0, axis=1) 112 | feature.loc[:, 'account_grade2'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '铜牌会员' else 0, axis=1) 113 | feature.loc[:, 'account_grade3'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '银牌会员' else 0, axis=1) 114 | feature.loc[:, 'account_grade4'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '金牌会员' else 0, axis=1) 115 | feature.loc[:, 'account_grade5'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade == '钻石会员' else 0, axis=1) 116 | feature.loc[:, 'account_grade_is_null'] = user_info[['account_grade']].apply(lambda x: 1 if x.account_grade is np.nan else 0, axis=1) 117 | ### 118 | feature.loc[:, 'qq_bound_is_null'] = user_info[['qq_bound']].apply(lambda x: 1 if x.qq_bound is np.nan else 0, axis=1) 119 | feature.loc[:, 'wechat_bound_is_null'] = user_info[['wechat_bound']].apply(lambda x: 1 if x.wechat_bound is np.nan else 0, axis=1) 120 | feature.loc[:, 'degree'] = user_info[['degree']].apply(lambda x: 1 if (x.degree == '硕士') | (x.degree == '其他') | (x.degree == '博士') else 0, axis=1) 121 | feature.loc[:, 'id_card_is_null'] = user_info[['id_card']].apply(lambda x: 1 if x.id_card is np.nan else 0, axis=1) 122 | ##### 123 | feature.loc[:, 'income1'] = [1 if index == '4000-5999元' else 0 for index in user_info['income']] 124 | feature.loc[:, 'income2'] = [1 if index == '8000元以上' else 0 for index in user_info['income']] 125 | feature.loc[:, 'income3'] = [1 if index == '2000-3999元' else 0 for index in user_info['income']] 126 | feature.loc[:, 'income4'] = [1 if index == '6000-7999元' else 0 for index in user_info['income']] 127 | feature.loc[:, 'income5'] = [1 if index == '2000元以下' else 0 for index in user_info['income']] # 0.775891365882 # 128 | 129 | '''年龄特征''' 130 | def is_valid_date(strdate): 131 | '''''判断是否是一个有效的日期字符串''' 132 | try: 133 | if ":" in strdate: 134 | time.strptime(strdate, "%Y-%m-%d %H:%M:%S") 135 | else: 136 | time.strptime(strdate, "%Y-%m-%d") 137 | return True 138 | except: 139 | return False 140 | 141 | #### 142 | user_info['birthday_two'] = user_info[['birthday']].apply(lambda index: is_valid_date(index.birthday), axis=1) 143 | user_info['birthday'] = user_info[['birthday']].apply(lambda index: 0 if (index.birthday is np.nan) or (index.birthday == '0000-00-00') else index.birthday[0:4], axis=1) 144 | user_info['age'] = user_info[['birthday', 'birthday_two']].apply(lambda x: 2018 - int(x.birthday) if x.birthday_two is True else 0, axis=1) 145 | # print(user_info[['birthday_two', 'age']]) 146 | feature.loc[:, 'age_one'] = user_info[['age']].apply(lambda x: 1 if x.age <= 18 and x.age > 0 else 0, axis=1) 147 | feature.loc[:, 'age_two'] = user_info[['age']].apply(lambda x: 1 if x.age <= 30 and x.age > 18 else 0, axis=1) 148 | feature.loc[:, 'age_three'] = user_info[['age']].apply(lambda x: 1 if x.age <= 60 and x.age > 30 else 0, axis=1) 149 | feature.loc[:, 'age_four'] = user_info[['age']].apply(lambda x: 1 if x.age <= 100 and x.age > 60 else 0, axis=1) 150 | feature.loc[:, 'age_five'] = user_info[['age']].apply(lambda x: 1 if x.age > 100 and x.age == 0 else 0, axis=1) 151 | 152 | return feature 153 | 154 | # print(extract_user_info(train_user_info)) 155 | # print(extract_user_info(test_user_info)) 156 | 157 | 158 | def extract_recieve_addr_info(recieve_addr_info): 159 | '''提取 recieve_addr_info表 特征''' 160 | recieve_addr_info['all_null'] = recieve_addr_info[['addr_id', 'region', 'phone', 'fix_phone', 'receiver_md5']].apply(lambda x: 1 if (x.addr_id is np.nan) and (x.region is np.nan) and (x.phone is np.nan) and (x.fix_phone is np.nan) | (x.receiver_md5 is np.nan) else 0, axis=1) 161 | feature = recieve_addr_info.drop_duplicates(['id'])[['id']] 162 | recieve_addr_info['index'] = recieve_addr_info.index 163 | all_is_null = pd.pivot_table(recieve_addr_info, index='id', values='all_null', aggfunc='min').reset_index() 164 | addr_id = pd.pivot_table(recieve_addr_info, index='id', values='index', aggfunc='count').reset_index().rename(columns={'index': 'record_count'}) 165 | feature = feature.merge(all_is_null, on='id', how='left') 166 | feature = feature.merge(addr_id, on='id', how='left') 167 | province = {'甘肃', '云南', '贵州', '河南', '黑龙', '香港', '北京', '湖南', '江苏', '青海', '宁夏', '内蒙', '浙江', '吉林', '海南', '福建', '重庆', '台湾', '陕西', '湖北', '江西', '辽宁', '山西', '西藏', '广东', '安徽', '四川', '河北', '山东', '上海', '广西', '新疆', '天津', 'null'} 168 | 169 | train_recieve_addr_info['province'] = train_recieve_addr_info[['region']].apply(lambda x: 'null' if x.region is np.nan else x.region[0:2], axis=1) 170 | city_set = pd.pivot_table(train_recieve_addr_info, index='id', values='province', aggfunc=return_set).reset_index() 171 | for string in list(province): 172 | city_set[string] = [1 if string in index else 0 for index in city_set['province']] 173 | city_set['province'] = city_set[['province']].apply(lambda x: x.province.clear() if 'null' in x.province else x.province, axis=1) 174 | city_set['province_len'] = [0 if index is None else len(index) for index in city_set['province']] 175 | 176 | feature = feature.merge(city_set.drop(['province'], axis=1), on='id', how='left') 177 | # print(feature) 178 | return feature 179 | 180 | # extract_recieve_addr_info(train_recieve_addr_info) 181 | # print(extract_recieve_addr_info(train_recieve_addr_info)) 182 | 183 | 184 | def extract_bankcard_info(bankcard_info): 185 | ''' 提取 bankcard_info表 特征 ''' 186 | 187 | def cal_store_card_num(group): 188 | flag = 0 189 | for index in group: 190 | if index == '储蓄卡': 191 | flag += 1 192 | return flag 193 | 194 | def if_have_credit_card(group): 195 | for index in group: 196 | if index == '信用卡': 197 | return 1 198 | else: 199 | return 0 200 | return 0 201 | 202 | def list_set(group): 203 | return len(set(group)) 204 | 205 | bankcard_info = bankcard_info.drop_duplicates() 206 | feature = bankcard_info.drop_duplicates(['id'])[['id']] 207 | card_record_count = pd.pivot_table(bankcard_info, index='id', values='phone', aggfunc='count').reset_index().rename(columns={'phone': 'card_record_count'}) 208 | phone_count = pd.pivot_table(bankcard_info, index='id', values='phone', aggfunc=list_set).reset_index().rename(columns={'phone': 'phone_count'}) 209 | store_card_count = pd.pivot_table(bankcard_info, index='id', values='card_type', aggfunc=cal_store_card_num).reset_index().rename(columns={'card_type': 'store_card_count'}) 210 | have_credit_card = pd.pivot_table(bankcard_info, index='id', values='card_type', aggfunc=if_have_credit_card).reset_index().rename(columns={'card_type': 'have_credit_card'}) 211 | card_category_count = pd.pivot_table(bankcard_info, index='id', values='card_type', aggfunc=list_set).reset_index().rename(columns={'card_type': 'card_category_count'}) 212 | 213 | feature = feature.merge(phone_count, on='id', how='left') 214 | feature = feature.merge(card_record_count, on='id', how='left') 215 | feature = feature.merge(store_card_count, on='id', how='left') 216 | feature = feature.merge(have_credit_card, on='id', how='left') 217 | feature = feature.merge(card_category_count, on='id', how='left') 218 | feature['credit_count'] = feature['card_record_count'] - feature['store_card_count'] 219 | feature['card_count_one'] = feature[['card_record_count']].apply(lambda x: 1 if x.card_record_count > 6 else 0, axis=1) 220 | feature['record_is_unique'] = feature[['card_record_count']].apply(lambda x: 1 if x.card_record_count == 1 else 0, axis=1) 221 | # print(feature) 222 | 223 | return feature 224 | 225 | # extract_bankcard_info(train_bankcard_info) 226 | # print(extract_bankcard_info(test_bankcard_info)) 227 | 228 | 229 | def extract_auth_info(auth_info): 230 | '''提取 auth_info表 特征''' 231 | feature = auth_info[['id']] 232 | feature.loc[:, 'auth_id_card_is_null'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card is not np.nan else 0, axis=1) 233 | feature.loc[:, 'auth_time_is_null'] = auth_info[['auth_time']].apply(lambda x: 1 if x.auth_time is not np.nan else 0, axis=1) 234 | feature.loc[:, 'phone_is_null'] = auth_info[['phone']].apply(lambda x: 1 if x.phone is not np.nan else 0, axis=1) 235 | feature.loc[:, 'all_is_null'] = auth_info[['id_card', 'auth_time', 'phone']].apply(lambda x: 1 if ((x.id_card is np.nan) and (x.auth_time is np.nan) and (x.phone is np.nan)) else 0, axis=1) 236 | feature.loc[:, 'all_not_null'] = auth_info[['id_card', 'auth_time', 'phone']].apply(lambda x: 1 if ((x.id_card is not np.nan) and (x.auth_time is not np.nan) and (x.phone is not np.nan)) else 0, axis=1) 237 | feature.loc[:, 'card_time_is_null'] = auth_info[['id_card', 'auth_time']].apply(lambda x: 1 if ((x.id_card is np.nan) and (x.auth_time is np.nan)) else 0, axis=1) 238 | feature.loc[:, 'time_phone_is_null'] = auth_info[['auth_time', 'phone']].apply(lambda x: 1 if ((x.phone is np.nan) and (x.auth_time is np.nan)) else 0, axis=1) 239 | # '''运营商''' 240 | # auth_info['id_card'] = [int(index[0]) if index is not np.nan else -1 for index in auth_info['id_card']] 241 | # auth_info['phone'] = [int(index[:3]) if index is not np.nan else -1 for index in auth_info['phone']] 242 | # mobile = {134, 135, 136, 137, 138, 139, 150, 151, 152, 157, 158, 159, 182, 183, 184, 187, 188, 147, 178} 243 | # unicom = {130, 131, 132, 155, 156, 185, 186, 145, 176} 244 | # telecom = {180, 181, 189, 133, 153, 177} 245 | # virtual = {170} 246 | # feature.loc[:, 'mobile'] = auth_info[['phone']].apply(lambda x: 1 if x.phone in mobile else 0, axis=1) 247 | # feature.loc[:, 'unicom'] = auth_info[['phone']].apply(lambda x: 1 if x.phone in unicom else 0, axis=1) 248 | # feature.loc[:, 'telecom'] = auth_info[['phone']].apply(lambda x: 1 if x.phone in telecom else 0, axis=1) 249 | # feature.loc[:, 'virtual'] = auth_info[['phone']].apply(lambda x: 1 if x.phone in virtual else 0, axis=1) 250 | # # 'mobile', 'unicom', 'telecom', 'virtual' 251 | # # 'id_card_one', 'id_card_two', 'id_card_three', 'id_card_four', 'id_card_five', 'id_card_six' 252 | # feature.loc[:, 'id_card_one'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 1 else 0, axis=1) 253 | # feature.loc[:, 'id_card_two'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 2 else 0, axis=1) 254 | # feature.loc[:, 'id_card_three'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 3 else 0, axis=1) 255 | # feature.loc[:, 'id_card_four'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 4 else 0, axis=1) 256 | # feature.loc[:, 'id_card_five'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 5 else 0, axis=1) 257 | # feature.loc[:, 'id_card_six'] = auth_info[['id_card']].apply(lambda x: 1 if x.id_card == 6 else 0, axis=1) 258 | # print(feature) 259 | return feature 260 | 261 | # extract_auth_info(train_auth_info) 262 | # print(extract_auth_info(test_auth_info)) 263 | 264 | 265 | def extract_order_info(order_info): 266 | '''提取 order_info表 特征''' 267 | def cal_set(group): 268 | return len(set(group)) 269 | 270 | '''求标准差''' 271 | def cal_std(group): 272 | return np.std(group) 273 | 274 | feature = order_info.drop_duplicates(['id'])[['id']] 275 | # amt_order, type_pay, time_order, sts_order, phone, unit_price, no_order_md5, name_rec_md5, product_id_md5 276 | order_info['order_all_is_null'] = order_info.apply(lambda x: 1 if ((x.amt_order is np.nan) and (x.type_pay is np.nan) and (x.time_order is np.nan) and (x.sts_order is np.nan)) else 0, axis=1) 277 | order_all_is_null = pd.pivot_table(order_info[['id', 'order_all_is_null']], index='id', values='order_all_is_null', aggfunc='max').reset_index() 278 | 279 | '''均值填充amt_order属性''' 280 | order_info_amt = order_info[['amt_order']] 281 | order_info_amt = order_info_amt[order_info_amt['amt_order'].notnull()] 282 | order_info_amt = order_info_amt[order_info_amt['amt_order'] != 'null'] 283 | order_info_amt['amt_order'] = [float(index) for index in order_info_amt['amt_order']] 284 | mean = order_info_amt['amt_order'].mean() 285 | order_info['amt_order'] = order_info['amt_order'].fillna(mean) 286 | order_info['amt_order'] = [mean if index == 'null' else index for index in order_info['amt_order']] 287 | order_info['amt_order'] = [float(index) for index in order_info['amt_order']] 288 | 289 | order_info['unit_price'] = order_info[['amt_order', 'unit_price']].apply(lambda x: x.amt_order if np.isnan(x.unit_price) else x.unit_price, axis=1) 290 | unit_price_mean = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc='mean').reset_index().rename(columns={'unit_price': 'unit_price_mean'}) 291 | unit_price_max = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc='max').reset_index().rename(columns={'unit_price': 'unit_price_max'}) 292 | unit_price_min = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc='min').reset_index().rename(columns={'unit_price': 'unit_price_min'}) 293 | unit_price_std = pd.pivot_table(order_info[['id', 'unit_price']], index='id', values='unit_price', aggfunc=cal_std).reset_index().rename(columns={'unit_price': 'unit_price_std'}) 294 | 295 | amt_order_mean = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc='mean').reset_index().rename(columns={'amt_order': 'amt_order_mean'}) 296 | amt_order_max = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc='max').reset_index().rename(columns={'amt_order': 'amt_order_max'}) 297 | amt_order_min = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc='min').reset_index().rename(columns={'amt_order': 'amt_order_min'}) 298 | amt_order_std = pd.pivot_table(order_info[['id', 'amt_order']], index='id', values='amt_order', aggfunc=cal_std).reset_index().rename(columns={'amt_order': 'amt_order_std'}) 299 | type_pay_count = pd.pivot_table(order_info[['id', 'type_pay']], index='id', values='type_pay', aggfunc=cal_set).reset_index().rename(columns={'type_pay': 'type_pay_count'}) 300 | sts_order_count = pd.pivot_table(order_info[['id', 'sts_order']], index='id', values='sts_order', aggfunc=cal_set).reset_index().rename(columns={'sts_order': 'sts_order_count'}) 301 | order_phone_count = pd.pivot_table(order_info[['id', 'phone']], index='id', values='phone', aggfunc=cal_set).reset_index().rename(columns={'phone': 'order_phone_count'}) 302 | name_rec_md5_count = pd.pivot_table(order_info[['id', 'name_rec_md5']], index='id', values='name_rec_md5', aggfunc=cal_set).reset_index().rename(columns={'name_rec_md5': 'name_rec_md5_count'}) 303 | 304 | feature = feature.merge(unit_price_mean, on='id', how='left') 305 | feature = feature.merge(unit_price_max, on='id', how='left') 306 | feature = feature.merge(unit_price_min, on='id', how='left') 307 | feature = feature.merge(unit_price_std, on='id', how='left') 308 | 309 | feature = feature.merge(order_all_is_null, on='id', how='left') 310 | feature = feature.merge(amt_order_mean, on='id', how='left') 311 | feature = feature.merge(amt_order_max, on='id', how='left') 312 | feature = feature.merge(amt_order_min, on='id', how='left') 313 | feature = feature.merge(amt_order_std, on='id', how='left') 314 | feature = feature.merge(type_pay_count, on='id', how='left') 315 | feature = feature.merge(sts_order_count, on='id', how='left') 316 | feature = feature.merge(order_phone_count, on='id', how='left') 317 | feature = feature.merge(name_rec_md5_count, on='id', how='left') 318 | '''归一化''' 319 | feature.iloc[:, 1:] = feature.iloc[:, 1:].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) # 0.791859501859 # 320 | '''离散化特征''' 321 | order_info['type_pay'] = order_info[['type_pay']].apply(lambda x: 'null' if x.type_pay is np.nan else x.type_pay, axis=1) 322 | type_pay = pd.pivot_table(order_info, index='id', values='type_pay', aggfunc=return_set).reset_index() 323 | # type_pay_category = {'定向京券支付', '白条支付', '分期付款', '积分支付', '在线+限品东券', '定向东券', '东券混合支付', '余额', '京豆东券混合支付', '前台自付', '在线', '在线+东券支付', '上门自提', '公司转账', '在线支付', '在线支付 ', '在线+京豆', '邮局汇款', '货到付款', 324 | # '在线+全品东券', 'null', '京豆支付', '在线预付', '定向京券', '混合支付', '京豆', '在线+定向东券', '京豆混合支付', '在线+东券'} 325 | 326 | type_pay_category = {'定向京券支付', '白条支付', '在线+余额+限品东券', '高校代理-代理支付', '京券全额支付', '分期付款', '积分支付', '在线+限品东券', '定向东券', '东券混合支付', '余额', '京豆东券混合支付', '前台自付', '在线', '在线+东券支付', '上门自提', '公司转账', '在线支付', '在线支付 ', '在线+京豆', '邮局汇款', '在线+全品京券', '货到付款', '分期付款(招行)', '在线+全品东券', '余额+限品东券', '在线+京券支付', '在线+余额', '限品京券', 'null', '京豆支付', '在线预付', '定向京券', '混合支付', '全品京券', '京豆', '在线+定向东券', '京豆混合支付', '在线+限品京券', '高校代理-自己支付', '京券混合支付', '在线+东券'} 327 | 328 | for string in list(type_pay_category): 329 | type_pay[string] = [1 if string in index else 0 for index in type_pay['type_pay']] 330 | 331 | type_pay['type_pay'] = type_pay[['type_pay']].apply(lambda x: x.type_pay.clear() if 'null' in x.type_pay else x.type_pay, axis=1) 332 | type_pay['type_pay_len'] = [0 if index is None else len(index) for index in type_pay['type_pay']] 333 | feature = feature.merge(type_pay.drop(['type_pay'], axis=1), on='id', how='left') 334 | 335 | '''sts_order离散化''' 336 | order_info['sts_order'] = order_info[['sts_order']].apply(lambda x: 'null' if x.sts_order is np.nan else x.sts_order, axis=1) 337 | # sts_order_category = set(train_order_info['sts_order']) 338 | sts_order = pd.pivot_table(order_info, index='id', values='sts_order', aggfunc=return_set).reset_index() 339 | sts_order_category = {'null', '等待审核', '等待处理', '已退款', '已收货', '购买成功', '付款成功', '失败退款', '已完成', '预订结束', '退款完成', '正在出库', '订单已取消', '充值成功', '商品出库', '下单失败', '请上门自提', '已晒单', '充值失败;退款成功', 340 | '退款成功', '未入住', '等待收货', '配送退货', '出票失败', '等待付款确认', '缴费成功', '预约完成', '未抢中', '完成', '已取消', '出票成功', '抢票已取消', '等待付款', '已取消订单', '正在处理', '等待退款', '充值失败', '订单取消'} 341 | 342 | for string in list(sts_order_category): 343 | sts_order[string] = [1 if string in index else 0 for index in sts_order['sts_order']] 344 | 345 | sts_order['sts_order'] = sts_order[['sts_order']].apply(lambda x: x.sts_order.clear() if 'null' in x.sts_order else x.sts_order, axis=1) 346 | sts_order['sts_order_len'] = [0 if index is None else len(index) for index in sts_order['sts_order']] 347 | # print(sts_order) 348 | feature = feature.merge(sts_order.drop(['sts_order'], axis=1), on='id', how='left') 349 | 350 | # print(feature) 351 | return feature 352 | 353 | # extract_order_info(train_order_info) 354 | # print(extract_order_info(test_order_info)) 355 | 356 | 357 | def extract_time_feature(auth_info, target_list): 358 | '''提取时间相关特征''' 359 | feature = target_list[['id']] 360 | target_list = target_list[['id', 'appl_sbm_tm']].merge(auth_info[['id', 'auth_time']], on='id', how='left') 361 | target_list.loc[:, 'appl_sbm_tm'] = [index.split(' ')[0] for index in target_list['appl_sbm_tm']] 362 | target_list['auth_time'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm if x.auth_time == '0000-00-00' else x.auth_time, axis=1) 363 | target_list['auth_time'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: x.appl_sbm_tm if x.auth_time is np.nan else x.auth_time, axis=1) 364 | feature['feature_1'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: 1 if x.appl_sbm_tm < x.auth_time else 0, axis=1) 365 | feature['register_days'] = target_list[['appl_sbm_tm', 'auth_time']].apply(lambda x: (datetime(int(x.appl_sbm_tm.split('-')[0]), int(x.appl_sbm_tm.split('-')[1]), int(x.appl_sbm_tm.split('-')[2])) - datetime(int(x.auth_time.split('-')[0]), int(x.auth_time.split('-')[1]), int(x.auth_time.split('-')[2]))).days, axis=1) 366 | # print(target_list) 367 | # print(feature) 368 | return feature 369 | 370 | # extract_time_feature(train_auth_info, train_target) 371 | # print(extract_time_feature(test_auth_info, test_list)) 372 | 373 | def extract_order_payment_time(order_info, target_list): 374 | str_len = len('2016-01-19 22:38:26') 375 | feature = target_list[['id']] 376 | target_list = target_list[['id', 'appl_sbm_tm']].merge(order_info[['id', 'time_order']], on='id', how='left') 377 | target_list.loc[:, 'appl_sbm_tm'] = [index.split(' ')[0] for index in target_list['appl_sbm_tm']] 378 | target_list['time_order'] = target_list[['appl_sbm_tm', 'time_order']].apply(lambda x: x.appl_sbm_tm if x.time_order is np.nan else x.time_order, axis=1) 379 | target_list['time_order'] = target_list[['appl_sbm_tm', 'time_order']].apply(lambda x: x.appl_sbm_tm if len(x.time_order) != str_len else x.time_order, axis=1) 380 | target_list.loc[:, 'time_order'] = [index.split(' ')[0] for index in target_list['time_order']] 381 | target_list['days'] = target_list[['appl_sbm_tm', 'time_order']].apply(lambda x: (datetime(int(x.appl_sbm_tm.split('-')[0]), int(x.appl_sbm_tm.split('-')[1]), int(x.appl_sbm_tm.split('-')[2])) - datetime(int(x.time_order.split('-')[0]), int(x.time_order.split('-')[1]), int(x.time_order.split('-')[2]))).days, axis=1) 382 | print(target_list) 383 | day_mean = pd.pivot_table(target_list, index='id', values='days', aggfunc='mean').reset_index().rename(columns={'days': 'day_mean'}) 384 | day_max = pd.pivot_table(target_list, index='id', values='days', aggfunc='max').reset_index().rename(columns={'days': 'day_max'}) 385 | day_min = pd.pivot_table(target_list, index='id', values='days', aggfunc='min').reset_index().rename(columns={'days': 'day_min'}) 386 | order_record_count = pd.pivot_table(target_list, index='id', values='days', aggfunc='count').reset_index().rename(columns={'days': 'order_record_count'}) 387 | feature = feature.merge(day_mean, on='id', how='left') 388 | feature = feature.merge(day_max, on='id', how='left') 389 | feature = feature.merge(day_min, on='id', how='left') 390 | feature = feature.merge(order_record_count, on='id', how='left') # 记录数 # 391 | feature.loc[:, 'order_record_unique'] = [1 if index == 1 else 0 for index in feature['order_record_count']] # 记录数是否唯一 # 392 | print(feature) 393 | return feature 394 | 395 | extract_order_payment_time(train_order_info, train_target) 396 | # print(extract_order_payment_time(test_order_info, test_list)) 397 | 398 | '''Logistic Regression''' 399 | def train_LR_module(store_result=False, store_feature=False, select_feature=False, feature_num='all', OneEncode=False): 400 | train_feature = pd.read_csv(r'train_feature.csv', encoding='utf-8') 401 | validate_feature = pd.read_csv(r'validate_feature.csv', encoding='utf-8') 402 | test_feature = pd.read_csv(r'test_feature.csv', encoding='utf-8') 403 | train_test_feature = pd.read_csv(r'train_test_feature.csv', encoding='utf-8') 404 | print('读取数据完毕。。。') 405 | 406 | validate_label = validate_feature[['target']] 407 | train_label = train_feature[['target']] 408 | train_test_label = train_test_feature[['target']] 409 | 410 | train_feature = train_feature.iloc[:, 2:] 411 | test_feature = test_feature.iloc[:, 1:] 412 | validate_feature = validate_feature.iloc[:, 2:] 413 | train_test_feature = train_test_feature.iloc[:, 2:] 414 | 415 | if OneEncode is True: 416 | features = list(train_feature.columns) 417 | one_hot = [] 418 | continuous_feature = [] 419 | for name in features: 420 | if len(set(train_feature[name])) == 2: 421 | one_hot.append(name) 422 | else: 423 | continuous_feature.append(name) 424 | 425 | feature = one_hot[:140] + continuous_feature 426 | train_feature = train_feature[feature] 427 | validate_feature = validate_feature[feature] 428 | test_feature = test_feature[feature] 429 | train_test_feature = train_test_feature[feature] 430 | 431 | if select_feature is True: 432 | print('开始特征选择。。。') 433 | ch2 = SelectKBest(chi2, k=feature_num) 434 | train_feature = ch2.fit_transform(train_feature, train_label) 435 | test_feature = ch2.transform(test_feature) 436 | validate_feature = ch2.transform(validate_feature) 437 | train_test_feature = ch2.transform(train_test_feature) 438 | print('特征选择完毕。。。') 439 | else: 440 | feature_num = train_feature.shape[1] 441 | 442 | print('开始训练logisticRegression模型。。。') 443 | module = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4) # , solver='sag' 444 | # module = lgb.LGBMClassifier( 445 | # num_leaves=64, # num_leaves = 2^max_depth * 0.6 # 446 | # max_depth=6, 447 | # n_estimators=80, 448 | # learning_rate=0.1 449 | # ) 450 | '''训练集''' 451 | module.fit(train_feature, train_label) 452 | 453 | if store_result is True: 454 | '''测试训练集''' 455 | module_two = LogisticRegression(penalty='l2', solver='sag', max_iter=500, random_state=42, n_jobs=4) 456 | # module_two = lgb.LGBMClassifier( 457 | # num_leaves=64, # num_leaves = 2^max_depth * 0.6 # 458 | # max_depth=6, 459 | # n_estimators=80, 460 | # learning_rate=0.1 461 | # ) 462 | module_two.fit(train_test_feature, train_test_label) 463 | 464 | result = module_two.predict_proba(test_feature)[:, 1] 465 | result = pd.DataFrame(result) 466 | result.columns = ['predicted_score'] 467 | sample = test_list[['id']] 468 | sample['predicted_score'] = [index for index in result['predicted_score']] 469 | sample.columns = ['ID', 'PROB'] 470 | sample.to_csv(r'lr_sample.csv', index=None) 471 | # sample.to_csv(r'lgb_sample.csv', index=None) 472 | print(sample) 473 | print('结果已更新。。。') 474 | 475 | print(" Score_offline:", roc_auc_score(validate_label, module.predict_proba(validate_feature)[:, 1])) 476 | print('特征维数:', feature_num) 477 | 478 | 479 | 480 | # def module_merge(prob_x, prob_l): 481 | # xgb_sample = pd.read_csv(r'xgb_sample.csv', low_memory=False) # encode:159:0.790297834417 482 | # lr_sample = pd.read_csv(r'lr_sample.csv', low_memory=False) # Uncode:0.792171452209 483 | # sample = xgb_sample.merge(lr_sample, on='ID', how='left') 484 | # sample['PROB'] = sample['PROB_x'] * prob_x + sample['PROB_y'] * prob_l 485 | # sample = sample[['ID', 'PROB']] 486 | # print(sample) 487 | # sample.to_csv(r'sample.csv', index=None) 488 | # print('模型已融合。。。') 489 | 490 | 491 | 492 | # def module_merge(prob_xgb, prob_lr, prob_lgb): 493 | # xgb_sample = pd.read_csv(r'xgb_sample.csv', low_memory=False) # encode:159:0.790297834417 494 | # lr_sample = pd.read_csv(r'lr_sample.csv', low_memory=False) # Uncode:0.792171452209 495 | # lgb_sample = pd.read_csv(r'lgb_sample.csv', low_memory=False) 496 | # 497 | # xgb_sample.columns = ['ID', 'PROB_xgb'] 498 | # lr_sample.columns = ['ID', 'PROB_lr'] 499 | # lgb_sample.columns = ['ID', 'PROB_lgb'] 500 | # sample = xgb_sample.merge(lr_sample, on='ID', how='left') 501 | # sample = sample.merge(lgb_sample, on='ID', how='left') 502 | # # print(sample) 503 | # sample['PROB'] = sample['PROB_xgb'] * prob_xgb + sample['PROB_lr'] * prob_lr + sample['PROB_lgb'] * prob_lgb 504 | # sample = sample[['ID', 'PROB']] 505 | # print(sample) 506 | # sample.to_csv(r'sample.csv', index=None) 507 | # print('模型已融合。。。') 508 | --------------------------------------------------------------------------------