├── 企业风险预测.png ├── stack ├── README.md ├── 0_add_features.py ├── 2_stacking_rf.py ├── 3_stacking_lgb.py ├── 1_stacking_xgb.py ├── 1_stacking_xgb_onehot.py ├── 2_stacking_rf_prov.py ├── 3_stacking_lgb_prov.py ├── 1_stacking_xgb_prov.py ├── remove_onefeatures.py └── 4_onehot_features.py ├── feature ├── README.md ├── run_features.py ├── 0_analyse_data.py ├── review_features.py ├── 8_feature.py ├── 2_feature.py ├── 1_feature.py ├── 6_feature.py ├── 7_feature.py ├── 10_feature.py ├── 5_feature.py └── 9_feature_1.py ├── model ├── README.md ├── .ipynb_checkpoints │ ├── 4_model_rgyear-checkpoint.ipynb │ ├── Untitled-checkpoint.ipynb │ └── model_lr-checkpoint.ipynb ├── result_alpha.py ├── keras.py ├── model_help.py ├── avg_result.py ├── 1_model_xgb.py ├── model_origin.py ├── 3_model_prov.py └── mode_avg_prov.py ├── LICENSE ├── .gitignore └── README.md /企业风险预测.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaorancs/business-exit-risk-forecast/HEAD/企业风险预测.png -------------------------------------------------------------------------------- /stack/README.md: -------------------------------------------------------------------------------- 1 | ### Author: xiaoran 2 | 使用onehot和stacking进行二次处理数据,得到新的的数据, 3 | 这里个每一个py文件都能生成一份对应的完整的数据,可以直接运行。 4 | -------------------------------------------------------------------------------- /feature/README.md: -------------------------------------------------------------------------------- 1 | ## feature 2 | 3 | 没有py文件对应处理一个数据文件,得到这个文件的特征。因为有文件的依赖,要按照顺序运行, 4 | 也可以直接运行run_features.py文件。 5 | 6 | 注:运行是请保证文件的路径正确。 7 | -------------------------------------------------------------------------------- /feature/run_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | author: xiaoran 5 | """ 6 | 7 | import os 8 | import utils 9 | utils.start(__file__) 10 | 11 | 12 | os.system('python -u 1_feature.py') 13 | os.system('python -u 2_feature.py') 14 | os.system('python -u 3_feature.py') 15 | 16 | os.system('python -u 4_feature.py') 17 | os.system('python -u 5_feature.py') 18 | os.system('python -u 6_feature.py') 19 | os.system('python -u 7_feature.py') 20 | 21 | os.system('python -u 8_feature.py') 22 | os.system('python -u 9_feature.py') 23 | os.system('python -u 10_feature.py') 24 | 25 | utils.end(__file__) -------------------------------------------------------------------------------- /model/README.md: -------------------------------------------------------------------------------- 1 | # model 2 | 文件的介绍 3 | 4 | 1. [1_model_xgb.py](./1_model_xgb.py) 5 | 6 | + 模型的原型,使用单一的随机化种子seed,进行预测。 7 | 8 | 2. [3_model_prov.py](./3_model_prov.py) 9 | 10 | + 模型的原型,使用单一的随机化种子seed,进行预测。但是对不同的省份PROV进行分开训练和预测。 11 | 12 | 3. [model_origin.py](./model_origin.py) 13 | 14 | + 模型的原型,使用三个不同的随机化种子seed,进行预测,进行结果平均。 15 | 16 | 4. [model_avg_prov.py](./model_avg_prov.py) 17 | 18 | + 模型的原型,使用3个不同的随机化种子seed,进行预测。但是对不同的省份PROV进行分开训练和预测,对三个seed的结果进行平均。 19 | 20 | 5. [keras.py](./keras.py) 21 | 22 | + 使用keras搭建的神经网络进行预测,(结果没有xgb理想)。 23 | 6. [avg_result.py](./avg_result.py) 24 | 25 | + 平均结果文件。 26 | 27 | 7. [result_alpha.py](./result_alpha.py) 28 | 29 | + 加权平均结果文件。 30 | 31 | 8. [model_help.py](./model_help.py) 32 | 33 | + 模型测试代码。 34 | -------------------------------------------------------------------------------- /model/.ipynb_checkpoints/4_model_rgyear-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [] 11 | } 12 | ], 13 | "metadata": { 14 | "anaconda-cloud": {}, 15 | "kernelspec": { 16 | "display_name": "Python [conda root]", 17 | "language": "python", 18 | "name": "conda-root-py" 19 | }, 20 | "language_info": { 21 | "codemirror_mode": { 22 | "name": "ipython", 23 | "version": 3 24 | }, 25 | "file_extension": ".py", 26 | "mimetype": "text/x-python", 27 | "name": "python", 28 | "nbconvert_exporter": "python", 29 | "pygments_lexer": "ipython3", 30 | "version": "3.5.2" 31 | } 32 | }, 33 | "nbformat": 4, 34 | "nbformat_minor": 1 35 | } 36 | -------------------------------------------------------------------------------- /model/result_alpha.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def avgWithAlpha(files, w, alpha = 0.21): 8 | ''' 9 | files: 文件名 10 | alpha: 权重 11 | ''' 12 | df_result = None 13 | for i in range(len(files)): 14 | df_tmp = pd.read_csv(files[i]) 15 | if i == 0: 16 | df_result = df_tmp 17 | df_result['PROB'] = df_result['PROB'] * w[i] 18 | else: 19 | df_result['PROB'] = df_result['PROB'] + df_tmp['PROB'] * w[i] 20 | 21 | df_result.loc[df_result['PROB']>=alpha,'FORTARGET'] = 1 22 | df_result.loc[df_result['PROB'] (CHANGE,RIGHT) 16 | # + (LAWSUIT,PROJECT,QUALIFICATION,BREAKFAITH) ---> (RIGHT) 17 | # + 左右两边就行了两两组合 18 | # + 19 | 20 | # In[ ]: 21 | 22 | timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 23 | 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 24 | 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 25 | 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME', 'LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 26 | 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 27 | 'RECRUIT_FIRST_TIME','RECRUIT_END_TIME','QUALIFICATION_FIRST_FIRST_TIME', 28 | 'QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME'] 29 | 30 | 31 | # In[12]: 32 | 33 | TBR = ['TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 34 | 'BRANCH_FIRST_CLOSE_YEAR','RECRUIT_FIRST_TIME','RECRUIT_END_TIME'] 35 | 36 | LPQ = ['PROJECT_FIRST_TIME', 'PROJECT_END_TIME','LAWSUIT_FIRST_TIME','LAWSUIT_END_TIME', 37 | 'QUALIFICATION_FIRST_FIRST_TIME','QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME', 38 | 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME'] 39 | 40 | CR = ['FIRST_CHANGE_TIME','END_CHANGE_TIME','RIGHT_FIRST_ASK_TIME','RIGHT_END_ASK_TIME',] 41 | 42 | R = ['RIGHT_FIRST_ASK_TIME','RIGHT_END_ASK_TIME'] 43 | 44 | 45 | 46 | # In[13]: 47 | 48 | df_all = pd.read_csv('../data/alldata/df_data1234567890.csv') 49 | 50 | 51 | # In[16]: 52 | 53 | def timeDiff(x): 54 | a = x[:x.find(':')] 55 | b = x[x.find(':')+1:] 56 | y = int(a[:a.find('-')]) - int(b[:b.find('-')]) 57 | m = int(a[a.find('-')+1:]) - int(b[b.find('-')+1:]) 58 | return y * 12 + m 59 | 60 | for f1 in TBR: 61 | for f2 in LPQ: 62 | df_all[f1+"_"+f2+"_DIFF"] = (df_all[f1] + ':' + df_all[f2]).apply(timeDiff) 63 | 64 | for f1 in CR: 65 | for f2 in R: 66 | df_all[f1+"_"+f2+"_DIFF"] = (df_all[f1] + ':' + df_all[f2]).apply(timeDiff) 67 | 68 | df_all.to_csv('../data/alldata/df_data1234567890_plus.csv',index=False,index_label=False) 69 | 70 | 71 | # In[3]: 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /stack/2_stacking_rf.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn.ensemble import RandomForestClassifier 9 | 10 | def xtrain_and_test(df_all): 11 | ''' 12 | 得到训练数据和测试数据 13 | ''' 14 | df_label = pd.read_csv('../data/public/train.csv') 15 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 16 | # print(len(df_label)) 17 | # print(len(df_test_label)) 18 | df_label.drop('ENDDATE',axis=1,inplace=True) 19 | 20 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 21 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 22 | 23 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 24 | 25 | return df_train,df_test 26 | 27 | 28 | def runRfStack(inputfile,outputfile): 29 | ''' 30 | 输入输出文件 31 | ''' 32 | df_all = pd.read_csv(inputfile) 33 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 34 | 35 | # 默认填充的0,显示使用一个负数尝试一下 36 | df_all.replace([np.inf, -np.inf], np.nan,inplace=True) 37 | df_all = df_all.fillna(0) 38 | 39 | # 默认填充的0,显示使用一个负数尝试一下 40 | features = df_all.columns[0:] 41 | features = list(features) 42 | features.remove('EID') 43 | label = 'TARGET' 44 | 45 | df_train,df_test = xtrain_and_test(df_all) 46 | 47 | 48 | clf = RandomForestClassifier( 49 | n_estimators=50,#50棵树 50 | max_depth=7, 51 | n_jobs=4, 52 | random_state=101) 53 | 54 | X_train = df_train[features] 55 | Y_label = df_train[label] 56 | 57 | X_test = df_test[features] 58 | 59 | clf.fit(X_train,Y_label) 60 | column = ['STACKFEATURE'+str(i) for i in range(50)] 61 | df_new_feature = pd.DataFrame(clf.apply(df_all[features]),columns=column) 62 | df_all[column] = df_new_feature 63 | 64 | df_all.to_csv(outputfile,index=False,index_label=False) 65 | del df_train,df_test,df_all 66 | return outputfile 67 | # In[ ]: 68 | 69 | # run 70 | inputfile = ['../data/alldata/df_data_all.csv','../data/alldata/df_data_onehot.csv', 71 | '../data/alldata/df_data_plus_all.csv','../data/alldata/df_data_plus_onehot.csv'] 72 | 73 | outputfile = ['../data/alldata/df_data_all_rfstack.csv','../data/alldata/df_data_onehot_rfstack.csv', 74 | '../data/alldata/df_data_plus_all_rfstack.csv','../data/alldata/df_data_plus_onehot_rfstack.csv'] 75 | 76 | for i in range(0,4): 77 | print(i," start ",inputfile[i]) 78 | runRfStack(inputfile[i],outputfile[i]) 79 | print(i," end ",inputfile[i]) 80 | 81 | 82 | -------------------------------------------------------------------------------- /feature/0_analyse_data.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import seaborn as sns 10 | 11 | 12 | # ### 分析数据特征的目的,增加时间的差值的个数 13 | # 1. 2alter.csv 和 5right.csv的数据较多,希望能高在这两个文件的融合出下下功夫 14 | # 15 | # + (TZ,BRANCH,RECRUIT) ---> (CHANGE,RIGHT) 16 | # + (LAWSUIT,PROJECT,QUALIFICATION,BREAKFAITH) ---> (RIGHT) 17 | # + 左右两边就行了两两组合 18 | # + 19 | 20 | # In[ ]: 21 | 22 | timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 23 | 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 24 | 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 25 | 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME', 'LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 26 | 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 27 | 'RECRUIT_FIRST_TIME','RECRUIT_END_TIME','QUALIFICATION_FIRST_FIRST_TIME', 28 | 'QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME'] 29 | 30 | 31 | # In[12]: 32 | 33 | TBR = ['TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 34 | 'BRANCH_FIRST_CLOSE_YEAR','RECRUIT_FIRST_TIME','RECRUIT_END_TIME'] 35 | 36 | LPQ = ['PROJECT_FIRST_TIME', 'PROJECT_END_TIME','LAWSUIT_FIRST_TIME','LAWSUIT_END_TIME', 37 | 'QUALIFICATION_FIRST_FIRST_TIME','QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME', 38 | 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME'] 39 | 40 | CR = ['FIRST_CHANGE_TIME','END_CHANGE_TIME','RIGHT_FIRST_ASK_TIME','RIGHT_END_ASK_TIME',] 41 | 42 | R = ['RIGHT_FIRST_ASK_TIME','RIGHT_END_ASK_TIME'] 43 | 44 | 45 | 46 | # In[13]: 47 | 48 | df_all = pd.read_csv('../data/alldata/df_data1234567890.csv') 49 | 50 | 51 | # In[16]: 52 | 53 | def timeDiff(x): 54 | a = x[:x.find(':')] 55 | b = x[x.find(':')+1:] 56 | y = int(a[:a.find('-')]) - int(b[:b.find('-')]) 57 | m = int(a[a.find('-')+1:]) - int(b[b.find('-')+1:]) 58 | return y * 12 + m 59 | 60 | for f1 in TBR: 61 | for f2 in LPQ: 62 | df_all[f1+"_"+f2+"_DIFF"] = np.abs((df_all[f1] + ':' + df_all[f2]).apply(timeDiff)) 63 | 64 | for f1 in CR: 65 | for f2 in R: 66 | df_all[f1+"_"+f2+"_DIFF"] = np.abs((df_all[f1] + ':' + df_all[f2]).apply(timeDiff)) 67 | 68 | 69 | 70 | # In[17]: 71 | 72 | # df_all.info() 73 | 74 | 75 | # In[18]: 76 | 77 | df_all.to_csv('../data/alldata/df_all1234567890_plus.csv',index=False,index_label=False) 78 | 79 | 80 | # In[3]: 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /stack/3_stacking_lgb.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import lightgbm as lgb 9 | 10 | 11 | def xtrain_and_test(df_all): 12 | ''' 13 | 得到训练数据和测试数据 14 | ''' 15 | df_label = pd.read_csv('../data/public/train.csv') 16 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 17 | # print(len(df_label)) 18 | # print(len(df_test_label)) 19 | df_label.drop('ENDDATE',axis=1,inplace=True) 20 | 21 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 22 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 23 | 24 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 25 | 26 | return df_train,df_test 27 | 28 | def runLgbStack(inputfile,outputfile): 29 | ''' 30 | 输入输出文件 31 | ''' 32 | df_all = pd.read_csv(inputfile) 33 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 34 | 35 | # 默认填充的0,显示使用一个负数尝试一下 36 | df_all.replace([np.inf, -np.inf], np.nan,inplace=True) 37 | df_all = df_all.fillna(0) 38 | 39 | # 默认填充的0,显示使用一个负数尝试一下 40 | features = df_all.columns[0:] 41 | features = list(features) 42 | features.remove('EID') 43 | label = 'TARGET' 44 | 45 | df_train,df_test = xtrain_and_test(df_all) 46 | 47 | gbm = lgb.LGBMClassifier( 48 | learning_rate=0.1, 49 | n_estimators=50, 50 | max_depth=7, 51 | objective= 'binary', 52 | num_leaves=25, 53 | ) 54 | 55 | 56 | X_train = df_train[features] 57 | Y_label = df_train[label] 58 | 59 | X_test = df_test[features] 60 | 61 | gbm.fit(X_train,Y_label,eval_metric='auc',verbose=5) 62 | column = ['STACKFEATURE'+str(i) for i in range(50)] 63 | 64 | df_new_feature = pd.DataFrame(gbm.apply(df_all[features]),columns=column) 65 | 66 | df_all[column] = df_new_feature 67 | 68 | 69 | df_all.to_csv(outputfile,index=False,index_label=False) 70 | del df_train,df_test,df_all 71 | 72 | return outputfile 73 | 74 | # run 75 | inputfile = ['../data/alldata/df_data_all.csv','../data/alldata/df_data_onehot.csv', 76 | '../data/alldata/df_data_plus_all.csv','../data/alldata/df_data_plus_onehot.csv'] 77 | 78 | outputfile = ['../data/alldata/df_data_all_lgbstack.csv','../data/alldata/df_data_onehot_lgbstack.csv', 79 | '../data/alldata/df_data_plus_all_lgbstack.csv','../data/alldata/df_data_plus_onehot_lgbstack.csv'] 80 | 81 | for i in range(0,4): 82 | print(i," start ",inputfile[i]) 83 | runLgbStack(inputfile[i],outputfile[i]) 84 | print(i," end ",outputfile[i]) 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [企业经营退出风险预测](http://www.datafountain.cn/#/competitions/271/intro) 2 | 3 | + 题目思路 4 | [!思路图片](./企业风险预测.png) 5 | + model: Xgboost 6 | + 数据处理: pandas 7 | 8 | + 文件夹结构介绍 9 | 10 | data/alldata/: 存放所有的得到的数据文件 11 | data/public/: 题目给定的原始数据 12 | (https://pan.baidu.com/s/1nuJNz9B) 13 | 提取密码:t2ek 14 | 运行之前,请建立对应的文件夹,并导入数据。 15 | model/:运行的model文件 16 | feature/: 提取特征的py文件 17 | saveModel/: 保存model,可以不使用 18 | stack/: stacking特征的py文件 19 | xresult/:存放输出结果的文件 20 | 21 | + 排名 22 | 23 | 初赛: 43 / 1192 (571个队伍) 24 | 复赛: 25 / 1192 (569个队伍) 25 | 26 | 数据量: 27 | 复赛是:21W训练数据,21W测试数据,共两个省份 28 | 29 | 比赛思路从小文件开始,先提取特征后优化模型。 30 | 31 | #### ----------------------------------------------------------------------------------------------------------------------- 32 | 赛后总结: 33 | 背景: 34 | 传统的企业评价主要基于企业的财务信息,借贷记录信息等来判断企业经营状况,以及是否可能违约等信用信息。对于财务健全、在传统银行借贷领域留有记录的大中型企业,这种评价方式无疑较为客观合理。然而,对于更大量的中小微企业,既无法公开获得企业真实财务信息,也无这些企业的公开信用信息,在强变量缺失的情况下,如何利用弱变量客观公正评价企业经营状况,正是本赛题需要解决的主要问题。 35 | 36 | 本次大赛从全国2000多万企业抽取部分企业(脱敏后),提供企业主体在多方面留下的行为足迹信息数据。参赛队伍需要通过数据挖掘的技术和机器学习的算法,针对企业未来是否会经营不善构建预测模型,输出风险预测概率值。 37 | 38 | 数据量: 39 | 初赛:train: 10W, test: 10W 40 | 复赛:train: 21W, test: 21W 41 | 42 | 43 | 数据的清洗: 44 | 1. 转化或者去掉数据中存在的中文字符 45 | 2. 针对性的填充空置,根据这个工作的类别的平均值,最大值和最小值进行填充,填充的时候考虑公司的注册的时间, 46 | 使用相近时间短的同类型的工司的数据进行填充,比如前后3年 47 | 3. 针对性的去掉重复的数据,数据本省的噪声,去掉完全相同的数据,比如投资数据出现两份 48 | 4. 如何处理异常值,针对有先验证知识的特征进行分组 49 | 50 | 特征选择: 51 | 基础特征: 添加的过程中进行特征选择和分组,进行多特征多模型训练 52 | 1. 直接使用原始数据的列作为特征,使用单个和分组统计的特征, 53 | 2. 分时间段记性统计特征 54 | 3. 统计特征,比如投资公司的个数,违约的个数等,在本省内的个数和本省外的个数, 55 | 4. 计算不同分组之间的统计值的特征 56 | 5. 单个数据的同大类别和小类别的公司的个数(已经排序rank特征) 57 | 6. 同(大、小)行业对根据年份归一化后的注册资本进行排序,得到不同排序次序特征 58 | 7. 对一些实数数据,进行取对数特征变换 59 | 偏离值特征: 60 | 1. 计算每一个个体与分组之间的偏离距离,计算分组的均值,得到与每个个体的分组 61 | 2. 添加聚类特征,计算单个个体与当年(全部或者3年),所有同大类型和同小类型公司的均值的偏差距离 62 | 63 | 64 | 交叉特征: 65 | 1. 跨表时间交叉特征: 使用不同的表的时间间隔得到特征,例如,第一次投资或违约时间和注册时间之间的差值 66 | 2. 加减乘除特征、多项式交叉特征,暴力特征,应该是暴力出奇迹 67 | 计算的使用进行特征筛选交叉,分别对积极特征和消极特征进行组合, 68 | 主管考虑正相关和负相关,比如,分公司的个数、投资公司数量、专利数都是正相关 69 | 失信此时、被执行案件此时都是负相关 70 | 3. one-hot特征,将one-hot特征乘以某个其他特征 71 | 4. stacking特征,使用树模型得到stacking特征 72 | 5. 企业关系特征,与这个企业有关系的事务的个数,可以构造一个有向图计算出度和入度 73 | 6. 注意时间衰减特征 74 | 75 | 特征相似性检验,进行特征分组 76 | 77 | 训练模型: 78 | 在不同的特征集合上进行单模型调优和多模型融合 79 | 复赛的使用分省份预测,提高结果 80 | 81 | 评价指标: 82 | AUC + F1 83 | 84 | 85 | 86 | 注意: 87 | 如果使用python2运行出现错误,请在Python文件头加上: 88 | ``` 89 | #!/usr/bin/env python3 90 | # -*- coding: utf-8 -*- 91 | `` 92 | -------------------------------------------------------------------------------- /stack/1_stacking_xgb.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from xgboost.sklearn import XGBClassifier 9 | 10 | 11 | def xtrain_and_test(df_all): 12 | ''' 13 | 得到训练数据和测试数据 14 | ''' 15 | df_label = pd.read_csv('../data/public/train.csv') 16 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 17 | # print(len(df_label)) 18 | # print(len(df_test_label)) 19 | df_label.drop('ENDDATE',axis=1,inplace=True) 20 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 21 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 22 | 23 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 24 | 25 | return df_train,df_test 26 | 27 | def runXgbStack(inputfile, outputfile): 28 | ''' 29 | 输入输出文件,inputfile和outputfile 30 | ''' 31 | # In[2]: 32 | 33 | df_all = pd.read_csv(inputfile) 34 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 35 | 36 | # 默认填充的0,显示使用一个负数尝试一下 37 | df_all.replace([np.inf, -np.inf], np.nan,inplace=True) 38 | df_all = df_all.fillna(0) 39 | 40 | 41 | # 默认填充的0,显示使用一个负数尝试一下 42 | features = df_all.columns[0:] 43 | features = list(features) 44 | features.remove('EID') 45 | label = 'TARGET' 46 | 47 | 48 | df_train,df_test = xtrain_and_test(df_all) 49 | # In[7]: 50 | 51 | clf = XGBClassifier( 52 | n_estimators=50,#50棵树 53 | learning_rate =0.05, 54 | max_depth=7, 55 | min_child_weight=1, 56 | gamma=0.1, 57 | subsample=0.8, 58 | colsample_bytree=0.8, 59 | objective= 'binary:logistic', 60 | seed=91) 61 | 62 | X_train = df_train[features] 63 | Y_label = df_train[label] 64 | X_test = df_test[features] 65 | 66 | clf.fit(X_train,Y_label,eval_metric='auc',verbose=5) 67 | column = ['STACKFEATURE'+str(i) for i in range(50)] 68 | df_new_feature = pd.DataFrame(clf.apply(df_all[features]),columns=column) 69 | df_all[column] = df_new_feature 70 | df_all.to_csv(outputfile,index=False,index_label=False) 71 | del df_train,df_test,df_all 72 | return outputfile 73 | 74 | # run 75 | inputfile = ['../data/alldata/df_data_all.csv','../data/alldata/df_data_onehot.csv', 76 | '../data/alldata/df_data_plus_all.csv','../data/alldata/df_data_plus_onehot.csv'] 77 | 78 | outputfile = ['../data/alldata/df_data_all_xgbstack.csv','../data/alldata/df_data_onehot_xgbstack.csv', 79 | '../data/alldata/df_data_plus_all_xgbstack.csv','../data/alldata/df_data_plus_onehot_xgbstack.csv'] 80 | 81 | for i in range(0,4): 82 | print(i," start ",inputfile[i]) 83 | runXgbStack(inputfile[i],outputfile[i]) 84 | print(i," end ",inputfile[i]) 85 | 86 | 87 | -------------------------------------------------------------------------------- /stack/1_stacking_xgb_onehot.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from xgboost.sklearn import XGBClassifier 9 | 10 | 11 | def xtrain_and_test(df_all): 12 | ''' 13 | 得到训练数据和测试数据 14 | ''' 15 | df_label = pd.read_csv('../data/public/train.csv') 16 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 17 | # print(len(df_label)) 18 | # print(len(df_test_label)) 19 | df_label.drop('ENDDATE',axis=1,inplace=True) 20 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 21 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 22 | 23 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 24 | 25 | return df_train,df_test 26 | 27 | def runXgbStack(inputfile, outputfile): 28 | ''' 29 | 输入输出文件,inputfile和outputfile 30 | ''' 31 | # In[2]: 32 | 33 | df_all = pd.read_csv(inputfile) 34 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 35 | 36 | # 默认填充的0,显示使用一个负数尝试一下 37 | df_all.replace([np.inf, -np.inf], np.nan,inplace=True) 38 | df_all = df_all.fillna(0) 39 | 40 | 41 | # 默认填充的0,显示使用一个负数尝试一下 42 | features = df_all.columns[0:] 43 | features = list(features) 44 | features.remove('EID') 45 | label = 'TARGET' 46 | 47 | 48 | df_train,df_test = xtrain_and_test(df_all) 49 | # In[7]: 50 | 51 | clf = XGBClassifier( 52 | n_estimators=50,#50棵树 53 | learning_rate =0.05, 54 | max_depth=7, 55 | min_child_weight=1, 56 | gamma=0.1, 57 | subsample=0.8, 58 | colsample_bytree=0.8, 59 | objective= 'binary:logistic', 60 | seed=91) 61 | 62 | X_train = df_train[features] 63 | Y_label = df_train[label] 64 | X_test = df_test[features] 65 | 66 | clf.fit(X_train,Y_label,eval_metric='auc',verbose=5) 67 | column = ['STACKFEATURE'+str(i) for i in range(50)] 68 | df_new_feature = pd.DataFrame(clf.apply(df_all[features]),columns=column) 69 | df_all[column] = df_new_feature 70 | 71 | # 融合特征转化为onehot 72 | for feature in column: 73 | df_all[feature] = df_all[feature].astype(np.int32) 74 | df_tmp = pd.get_dummies(df_all[feature], prefix=feature) 75 | df_all[df_tmp.columns] = df_tmp 76 | df_all.drop(feature,axis=1,inplace=True) 77 | 78 | 79 | df_all.to_csv(outputfile,index=False,index_label=False) 80 | del df_train,df_test,df_all 81 | return outputfile 82 | 83 | # run 84 | inputfile = ['../data/alldata/df_data_all.csv','../data/alldata/df_data_onehot.csv', 85 | '../data/alldata/df_data_plus_all.csv','../data/alldata/df_data_plus_onehot.csv'] 86 | 87 | outputfile = ['../data/alldata/df_data_all_xgbstack_onehot.csv','../data/alldata/df_data_onehot_xgbstack_onehot.csv', 88 | '../data/alldata/df_data_plus_all_xgbstack_onehot.csv','../data/alldata/df_data_plus_onehot_xgbstack_onehot.csv'] 89 | 90 | for i in range(0,4): 91 | print(i," start ",inputfile[i]) 92 | runXgbStack(inputfile[i],outputfile[i]) 93 | print(i," end ",outputfile[i]) 94 | 95 | 96 | -------------------------------------------------------------------------------- /model/keras.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Dropout 9 | 10 | from sklearn import preprocessing 11 | 12 | from keras.optimizers import SGD 13 | import pandas as pd 14 | 15 | 16 | # In[4]: 17 | 18 | def xtrain_and_test(df_all): 19 | ''' 20 | 得到训练数据和测试数据 21 | ''' 22 | df_label = pd.read_csv('../data/public/train.csv') 23 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 24 | # print(len(df_label)) 25 | # print(len(df_test_label)) 26 | df_label.drop('ENDDATE',axis=1,inplace=True) 27 | 28 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 29 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 30 | 31 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 32 | 33 | return df_train,df_test 34 | 35 | 36 | 37 | # In[ ]: 38 | 39 | 40 | 41 | 42 | # In[9]: 43 | 44 | df_all = pd.read_csv('../data/alldata/df_all_xgbstack.csv') 45 | df_train,df_test = xtrain_and_test(df_all) 46 | 47 | features = df_all.columns[0:] 48 | features = list(features) 49 | features.remove('EID') 50 | label = 'TARGET' 51 | 52 | len(features) 53 | 54 | 55 | # In[10]: 56 | 57 | x_train = preprocessing.scale(df_train[features]) 58 | x_test = preprocessing.scale(df_test[features]) 59 | 60 | 61 | 62 | # In[2]: 63 | 64 | model = Sequential() 65 | 66 | 67 | # In[11]: 68 | 69 | model = Sequential() 70 | model.add(Dense(1000, input_dim=605, activation='relu')) 71 | model.add(Dropout(0.4)) 72 | 73 | 74 | model.add(Dense(600, activation='relu')) 75 | model.add(Dropout(0.4)) 76 | 77 | model.add(Dense(400, activation='relu')) 78 | model.add(Dropout(0.4)) 79 | 80 | model.add(Dense(200, activation='relu')) 81 | model.add(Dropout(0.4)) 82 | 83 | model.add(Dense(100, activation='relu')) 84 | model.add(Dropout(0.4)) 85 | 86 | model.add(Dense(40, activation='relu')) 87 | model.add(Dropout(0.4)) 88 | 89 | model.add(Dense(10, activation='relu')) 90 | model.add(Dropout(0.4)) 91 | 92 | 93 | model.add(Dense(1, activation='sigmoid')) 94 | 95 | sgd = SGD(lr=0.01, decay=1e-5, momentum=0.8, nesterov=True) 96 | model.compile(loss='binary_crossentropy', 97 | optimizer='adam', 98 | metrics=['accuracy']) 99 | # epochs = 40 100 | 101 | model.fit(x_train, df_train[label], 102 | epochs=100, 103 | batch_size=128,) 104 | 105 | 106 | # In[ ]: 107 | 108 | proba = model.predict_proba(x_test) 109 | 110 | 111 | 112 | # In[ ]: 113 | 114 | proba_test = pd.DataFrame() 115 | proba_test['EID'] = df_test['EID'] 116 | proba_test['FORTARGET'] = [0 for i in range(len(df_test))] 117 | proba_test['PROB'] = proba 118 | 119 | 120 | # In[ ]: 121 | 122 | proba_test.loc[proba_test['PROB']>=0.23,'FORTARGET'] = 1 123 | proba_test.to_csv('../xresult/xsubmussion_xgbstack.csv',index=False,index_label=False) 124 | 125 | print(len(proba_test[proba_test['FORTARGET']==1])) 126 | print(len(proba_test[proba_test['FORTARGET']==0])) 127 | 128 | 129 | -------------------------------------------------------------------------------- /stack/2_stacking_rf_prov.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn.ensemble import RandomForestClassifier 9 | 10 | def xtrain_and_test(df_all): 11 | ''' 12 | 得到训练数据和测试数据 13 | ''' 14 | df_label = pd.read_csv('../data/public/train.csv') 15 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 16 | # print(len(df_label)) 17 | # print(len(df_test_label)) 18 | df_label.drop('ENDDATE',axis=1,inplace=True) 19 | 20 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 21 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 22 | 23 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 24 | 25 | return df_train,df_test 26 | 27 | 28 | def split_data_with_prov(df_data): 29 | ''' 30 | 根据特征PROV,分割数据,进行单独预测之后合并数据结果 31 | ''' 32 | df_train_prov11 = df_data[df_data['PROV'] == 11] 33 | df_train_prov12 = df_data[df_data['PROV'] == 12] 34 | 35 | return df_train_prov11,df_train_prov12 36 | 37 | 38 | def runRfStack(inputfile,outputfile): 39 | ''' 40 | 输入输出文件 41 | ''' 42 | df_all = pd.read_csv(inputfile) 43 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 44 | 45 | # 默认填充的0,显示使用一个负数尝试一下 46 | df_all.replace([np.inf, -np.inf], np.nan,inplace=True) 47 | df_all = df_all.fillna(0) 48 | 49 | # 默认填充的0,显示使用一个负数尝试一下 50 | features = df_all.columns[0:] 51 | features = list(features) 52 | features.remove('EID') 53 | label = 'TARGET' 54 | 55 | clf = RandomForestClassifier( 56 | n_estimators=50,#50棵树 57 | max_depth=7, 58 | n_jobs=4, 59 | random_state=101) 60 | 61 | df_all_prov11,df_all_prov12 = split_data_with_prov(df_all) 62 | 63 | ###################### prov == 11 64 | df_train11,df_test11 = xtrain_and_test(df_all_prov11) 65 | 66 | X_train11 = df_train11[features] 67 | Y_label11 = df_train11[label] 68 | 69 | X_test11 = df_test11[features] 70 | 71 | clf.fit(X_train11,Y_label11) 72 | column = ['STACKFEATURE'+str(i) for i in range(50)] 73 | df_new_feature11 = pd.DataFrame(clf.apply(df_all_prov11[features]),columns=column) 74 | df_all_prov11[column] = df_new_feature11 75 | 76 | ###################### prov == 12 77 | df_train12,df_test12 = xtrain_and_test(df_all_prov12) 78 | 79 | X_train12 = df_train12[features] 80 | Y_label12 = df_train12[label] 81 | 82 | X_test12 = df_test12[features] 83 | 84 | clf.fit(X_train12,Y_label12) 85 | column = ['STACKFEATURE'+str(i) for i in range(50)] 86 | df_new_feature12 = pd.DataFrame(clf.apply(df_all_prov12[features]),columns=column) 87 | df_all_prov12[column] = df_new_feature12 88 | 89 | # 合并 90 | df_all = df_all_prov11.append(df_all_prov12) 91 | 92 | df_all.to_csv(outputfile,index=False,index_label=False) 93 | del df_all_prov11,df_all_prov12,df_all 94 | return outputfile 95 | # In[ ]: 96 | 97 | # run 98 | inputfile = ['../data/alldata/df_data_all.csv','../data/alldata/df_data_onehot.csv', 99 | '../data/alldata/df_data_plus_all.csv','../data/alldata/df_data_plus_onehot.csv'] 100 | 101 | outputfile = ['../data/alldata/df_data_all_prov_rfstack.csv','../data/alldata/df_data_onehot_prov_rfstack.csv', 102 | '../data/alldata/df_data_plus_all_prov_rfstack.csv','../data/alldata/df_data_plus_onehot_prov_rfstack.csv'] 103 | 104 | for i in range(0,4): 105 | print(i," start ",inputfile[i]) 106 | runRfStack(inputfile[i],outputfile[i]) 107 | print(i," end ",outputfile[i]) 108 | 109 | 110 | -------------------------------------------------------------------------------- /stack/3_stacking_lgb_prov.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import lightgbm as lgb 9 | 10 | 11 | def xtrain_and_test(df_all): 12 | ''' 13 | 得到训练数据和测试数据 14 | ''' 15 | df_label = pd.read_csv('../data/public/train.csv') 16 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 17 | # print(len(df_label)) 18 | # print(len(df_test_label)) 19 | df_label.drop('ENDDATE',axis=1,inplace=True) 20 | 21 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 22 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 23 | 24 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 25 | 26 | return df_train,df_test 27 | 28 | 29 | def split_data_with_prov(df_data): 30 | ''' 31 | 根据特征PROV,分割数据,进行单独预测之后合并数据结果 32 | ''' 33 | df_train_prov11 = df_data[df_data['PROV'] == 11] 34 | df_train_prov12 = df_data[df_data['PROV'] == 12] 35 | 36 | return df_train_prov11,df_train_prov12 37 | 38 | 39 | def runLgbStack(inputfile,outputfile): 40 | ''' 41 | 输入输出文件 42 | ''' 43 | df_all = pd.read_csv(inputfile) 44 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 45 | 46 | # 默认填充的0,显示使用一个负数尝试一下 47 | df_all.replace([np.inf, -np.inf], np.nan,inplace=True) 48 | df_all = df_all.fillna(0) 49 | 50 | # 默认填充的0,显示使用一个负数尝试一下 51 | features = df_all.columns[0:] 52 | features = list(features) 53 | features.remove('EID') 54 | label = 'TARGET' 55 | 56 | gbm = lgb.LGBMClassifier( 57 | learning_rate=0.1, 58 | n_estimators=50, 59 | max_depth=7, 60 | objective= 'binary', 61 | num_leaves=25, 62 | ) 63 | 64 | df_all_prov11,df_all_prov12 = split_data_with_prov(df_all) 65 | 66 | #################### prov == 11 67 | df_train11,df_test11 = xtrain_and_test(df_all_prov11) 68 | 69 | X_train11 = df_train11[features] 70 | Y_label11 = df_train11[label] 71 | 72 | X_test11 = df_test11[features] 73 | 74 | gbm.fit(X_train11,Y_label11,eval_metric='auc',verbose=5) 75 | column = ['STACKFEATURE'+str(i) for i in range(50)] 76 | 77 | df_new_feature11 = pd.DataFrame(gbm.apply(df_all_prov11[features]),columns=column) 78 | 79 | df_all_prov11[column] = df_new_feature11 80 | 81 | #################### prov == 12 82 | df_train12,df_test12 = xtrain_and_test(df_all_prov12) 83 | 84 | X_train12 = df_train12[features] 85 | Y_label12 = df_train12[label] 86 | 87 | X_test12 = df_test12[features] 88 | 89 | gbm.fit(X_train12,Y_label12,eval_metric='auc',verbose=5) 90 | column = ['STACKFEATURE'+str(i) for i in range(50)] 91 | 92 | df_new_feature12 = pd.DataFrame(gbm.apply(df_all_prov12[features]),columns=column) 93 | 94 | df_all_prov12[column] = df_new_feature12 95 | 96 | # 合并 97 | df_all = df_all_prov11.append(df_all_prov12) 98 | 99 | df_all.to_csv(outputfile,index=False,index_label=False) 100 | 101 | del df_all_prov11,df_all_prov12,df_all 102 | 103 | return outputfile 104 | 105 | # run 106 | inputfile = ['../data/alldata/df_data_all.csv','../data/alldata/df_data_onehot.csv', 107 | '../data/alldata/df_data_plus_all.csv','../data/alldata/df_data_plus_onehot.csv'] 108 | 109 | outputfile = ['../data/alldata/df_data_all_prov_lgbstack.csv','../data/alldata/df_data_onehot_prov_lgbstack.csv', 110 | '../data/alldata/df_data_plus_all_prov_lgbstack.csv','../data/alldata/df_data_plus_onehot_prov_lgbstack.csv'] 111 | 112 | for i in range(0,4): 113 | print(i," start ",inputfile[i]) 114 | runLgbStack(inputfile[i],outputfile[i]) 115 | print(i," end ",outputfile[i]) 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /stack/1_stacking_xgb_prov.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from xgboost.sklearn import XGBClassifier 9 | 10 | 11 | def xtrain_and_test(df_all): 12 | ''' 13 | 得到训练数据和测试数据 14 | ''' 15 | df_label = pd.read_csv('../data/public/train.csv') 16 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 17 | # print(len(df_label)) 18 | # print(len(df_test_label)) 19 | df_label.drop('ENDDATE',axis=1,inplace=True) 20 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 21 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 22 | 23 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 24 | 25 | return df_train,df_test 26 | 27 | def split_data_with_prov(df_data): 28 | ''' 29 | 根据特征PROV,分割数据,进行单独预测之后合并数据结果 30 | ''' 31 | df_train_prov11 = df_data[df_data['PROV'] == 11] 32 | df_train_prov12 = df_data[df_data['PROV'] == 12] 33 | 34 | return df_train_prov11,df_train_prov12 35 | 36 | 37 | def runXgbStack(inputfile, outputfile): 38 | ''' 39 | 输入输出文件,inputfile和outputfile 40 | ''' 41 | # In[2]: 42 | 43 | df_all = pd.read_csv(inputfile) 44 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 45 | 46 | # 默认填充的0,显示使用一个负数尝试一下 47 | df_all.replace([np.inf, -np.inf], np.nan,inplace=True) 48 | df_all = df_all.fillna(0) 49 | 50 | 51 | # 默认填充的0,显示使用一个负数尝试一下 52 | features = df_all.columns[0:] 53 | features = list(features) 54 | features.remove('EID') 55 | label = 'TARGET' 56 | 57 | clf = XGBClassifier( 58 | n_estimators=50,#50棵树 59 | learning_rate =0.05, 60 | max_depth=7, 61 | min_child_weight=1, 62 | gamma=0.1, 63 | subsample=0.8, 64 | colsample_bytree=0.8, 65 | objective= 'binary:logistic', 66 | seed=91) 67 | 68 | df_all_prov11,df_all_prov12 = split_data_with_prov(df_all) 69 | 70 | ###################### prov == 11 71 | df_train11,df_test11 = xtrain_and_test(df_all_prov11) 72 | # In[7]: 73 | 74 | 75 | X_train11 = df_train11[features] 76 | Y_label11 = df_train11[label] 77 | X_test11 = df_test11[features] 78 | 79 | clf.fit(X_train11,Y_label11,eval_metric='auc',verbose=5) 80 | column = ['STACKFEATURE'+str(i) for i in range(50)] 81 | df_new_feature11 = pd.DataFrame(clf.apply(df_all_prov11[features]),columns=column) 82 | df_all_prov11[column] = df_new_feature11 83 | 84 | ####################### prov == 12 85 | df_train12,df_test12 = xtrain_and_test(df_all_prov12) 86 | # In[7]: 87 | X_train12 = df_train12[features] 88 | Y_label12 = df_train12[label] 89 | X_test12 = df_test12[features] 90 | 91 | clf.fit(X_train12,Y_label12,eval_metric='auc',verbose=5) 92 | column = ['STACKFEATURE'+str(i) for i in range(50)] 93 | df_new_feature12 = pd.DataFrame(clf.apply(df_all_prov12[features]),columns=column) 94 | df_all_prov12[column] = df_new_feature12 95 | 96 | df_all = df_all_prov11.append(df_all_prov12) 97 | 98 | df_all.to_csv(outputfile,index=False,index_label=False) 99 | del df_all_prov11,df_all_prov12,df_all 100 | return outputfile 101 | 102 | # run 103 | inputfile = ['../data/alldata/df_data_all.csv','../data/alldata/df_data_onehot.csv', 104 | '../data/alldata/df_data_plus_all.csv','../data/alldata/df_data_plus_onehot.csv'] 105 | 106 | outputfile = ['../data/alldata/df_data_all_prov_xgbstack.csv','../data/alldata/df_data_onehot_prov_xgbstack.csv', 107 | '../data/alldata/df_data_plus_all_prov_xgbstack.csv','../data/alldata/df_data_plus_onehot_prov_xgbstack.csv'] 108 | 109 | # 本地测试 110 | for i in range(0,1): 111 | print(i," start ",inputfile[i]) 112 | runXgbStack(inputfile[i],outputfile[i]) 113 | print(i," end ",outputfile[i]) 114 | 115 | 116 | -------------------------------------------------------------------------------- /model/model_help.py: -------------------------------------------------------------------------------- 1 | 2 | # 根据xgboost的进行特征融合,进行训练和预测,得到训练的特征的初始代码(元代码) 3 | 4 | # coding: utf-8 5 | 6 | # In[4]: 7 | 8 | import xgboost as xgb 9 | import pandas as pd 10 | import numpy as np 11 | from sklearn.model_selection import KFold 12 | from sklearn.model_selection import train_test_split 13 | 14 | from xgboost.sklearn import XGBClassifier 15 | 16 | import seaborn as sns 17 | import matplotlib.pyplot as plt 18 | 19 | 20 | # In[33]: 21 | 22 | #seed = np.random.randint(99999) 23 | seed = 71 24 | 25 | np.random.seed(seed) 26 | valid_size = 0.2 27 | LOOP = 1 28 | ESR = 100 29 | # XGB param 30 | nround = 1500 31 | #nround = 10 32 | 33 | param = {'max_depth':7, # 基准是5 34 | 'learning_rate':0.05, 35 | 'gamma ':0.1, 36 | 'colsample_bytree':0.8, # old 0.8 37 | 'subsample':0.8, 38 | 'silent':1, 39 | 'objective':'binary:logistic', 40 | # 'scale_pos_weight':6 41 | } 42 | 43 | 44 | # In[6]: 45 | 46 | df_all = pd.read_csv('../data/alldata/df_data1234567890.csv') 47 | 48 | 49 | # In[ ]: 50 | 51 | # df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 52 | 53 | 54 | 55 | # In[9]: 56 | 57 | timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 58 | 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 59 | 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 60 | 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME','LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 61 | 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 62 | 'RECRUIT_FIRST_TIME','RECRUIT_END_TIME','QUALIFICATION_FIRST_FIRST_TIME', 63 | 'QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME'] 64 | 65 | 66 | def time2int(x): 67 | ''' 68 | 将时间转化为整数,加入训练 69 | 2018-01 - x 70 | ''' 71 | y = int(x[:x.find('-')]) 72 | m = int(x[x.find('-')+1:]) 73 | s = 12*(2018-y) + (1-m) 74 | 75 | return s 76 | 77 | # 尝试一下去电时间 78 | for t in timeType: 79 | df_all[t] = df_all[t].apply(time2int) 80 | 81 | 82 | 83 | # In[10]: 84 | 85 | df_all.replace([np.inf, -np.inf], np.nan,inplace=True) 86 | df_all = df_all.fillna(0) 87 | 88 | # 默认填充的0,显示使用一个负数尝试一下 89 | 90 | 91 | # In[20]: 92 | 93 | features = df_all.columns[0:] 94 | features = list(features) 95 | features.remove('EID') 96 | label = 'TARGET' 97 | 98 | len(features) 99 | 100 | 101 | # In[25]: 102 | 103 | def xtrain_and_test(df_all): 104 | ''' 105 | 得到训练数据和测试数据 106 | ''' 107 | df_label = pd.read_csv('../data/public/train.csv') 108 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 109 | # print(len(df_label)) 110 | # print(len(df_test_label)) 111 | df_label.drop('ENDDATE',axis=1,inplace=True) 112 | 113 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 114 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 115 | 116 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 117 | 118 | return df_train,df_test 119 | 120 | df_train,df_test = xtrain_and_test(df_all) 121 | 122 | 123 | # In[36]: 124 | 125 | clf = XGBClassifier(max_depth=7,learning_rate=0.05,gamma=0.1, 126 | colsample_bytree=0.8,subsample=0.8, 127 | silent=1,objective='binary:logistic') 128 | 129 | # clf = XGBClassifier(param) 130 | 131 | 132 | # In[38]: 133 | 134 | clf.fit(df_train[features],df_train[label],eval_metric='auc',verbose=5) 135 | 136 | 137 | 138 | # In[44]: 139 | 140 | prob = clf.predict_proba(df_test[features]) 141 | 142 | 143 | # In[ ]: 144 | 145 | 146 | 147 | 148 | # In[45]: 149 | 150 | proba_test = pd.DataFrame() 151 | proba_test['EID'] = df_test['EID'] 152 | proba_test['FORTARGET'] = [0 for i in range(len(df_test))] 153 | proba_test['PROB'] = prob[:,1] 154 | 155 | 156 | # In[50]: 157 | 158 | proba_test.loc[proba_test['PROB']>=0.23,'FORTARGET'] = 1 159 | 160 | 161 | # In[51]: 162 | 163 | proba_test.to_csv('../xresult/xsubmussion_1234567890_7_fit.csv',index=False,index_label=False) 164 | 165 | 166 | # In[52]: 167 | 168 | print(len(proba_test[proba_test['FORTARGET']==1])) 169 | print(len(proba_test[proba_test['FORTARGET']==0])) 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /stack/remove_onefeatures.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | 10 | # 重新审视数据,对年和月进行分割,对不听的类别的特征添加更过的one-hot编码 11 | # ont-hot 12 | # 1. HY 13 | # 2. ETYPE 14 | # 3. ALEREO 15 | # 4. WXCODE 16 | # 5. 年和月(日期) 17 | # 18 | # 19 | # > 可以确定的one-hot编码: 20 | # + HY 21 | # + ETYPE 22 | # + CHANGE_FIRST_ALTERNO 23 | # + CHANGE_END_ALTERNO 24 | # + CHANGE_ALTERNO_MUCNID 25 | # + RIGHT_FIRST_TYPECODE 26 | # + RIGHT_END_TYPECODE 27 | # + RIGHT_TYPECODE_MUCHID 28 | # 29 | # 删除只有一个特征的脏特征。 30 | # 31 | # 32 | # 对所有的有关时间还能的特征,分成年和月两个维度, 33 | # 34 | # timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 35 | # 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 36 | # 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 37 | # 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME', 'LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 38 | # 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 39 | # 'RECRUIT_FIRST_TIME', 'RECRUIT_END_TIME'] 40 | # 41 | # 继续添加特征, 42 | # 增加注册资本 / 2018.01 - RGYEAR。 43 | # 增加注册资本注册RATE / 2018.01 - RGYEAR。 44 | 45 | 46 | # In[2]: 47 | 48 | # 所有时间类型的特征 49 | timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 50 | 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 51 | 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 52 | 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME', 'LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 53 | 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 54 | 'RECRUIT_FIRST_TIME', 'RECRUIT_END_TIME','QUALIFICATION_FIRST_FIRST_TIME', 55 | 'QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME' ] 56 | 57 | 58 | # 所有进行one-hot的编码的数据,对注册时间和变成时间得到的年和月进行one-hot编码 59 | # one-hot的特征。 60 | onehot = ['HY','ETYPE','CHANGE_FIRST_ALTERNO','CHANGE_END_ALTERNO','CHANGE_ALTERNO_MUCNID', 61 | 'RIGHT_FIRST_TYPECODE','RIGHT_END_TYPECODE','RIGHT_TYPECODE_MUCHID','BREAKFAITH_FIRST_END_TIME_MONTH', 62 | 'BREAKFAITH_FIRST_FIRST_TIME_MONTH','END_CHANGE_TIME_MONTH','FIRST_CHANGE_TIME_MONTH', 63 | 'LAWSUIT_END_TIME_MONTH','LAWSUIT_END_TIME_YEAR','LAWSUIT_FIRST_TIME_MONTH','LAWSUIT_FIRST_TIME_YEAR', 64 | 'PROJECT_END_TIME_MONTH','PROJECT_END_TIME_YEAR','PROJECT_FIRST_TIME_MONTH','PROJECT_FIRST_TIME_YEAR', 65 | 'QUALIFICATION_END_FIRST_TIME_MONTH','QUALIFICATION_FIRST_END_TIME_MONTH','QUALIFICATION_FIRST_FIRST_TIME_MONTH', 66 | 'RECRUIT_END_TIME_MONTH','RECRUIT_END_TIME_YEAR','RECRUIT_FIRST_TIME_MONTH','RECRUIT_FIRST_TIME_YEAR', 67 | 'RIGHT_END_ASK_TIME_MONTH','RIGHT_END_FB_TIME_MONTH','RIGHT_FIRST_ASK_TIME_MONTH','RIGHT_FIRST_FB_TIME_MONTH'] 68 | 69 | def time2int(x): 70 | ''' 71 | 将时间转化为整数,加入训练 72 | 2018-01 - x 73 | ''' 74 | y = int(x[:x.find('-')]) 75 | m = int(x[x.find('-')+1:]) 76 | s = 12*(2018-y) + (1-m) 77 | return s 78 | 79 | 80 | def removeOneValueFeature(df_all): 81 | ''' 82 | 删除只有一个值的特征,没有用处,而且占用空间, 83 | 设置直接读取的特征文件。 84 | ''' 85 | features = list(df_all.columns) 86 | for f in features: 87 | if len(set(df_all[f])) == 1: 88 | df_all.drop(f,axis=1,inplace=True) 89 | # 设置时间日期 90 | for t in timeType: 91 | df_all[t] = df_all[t].apply(time2int) 92 | 93 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 94 | return df_all 95 | 96 | def runRemove(inputfile,outputfile): 97 | ''' 98 | 输入输出文件,文件名主义其格式的不同。 99 | ''' 100 | df_all = pd.read_csv(inputfile) 101 | 102 | df_all = removeOneValueFeature(df_all) 103 | df_all.to_csv(outputfile,index=False,index_label=False) 104 | 105 | del df_all 106 | return outputfile 107 | 108 | # 运行 109 | inputfile = ['../data/alldata/df_data1234567890.csv','../data/alldata/df_data1234567890_plus.csv'] 110 | outputfile = ['../data/alldata/df_data_all.csv','../data/alldata/df_data_plus_all.csv'] 111 | for i in range(2): 112 | print(i," start ",inputfile[i]) 113 | runRemove(inputfile[i],outputfile[i]) 114 | print(i," end ",inputfile[i]) 115 | 116 | -------------------------------------------------------------------------------- /feature/review_features.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | 10 | # 重新审视数据,对年和月进行分割,对不听的类别的特征添加更过的one-hot编码 11 | # ont-hot 12 | # 1. HY 13 | # 2. ETYPE 14 | # 3. ALEREO 15 | # 4. WXCODE 16 | # 5. 年和月(日期) 17 | # 18 | # 19 | # > 可以确定的one-hot编码: 20 | # + HY 21 | # + ETYPE 22 | # + CHANGE_FIRST_ALTERNO 23 | # + CHANGE_END_ALTERNO 24 | # + CHANGE_ALTERNO_MUCNID 25 | # + RIGHT_FIRST_TYPECODE 26 | # + RIGHT_END_TYPECODE 27 | # + RIGHT_TYPECODE_MUCHID 28 | # 29 | # 删除只有一个特征的脏特征。 30 | # 31 | # 32 | # 对所有的有关时间还能的特征,分成年和月两个维度, 33 | # 34 | # timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 35 | # 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 36 | # 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 37 | # 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME', 'LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 38 | # 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 39 | # 'RECRUIT_FIRST_TIME', 'RECRUIT_END_TIME'] 40 | # 41 | # 继续添加特征, 42 | # 增加注册资本 / 2018.01 - RGYEAR。 43 | # 增加注册资本注册RATE / 2018.01 - RGYEAR。 44 | 45 | 46 | # In[2]: 47 | 48 | # 所有时间类型的特征 49 | timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 50 | 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 51 | 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 52 | 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME', 'LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 53 | 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 54 | 'RECRUIT_FIRST_TIME', 'RECRUIT_END_TIME','QUALIFICATION_FIRST_FIRST_TIME', 55 | 'QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME' ] 56 | 57 | 58 | # 所有进行one-hot的编码的数据,对注册时间和变成时间得到的年和月进行one-hot编码 59 | onehot = ['PROV','HY','ETYPE','CHANGE_FIRST_ALTERNO','CHANGE_END_ALTERNO','CHANGE_ALTERNO_MUCNID', 60 | 'RIGHT_FIRST_TYPECODE','RIGHT_END_TYPECODE','RIGHT_TYPECODE_MUCHID', 61 | 'BRANCH_ETYPE_OPEN_ALL_MAX_RATE','BRANCH_OPEN_ETYPE_MAX','BREAKFAITH_2_OPEN_CNT', 62 | 'BREAKFAITH_ETYPE_CNT_ALL_RATE_MAX','BREAKFAITH_ETYPE_CNT_MAX','CHANGE_ETYPE_CNT_ALL_MAX', 63 | 'CHANGE_ETYPE_CNT_MAX','QUALIFICATION_1_CLOSE_CNT','QUALIFICATION_2_CLOSE_CNT', 64 | 'QUALIFICATION_2_OPEN_CNT','QUALIFICATION_3_CLOSE_CNT','QUALIFICATION_3_OPEN_CNT', 65 | 'QUALIFICATION_5_OPEN_CNT','QUALIFICATION_ETYPE_CNT_ALL_RATE_MAX','QUALIFICATION_ETYPE_CNT_MAX', 66 | 'TZ_CLOSE_ETYPE_CNT_MAX','TZ_ETYPE_CLOSE_CNT_ALL_MAX_RATE','TZ_ETYPE_CNT_ALL_MAX_RATE', 67 | 'TZ_ETYPE_CNT_MAX'] 68 | 69 | # 这些特征只有一个值,直接删除 70 | oneValueFeature = ['BRANCH_1_OPEN_CNT','BREAKFAITH_1_OPEN_CNT','CHANGE_1_CNT', 71 | 'LAWSUIT_1_OPEN_CNT','PROJECT_1_OPEN_CNT','QUALIFICATION_1_OPEN_CNT', 72 | 'RECRUIT_1_OPEN_CNT','RIGHT_1_OPEN_CNT','TZ_1_OPEN_CNT', 73 | 'TZ_ETYPE_BTBL_ALL_MAX_RATE','TZ_ETYPE_BTBL_MAX'] 74 | 75 | 76 | # In[3]: 77 | 78 | df_all = pd.read_csv('../data/alldata/df_data1234567890_plus.csv') 79 | 80 | 81 | # In[4]: 82 | 83 | 84 | # In[15]: 85 | 86 | # df_all['RGYEAR_DIFF'].value_counts() 87 | 88 | 89 | # In[5]: 90 | 91 | x = list(df_all.columns) 92 | x.sort() 93 | 94 | # In[25]: 95 | 96 | oneValueFeature = [] 97 | 98 | for f in x: 99 | if(len(set(df_all[f]))<=10 and len(set(df_all[f]))>2): 100 | print(f) 101 | oneValueFeature.append(f) 102 | 103 | 104 | # In[ ]: 105 | 106 | 107 | 108 | 109 | # In[30]: 110 | 111 | # 会多出来50个特征, 112 | k = 0 113 | for feature in timeType: 114 | # print(feature) 115 | df_all[feature+'_YEAR'] = df_all[feature].map(lambda x: int(x[:x.find('-')])) 116 | df_all[feature+'_MONTH'] = df_all[feature].map(lambda x: int(x[x.find('-')+1:])) 117 | print (k) 118 | k+=2 119 | 120 | 121 | # In[32]: 122 | 123 | # ont-hot编码,多出150个特征 124 | for feature in onehot: 125 | df_all[feature] = df_all[feature].astype(np.int32) 126 | df_tmp = pd.get_dummies(df_all[feature], prefix=feature) 127 | df_all[df_tmp.columns] = df_tmp 128 | df_all.drop(feature,axis=1,inplace=True) 129 | print(feature) 130 | 131 | 132 | 133 | # In[5]: 134 | 135 | # 删除只有一个值的特征,这是脏特征 136 | for f in oneValueFeature: 137 | df_all.drop(f,axis=1,inplace=True) 138 | 139 | 140 | # In[6]: 141 | 142 | # df_all.info() 143 | # df_all.head() 144 | 145 | 146 | # In[7]: 147 | 148 | # 没有对时间进行one-hot编码 149 | df_all.to_csv('../data/alldata/df_all_reviewed.csv',index=False,index_label=False) 150 | 151 | # df_all.to_csv('../data/alldata/df_all1234567890_removeOneFeature.csv',index=False,index_label=False) 152 | 153 | 154 | # In[ ]: 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /stack/4_onehot_features.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | 9 | 10 | # 重新审视数据,对年和月进行分割,对不听的类别的特征添加更过的one-hot编码 11 | # ont-hot 12 | # 1. HY 13 | # 2. ETYPE 14 | # 3. ALEREO 15 | # 4. WXCODE 16 | # 5. 年和月(日期) 17 | # 18 | # 19 | # > 可以确定的one-hot编码: 20 | # + HY 21 | # + ETYPE 22 | # + CHANGE_FIRST_ALTERNO 23 | # + CHANGE_END_ALTERNO 24 | # + CHANGE_ALTERNO_MUCNID 25 | # + RIGHT_FIRST_TYPECODE 26 | # + RIGHT_END_TYPECODE 27 | # + RIGHT_TYPECODE_MUCHID 28 | # 29 | # 删除只有一个特征的脏特征。 30 | # 31 | # 32 | # 对所有的有关时间还能的特征,分成年和月两个维度, 33 | # 34 | # timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 35 | # 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 36 | # 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 37 | # 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME', 'LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 38 | # 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 39 | # 'RECRUIT_FIRST_TIME', 'RECRUIT_END_TIME'] 40 | # 41 | # 继续添加特征, 42 | # 增加注册资本 / 2018.01 - RGYEAR。 43 | # 增加注册资本注册RATE / 2018.01 - RGYEAR。 44 | 45 | 46 | # In[2]: 47 | 48 | # 所有时间类型的特征 49 | timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 50 | 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 51 | 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 52 | 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME', 'LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 53 | 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 54 | 'RECRUIT_FIRST_TIME', 'RECRUIT_END_TIME','QUALIFICATION_FIRST_FIRST_TIME', 55 | 'QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME' ] 56 | 57 | 58 | # 所有进行one-hot的编码的数据,对注册时间和变成时间得到的年和月进行one-hot编码 59 | # one-hot的特征。 60 | onehot = ['HY','ETYPE','CHANGE_FIRST_ALTERNO','CHANGE_END_ALTERNO','CHANGE_ALTERNO_MUCNID', 61 | 'RIGHT_FIRST_TYPECODE','RIGHT_END_TYPECODE','RIGHT_TYPECODE_MUCHID','BREAKFAITH_FIRST_END_TIME_MONTH', 62 | 'BREAKFAITH_FIRST_FIRST_TIME_MONTH','END_CHANGE_TIME_MONTH','FIRST_CHANGE_TIME_MONTH', 63 | 'LAWSUIT_END_TIME_MONTH','LAWSUIT_END_TIME_YEAR','LAWSUIT_FIRST_TIME_MONTH','LAWSUIT_FIRST_TIME_YEAR', 64 | 'PROJECT_END_TIME_MONTH','PROJECT_END_TIME_YEAR','PROJECT_FIRST_TIME_MONTH','PROJECT_FIRST_TIME_YEAR', 65 | 'QUALIFICATION_END_FIRST_TIME_MONTH','QUALIFICATION_FIRST_END_TIME_MONTH','QUALIFICATION_FIRST_FIRST_TIME_MONTH', 66 | 'RECRUIT_END_TIME_MONTH','RECRUIT_END_TIME_YEAR','RECRUIT_FIRST_TIME_MONTH','RECRUIT_FIRST_TIME_YEAR', 67 | 'RIGHT_END_ASK_TIME_MONTH','RIGHT_END_FB_TIME_MONTH','RIGHT_FIRST_ASK_TIME_MONTH','RIGHT_FIRST_FB_TIME_MONTH'] 68 | 69 | def time2int(x): 70 | ''' 71 | 将时间转化为整数,加入训练 72 | 2018-01 - x 73 | ''' 74 | y = int(x[:x.find('-')]) 75 | m = int(x[x.find('-')+1:]) 76 | s = 12*(2018-y) + (1-m) 77 | return s 78 | 79 | 80 | def removeOneValueFeature(df_all): 81 | ''' 82 | 删除只有一个值的特征,没有用处,而且占用空间, 83 | 设置直接读取的特征文件。 84 | ''' 85 | features = list(df_all.columns) 86 | for f in features: 87 | if len(set(df_all[f])) == 1: 88 | df_all.drop(f,axis=1,inplace=True) 89 | 90 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 91 | return df_all 92 | 93 | 94 | def onehotFeature(df_all): 95 | ''' 96 | 进行onehot编码, 97 | ''' 98 | # 会多出来50个特征, 99 | for feature in timeType: 100 | # print(feature) 101 | df_all[feature+'_YEAR'] = df_all[feature].map(lambda x: int(x[:x.find('-')])) 102 | df_all[feature+'_MONTH'] = df_all[feature].map(lambda x: int(x[x.find('-')+1:])) 103 | 104 | # In[32]: 105 | 106 | # ont-hot编码,多出350+个特征 107 | for feature in onehot: 108 | df_all[feature] = df_all[feature].astype(np.int32) 109 | df_tmp = pd.get_dummies(df_all[feature], prefix=feature) 110 | df_all[df_tmp.columns] = df_tmp 111 | df_all.drop(feature,axis=1,inplace=True) 112 | 113 | # 设置时间日期 114 | for t in timeType: 115 | df_all[t] = df_all[t].apply(time2int) 116 | 117 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 118 | return df_all 119 | 120 | 121 | def runOneHot(inputfile,outputfile): 122 | ''' 123 | 输入输出文件,文件名主义其格式的不同。 124 | ''' 125 | df_all = pd.read_csv(inputfile) 126 | 127 | df_all = onehotFeature(df_all) 128 | 129 | df_all = removeOneValueFeature(df_all) 130 | 131 | df_all.to_csv(outputfile,index=False,index_label=False) 132 | del df_all 133 | return outputfile 134 | 135 | inputfile = ['../data/alldata/df_data1234567890.csv','../data/alldata/df_data1234567890_plus.csv'] 136 | outputfile = ['../data/alldata/df_data_onehot.csv','../data/alldata/df_data_plus_onehot.csv'] 137 | for i in range(2): 138 | print(i," start ",inputfile[i]) 139 | runOneHot(inputfile[i],outputfile[i]) 140 | print(i," end ",inputfile[i]) 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /model/avg_result.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 合并给定的结果文件,进行平均 5 | # 6 | 7 | # In[1]: 8 | 9 | import pandas as pd 10 | import numpy as np 11 | 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | # In[38]: 16 | 17 | def avg_result(results,alpha=0.215): 18 | """ 19 | 参数: 20 | results:结果列表(可以是Dataframe或者是file类型) 21 | alpha:概率阈值,默认是0.23,大于这个概率的是1 22 | 23 | return: 24 | 平均之后的结果 25 | """ 26 | if isinstance(results[0],str): 27 | for i in range(len(results)): 28 | print(i) 29 | results[i] = pd.read_csv(results[i]) 30 | 31 | 32 | df_result = results[0] 33 | for i in range(1,len(results)): 34 | df_result['PROB'] = df_result['PROB'] + results[i]['PROB'] 35 | 36 | df_result['PROB'] = df_result['PROB'] / len(results) 37 | 38 | df_result.loc[df_result['PROB']>=alpha,'FORTARGET'] = 1 39 | df_result.loc[df_result['PROB'] ",inputfile[i]) 221 | runModel(inputfile[i],outputfile[i]) 222 | print(i," end --> ",inputfile[i]) 223 | 224 | -------------------------------------------------------------------------------- /model/3_model_prov.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## 根据省份数据,将所有的数据分成单个省进行训练和预测 5 | # 6 | # + prov = 11 7 | # + prov = 12 8 | # 分成两个特征,这是目前最有的结果, 2017-11-27 9 | # 10 | # In[1]: 11 | 12 | import xgboost as xgb 13 | import pandas as pd 14 | import numpy as np 15 | from sklearn.model_selection import KFold 16 | from sklearn.model_selection import train_test_split 17 | 18 | 19 | import seaborn as sns 20 | import matplotlib.pyplot as plt 21 | 22 | 23 | # In[2]: 24 | 25 | #seed = np.random.randint(99999) 26 | seed = 71 27 | 28 | np.random.seed(seed) 29 | valid_size = 0.2 30 | LOOP = 1 31 | ESR = 100 32 | # XGB param 33 | nround = 1500 34 | #nround = 10 35 | 36 | param = {'max_depth':7, # 基准是5 37 | 'eta':0.05, 38 | 'gamma ':0.1, 39 | 'colsample_bytree':0.8, # old 0.8 40 | 'subsample':0.8, 41 | 'silent':1, 42 | 'eval_metric':'auc', 43 | 'objective':'binary:logistic', 44 | # 'scale_pos_weight':6 45 | } 46 | 47 | 48 | # In[3]: 49 | 50 | df_all = pd.read_csv('../data/alldata/df_all1234567890_plus.csv') 51 | df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) 52 | 53 | 54 | 55 | # In[4]: 56 | 57 | # 融合模型不需要运行 58 | 59 | timeType = ['RGYEAR','FIRST_CHANGE_TIME','END_CHANGE_TIME','BRANCH_FIRST_YEAR','BRANCH_END_YEAR', 60 | 'BRANCH_FIRST_CLOSE_YEAR','TZ_QY_FIRST_TIME','TZ_QY_END_TIME','TZ_FIRST_CLOSE_TIME', 61 | 'RIGHT_FIRST_ASK_TIME', 'RIGHT_FIRST_FB_TIME','RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 62 | 'PROJECT_FIRST_TIME', 'PROJECT_END_TIME','LAWSUIT_FIRST_TIME', 'LAWSUIT_END_TIME', 63 | 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME', 64 | 'RECRUIT_FIRST_TIME','RECRUIT_END_TIME','QUALIFICATION_FIRST_FIRST_TIME', 65 | 'QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME'] 66 | 67 | 68 | def time2int(x): 69 | ''' 70 | 将时间转化为整数,加入训练 71 | 2018-01 - x 72 | ''' 73 | y = int(x[:x.find('-')]) 74 | m = int(x[x.find('-')+1:]) 75 | s = 12*(2018-y) + (1-m) 76 | 77 | return s 78 | 79 | # 尝试一下去电时间 80 | for t in timeType: 81 | df_all[t] = df_all[t].apply(time2int) 82 | 83 | 84 | 85 | # In[5]: 86 | 87 | # df_all['PROV'].value_counts() 88 | 89 | 90 | # In[6]: 91 | 92 | features = df_all.columns[0:] 93 | features = list(features) 94 | features.remove('EID') 95 | features.remove('PROV') 96 | 97 | label = 'TARGET' 98 | 99 | 100 | # In[14]: 101 | 102 | def split_build_valid(df_train,k=10): 103 | ''' 104 | k-fold交叉验证,默认k=10 105 | df_train:训练数据 106 | ''' 107 | 108 | #added some parameters 109 | kf = KFold(n_splits = k, shuffle = True, random_state = 400000) 110 | result1 = next(kf.split(df_train[df_train[label]==1]), None) 111 | result0 = next(kf.split(df_train[df_train[label]==0]), None) 112 | 113 | train_list = [] 114 | train_list.extend(result1[0]) 115 | train_list.extend(result0[0]) 116 | np.random.shuffle(train_list) 117 | 118 | vali_list = [] 119 | vali_list.extend(result1[1]) 120 | vali_list.extend(result0[1]) 121 | np.random.shuffle(vali_list) 122 | 123 | dbuild = xgb.DMatrix(df_train.iloc[train_list][features],label=df_train.iloc[train_list][label]) 124 | dvalid = xgb.DMatrix(df_train.iloc[vali_list][features],label=df_train.iloc[vali_list][label]) 125 | watchlist = [(dbuild, 'build'),(dvalid, 'valid')] 126 | 127 | return dbuild, dvalid, watchlist 128 | 129 | def split_train_valid(df_train,test_size=0.2): 130 | ''' 131 | k-fold交叉验证,默认k=10 132 | df_train:训练数据 133 | ''' 134 | X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000) 135 | #added some parameters 136 | 137 | # dtrain = df_train.iloc[train_list] 138 | # dvali = df_train.iloc[vali_list] 139 | 140 | dtrain = xgb.DMatrix(X_train,label=y_train) 141 | dvalid = xgb.DMatrix(X_vali,label=y_vali) 142 | watchlist = [(dtrain, 'train'),(dvalid, 'valid')] 143 | 144 | return dtrain, dvalid, watchlist 145 | 146 | def xtrain_and_test(df_all): 147 | ''' 148 | 得到训练数据和测试数据 149 | ''' 150 | df_label = pd.read_csv('../data/public/train.csv') 151 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 152 | df_label.drop('ENDDATE',axis=1,inplace=True) 153 | 154 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 155 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 156 | 157 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 158 | 159 | return df_train,df_test 160 | 161 | def split_data_with_prov(df_data): 162 | ''' 163 | 根据特征PROV,分割数据,进行单独预测之后合并数据结果 164 | ''' 165 | df_train_prov11 = df_data[df_data['PROV'] == 11] 166 | df_train_prov12 = df_data[df_data['PROV'] == 12] 167 | 168 | return df_train_prov11,df_train_prov12 169 | 170 | 171 | 172 | # In[8]: 173 | 174 | df_train,df_test = xtrain_and_test(df_all) 175 | 176 | # 根据省份11和12分成两份数据,进行预测。 177 | df_train_prov11,df_train_prov12 = split_data_with_prov(df_train) 178 | df_test_prov11,df_test_prov12 = split_data_with_prov(df_test) 179 | 180 | 181 | # In[12]: 182 | 183 | # print(len(df_train_prov11[df_train_prov11['TARGET']==1])) 184 | 185 | # print(len(df_train_prov11[df_train_prov11['TARGET']==0])) 186 | 187 | 188 | # In[15]: 189 | 190 | print('PROV == 11') 191 | 192 | models = [] 193 | for i in range(LOOP): 194 | print('11 LOOP',i) 195 | # dbuild, dvalid, watchlist = split_build_valid(df_train) 196 | dbuild, dvalid, watchlist = split_train_valid(df_train_prov11,test_size=0.2) 197 | 198 | 199 | model = xgb.train(param, dbuild, nround, watchlist,early_stopping_rounds=ESR,verbose_eval=5) 200 | models.append(model) 201 | model.save_model('../saveModel/model1'+ str(i) + '.model') 202 | # VALID 203 | valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration) 204 | print('11 Valid Mean:---------------------->', np.mean(valid_yhat)) 205 | del dbuild, dvalid, watchlist 206 | 207 | 208 | 209 | # In[16]: 210 | 211 | print('PROV == 12') 212 | models = [] 213 | for i in range(LOOP): 214 | print('12 LOOP',i) 215 | # dbuild, dvalid, watchlist = split_build_valid(df_train) 216 | dbuild, dvalid, watchlist = split_train_valid(df_train_prov12,test_size=0.2) 217 | 218 | 219 | model = xgb.train(param, dbuild, nround, watchlist,early_stopping_rounds=ESR,verbose_eval=5) 220 | models.append(model) 221 | model.save_model('../saveModel/model1'+ str(i) + '.model') 222 | # VALID 223 | valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration) 224 | print('12 Valid Mean:---------------------->', np.mean(valid_yhat)) 225 | del dbuild, dvalid, watchlist 226 | 227 | 228 | 229 | # ------------------------------------------------------------------------ 230 | 231 | # In[19]: 232 | 233 | print('PROV == 11') 234 | 235 | # 得到最优参数,使用全部数据进行训练数据 236 | models = [] 237 | for i in range(LOOP): 238 | print('PROV 11 LOOP',i) 239 | dbuild, dvalid, watchlist = split_train_valid(df_train_prov11,test_size=0.001) 240 | nround = 250 # 在验证的基础上加上20轮左右 241 | model = xgb.train(param, dbuild,nround,watchlist,verbose_eval=5) 242 | models.append(model) 243 | model.save_model('../saveModel/model1234567'+ str(i) + '.model') 244 | # VALID 245 | valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration) 246 | print('PROV 11 Valid Mean:---------------------->', np.mean(valid_yhat)) 247 | del dbuild, dvalid, watchlist 248 | 249 | #============================================================================== 250 | print('PROV test 11') 251 | #============================================================================== 252 | 253 | dtest11 = xgb.DMatrix(df_test_prov11[features]) 254 | proba_test11 = pd.DataFrame() 255 | proba_test11['EID'] = df_test_prov11['EID'] 256 | proba_test11['FORTARGET'] = [0 for i in range(len(df_test_prov11))] 257 | proba_test11['PROB'] = [0 for i in range(len(df_test_prov11))] 258 | for model in models: 259 | proba_test11['PROB'] += model.predict(dtest11) 260 | proba_test11['PROB'] /= LOOP 261 | 262 | 263 | 264 | # In[20]: 265 | 266 | print('PROV == 12') 267 | 268 | models = [] 269 | for i in range(LOOP): 270 | print('PROV 12 LOOP',i) 271 | dbuild, dvalid, watchlist = split_train_valid(df_train_prov12,test_size=0.001) 272 | nround = 250 # 在验证的基础上加上20轮左右 273 | model = xgb.train(param, dbuild,nround,watchlist,verbose_eval=5) 274 | models.append(model) 275 | model.save_model('../saveModel/model1234567'+ str(i) + '.model') 276 | # VALID 277 | valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration) 278 | print('12 Valid Mean:---------------------->', np.mean(valid_yhat)) 279 | del dbuild, dvalid, watchlist 280 | 281 | #============================================================================== 282 | print('PROV test 12') 283 | #============================================================================== 284 | 285 | dtest12 = xgb.DMatrix(df_test_prov12[features]) 286 | proba_test12 = pd.DataFrame() 287 | proba_test12['EID'] = df_test_prov12['EID'] 288 | proba_test12['FORTARGET'] = [0 for i in range(len(df_test_prov12))] 289 | proba_test12['PROB'] = [0 for i in range(len(df_test_prov12))] 290 | for model in models: 291 | proba_test12['PROB'] += model.predict(dtest12) 292 | proba_test12['PROB'] /= LOOP 293 | 294 | 295 | # In[33]: 296 | 297 | print(len(proba_test11[proba_test11['PROB']>=0.23])) 298 | print(len(proba_test11[proba_test11['PROB']<=0.23])) 299 | 300 | 301 | print(len(proba_test12[proba_test12['PROB']>=0.22])) 302 | print(len(proba_test12[proba_test12['PROB']<=0.22])) 303 | 304 | 305 | # In[34]: 306 | 307 | proba_test11.loc[proba_test11['PROB']>=0.23,'FORTARGET'] = 1 308 | proba_test12.loc[proba_test12['PROB']>=0.22,'FORTARGET'] = 1 309 | 310 | 311 | # In[35]: 312 | 313 | proba_tmp = proba_test11.append(proba_test12) 314 | 315 | 316 | 317 | proba_test = pd.DataFrame() 318 | proba_test['EID'] = df_test['EID'] 319 | proba_test['FORTARGET_TMP'] = [0 for i in range(len(df_test))] 320 | 321 | 322 | # In[37]: 323 | 324 | proba_test = pd.merge(proba_test,proba_tmp,how='left',on='EID') 325 | 326 | 327 | # In[41]: 328 | 329 | proba_test.drop('FORTARGET_TMP',axis=1,inplace=True) 330 | 331 | proba_test.info() 332 | 333 | 334 | # In[42]: 335 | 336 | proba_test.to_csv('../xresult/xsubmussion_plus_prov.csv',index=False,index_label=False) 337 | 338 | 339 | # In[43]: 340 | 341 | print(len(proba_test[proba_test['FORTARGET']==1])) 342 | print(len(proba_test[proba_test['FORTARGET']==0])) 343 | 344 | 345 | -------------------------------------------------------------------------------- /feature/8_feature.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | # # 2.3.8 失信数据8breakfaith.csv提取特征 13 | # 1. 企业失信的个数,BREAKFAITH_CNT 14 | # 2. 企业第一个失信数据的日期,BREAKFAITH_FIRST_FIRST_TIME 15 | # 3. 企业最后一个失信数据的日期,BREAKFAITH_FIRST_END_TIME 16 | # 4. 企业第一个失信数据的到期日期,BREAKFAITH_END_FIRST_TIME 17 | # 5. 企业第一个失信数据的日期与其结束日期的差值,BREAKFAITH_FIRST_END_FIRST_DIFF 18 | # 19 | # 5. 企业第一个失信数据的日期与企业注册日期的差值,BREAKFAITH_FIRST_RGYEAR_DIFF 20 | # 6. 企业最后一个失信数据的日期与企业注册日期的差值,BREAKFAITH_END_RGYEAR_DIFF 21 | # 22 | # 7. 企业第一个失信数据的到期日期与企业注册日期的差值,BREAKFAITH_END_RGYEAR_DIFF 23 | # 24 | # 8. 企业第一个失信数据的日期与企业第一次变更的差值,BREAKFAITH_FIRST_CHANGE_FIRST_DIFF 25 | # 9. 企业最后一个失信数据的日期与企业第一次变更的差值,BREAKFAITH_END_CHANGE_FIRST_DIFF 26 | # 27 | # 10. 企业第一个失信数据的日期与企业最后一次变更的差值,BREAKFAITH_FIRST_CHANGE_END_DIFF 28 | # 11. 企业最后一个失信数据的日期与企业最后一次变更的差值,BREAKFAITH_END_CHANGE_END_DIFF 29 | # 30 | # --------------------------------------------------------------------------------------- 31 | # 10. 企业平均每几个月的失信一次, BREAKFAITH_PRE_MONTH_CNT = BREAKFAITH_END_RGYEAR_DIFF / BREAKFAITH_CNT 32 | # 33 | # 11. 企业失信的个数占所有失信个数个平均值的比例,BREAKFAITH_CNT_ALL_RATE 34 | # 35 | # 12. 企业对应的大类HY的平均失信数据的个数,BREAKFAITH_HY_CNT_AVG 36 | # 13. 企业对应大类HY的平均失信个数占所有失信平均个数的比例,BREAKFAITH_HY_CNT_ALL_RATE 37 | # 14. 企业失信的个数占其对应的大类HY的失信平均值的比例,BREAKFAITH_CNT_HY_RATE 38 | # 39 | # --------------------------------------------------------------------------------------- 40 | # 41 | # 12. 企业对应的大类HY的平均失信数据的个数,BREAKFAITH_ETYPE_CNT_AVG 42 | # 13. 企业对应大类HY的平均失信个数占所有失信平均个数的比例,BREAKFAITH_ETYPE_CNT_ALL_RATE 43 | # 14. 企业失信的个数占其对应的大类HY的失信平均值的比例,BREAKFAITH_CNT_ETYPE_RATE 44 | # 45 | # 对应的MAX数据特征 46 | # 47 | # 复赛新特征,滑动窗口系列的特征,时间段是1年,2年,3年,5年,计算在最近k[1,2,3,5]年之后的数据,主要是个数和次数。 48 | # 时间是开始是2017-08之前的k年 49 | # 1. 之前k年的变更时间的个数。 50 | # BREAKFAITH_K_OPEN_CNT, 51 | # 52 | # 53 | # 54 | 55 | # In[2]: 56 | 57 | df_all = pd.read_csv("../data/alldata/df_data1234567.csv") 58 | df_breakfaith = pd.read_csv("../data/public/8breakfaith.csv") 59 | 60 | 61 | # In[3]: 62 | 63 | # df_all.info() 64 | # df_all.head() 65 | 66 | 67 | # In[4]: 68 | 69 | 70 | # df_breakfaith.info() 71 | # df_breakfaith.head() 72 | 73 | 74 | # In[5]: 75 | 76 | # df_breakfaith['SXENDDATE'].value_counts() 77 | 78 | 79 | # In[6]: 80 | 81 | df_breakfaith['FBDATE'] = df_breakfaith['FBDATE'].map(lambda x:x.replace('年','-').replace('月','')) 82 | 83 | df_breakfaith.loc[df_breakfaith['SXENDDATE'].isnull(),['SXENDDATE']] = '2018/1/1' 84 | df_breakfaith.loc[df_breakfaith['SXENDDATE']=='2018/1/1',['SXENDDATE']] = '2018-01' 85 | 86 | 87 | def time(x): 88 | y = x[:x.find('/')] 89 | m = int(x[x.find('/')+1:x.rfind('/')]) 90 | if m < 10: m = '0'+str(m) 91 | else: m = str(m) 92 | return y + '-' + m 93 | 94 | # df_breakfaith['FBDATE'] = df_breakfaith['FBDATE'].apply(time) 95 | # df_breakfaith['SXENDDATE'] = df_breakfaith['SXENDDATE'].apply(time) 96 | 97 | 98 | # In[7]: 99 | 100 | df_breakfaith = df_breakfaith.sort_values(['FBDATE','SXENDDATE']) 101 | 102 | # df_breakfaith.info() 103 | # df_breakfaith.head() 104 | 105 | 106 | # In[8]: 107 | 108 | EIDS = set(df_breakfaith['EID']) 109 | 110 | # print(len(EIDS)) 111 | 112 | columns = df_breakfaith.columns 113 | df_xbreakfaith = pd.DataFrame(columns=columns) 114 | 115 | # print(columns) 116 | 117 | 118 | # In[9]: 119 | 120 | k = 0 121 | for EID in EIDS: 122 | if k%3000 == 0: 123 | print('第%d次处理--------->',k) 124 | k+=1 125 | tmp = df_breakfaith[df_breakfaith['EID'] == EID] 126 | row = [EID,tmp['TYPECODE'].values,tmp['FBDATE'].values,tmp['SXENDDATE'].values] 127 | 128 | df_xbreakfaith = df_xbreakfaith.append(pd.Series(row,columns),ignore_index=True) 129 | 130 | 131 | 132 | # In[10]: 133 | 134 | # 2017-08 BRANCH_K_OPEN_CNT, BRANCH_K_CLOSE_CNT 135 | df_xbreakfaith['BREAKFAITH_1_OPEN_CNT'] = df_xbreakfaith['FBDATE'].map(lambda x: np.sum(np.array(x) >= '2016-08')) 136 | df_xbreakfaith['BREAKFAITH_2_OPEN_CNT'] = df_xbreakfaith['FBDATE'].map(lambda x: np.sum(np.array(x) >= '2015-08')) 137 | df_xbreakfaith['BREAKFAITH_3_OPEN_CNT'] = df_xbreakfaith['FBDATE'].map(lambda x: np.sum(np.array(x) >= '2014-08')) 138 | df_xbreakfaith['BREAKFAITH_5_OPEN_CNT'] = df_xbreakfaith['FBDATE'].map(lambda x: np.sum(np.array(x) >= '2012-08')) 139 | 140 | 141 | # 文件8的数据量特别小,对特征几乎没有意思,这里添加一个时候存在 失信数据的特征 IS_BREAKFAITH 142 | df_xbreakfaith['IS_BREAKFAITH'] = 1 143 | 144 | 145 | 146 | # In[ ]: 147 | 148 | 149 | 150 | 151 | # In[11]: 152 | 153 | # df_xbreakfaith.info() 154 | # df_xbreakfaith.head() 155 | 156 | 157 | # In[12]: 158 | 159 | df_xbreakfaith['BREAKFAITH_CNT'] = df_xbreakfaith['TYPECODE'].apply(lambda x: len(x)) 160 | 161 | df_xbreakfaith['BREAKFAITH_FIRST_FIRST_TIME'] = df_xbreakfaith['FBDATE'].apply(lambda x: x[0]) 162 | df_xbreakfaith['BREAKFAITH_FIRST_END_TIME'] = df_xbreakfaith['FBDATE'].apply(lambda x: x[-1]) 163 | df_xbreakfaith['BREAKFAITH_END_FIRST_TIME'] = df_xbreakfaith['SXENDDATE'].apply(lambda x: x[0]) 164 | 165 | 166 | 167 | # In[13]: 168 | 169 | df_xbreakfaith.to_csv('../data/public/8breakfaith_1.csv',index=False,index_label=False) 170 | df_xbreakfaith.columns 171 | 172 | 173 | 174 | # In[14]: 175 | 176 | df_all = pd.merge(df_all,df_xbreakfaith[['EID','BREAKFAITH_CNT','BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME', 177 | 'BREAKFAITH_END_FIRST_TIME','BREAKFAITH_1_OPEN_CNT','BREAKFAITH_2_OPEN_CNT', 178 | 'BREAKFAITH_3_OPEN_CNT','BREAKFAITH_5_OPEN_CNT','IS_BREAKFAITH']],how='left',on=['EID']) 179 | 180 | 181 | # In[15]: 182 | 183 | # df_all.info() 184 | # df_all.head() 185 | 186 | 187 | # In[16]: 188 | 189 | # 所有有管时间的特征,用注册日期填充,其他的用0填充 190 | df_all['BREAKFAITH_CNT'] = df_all['BREAKFAITH_CNT'].fillna(0) 191 | 192 | # 不存在失信数据的填写0 193 | df_all['IS_BREAKFAITH'] = df_all['IS_BREAKFAITH'].fillna(0) 194 | 195 | modelist = [ 'BREAKFAITH_FIRST_FIRST_TIME', 'BREAKFAITH_FIRST_END_TIME','BREAKFAITH_END_FIRST_TIME'] 196 | 197 | for d in modelist: 198 | df_all.loc[df_all[d].isnull(),d] = df_all.loc[df_all[d].isnull(),'RGYEAR'] 199 | 200 | 201 | 202 | # In[ ]: 203 | 204 | 205 | 206 | 207 | # In[17]: 208 | 209 | df_all[['BREAKFAITH_1_OPEN_CNT','BREAKFAITH_2_OPEN_CNT','BREAKFAITH_3_OPEN_CNT','BREAKFAITH_5_OPEN_CNT']] = df_all[['BREAKFAITH_1_OPEN_CNT','BREAKFAITH_2_OPEN_CNT','BREAKFAITH_3_OPEN_CNT','BREAKFAITH_5_OPEN_CNT']].fillna(0) 210 | 211 | 212 | # In[18]: 213 | 214 | def timeDiff(x): 215 | a = x[:x.find(':')] 216 | b = x[x.find(':')+1:] 217 | y = int(a[:a.find('-')]) - int(b[:b.find('-')]) 218 | m = int(a[a.find('-')+1:]) - int(b[b.find('-')+1:]) 219 | return y * 12 + m 220 | 221 | 222 | df_all['BREAKFAITH_FIRST_END_FIRST_DIFF'] = (df_all['BREAKFAITH_END_FIRST_TIME'] + ':' + df_all['BREAKFAITH_FIRST_FIRST_TIME']).apply(timeDiff) 223 | df_all['BREAKFAITH_FIRST_RGYEAR_DIFF'] = (df_all['BREAKFAITH_FIRST_FIRST_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 224 | df_all['BREAKFAITH_END_RGYEAR_DIFF'] = (df_all['BREAKFAITH_FIRST_END_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 225 | df_all['BREAKFAITH_END_RGYEAR_DIFF'] = (df_all['BREAKFAITH_END_FIRST_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 226 | 227 | 228 | df_all['BREAKFAITH_FIRST_CHANGE_FIRST_DIFF'] = (df_all['BREAKFAITH_FIRST_FIRST_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 229 | df_all['BREAKFAITH_END_CHANGE_FIRST_DIFF'] = (df_all['BREAKFAITH_FIRST_END_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 230 | df_all['BREAKFAITH_FIRST_CHANGE_END_DIFF'] = (df_all['BREAKFAITH_FIRST_FIRST_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 231 | df_all['BREAKFAITH_END_CHANGE_END_DIFF'] = (df_all['BREAKFAITH_FIRST_END_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 232 | 233 | 234 | # --------------------------------------------------------------------------------------- 235 | # 10. 企业平均每几个月的失信一次, BREAKFAITH_PRE_MONTH_CNT = BREAKFAITH_END_RGYEAR_DIFF / BREAKFAITH_CNT 236 | 237 | # 11. 企业失信的个数占所有失信个数个平均值的比例,BREAKFAITH_CNT_ALL_RATE 238 | 239 | # 12. 企业对应的大类HY的平均失信数据的个数,BREAKFAITH_HY_CNT_AVG 240 | # 13. 企业对应大类HY的平均失信个数占所有失信平均个数的比例,BREAKFAITH_HY_CNT_ALL_RATE 241 | # 14. 企业失信的个数占其对应的大类HY的失信平均值的比例,BREAKFAITH_CNT_HY_RATE 242 | 243 | # In[19]: 244 | 245 | df_all['BREAKFAITH_PRE_MONTH_CNT'] = df_all['BREAKFAITH_END_RGYEAR_DIFF'] / df_all['BREAKFAITH_CNT'] 246 | 247 | df_all['BREAKFAITH_CNT_ALL_RATE'] = df_all['BREAKFAITH_CNT'] / df_all['BREAKFAITH_CNT'].mean() 248 | 249 | df_all['BREAKFAITH_CNT_ALL_RATE_MAX'] = df_all['BREAKFAITH_CNT'] / df_all['BREAKFAITH_CNT'].max() 250 | 251 | 252 | 253 | 254 | # In[20]: 255 | 256 | tmp = pd.DataFrame() 257 | 258 | tmp['BREAKFAITH_HY_CNT_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['BREAKFAITH_CNT'].mean() 259 | tmp['BREAKFAITH_HY_CNT_ALL_RATE'] = tmp['BREAKFAITH_HY_CNT_AVG'] / df_all['BREAKFAITH_CNT'].mean() 260 | 261 | tmp['BREAKFAITH_HY_CNT_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['BREAKFAITH_CNT'].max() 262 | tmp['BREAKFAITH_HY_CNT_ALL_RATE_MAX'] = tmp['BREAKFAITH_HY_CNT_MAX'] / df_all['BREAKFAITH_CNT'].max() 263 | 264 | 265 | tmp['HY'] = tmp.index 266 | 267 | 268 | 269 | # In[21]: 270 | 271 | tmp1 = pd.DataFrame() 272 | 273 | tmp1['BREAKFAITH_ETYPE_CNT_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['BREAKFAITH_CNT'].mean() 274 | tmp1['BREAKFAITH_ETYPE_CNT_ALL_RATE'] = tmp1['BREAKFAITH_ETYPE_CNT_AVG'] / df_all['BREAKFAITH_CNT'].mean() 275 | 276 | tmp1['BREAKFAITH_ETYPE_CNT_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['BREAKFAITH_CNT'].max() 277 | tmp1['BREAKFAITH_ETYPE_CNT_ALL_RATE_MAX'] = tmp1['BREAKFAITH_ETYPE_CNT_MAX'] / df_all['BREAKFAITH_CNT'].max() 278 | 279 | 280 | tmp1['ETYPE'] = tmp1.index 281 | 282 | 283 | 284 | # In[22]: 285 | 286 | df_all = pd.merge(df_all,tmp,how='left',on=['HY']) 287 | df_all = pd.merge(df_all,tmp1,how='left',on=['ETYPE']) 288 | 289 | 290 | # In[23]: 291 | 292 | df_all['BREAKFAITH_CNT_HY_RATE'] = df_all['BREAKFAITH_CNT'] / df_all['BREAKFAITH_HY_CNT_AVG'] 293 | df_all['BREAKFAITH_CNT_HY_RATE_MAX'] = df_all['BREAKFAITH_CNT'] / df_all['BREAKFAITH_HY_CNT_MAX'] 294 | 295 | df_all['BREAKFAITH_CNT_ETYPE_RATE'] = df_all['BREAKFAITH_CNT'] / df_all['BREAKFAITH_ETYPE_CNT_AVG'] 296 | df_all['BREAKFAITH_CNT_ETYPE_RATE_MAX'] = df_all['BREAKFAITH_CNT'] / df_all['BREAKFAITH_ETYPE_CNT_MAX'] 297 | 298 | 299 | 300 | # In[24]: 301 | 302 | # df_all.info() 303 | # df_all.head() 304 | 305 | 306 | # In[25]: 307 | 308 | # 得到在df_xchange的所有数据, 309 | df_all = df_all.fillna(0) 310 | df_all.to_csv('../data/alldata/df_data12345678.csv',index=False,index_label=False) 311 | 312 | 313 | 314 | # In[26]: 315 | 316 | # df_all[['EID','IS_BREAKFAITH']].to_csv('../data/df_breakfaith_insert.csv',index=False,index_label=False) 317 | 318 | 319 | # In[ ]: 320 | 321 | 322 | 323 | -------------------------------------------------------------------------------- /feature/2_feature.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[2]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | import matplotlib.pyplot as plt 10 | import re 11 | 12 | # # 企业变更数据的对应特征(2alter.csv) 13 | # 14 | # 1. 变更的事件的总个数, ALTERNO_SUM 15 | # 2. 变更的事件的类型的种类数, ALTERNO_CNT 16 | # 3. 变更的事件的类型的种类数 / 所有事件的总总的类别数, ALTERNO_CNT_RATE 17 | # 4. 第一次变更时间, FIRST_CHANGE_TIME 18 | # 5. 第一次变更距离企业注册时间的差(单位:月),不存在变更事件的用0填充, FIRST_CHANGE_TIME_DIFF 19 | # 6. 最后一次变更时间, END_CHANGE_TIME 20 | # 7. 最后一次变更时间距离企业注册时间的差值,END_CHANGE_TIME_DIFF 21 | # 8. 最后一次变更距离第一次事件变更时间的差(单位:月),不存在变更事件的用0填充, END_FIRST_CHANGE_TIME_DIFF 22 | # 23 | # 9. 变更的事件是否发生了前后的值得变化(1/0), IS_CHANGE 24 | # 10. 第一次变更事件的类型,CHANGE_FIRST_ALTERNO 25 | # 11. 最后一次变更事件的类型,CHANGE_END_ALTERNO 26 | # 12. 最多的变更事件的类型,CHANGE_ALTERNO_MUCNID 27 | # 28 | # 新加特征 29 | # ------------------------------------------------------------------------------------------------------- 30 | # 13. 平均每几个月变更一次事件,CHANGE_PRE_CNT = ALTERNO_SUM / END_CHANGE_TIME_DIFF 31 | # 14. 每个企业变更事件的个数占所有企业变更事件平均值的比例, CHANGE_CNT_RATE = ALTERNO_SUM / AVG 32 | # 15. 企业对应大类HY的平均变更次数,CHANGE_HY_CNT_AVG 33 | # 16. 企业对应大类HY的平均变更次数占所有变更时间的个数的平均值的比例,CHANGE_HY_CNT_ALL_AVG 34 | # 35 | # 17. 企业变更次数占其对应大类的平均次数的比例,CHANGE_CNT_HY_RATE = ALTERNO_SUM / CHANGE_HY_CNT_AVG 36 | # 37 | # 新加特征1 38 | # ------------------------------------------------------------------------------------------------------- 39 | # 15. 企业对应大类ETYPE的平均变更次数,CHANGE_ETYPE_CNT_AVG 40 | # 16. 企业对应大类ETYPE的平均变更次数占所有变更时间的个数的平均值的比例,CHANGE_ETYPE_CNT_ALL_AVG 41 | # 17. 企业变更次数占其对应大类ETYPE的平均次数的比例,CHANGE_CNT_ETYPE_RATE = ALTERNO_SUM / CHANGE_ETYPE_CNT_AVG 42 | # 43 | # 44 | # 添加对应的最大值的特征 45 | # 46 | # 复赛新特征,滑动窗口系列的特征,时间段是1年,2年,3年,计算在最近k[1,2,3,5]年之后的数据,主要是个数和次数。 47 | # 时间是开始是2017-08之前的k年 48 | # 1. 之前k年的变更时间的个数。 49 | # CHANGE_K_CNT 50 | # 51 | # 52 | 53 | # In[32]: 54 | 55 | df_all = pd.read_csv("../data/alldata/df_data1.csv") 56 | df_change = pd.read_csv("../data/public/2alter.csv") 57 | 58 | # In[12]: 59 | 60 | # df_all.info() 61 | # df_all.head() 62 | 63 | 64 | # In[13]: 65 | 66 | # df_change.info() 67 | # df_change.head() 68 | 69 | 70 | # In[ ]: 71 | 72 | 73 | # In[14]: 74 | 75 | df_change = df_change.sort_values(['EID','ALTDATE']) 76 | 77 | 78 | # In[15]: 79 | 80 | df_change = df_change.fillna('0') 81 | columns = ['EID', 'ALTERNO', 'ALTDATE', 'ALTBE', 'ALTAF'] 82 | df_xchange = pd.DataFrame(columns=['EID', 'ALTERNO', 'ALTDATE', 'ALTBE', 'ALTAF']) 83 | 84 | # In[16]: 85 | 86 | EIDS = set(df_change['EID']) 87 | 88 | # len(EIDS) 89 | 90 | # In[17]: 91 | 92 | k = 0 93 | for EID in EIDS: 94 | if k%3000 == 0: 95 | print('第%d次处理--------->',k) 96 | k+=1 97 | tmp = df_change[df_change['EID'] == EID] 98 | row = [EID,tmp['ALTERNO'].values,tmp['ALTDATE'].values,tmp['ALTBE'].values,tmp['ALTAF'].values] 99 | 100 | df_xchange = df_xchange.append(pd.Series(row,columns),ignore_index=True) 101 | 102 | 103 | 104 | # In[18]: 105 | 106 | # df_xchange.info() 107 | # df_xchange.head() 108 | 109 | def getFAlterno(x): 110 | m = x[0] 111 | if m == 'A_015': 112 | m = m[3:] 113 | return m 114 | 115 | def getEAlterno(x): 116 | m = x[-1] 117 | if m == 'A_015': 118 | m = m[3:] 119 | return m 120 | 121 | def getMAlterno(x): 122 | x = list(x) 123 | m = x[0] 124 | k = 0 125 | for i in x: 126 | if k= '2016-08')) 143 | df_xchange['CHANGE_2_CNT'] = df_xchange['ALTDATE'].map(lambda x: np.sum(np.array(x) >= '2015-08')) 144 | df_xchange['CHANGE_3_CNT'] = df_xchange['ALTDATE'].map(lambda x: np.sum(np.array(x) >= '2014-08')) 145 | df_xchange['CHANGE_5_CNT'] = df_xchange['ALTDATE'].map(lambda x: np.sum(np.array(x) >= '2012-08')) 146 | 147 | 148 | 149 | # In[22]: 150 | 151 | # df_xchange.head() 152 | 153 | 154 | # In[23]: 155 | 156 | k = len(set(df_change['ALTERNO'])) 157 | 158 | 159 | # In[24]: 160 | 161 | def totalcnt(x): 162 | return len(x) 163 | 164 | df_xchange['ALTERNO_SUM'] = df_xchange['ALTERNO'].apply(totalcnt) 165 | 166 | def cnt(x): 167 | return len(set(x)) 168 | 169 | df_xchange['ALTERNO_CNT'] = df_xchange['ALTERNO'].apply(cnt) 170 | 171 | df_xchange['ALTERNO_CNT_RATE'] = df_xchange['ALTERNO_CNT'] / k 172 | 173 | def getFirst(x): 174 | x.sort() 175 | return x[0] 176 | 177 | df_xchange['FIRST_CHANGE_TIME'] = df_xchange['ALTDATE'].apply(getFirst) 178 | 179 | def getEnd(x): 180 | x.sort() 181 | return x[-1] 182 | 183 | df_xchange['END_CHANGE_TIME'] = df_xchange['ALTDATE'].apply(getEnd) 184 | 185 | def equal(x): 186 | ok = 0 187 | for d in x: 188 | if d[:d.find(":")] != d[d.find(":")+1:]: 189 | ok = 1 190 | return ok 191 | df_xchange['IS_CHANGE'] = (df_xchange['ALTBE'] +':'+ df_xchange['ALTAF']).apply(equal) 192 | 193 | 194 | # In[25]: 195 | 196 | df_xchange.to_csv('../data/public/2alter_1.csv',index=False,index_label=False) 197 | 198 | # df_xchange = pd.read_csv('../data/public/2alter_1.csv') 199 | 200 | 201 | # In[33]: 202 | 203 | df_all = pd.merge(df_all,df_xchange[['EID', 'ALTERNO_SUM','ALTERNO_CNT', 'ALTERNO_CNT_RATE', 'FIRST_CHANGE_TIME', 204 | 'END_CHANGE_TIME', 'IS_CHANGE','CHANGE_FIRST_ALTERNO','CHANGE_END_ALTERNO', 205 | 'CHANGE_ALTERNO_MUCNID','CHANGE_1_CNT','CHANGE_2_CNT','CHANGE_3_CNT','CHANGE_5_CNT']],how='left',on=['EID']) 206 | 207 | 208 | # In[34]: 209 | 210 | # df_all.info() 211 | # df_all.head() 212 | 213 | 214 | # > 空值填充,根据HY的类别的平均值或者众数进行填充 215 | # + ['ALTERNO_SUM','ALTERNO_CNT', 'ALTERNO_CNT_RATE',]使用同一个大类别的均值进行填充 216 | # + ['FIRST_CHANGE_TIME','END_CHANGE_TIME', 'IS_CHANGE' ]使用同一个大类别的众数进行填充 217 | # 218 | # 219 | 220 | # In[35]: 221 | 222 | HYLIST = set(df_all['HY']) 223 | 224 | # print(HYLIST) 225 | 226 | meanlist = ['ALTERNO_SUM','ALTERNO_CNT', 'ALTERNO_CNT_RATE'] 227 | modelist = ['FIRST_CHANGE_TIME','END_CHANGE_TIME', 'IS_CHANGE','CHANGE_FIRST_ALTERNO','CHANGE_END_ALTERNO','CHANGE_ALTERNO_MUCNID' ] 228 | 229 | 230 | # In[36]: 231 | 232 | 233 | # int(df_all[df_all['HY']==75]['ALTERNO_SUM'].mean()) 234 | 235 | # df_all[df_all['HY']==7]['CHANGE_FIRST_ALTERNO'].value_counts().index[0] 236 | 237 | 238 | 239 | # In[37]: 240 | 241 | # for HY in HYLIST: 242 | # # print(df_train['HY'].value_counts()) 243 | # for d in meanlist: 244 | # df_all.loc[df_all[df_all[d].isnull()][df_all['HY']==HY].index,d] = df_all[df_all['HY']==HY][d].mean() 245 | 246 | # for c in modelist: 247 | # if(len(df_all[df_all['HY']==HY][c].value_counts().index)==0): continue 248 | # df_all.loc[df_all[df_all[c].isnull()][df_all['HY']==HY].index,c] = df_all[df_all['HY']==HY][c].value_counts().index[0] 249 | 250 | 251 | # In[38]: 252 | 253 | # new 254 | df_all[['CHANGE_1_CNT','CHANGE_2_CNT','CHANGE_3_CNT','CHANGE_5_CNT']] = df_all[['CHANGE_1_CNT','CHANGE_2_CNT','CHANGE_3_CNT','CHANGE_5_CNT']].fillna(0) 255 | 256 | 257 | # In[39]: 258 | 259 | df_all[meanlist] = df_all[meanlist].fillna(0) 260 | 261 | 262 | df_all[df_all['FIRST_CHANGE_TIME'].isnull()]['FIRST_CHANGE_TIME'] = df_all[df_all['FIRST_CHANGE_TIME'].isnull()]['RGYEAR'] 263 | 264 | df_all.loc[df_all['FIRST_CHANGE_TIME'].isnull(),'FIRST_CHANGE_TIME'] = df_all.loc[df_all['FIRST_CHANGE_TIME'].isnull(),'RGYEAR'] 265 | df_all.loc[df_all['END_CHANGE_TIME'].isnull(),'END_CHANGE_TIME'] = df_all.loc[df_all['END_CHANGE_TIME'].isnull(),'RGYEAR'] 266 | 267 | 268 | # In[ ]: 269 | 270 | 271 | 272 | 273 | # In[40]: 274 | 275 | def timeDiff(x): 276 | a = x[:x.find(':')] 277 | b = x[x.find(':')+1:] 278 | y = int(a[:a.find('-')]) - int(b[:b.find('-')]) 279 | m = int(a[a.find('-')+1:]) - int(b[b.find('-')+1:]) 280 | return y * 12 + m 281 | 282 | df_all['FIRST_CHANGE_TIME_DIFF'] = (df_all['FIRST_CHANGE_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 283 | df_all['END_CHANGE_TIME_DIFF'] = (df_all['END_CHANGE_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 284 | 285 | df_all['END_FIRST_CHANGE_TIME_DIFF'] = (df_all['END_CHANGE_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 286 | 287 | 288 | # In[41]: 289 | 290 | # df_all.info() 291 | 292 | # 13. 平均每几个月变更一次事件,CHANGE_PRE_CNT = ALTERNO_SUM / END_CHANGE_TIME_DIFF 293 | # 14. 每个企业变更事件的个数占所有企业变更事件平均值的比例, CHANGE_CNT_RATE = ALTERNO_SUM / AVG 294 | # 15. 企业对应大类HY的平均变更次数,CHANGE_HY_CNT_AVG 295 | # 16. 企业对应大类HY的平均变更次数占所有变更时间的个数的平均值的比例,CHANGE_HY_CNT_ALL_AVG 296 | 297 | # In[42]: 298 | 299 | df_all['CHANGE_PRE_CNT'] = df_all['ALTERNO_SUM'] / df_all['END_CHANGE_TIME_DIFF'] 300 | df_all['CHANGE_CNT_RATE'] = df_all['ALTERNO_SUM'] / df_all['ALTERNO_SUM'].mean() 301 | 302 | 303 | # In[43]: 304 | 305 | tmp = pd.DataFrame() 306 | 307 | tmp['CHANGE_HY_CNT_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['ALTERNO_SUM'].mean() 308 | tmp['CHANGE_HY_CNT_ALL_AVG'] = tmp['CHANGE_HY_CNT_AVG'] / df_all['ALTERNO_SUM'].mean() 309 | 310 | tmp['CHANGE_HY_CNT_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['ALTERNO_SUM'].max() 311 | tmp['CHANGE_HY_CNT_ALL_MAX'] = tmp['CHANGE_HY_CNT_MAX'] / df_all['ALTERNO_SUM'].max() 312 | 313 | tmp['HY'] = tmp.index 314 | 315 | 316 | 317 | # In[44]: 318 | 319 | tmp1 = pd.DataFrame() 320 | 321 | tmp1['CHANGE_ETYPE_CNT_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['ALTERNO_SUM'].mean() 322 | tmp1['CHANGE_ETYPE_CNT_ALL_AVG'] = tmp1['CHANGE_ETYPE_CNT_AVG'] / df_all['ALTERNO_SUM'].mean() 323 | 324 | tmp1['CHANGE_ETYPE_CNT_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['ALTERNO_SUM'].max() 325 | tmp1['CHANGE_ETYPE_CNT_ALL_MAX'] = tmp1['CHANGE_ETYPE_CNT_MAX'] / df_all['ALTERNO_SUM'].max() 326 | 327 | tmp1['ETYPE'] = tmp1.index 328 | 329 | 330 | # In[ ]: 331 | 332 | 333 | 334 | 335 | # In[45]: 336 | 337 | df_all = pd.merge(df_all,tmp,how='left',on=['HY']) 338 | df_all = pd.merge(df_all,tmp1,how='left',on=['ETYPE']) 339 | 340 | 341 | 342 | # In[46]: 343 | 344 | df_all['CHANGE_CNT_HY_RATE'] = df_all['ALTERNO_SUM'] / df_all['CHANGE_HY_CNT_AVG'] 345 | df_all['CHANGE_CNT_ETYPE_RATE'] = df_all['ALTERNO_SUM'] / df_all['CHANGE_ETYPE_CNT_AVG'] 346 | 347 | df_all['CHANGE_CNT_HY_RATE_MAX'] = df_all['ALTERNO_SUM'] / df_all['CHANGE_HY_CNT_MAX'] 348 | df_all['CHANGE_CNT_ETYPE_RATE_MAX'] = df_all['ALTERNO_SUM'] / df_all['CHANGE_ETYPE_CNT_MAX'] 349 | 350 | 351 | 352 | # In[47]: 353 | 354 | # df_all.info() 355 | # df_all.head() 356 | 357 | 358 | # In[ ]: 359 | 360 | 361 | 362 | 363 | # In[48]: 364 | 365 | df_all = df_all.fillna(0) 366 | df_all.to_csv('../data/alldata/df_data12.csv',index=False,index_label=False) 367 | 368 | 369 | # In[ ]: 370 | 371 | 372 | 373 | -------------------------------------------------------------------------------- /feature/1_feature.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | # # 得到所有的企业本身的特征 13 | # 0. 企业注册时间差,RGYEAR 14 | # 1. 2018-企业注册时间差,RGYEAR_DIFF 15 | # 2. 注册资本,ZCZB 16 | # 3. 企业的行业大类,HY 17 | # 4. 企业的类型,ETYPE 18 | # 5. 注册资本占整个行业大类的注册资本的平均值的比例,ZCZB_RATE 19 | # 6. 注册资本占这个企业类型的注册资本的平均值的比例,ETYPE_RATE 20 | # 21 | # 7. 已经完成的企业的计算指标,['MPNUM', 'INUM', 'FINZB','FSTINUM', 'TZINUM'],用0填充 22 | # ----------------------------------------------------------------------------------------- 23 | # 8. 每个企业对应行业大类的平均资金,HY_ZCZB_AVG 24 | # 9. 每个企业对应行业大类的平均资金占所有企业的平均注册资金的比例, HY_ZCZB_AVG_RATE 25 | # 26 | # 8. 每个企业对应行业大类的平均资金,ETYPE_ZCZB_AVG 27 | # 9. 每个企业对应行业大类的平均资金占所有企业的平均注册资金的比例, ETYPE_ZCZB_AVG_RATE 28 | # 29 | # 30 | # 10. 每个完成的企业的计算指标其对应行业大类的计算指标的平均值,['MPNUM_HY_AVG', 'INUM_HY_AVG', 'FINZB_HY_AVG','FSTINUM_HY_AVG', 'TZINUM_HY_AVG'] 31 | # 32 | # 33 | # ['MPNUM_HY_AVG', 'INUM_HY_AVG', 'FINZB_HY_AVG','FSTINUM_HY_AVG', 'TZINUM_HY_AVG'] / MPNUM_AVG ... 34 | # 11. 每个完成的企业的计算指标其对应行业大类的计算指标的平均值占整个计算指标的平均值的比例,['MPNUM_HY_AVG_RATE', 'INUM_HY_AVG_RATE', 'FINZB_HY_AVG_RATE','FSTINUM_HY_AVG_RATE', 'TZINUM_HY_AVG_RATE'] 35 | # 36 | # 37 | # ['MPNUM', 'INUM', 'FINZB','FSTINUM', 'TZINUM'] / ['MPNUM_HY_AVG', 'INUM_HY_AVG', 'FINZB_HY_AVG','FSTINUM_HY_AVG', 'TZINUM_HY_AVG'] 38 | # 12. 每个完成的企业的计算指标占其对应行业大类对应的计算指标的平均值的比例,['MPNUM_HY_RATE', 'INUM_HY_RATE', 'FINZB_HY_RATE','FSTINUM_HY_RATE', 'TZINUM_HY_RATE'] 39 | # 40 | # --------------------------------------------------------------------------------------- 41 | # 13. 每个完成的企业的计算指标其对应行业大类的计算指标的平均值,['MPNUM_ETYPE_AVG', 'INUM_ETYPE_AVG', 'FINZB_ETYPE_AVG','FSTINUM_ETYPE_AVG', 'TZINUM_ETYPE_AVG'] 42 | # 43 | # 44 | # ['MPNUM_ETYPE_AVG', 'INUM_ETYPE_AVG', 'FINZB_ETYPE_AVG','FSTINUM_ETYPE_AVG', 'TZINUM_ETYPE_AVG'] / MPNUM_AVG ... 45 | # 14. 每个完成的企业的计算指标其对应行业大类的计算指标的平均值占整个计算指标的平均值的比例,['MPNUM_ETYPE_AVG_RATE', 'INUM_ETYPE_AVG_RATE', 'FINZB_ETYPE_AVG_RATE','FSTINUM_ETYPE_AVG_RATE', 'TZINUM_ETYPE_AVG_RATE'] 46 | # 47 | # 48 | # ['MPNUM', 'INUM', 'FINZB','FSTINUM', 'TZINUM'] / ['MPNUM_ETYPE_AVG', 'INUM_ETYPE_AVG', 'FINZB_ETYPE_AVG','FSTINUM_ETYPE_AVG', 'TZINUM_ETYPE_AVG'] 49 | # 15. 每个完成的企业的计算指标占其对应行业大类对应的计算指标的平均值的比例,['MPNUM_ETYPE_RATE', 'INUM_ETYPE_RATE', 'FINZB_ETYPE_RATE','FSTINUM_ETYPE_RATE', 'TZINUM_ETYPE_RATE'] 50 | # 51 | # 52 | # ------------------------------------------------------------------------------------- 53 | # 16. 添加对应HY和ETYPE最大值的比例 54 | # 55 | # 10. 每个完成的企业的计算指标其对应行业大类的计算指标的最大值,['MPNUM_HY_MAX', 'INUM_HY_MAX', 'FINZB_HY_MAX','FSTINUM_HY_MAX', 'TZINUM_HY_MAX'] 56 | # 57 | # 58 | # ['MPNUM_HY_MAX', 'INUM_HY_MAX', 'FINZB_HY_MAX','FSTINUM_HY_MAX', 'TZINUM_HY_MAX'] / MPNUM_MAX ... 59 | # 11. 每个完成的企业的计算指标其对应行业大类的计算指标的最大值值占整个计算指标的最大值值的比例,['MPNUM_HY_MAX_RATE', 'INUM_HY_MAX_RATE', 'FINZB_HY_MAX_RATE','FSTINUM_HY_MAX_RATE', 'TZINUM_HY_MAX_RATE'] 60 | # 61 | # 62 | # ['MPNUM', 'INUM', 'FINZB','FSTINUM', 'TZINUM'] / ['MPNUM_HY_MAX', 'INUM_HY_MAX', 'FINZB_HY_MAX','FSTINUM_HY_MAX', 'TZINUM_HY_MAX'] 63 | # 12. 每个完成的企业的计算指标占其对应行业大类对应的计算指标的最大值值的比例,['MPNUM_HY_MAX_RATE', 'INUM_HY_MAX_RATE', 'FINZB_HY_MAX_RATE','FSTINUM_HY_MAX_RATE', 'TZINUM_HY_MAX_RATE'] 64 | # 65 | # 66 | # 67 | # 68 | # 69 | 70 | # In[2]: 71 | 72 | company = pd.read_csv('../data/public/1entbase.csv') 73 | 74 | 75 | # company['ETYPE'].value_counts() 76 | 77 | company = company.fillna(0) 78 | # company.info() 79 | # company.head() 80 | 81 | 82 | # In[6]: 83 | 84 | # company['PROV'].value_counts() 85 | 86 | # In[7]: 87 | 88 | tmp = pd.DataFrame() 89 | tmp['HY_ZCZB_AVG'] = company.groupby(['HY'],as_index=True,axis=0)['ZCZB'].mean() 90 | tmp['HY_ZCZB_AVG_RATE'] = tmp['HY_ZCZB_AVG'] / company['ZCZB'].mean() 91 | 92 | tmp['MPNUM_HY_AVG'] = company.groupby(['HY'],as_index=True,axis=0)['MPNUM'].mean() 93 | tmp['INUM_HY_AVG'] = company.groupby(['HY'],as_index=True,axis=0)['INUM'].mean() 94 | tmp['FINZB_HY_AVG'] = company.groupby(['HY'],as_index=True,axis=0)['FINZB'].mean() 95 | tmp['FSTINUM_HY_AVG'] = company.groupby(['HY'],as_index=True,axis=0)['FSTINUM'].mean() 96 | tmp['TZINUM_HY_AVG'] = company.groupby(['HY'],as_index=True,axis=0)['TZINUM'].mean() 97 | 98 | 99 | tmp['HY_ZCZB_MAX'] = company.groupby(['HY'],as_index=True,axis=0)['ZCZB'].mean() 100 | tmp['HY_ZCZB_MAX_RATE'] = tmp['HY_ZCZB_MAX'] / company['ZCZB'].mean() 101 | 102 | tmp['MPNUM_HY_MAX'] = company.groupby(['HY'],as_index=True,axis=0)['MPNUM'].mean() 103 | tmp['INUM_HY_MAX'] = company.groupby(['HY'],as_index=True,axis=0)['INUM'].mean() 104 | tmp['FINZB_HY_MAX'] = company.groupby(['HY'],as_index=True,axis=0)['FINZB'].mean() 105 | tmp['FSTINUM_HY_MAX'] = company.groupby(['HY'],as_index=True,axis=0)['FSTINUM'].mean() 106 | tmp['TZINUM_HY_MAX'] = company.groupby(['HY'],as_index=True,axis=0)['TZINUM'].mean() 107 | 108 | 109 | tmp['HY'] = tmp.index 110 | 111 | tmp1 = pd.DataFrame() 112 | tmp1['ETYPE_ZCZB_AVG'] = company.groupby(['ETYPE'],as_index=True,axis=0)['ZCZB'].mean() 113 | tmp1['ETYPE_ZCZB_AVG_RATE'] = tmp1['ETYPE_ZCZB_AVG'] / company['ZCZB'].mean() 114 | 115 | tmp1['MPNUM_ETYPE_AVG'] = company.groupby(['ETYPE'],as_index=True,axis=0)['MPNUM'].mean() 116 | tmp1['INUM_ETYPE_AVG'] = company.groupby(['ETYPE'],as_index=True,axis=0)['INUM'].mean() 117 | tmp1['FINZB_ETYPE_AVG'] = company.groupby(['ETYPE'],as_index=True,axis=0)['FINZB'].mean() 118 | tmp1['FSTINUM_ETYPE_AVG'] = company.groupby(['ETYPE'],as_index=True,axis=0)['FSTINUM'].mean() 119 | tmp1['TZINUM_ETYPE_AVG'] = company.groupby(['ETYPE'],as_index=True,axis=0)['TZINUM'].mean() 120 | 121 | tmp1['ETYPE_ZCZB_MAX'] = company.groupby(['ETYPE'],as_index=True,axis=0)['ZCZB'].mean() 122 | tmp1['ETYPE_ZCZB_MAX_RATE'] = tmp1['ETYPE_ZCZB_MAX'] / company['ZCZB'].mean() 123 | 124 | tmp1['MPNUM_ETYPE_MAX'] = company.groupby(['ETYPE'],as_index=True,axis=0)['MPNUM'].mean() 125 | tmp1['INUM_ETYPE_MAX'] = company.groupby(['ETYPE'],as_index=True,axis=0)['INUM'].mean() 126 | tmp1['FINZB_ETYPE_MAX'] = company.groupby(['ETYPE'],as_index=True,axis=0)['FINZB'].mean() 127 | tmp1['FSTINUM_ETYPE_MAX'] = company.groupby(['ETYPE'],as_index=True,axis=0)['FSTINUM'].mean() 128 | tmp1['TZINUM_ETYPE_MAX'] = company.groupby(['ETYPE'],as_index=True,axis=0)['TZINUM'].mean() 129 | 130 | 131 | tmp1['ETYPE'] = tmp1.index 132 | 133 | 134 | 135 | company = pd.merge(company,tmp,how='left',on=['HY']) 136 | company = pd.merge(company,tmp1,how='left',on=['ETYPE']) 137 | 138 | 139 | # company.info() 140 | # company.head() 141 | 142 | 143 | # In[9]: 144 | 145 | company['RGYEAR_DIFF'] = 2018 - company['RGYEAR'] 146 | company['ZCZB_RATE'] = company['ZCZB'] / company['HY_ZCZB_AVG'] 147 | company['ETYPE_RATE'] = company['ZCZB'] / company['ETYPE_ZCZB_AVG'] 148 | 149 | company['ZCZB_RATE_MAXN'] = company['ZCZB'] / company['HY_ZCZB_MAX'] 150 | company['ETYPE_RATE_MAXN'] = company['ZCZB'] / company['ETYPE_ZCZB_MAX'] 151 | 152 | 153 | # In[10]: 154 | 155 | company['MPNUM_HY_AVG_RATE'] = company['MPNUM_HY_AVG'] / company['MPNUM'].mean() 156 | company['INUM_HY_AVG_RATE'] = company['INUM_HY_AVG'] / company['INUM'].mean() 157 | company['FINZB_HY_AVG_RATE'] = company['FINZB_HY_AVG'] / company['FINZB'].mean() 158 | company['FSTINUM_HY_AVG_RATE'] = company['FSTINUM_HY_AVG'] / company['FSTINUM'].mean() 159 | company['TZINUM_HY_AVG_RATE'] = company['TZINUM_HY_AVG'] / company['TZINUM'].mean() 160 | 161 | company['MPNUM_HY_RATE'] = company['MPNUM'] / company['MPNUM_HY_AVG'] 162 | company['INUM_HY_RATE'] = company['INUM'] / company['INUM_HY_AVG'] 163 | company['FINZB_HY_RATE'] = company['FINZB'] / company['FINZB_HY_AVG'] 164 | company['FSTINUM_HY_RATE'] = company['FSTINUM'] / company['FSTINUM_HY_AVG'] 165 | company['TZINUM_HY_RATE'] = company['TZINUM'] / company['TZINUM_HY_AVG'] 166 | 167 | 168 | 169 | # In[11]: 170 | 171 | company['MPNUM_HY_MAX_RATE'] = company['MPNUM_HY_MAX'] / company['MPNUM'].mean() 172 | company['INUM_HY_MAX_RATE'] = company['INUM_HY_MAX'] / company['INUM'].mean() 173 | company['FINZB_HY_MAX_RATE'] = company['FINZB_HY_MAX'] / company['FINZB'].mean() 174 | company['FSTINUM_HY_MAX_RATE'] = company['FSTINUM_HY_MAX'] / company['FSTINUM'].mean() 175 | company['TZINUM_HY_MAX_RATE'] = company['TZINUM_HY_MAX'] / company['TZINUM'].mean() 176 | 177 | company['MPNUM_HY_MAX_RATE'] = company['MPNUM'] / company['MPNUM_HY_MAX'] 178 | company['INUM_HY_MAX_RATE'] = company['INUM'] / company['INUM_HY_MAX'] 179 | company['FINZB_HY_MAX_RATE'] = company['FINZB'] / company['FINZB_HY_MAX'] 180 | company['FSTINUM_HY_MAX_RATE'] = company['FSTINUM'] / company['FSTINUM_HY_MAX'] 181 | company['TZINUM_HY_MAX_RATE'] = company['TZINUM'] / company['TZINUM_HY_MAX'] 182 | 183 | 184 | # In[12]: 185 | 186 | company['MPNUM_ETYPE_AVG_RATE'] = company['MPNUM_ETYPE_AVG'] / company['MPNUM'].mean() 187 | company['INUM_ETYPE_AVG_RATE'] = company['INUM_ETYPE_AVG'] / company['INUM'].mean() 188 | company['FINZB_ETYPE_AVG_RATE'] = company['FINZB_ETYPE_AVG'] / company['FINZB'].mean() 189 | company['FSTINUM_ETYPE_AVG_RATE'] = company['FSTINUM_ETYPE_AVG'] / company['FSTINUM'].mean() 190 | company['TZINUM_ETYPE_AVG_RATE'] = company['TZINUM_ETYPE_AVG'] / company['TZINUM'].mean() 191 | 192 | company['MPNUM_ETYPE_RATE'] = company['MPNUM'] / company['MPNUM_ETYPE_AVG'] 193 | company['INUM_ETYPE_RATE'] = company['INUM'] / company['INUM_ETYPE_AVG'] 194 | company['FINZB_ETYPE_RATE'] = company['FINZB'] / company['FINZB_ETYPE_AVG'] 195 | company['FSTINUM_ETYPE_RATE'] = company['FSTINUM'] / company['FSTINUM_ETYPE_AVG'] 196 | company['TZINUM_ETYPE_RATE'] = company['TZINUM'] / company['TZINUM_ETYPE_AVG'] 197 | 198 | 199 | 200 | # In[13]: 201 | 202 | company['MPNUM_ETYPE_MAX_RATE'] = company['MPNUM_ETYPE_MAX'] / company['MPNUM'].mean() 203 | company['INUM_ETYPE_MAX_RATE'] = company['INUM_ETYPE_MAX'] / company['INUM'].mean() 204 | company['FINZB_ETYPE_MAX_RATE'] = company['FINZB_ETYPE_MAX'] / company['FINZB'].mean() 205 | company['FSTINUM_ETYPE_MAX_RATE'] = company['FSTINUM_ETYPE_MAX'] / company['FSTINUM'].mean() 206 | company['TZINUM_ETYPE_MAX_RATE'] = company['TZINUM_ETYPE_MAX'] / company['TZINUM'].mean() 207 | 208 | company['MPNUM_ETYPE_MAX_RATE'] = company['MPNUM'] / company['MPNUM_ETYPE_MAX'] 209 | company['INUM_ETYPE_MAX_RATE'] = company['INUM'] / company['INUM_ETYPE_MAX'] 210 | company['FINZB_ETYPE_MAX_RATE'] = company['FINZB'] / company['FINZB_ETYPE_MAX'] 211 | company['FSTINUM_ETYPE_MAX_RATE'] = company['FSTINUM'] / company['FSTINUM_ETYPE_MAX'] 212 | company['TZINUM_ETYPE_MAX_RATE'] = company['TZINUM'] / company['TZINUM_ETYPE_MAX'] 213 | 214 | 215 | 216 | # In[ ]: 217 | 218 | 219 | # In[14]: 220 | 221 | company = company.fillna(0) 222 | 223 | 224 | # In[15]: 225 | 226 | def setTime(x): 227 | return str(x)+'-01' 228 | 229 | company['RGYEAR'] = company['RGYEAR'].apply(setTime) 230 | 231 | # In[16]: 232 | 233 | # company.info() 234 | # company.head() 235 | 236 | 237 | # HY的类别更多 238 | # FSTINUM、INUM、MPNUM、FINZB:用同一个企业类型(HY)的平均值填充空值(取整) 239 | # TZINUM:用这个HY中出现最多的值进行填充 240 | # 241 | 242 | # In[17]: 243 | 244 | company.to_csv('../data/alldata/df_data1.csv',index=False,index_label=False) 245 | 246 | 247 | # In[ ]: 248 | 249 | 250 | 251 | # In[ ]: 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /feature/6_feature.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[10]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | # + 第一个修改特征 13 | # # 项目数据6project.csv提取的特征 14 | # 1. 项目的数量个数, PROJECT_CNT, 15 | # 2. 省内项目的个数, PROJECT_INHOME_CNT 16 | # 3. 省外项目的个数, PROJECT_OUTHOME_CNT 17 | # 4. 省内项目的比例, PROJECT_INHOME_RATE 18 | # 5. 省外项目的比例, PROJECT_OUTHOME_RATE 19 | # 20 | # 6. 项目的类型数目,PROJECT_TYPECODE_CNT 21 | # 22 | # 7. 第一个项目的项目ID,PROJECT_FIRST_TYPECODE 23 | # 8. 最后一个项目的项目ID,PROJECT_END_TYPECODE 24 | # 25 | # 9. 第一个项目中标的时间,PROJECT_FIRST_TIME 26 | # 10. 最后一个项目中标的时间,PROJECT_END_TIME 27 | # 28 | # 11. 最后一个项目中标时间和第一个中标时间的差值,PROJECT_END_FIRST_DIFF 29 | # 12. 第一个项目中标时间距离企业注册时间的差值,PROJECT_FIRST_RGYEAR_DIFF 30 | # 13. 最后一个项目中标时间距离企业注册时间的差值,PROJECT_END_RGYEAR_DIFF 31 | # 32 | # 33 | # 14. 第一个项目中标时间距离企业第一个变更时间的差值,PROJECT_FIRST_CHANGE_FIRST_DIFF 34 | # 15. 最后一个项目中标时间距离企第一个变更时间的差值,PROJECT_END_CHANGE_FIRST_DIFF 35 | # 16. 第一个项目中标时间距离企业最后一个变更时间的差值,PROJECT_FIRST_CHANGE_END_DIFF 36 | # 17. 最后一个项目中标时间距离企最后一个变更时间的差值,PROJECT_END_CHANGE_END_DIFF 37 | # 38 | # --------------------------------------------------------------------------------- 39 | # 18. 平均没几个月中标一个项目其余用0表示,PROJECT_PRE_MONTH_CNT = PROJECT_END_RGYEAR_DIFF / PROJECT_CNT 40 | # 41 | # 19. 企业权利的个数占所有权利个数平均值的比例。PROJECT_CNT_ALL_RATE 42 | # 43 | # 20. 企业对应的大类HY的平均权利的个数,PROJECT_HY_CNT_AVG 44 | # 21. 企业对应大类HY的平均权利个数占所有权利平均个数的比例,PROJECT_HY_CNT_ALL_RATE 45 | # 22. 企业权利的个数占其对应的大类HY的平均值的比例,PROJECT_CNT_HY_RATE 46 | # 47 | # --------------------------------------------------------------------------------- 48 | # 49 | # 19. 企业权利的个数占所有权利个数最大值的比例。PROJECT_CNT_ALL_RATE_MAX 50 | # 51 | # 20. 企业对应的大类ETYPE的平均权利的个数,PROJECT_ETYPE_CNT_AVG 52 | # 21. 企业对应大类ETYPE的平均权利个数占所有权利平均个数的比例,PROJECT_ETYPE_CNT_ALL_RATE 53 | # 22. 企业权利的个数占其对应的大类ETYPE的平均值的比例,PROJECT_CNT_ETYPE_RATE 54 | # 55 | # 添加对应的MAX特征数据 56 | # 复赛新特征,滑动窗口系列的特征,时间段是1年,2年,3年,5年,计算在最近k[1,2,3,5]年之后的数据,主要是个数和次数。 57 | # 时间是开始是2017-08之前的k年 58 | # 1. 之前k年的变更时间的个数。 59 | # RIGHT_K_OPEN_CNT, RIGHT_K_CLOSE_CNT 60 | # 61 | # 62 | 63 | # In[11]: 64 | 65 | df_all = pd.read_csv("../data/alldata/df_data12345.csv") 66 | df_project = pd.read_csv("../data/public/6project.csv") 67 | 68 | 69 | # In[12]: 70 | 71 | # df_all.info() 72 | # df_all.head() 73 | 74 | 75 | # In[13]: 76 | 77 | # df_project.info() 78 | # df_project.head() 79 | 80 | 81 | 82 | # In[ ]: 83 | 84 | 85 | 86 | 87 | # In[15]: 88 | 89 | EIDS = set(df_project['EID']) 90 | 91 | # print(len(EIDS)) 92 | 93 | columns = df_project.columns 94 | df_xproject = pd.DataFrame(columns=columns) 95 | 96 | # print(columns) 97 | 98 | 99 | # In[16]: 100 | 101 | k = 0 102 | for EID in EIDS: 103 | if k%3000 == 0: 104 | print('第%d次处理--------->',k) 105 | k+=1 106 | tmp = df_project[df_project['EID'] == EID] 107 | row = [EID,tmp['TYPECODE'].values,tmp['DJDATE'].values,tmp['IFHOME'].values] 108 | 109 | df_xproject = df_xproject.append(pd.Series(row,columns),ignore_index=True) 110 | 111 | 112 | 113 | # In[17]: 114 | 115 | # df_xproject.info() 116 | # df_xproject.head() 117 | 118 | 119 | # In[18]: 120 | 121 | df_xproject['PROJECT_CNT'] = df_xproject['TYPECODE'].apply(lambda x: len(x)) 122 | df_xproject['PROJECT_INHOME_CNT'] = df_xproject['IFHOME'].apply(lambda x: list(x).count(1)) 123 | df_xproject['PROJECT_OUTHOME_CNT'] = df_xproject['IFHOME'].apply(lambda x: list(x).count(0)) 124 | 125 | 126 | # In[19]: 127 | 128 | df_xproject['PROJECT_INHOME_RATE'] = df_xproject['PROJECT_INHOME_CNT'] / df_xproject['PROJECT_CNT'] 129 | df_xproject['PROJECT_OUTHOME_RATE'] = df_xproject['PROJECT_OUTHOME_CNT'] / df_xproject['PROJECT_CNT'] 130 | 131 | 132 | 133 | # In[20]: 134 | 135 | df_xproject['PROJECT_TYPECODE_CNT'] = df_xproject['TYPECODE'].apply(lambda x: len(set(x)) ) 136 | 137 | df_xproject['PROJECT_FIRST_TYPECODE'] = df_xproject['TYPECODE'].apply(lambda x: x[0]) 138 | df_xproject['PROJECT_END_TYPECODE'] = df_xproject['TYPECODE'].apply(lambda x: x[-1]) 139 | 140 | df_xproject['PROJECT_FIRST_TIME'] = df_xproject['DJDATE'].apply(lambda x: x[0]) 141 | df_xproject['PROJECT_END_TIME'] = df_xproject['DJDATE'].apply(lambda x: x[-1]) 142 | 143 | 144 | # In[21]: 145 | 146 | # 2017-08 BRANCH_K_OPEN_CNT, BRANCH_K_CLOSE_CNT 147 | df_xproject['PROJECT_1_OPEN_CNT'] = df_xproject['DJDATE'].map(lambda x: np.sum(np.array(x) >= '2016-08')) 148 | df_xproject['PROJECT_2_OPEN_CNT'] = df_xproject['DJDATE'].map(lambda x: np.sum(np.array(x) >= '2015-08')) 149 | df_xproject['PROJECT_3_OPEN_CNT'] = df_xproject['DJDATE'].map(lambda x: np.sum(np.array(x) >= '2014-08')) 150 | df_xproject['PROJECT_5_OPEN_CNT'] = df_xproject['DJDATE'].map(lambda x: np.sum(np.array(x) >= '2012-08')) 151 | 152 | 153 | 154 | # In[22]: 155 | 156 | df_xproject.to_csv('../data/public/6project_1.csv',index=False,index_label=False) 157 | # df_xproject.columns 158 | 159 | 160 | 161 | # In[23]: 162 | 163 | df_all = pd.merge(df_all,df_xproject[['EID', 'PROJECT_CNT','PROJECT_INHOME_CNT', 'PROJECT_OUTHOME_CNT', 'PROJECT_INHOME_RATE', 164 | 'PROJECT_OUTHOME_RATE', 'PROJECT_TYPECODE_CNT', 'PROJECT_FIRST_TYPECODE', 'PROJECT_END_TYPECODE', 165 | 'PROJECT_FIRST_TIME','PROJECT_END_TIME','PROJECT_1_OPEN_CNT','PROJECT_2_OPEN_CNT', 166 | 'PROJECT_3_OPEN_CNT','PROJECT_5_OPEN_CNT']],how='left',on=['EID']) 167 | 168 | 169 | # In[24]: 170 | 171 | # df_all.info() 172 | # df_all.head() 173 | 174 | 175 | # In[ ]: 176 | 177 | 178 | 179 | 180 | # In[25]: 181 | 182 | # 空值填充,根据HY的类别的平均值或者众数进行填充 183 | # ['RIGHT_CNT','RIGHT_TYPE_CNT', 'RIGHT_TYPE_RATE']使用同一个大类别的均值进行填充 184 | # ['RIGHT_FIRST_TYPECODE','RIGHT_END_TYPECODE', 'RIGHT_TYPECODE_MUCHID', 'RIGHT_FIRST_ASK_TIME', 185 | # 'RIGHT_FIRST_FB_TIME', 'RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME']使用同一个大类别的众数进行填充 186 | 187 | HYLIST = set(df_all['HY']) 188 | # print(HYLIST) 189 | 190 | meanlist = ['PROJECT_FIRST_TYPECODE', 'PROJECT_END_TYPECODE','PROJECT_CNT','PROJECT_INHOME_CNT', 'PROJECT_OUTHOME_CNT','PROJECT_OUTHOME_CNT','PROJECT_OUTHOME_RATE','PROJECT_TYPECODE_CNT'] 191 | modelist = ['PROJECT_FIRST_TIME', 'PROJECT_END_TIME'] 192 | 193 | 194 | # In[26]: 195 | 196 | df_all[['PROJECT_1_OPEN_CNT','PROJECT_2_OPEN_CNT','PROJECT_3_OPEN_CNT','PROJECT_5_OPEN_CNT']] = df_all[['PROJECT_1_OPEN_CNT','PROJECT_2_OPEN_CNT','PROJECT_3_OPEN_CNT','PROJECT_5_OPEN_CNT']].fillna(0) 197 | 198 | for HY in HYLIST: 199 | # print(df_train['HY'].value_counts()) 200 | for d in meanlist: 201 | df_all.loc[df_all[df_all[d].isnull()][df_all['HY']==HY].index,d] = df_all[df_all['HY']==HY][d].mean() 202 | 203 | for c in modelist: 204 | if(len(df_all[df_all['HY']==HY][c].value_counts().index)==0): continue 205 | df_all.loc[df_all[df_all[c].isnull()][df_all['HY']==HY].index,c] = df_all[df_all['HY']==HY][c].value_counts().index[0] 206 | for d in meanlist: 207 | df_all.loc[df_all[d].isnull(),d] = df_all[d].mean() 208 | 209 | for c in modelist: 210 | df_all.loc[df_all[c].isnull(),c] = df_all[c].value_counts().index[0] 211 | 212 | df_all = df_all.fillna(0) 213 | # In[27]: 214 | 215 | # 所有的关于时间的空值用2018-01填充,假设这些企业被执行的数据发生在下一年,其他用0填充便是没有被执行 216 | 217 | for d in meanlist: 218 | df_all.loc[df_all[d].isnull(),d] = 0 219 | 220 | for c in modelist: 221 | df_all.loc[df_all[c].isnull(),c] = '2018-01' 222 | 223 | 224 | # In[28]: 225 | 226 | def timeDiff(x): 227 | a = x[:x.find(':')] 228 | b = x[x.find(':')+1:] 229 | y = int(a[:a.find('-')]) - int(b[:b.find('-')]) 230 | m = int(a[a.find('-')+1:]) - int(b[b.find('-')+1:]) 231 | return y * 12 + m 232 | 233 | 234 | # In[29]: 235 | 236 | df_all['PROJECT_END_FIRST_DIFF'] = (df_all['PROJECT_END_TIME'] + ':' + df_all['PROJECT_FIRST_TIME']).apply(timeDiff) 237 | 238 | df_all['PROJECT_FIRST_RGYEAR_DIFF'] = (df_all['PROJECT_FIRST_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 239 | df_all['PROJECT_END_RGYEAR_DIFF'] = (df_all['PROJECT_END_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 240 | 241 | df_all['PROJECT_FIRST_CHANGE_FIRST_DIFF'] = (df_all['PROJECT_FIRST_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 242 | df_all['PROJECT_END_CHANGE_FIRST_DIFF'] = (df_all['PROJECT_END_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 243 | 244 | df_all['PROJECT_FIRST_CHANGE_END_DIFF'] = (df_all['PROJECT_FIRST_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 245 | df_all['PROJECT_END_CHANGE_END_DIFF'] = (df_all['PROJECT_END_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 246 | 247 | # --------------------------------------------------------------------------------- 248 | # 18. 平均没几个月中标一个项目其余用0表示,PROJECT_PRE_MONTH_CNT = PROJECT_END_RGYEAR_DIFF / PROJECT_CNT 249 | 250 | # 19. 企业权利的个数占所有权利个数平均值的比例。PROJECT_CNT_ALL_RATE 251 | 252 | # 20. 企业对应的大类HY的平均权利的个数,PROJECT_HY_CNT_AVG 253 | # 21. 企业对应大类HY的平均权利个数占所有权利平均个数的比例,PROJECT_HY_CNT_ALL_RATE 254 | # 22. 企业权利的个数占其对应的大类HY的平均值的比例,PROJECT_CNT_HY_RATE 255 | 256 | # In[30]: 257 | 258 | df_all['PROJECT_PRE_MONTH_CNT'] = df_all['PROJECT_END_RGYEAR_DIFF'] / df_all['PROJECT_CNT'] 259 | df_all['PROJECT_CNT_ALL_RATE'] = df_all['PROJECT_CNT'] / df_all['PROJECT_CNT'].mean() 260 | 261 | df_all['PROJECT_CNT_ALL_RATE_MAX'] = df_all['PROJECT_CNT'] / df_all['PROJECT_CNT'].max() 262 | 263 | 264 | # In[31]: 265 | 266 | tmp = pd.DataFrame() 267 | 268 | tmp['PROJECT_HY_CNT_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['PROJECT_CNT'].mean() 269 | tmp['PROJECT_HY_CNT_ALL_RATE'] = tmp['PROJECT_HY_CNT_AVG'] / df_all['PROJECT_CNT'].mean() 270 | 271 | tmp['PROJECT_HY_CNT_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['PROJECT_CNT'].max() 272 | tmp['PROJECT_HY_CNT_ALL_RATE_MAX'] = tmp['PROJECT_HY_CNT_MAX'] / df_all['PROJECT_CNT'].max() 273 | 274 | tmp['HY'] = tmp.index 275 | 276 | 277 | # In[32]: 278 | 279 | tmp1 = pd.DataFrame() 280 | 281 | tmp1['PROJECT_ETYPE_CNT_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['PROJECT_CNT'].mean() 282 | tmp1['PROJECT_ETYPE_CNT_ALL_RATE'] = tmp1['PROJECT_ETYPE_CNT_AVG'] / df_all['PROJECT_CNT'].mean() 283 | 284 | tmp1['PROJECT_ETYPE_CNT_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['PROJECT_CNT'].max() 285 | tmp1['PROJECT_ETYPE_CNT_ALL_RATE_MAX'] = tmp1['PROJECT_ETYPE_CNT_MAX'] / df_all['PROJECT_CNT'].max() 286 | 287 | tmp1['ETYPE'] = tmp1.index 288 | 289 | 290 | 291 | 292 | # In[33]: 293 | 294 | df_all = pd.merge(df_all,tmp,how='left',on=['HY']) 295 | df_all = pd.merge(df_all,tmp1,how='left',on=['ETYPE']) 296 | 297 | 298 | 299 | # In[34]: 300 | 301 | df_all['PROJECT_CNT_HY_RATE'] = df_all['PROJECT_CNT'] / df_all['PROJECT_HY_CNT_AVG'] 302 | df_all['PROJECT_CNT_HY_RATE_MAX'] = df_all['PROJECT_CNT'] / df_all['PROJECT_HY_CNT_MAX'] 303 | 304 | df_all['PROJECT_CNT_ETYPE_RATE'] = df_all['PROJECT_CNT'] / df_all['PROJECT_ETYPE_CNT_AVG'] 305 | df_all['PROJECT_CNT_ETYPE_RATE_MAX'] = df_all['PROJECT_CNT'] / df_all['PROJECT_ETYPE_CNT_MAX'] 306 | 307 | 308 | # In[ ]: 309 | 310 | 311 | 312 | 313 | # In[35]: 314 | 315 | 316 | # df_all.info() 317 | # df_all.head() 318 | 319 | 320 | # In[36]: 321 | 322 | # 得到在df_xchange的所有数据, 323 | df_all = df_all.fillna(0) 324 | df_all.to_csv('../data/alldata/df_data123456.csv',index=False,index_label=False) 325 | 326 | 327 | 328 | # In[21]: 329 | 330 | 331 | 332 | 333 | # In[ ]: 334 | 335 | 336 | 337 | -------------------------------------------------------------------------------- /feature/7_feature.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | # # 被执行数据7lawsuit.csv提取特征 13 | # 1. 企业被执行的案件个数,LAWSUIT_CNT 14 | # 2. 企业被执行的案件得金额的均值,LAWSUIT_LAWAMOUNT_MEAN 15 | # 3. 企业被执行的案件得金额的和,LAWSUIT_LAWAMOUNT_SUM 16 | # 4. 企业第一被执行的时间, LAWSUIT_FIRST_TIME 17 | # 5. 企业最后一被执行的时间, LAWSUIT_END_TIME 18 | # 6. 企业第一个被执行的时间与企业注册的时间的差值, LAWSUIT_FIRST_RGYEAR_TIME_DIFF 19 | # 7. 企业最后一个被执行的时间与企业注册的时间的差值, LAWSUIT_END_RGYEAR_TIME_DIFF 20 | # 8. 企业第一个被执行的时间与企业第一次变更时间的差值, LAWSUIT_FIRST_CHANGE_FIRST_TIME_DIFF 21 | # 9. 企业第一个被执行的时间与企业最后一次变更时间的差值, LAWSUIT_FIRST_CHANGE_END_TIME_DIFF 22 | # 23 | # --------------------------------------------------------------------------------------- 24 | # 10. 企业平均没几个月被执行的一次, LAWSUIT_PRE_MONTH_CNT = LAWSUIT_END_RGYEAR_TIME_DIFF / LAWSUIT_CNT 25 | # 26 | # 11. 企业被执行的个数占所有执行个数个平均值的比例,LAWSUIT_CNT_ALL_RATE 27 | # 28 | # 12. 企业对应的大类HY的平均执行数据的个数,LAWSUIT_HY_CNT_AVG 29 | # 13. 企业对应大类HY的平均执行个数占所有执行平均个数的比例,LAWSUIT_HY_CNT_ALL_RATE 30 | # 14. 企业执行的个数占其对应的大类HY的执行平均值的比例,LAWSUIT_CNT_HY_RATE 31 | # 32 | # 33 | # 15. 企业被执行的金额占所有执行金额个平均值的比例,LAWSUIT_LAWAMOUNT_MEAN_ALL_RATE 34 | # 35 | # 16. 企业对应的大类HY的平均金额数据的数值,LAWSUIT_LAWAMOUNT_MEAN_HY_AVG 36 | # 17. 企业对应大类HY的平均执行金额占所有执行平均金额的比例,LAWSUIT_LAWAMOUNT_MEAN_HY_ALL_RATE 37 | # 18. 企业执行的金额占其对应的大类HY的执行金额平均值的比例,LAWSUIT_LAWAMOUNT_MEAN_HY_RATE 38 | # 39 | # --------------------------------------------------------------------------------------- 40 | # 41 | # 12. 企业对应的大类HY的平均执行数据的个数,LAWSUIT_ETYPE_CNT_AVG 42 | # 13. 企业对应大类HY的平均执行个数占所有执行平均个数的比例,LAWSUIT_ETYPE_CNT_ALL_RATE 43 | # 14. 企业执行的个数占其对应的大类HY的执行平均值的比例,LAWSUIT_CNT_ETYPE_RATE 44 | # 45 | # 46 | # 16. 企业对应的大类HY的平均金额数据的数值,LAWSUIT_LAWAMOUNT_MEAN_ETYPE_AVG 47 | # 17. 企业对应大类HY的平均执行金额占所有执行平均金额的比例,LAWSUIT_LAWAMOUNT_MEAN_ETYPE_ALL_RATE 48 | # 18. 企业执行的金额占其对应的大类HY的执行金额平均值的比例,LAWSUIT_LAWAMOUNT_MEAN_ETYPE_RATE 49 | # 50 | # 对应的max数据的特征 51 | # 52 | # 复赛新特征,滑动窗口系列的特征,时间段是1年,2年,3年,5年,计算在最近k[1,2,3,5]年之后的数据,主要是个数和次数。 53 | # 时间是开始是2017-08之前的k年 54 | # 1. 之前k年的变更时间的个数。 55 | # LAWSUIT_K_OPEN_CNT, 56 | # 57 | 58 | # In[2]: 59 | 60 | df_all = pd.read_csv("../data/alldata/df_data123456.csv") 61 | df_lawsuit = pd.read_csv("../data/public/7lawsuit.csv") 62 | 63 | 64 | # In[3]: 65 | 66 | # df_all.info() 67 | # df_all.head() 68 | 69 | 70 | # In[4]: 71 | 72 | # df_lawsuit.info() 73 | # df_lawsuit.head() 74 | 75 | 76 | # In[5]: 77 | 78 | df_lawsuit['LAWDATE'] = df_lawsuit['LAWDATE'].map(lambda x:x.replace('年','-').replace('月','')) 79 | 80 | 81 | # In[6]: 82 | 83 | df_lawsuit = df_lawsuit.sort_values(['LAWDATE']) 84 | 85 | EIDS = set(df_lawsuit['EID']) 86 | 87 | # print(len(EIDS)) 88 | 89 | columns = df_lawsuit.columns 90 | df_xlawsuit = pd.DataFrame(columns=columns) 91 | 92 | # print(columns) 93 | 94 | 95 | # In[7]: 96 | 97 | k = 0 98 | for EID in EIDS: 99 | if k%3000 == 0: 100 | print('第%d次处理--------->',k) 101 | k+=1 102 | tmp = df_lawsuit[df_lawsuit['EID'] == EID] 103 | row = [EID,tmp['TYPECODE'].values,tmp['LAWDATE'].values,tmp['LAWAMOUNT'].values] 104 | 105 | df_xlawsuit = df_xlawsuit.append(pd.Series(row,columns),ignore_index=True) 106 | 107 | 108 | 109 | # In[8]: 110 | 111 | # df_xlawsuit.info() 112 | # df_xlawsuit.head() 113 | 114 | 115 | # In[9]: 116 | 117 | df_xlawsuit['LAWSUIT_CNT'] = df_xlawsuit['TYPECODE'].apply(lambda x: len(x)) 118 | df_xlawsuit['LAWSUIT_LAWAMOUNT_MEAN'] = df_xlawsuit['LAWAMOUNT'].apply(lambda x: np.mean(x)) 119 | df_xlawsuit['LAWSUIT_LAWAMOUNT_SUM'] = df_xlawsuit['LAWAMOUNT'].apply(lambda x: np.sum(x)) 120 | 121 | 122 | # In[10]: 123 | 124 | df_xlawsuit['LAWSUIT_FIRST_TIME'] = df_xlawsuit['LAWDATE'].apply(lambda x: x[0][:7]) 125 | df_xlawsuit['LAWSUIT_END_TIME'] = df_xlawsuit['LAWDATE'].apply(lambda x: x[-1][:7]) 126 | 127 | 128 | # In[11]: 129 | 130 | # 2017-08 BRANCH_K_OPEN_CNT, BRANCH_K_CLOSE_CNT 131 | df_xlawsuit['LAWSUIT_1_OPEN_CNT'] = df_xlawsuit['LAWDATE'].map(lambda x: np.sum(np.array(x) >= '2016-08')) 132 | df_xlawsuit['LAWSUIT_2_OPEN_CNT'] = df_xlawsuit['LAWDATE'].map(lambda x: np.sum(np.array(x) >= '2015-08')) 133 | df_xlawsuit['LAWSUIT_3_OPEN_CNT'] = df_xlawsuit['LAWDATE'].map(lambda x: np.sum(np.array(x) >= '2014-08')) 134 | df_xlawsuit['LAWSUIT_5_OPEN_CNT'] = df_xlawsuit['LAWDATE'].map(lambda x: np.sum(np.array(x) >= '2012-08')) 135 | 136 | 137 | 138 | # In[ ]: 139 | 140 | 141 | 142 | 143 | # In[12]: 144 | 145 | df_xlawsuit.to_csv('../data/public/7lawsuit_1.csv',index=False,index_label=False) 146 | df_xlawsuit.columns 147 | 148 | 149 | 150 | # In[13]: 151 | 152 | df_all = pd.merge(df_all,df_xlawsuit[['EID', 'LAWSUIT_CNT','LAWSUIT_LAWAMOUNT_MEAN', 'LAWSUIT_LAWAMOUNT_SUM', 'LAWSUIT_FIRST_TIME', 153 | 'LAWSUIT_END_TIME','LAWSUIT_1_OPEN_CNT','LAWSUIT_2_OPEN_CNT', 154 | 'LAWSUIT_3_OPEN_CNT','LAWSUIT_5_OPEN_CNT']],how='left',on=['EID']) 155 | 156 | 157 | # In[14]: 158 | 159 | # df_all.info() 160 | # df_all.head() 161 | 162 | 163 | # In[15]: 164 | 165 | # 所有的关于时间的空值用2018-01填充,假设这些企业被执行的数据发生在下一年,其他用0填充便是没有被执行 166 | meanlist = ['LAWSUIT_FIRST_TIME','LAWSUIT_END_TIME'] 167 | modelist = ['LAWSUIT_CNT','LAWSUIT_LAWAMOUNT_MEAN', 'LAWSUIT_LAWAMOUNT_SUM'] 168 | 169 | for d in meanlist: 170 | df_all.loc[df_all[d].isnull(),d] = '2018-01' 171 | 172 | for c in modelist: 173 | df_all.loc[df_all[c].isnull(),c] = 0 174 | 175 | 176 | # In[16]: 177 | 178 | df_all[['LAWSUIT_1_OPEN_CNT','LAWSUIT_2_OPEN_CNT','LAWSUIT_3_OPEN_CNT','LAWSUIT_5_OPEN_CNT']] = df_all[['LAWSUIT_1_OPEN_CNT','LAWSUIT_2_OPEN_CNT','LAWSUIT_3_OPEN_CNT','LAWSUIT_5_OPEN_CNT']].fillna(0) 179 | 180 | 181 | # In[17]: 182 | 183 | def timeDiff(x): 184 | x = x.replace('年','-') 185 | x = x.replace('月','') 186 | 187 | a = x[:x.find(':')] 188 | b = x[x.find(':')+1:] 189 | 190 | 191 | 192 | y = int(a[:a.find('-')]) - int(b[:b.find('-')]) 193 | m = int(a[a.find('-')+1:]) - int(b[b.find('-')+1:]) 194 | return y * 12 + m 195 | 196 | 197 | df_all['LAWSUIT_FIRST_RGYEAR_TIME_DIFF'] = (df_all['LAWSUIT_FIRST_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 198 | df_all['LAWSUIT_END_RGYEAR_TIME_DIFF'] = (df_all['LAWSUIT_END_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 199 | 200 | df_all['LAWSUIT_FIRST_CHANGE_FIRST_TIME_DIFF'] = (df_all['LAWSUIT_FIRST_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 201 | df_all['LAWSUIT_FIRST_CHANGE_END_TIME_DIFF'] = (df_all['LAWSUIT_FIRST_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 202 | 203 | # --------------------------------------------------------------------------------------- 204 | # 10. 企业平均没几个月被执行的一次, LAWSUIT_PRE_MONTH_CNT = LAWSUIT_END_RGYEAR_TIME_DIFF / LAWSUIT_CNT 205 | 206 | # 11. 企业被执行的个数占所有执行个数个平均值的比例,LAWSUIT_CNT_ALL_RATE 207 | 208 | # 12. 企业对应的大类HY的平均执行数据的个数,LAWSUIT_HY_CNT_AVG 209 | # 13. 企业对应大类HY的平均执行个数占所有执行平均个数的比例,LAWSUIT_HY_CNT_ALL_RATE 210 | # 14. 企业执行的个数占其对应的大类HY的执行平均值的比例,LAWSUIT_CNT_HY_RATE 211 | 212 | 213 | # 15. 企业被执行的金额占所有执行金额个平均值的比例,LAWSUIT_LAWAMOUNT_MEAN_ALL_RATE 214 | 215 | # 16. 企业对应的大类HY的平均金额数据的数值,LAWSUIT_LAWAMOUNT_MEAN_HY_AVG 216 | # 17. 企业对应大类HY的平均执行金额占所有执行平均金额的比例,LAWSUIT_LAWAMOUNT_MEAN_ALL_RATE 217 | # 18. 企业执行的金额占其对应的大类HY的执行金额平均值的比例,LAWSUIT_LAWAMOUNT_MEAN_HY_RATE 218 | 219 | # In[18]: 220 | 221 | df_all['LAWSUIT_PRE_MONTH_CNT'] = df_all['LAWSUIT_END_RGYEAR_TIME_DIFF'] / df_all['LAWSUIT_CNT'] 222 | df_all['LAWSUIT_CNT_ALL_RATE'] = df_all['LAWSUIT_CNT'] / df_all['LAWSUIT_CNT'].mean() 223 | 224 | df_all['LAWSUIT_LAWAMOUNT_MEAN_ALL_RATE'] = df_all['LAWSUIT_LAWAMOUNT_MEAN'] / df_all['LAWSUIT_LAWAMOUNT_MEAN'].mean() 225 | 226 | 227 | df_all['LAWSUIT_CNT_ALL_RATE_MAX'] = df_all['LAWSUIT_CNT'] / df_all['LAWSUIT_CNT'].max() 228 | 229 | df_all['LAWSUIT_LAWAMOUNT_MEAN_ALL_RATE_MAX'] = df_all['LAWSUIT_LAWAMOUNT_MEAN'] / df_all['LAWSUIT_LAWAMOUNT_MEAN'].max() 230 | 231 | 232 | 233 | # In[19]: 234 | 235 | tmp = pd.DataFrame() 236 | 237 | tmp['LAWSUIT_HY_CNT_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['LAWSUIT_CNT'].mean() 238 | tmp['LAWSUIT_HY_CNT_ALL_RATE'] = tmp['LAWSUIT_HY_CNT_AVG'] / df_all['LAWSUIT_CNT'].mean() 239 | 240 | tmp['LAWSUIT_LAWAMOUNT_MEAN_HY_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['LAWSUIT_LAWAMOUNT_MEAN'].mean() 241 | tmp['LAWSUIT_LAWAMOUNT_MEAN_HY_ALL_RATE'] = tmp['LAWSUIT_LAWAMOUNT_MEAN_HY_AVG'] / df_all['LAWSUIT_LAWAMOUNT_MEAN'].mean() 242 | 243 | 244 | tmp['LAWSUIT_HY_CNT_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['LAWSUIT_CNT'].max() 245 | tmp['LAWSUIT_HY_CNT_ALL_RATE_MAX'] = tmp['LAWSUIT_HY_CNT_MAX'] / df_all['LAWSUIT_CNT'].max() 246 | 247 | tmp['LAWSUIT_LAWAMOUNT_MEAN_HY_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['LAWSUIT_LAWAMOUNT_MEAN'].max() 248 | tmp['LAWSUIT_LAWAMOUNT_MEAN_HY_ALL_RATE_MAX'] = tmp['LAWSUIT_LAWAMOUNT_MEAN_HY_MAX'] / df_all['LAWSUIT_LAWAMOUNT_MEAN'].max() 249 | 250 | 251 | tmp['HY'] = tmp.index 252 | 253 | 254 | 255 | # In[20]: 256 | 257 | tmp1 = pd.DataFrame() 258 | 259 | tmp1['LAWSUIT_ETYPE_CNT_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['LAWSUIT_CNT'].mean() 260 | tmp1['LAWSUIT_ETYPE_CNT_ALL_RATE'] = tmp1['LAWSUIT_ETYPE_CNT_AVG'] / df_all['LAWSUIT_CNT'].mean() 261 | 262 | tmp1['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['LAWSUIT_LAWAMOUNT_MEAN'].mean() 263 | tmp1['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_ALL_RATE'] = tmp1['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_AVG'] / df_all['LAWSUIT_LAWAMOUNT_MEAN'].mean() 264 | 265 | 266 | tmp1['LAWSUIT_ETYPE_CNT_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['LAWSUIT_CNT'].max() 267 | tmp1['LAWSUIT_ETYPE_CNT_ALL_RATE_MAX'] = tmp1['LAWSUIT_ETYPE_CNT_MAX'] / df_all['LAWSUIT_CNT'].max() 268 | 269 | tmp1['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['LAWSUIT_LAWAMOUNT_MEAN'].max() 270 | tmp1['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_ALL_RATE_MAX'] = tmp1['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_MAX'] / df_all['LAWSUIT_LAWAMOUNT_MEAN'].max() 271 | 272 | 273 | tmp1['ETYPE'] = tmp1.index 274 | 275 | 276 | 277 | 278 | # In[21]: 279 | 280 | df_all = pd.merge(df_all,tmp,how='left',on=['HY']) 281 | df_all = pd.merge(df_all,tmp1,how='left',on=['ETYPE']) 282 | 283 | 284 | 285 | # In[22]: 286 | 287 | df_all['LAWSUIT_CNT_HY_RATE'] = df_all['LAWSUIT_CNT'] / df_all['LAWSUIT_HY_CNT_AVG'] 288 | df_all['LAWSUIT_LAWAMOUNT_MEAN_HY_RATE'] = df_all['LAWSUIT_LAWAMOUNT_MEAN'] / df_all['LAWSUIT_LAWAMOUNT_MEAN_HY_AVG'] 289 | 290 | df_all['LAWSUIT_CNT_HY_RATE_MAX'] = df_all['LAWSUIT_CNT'] / df_all['LAWSUIT_HY_CNT_MAX'] 291 | df_all['LAWSUIT_LAWAMOUNT_MEAN_HY_RATE_MAX'] = df_all['LAWSUIT_LAWAMOUNT_MEAN'] / df_all['LAWSUIT_LAWAMOUNT_MEAN_HY_MAX'] 292 | 293 | 294 | 295 | # In[23]: 296 | 297 | df_all['LAWSUIT_CNT_ETYPE_RATE'] = df_all['LAWSUIT_CNT'] / df_all['LAWSUIT_ETYPE_CNT_AVG'] 298 | df_all['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_RATE'] = df_all['LAWSUIT_LAWAMOUNT_MEAN'] / df_all['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_AVG'] 299 | 300 | df_all['LAWSUIT_CNT_ETYPE_RATE_MAX'] = df_all['LAWSUIT_CNT'] / df_all['LAWSUIT_ETYPE_CNT_MAX'] 301 | df_all['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_RATE_MAX'] = df_all['LAWSUIT_LAWAMOUNT_MEAN'] / df_all['LAWSUIT_LAWAMOUNT_MEAN_ETYPE_MAX'] 302 | 303 | 304 | 305 | # In[24]: 306 | 307 | # df_all.info() 308 | # df_all.head() 309 | 310 | 311 | # In[25]: 312 | 313 | # 得到在df_xchange的所有数据, 314 | df_all = df_all.fillna(0) 315 | df_all.to_csv('../data/alldata/df_data1234567.csv',index=False,index_label=False) 316 | 317 | 318 | 319 | # In[ ]: 320 | 321 | 322 | 323 | -------------------------------------------------------------------------------- /model/mode_avg_prov.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.model_selection import KFold 5 | from sklearn.model_selection import train_test_split 6 | import seaborn as sns 7 | import matplotlib.pyplot as plt 8 | 9 | #seed = np.random.randint(99999) 10 | valid_size = 0.2 11 | LOOP = 3 12 | ESR = 100 13 | # XGB param 14 | NROUND = 1500 15 | nround = 300 16 | features = None 17 | label = 'TARGET' 18 | 19 | 20 | # 设置三组参数,使用不同的seed,分别是默认,27,9999,三个数,对单个测试文件的三个模型进行平均 21 | param =[ 22 | {'max_depth':7, # 基准是5 23 | 'eta':0.05, 24 | 'gamma ':0.1, 25 | 'colsample_bytree':0.8, # old 0.8 26 | 'subsample':0.8, 27 | 'silent':1, 28 | 'eval_metric':'auc', 29 | 'objective':'binary:logistic', 30 | }, 31 | {'max_depth':7, # 基准是5 32 | 'eta':0.05, 33 | 'gamma ':0.1, 34 | 'colsample_bytree':0.8, # old 0.8 35 | 'subsample':0.8, 36 | 'silent':1, 37 | 'eval_metric':'auc', 38 | 'objective':'binary:logistic', 39 | 'seed':27 40 | }, 41 | {'max_depth':7, # 基准是5 42 | 'eta':0.05, 43 | 'gamma ':0.1, 44 | 'colsample_bytree':0.8, # old 0.8 45 | 'subsample':0.8, 46 | 'silent':1, 47 | 'eval_metric':'auc', 48 | 'objective':'binary:logistic', 49 | 'seed':9999 50 | } 51 | ] 52 | 53 | def setFeaturesAndLable(df_columns): 54 | ''' 55 | 参数:读入数据的df_data的所有列名 56 | 得到数据的特征features和lable, 57 | 这里所有的特征都是去除EID 58 | ''' 59 | xfeatures = df_columns 60 | xfeatures = list(xfeatures) 61 | xfeatures.remove('EID') 62 | xfeatures.remove('PROV') 63 | xlabel = 'TARGET' 64 | 65 | global features, label 66 | features, label = xfeatures, xlabel 67 | 68 | return xfeatures, xlabel 69 | 70 | 71 | def split_train_valid(df_train,test_size=0.2): 72 | ''' 73 | k-fold交叉验证,默认k=5 74 | df_train:训练数据 75 | ''' 76 | X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000) 77 | 78 | dtrain = xgb.DMatrix(X_train,label=y_train) 79 | dvalid = xgb.DMatrix(X_vali,label=y_vali) 80 | watchlist = [(dtrain, 'train'),(dvalid, 'valid')] 81 | 82 | return dtrain, dvalid, watchlist 83 | 84 | 85 | def split_data_with_prov(df_data): 86 | ''' 87 | 根据特征PROV,分割数据,进行单独预测之后合并数据结果 88 | ''' 89 | df_train_prov11 = df_data[df_data['PROV'] == 11] 90 | df_train_prov12 = df_data[df_data['PROV'] == 12] 91 | 92 | return df_train_prov11,df_train_prov12 93 | 94 | 95 | 96 | def xtrain_and_test(df_all): 97 | ''' 98 | 得到训练数据和测试数据 99 | ''' 100 | df_label = pd.read_csv('../data/public/train.csv') 101 | df_test_label = pd.read_csv('../data/public/evaluation_public.csv') 102 | 103 | df_label.drop('ENDDATE',axis=1,inplace=True) 104 | 105 | df_train = df_all[df_all['EID'].isin(df_label['EID'])] 106 | df_test = df_all[df_all['EID'].isin(df_test_label['EID'])] 107 | 108 | df_train = pd.merge(df_train,df_label,how='left',on=['EID']) 109 | 110 | return df_train,df_test 111 | 112 | def getBestIteration(df_train): 113 | ''' 114 | 参数:所有训练数据,特征、label 115 | 根据设置第一组参数和5折验证集合,返回得到最好的迭代次数, 116 | 给nround赋值,下面使用全部训练数据,设置得带次数nround = best_iteration + 50 117 | ''' 118 | print("根据cv得到最优迭代次数") 119 | 120 | dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.2) 121 | model = xgb.train(param[0], dbuild, NROUND, watchlist,early_stopping_rounds=ESR,verbose_eval=5) 122 | valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration) 123 | print('Valid Mean:---------------------->', np.mean(valid_yhat)) 124 | del dbuild, dvalid, watchlist 125 | 126 | # 设置全局变量的值 127 | nround = model.best_iteration + 50 128 | 129 | return model.best_iteration + 50 130 | 131 | 132 | def XgbModel(df_train,df_test,nround,alpha = 0.23): 133 | ''' 134 | 参数:训练数据和测试数据 135 | outputfile:输出文件的结果。 136 | alpha设置是1的阈值,默认是0.23 137 | return 数据文件的结果,三个不同seed的平均是的 138 | ''' 139 | # 得到特征和label 140 | models = [] 141 | for i in range(LOOP): 142 | print('LOOP',i) 143 | dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.1) 144 | 145 | dtrain = xgb.DMatrix(df_train[features],label=df_train[label]) 146 | 147 | model = xgb.train(param[i], dtrain,nround,watchlist,verbose_eval=5) 148 | models.append(model) 149 | # VALID 150 | valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration) 151 | print('Valid Mean:---------------------->', np.mean(valid_yhat)) 152 | del dbuild, dvalid, watchlist 153 | 154 | #============================================================================== 155 | print('test') 156 | #============================================================================== 157 | 158 | dtest = xgb.DMatrix(df_test[features]) 159 | proba_test = pd.DataFrame() 160 | proba_test['EID'] = df_test['EID'] 161 | proba_test['FORTARGET'] = [0 for i in range(len(df_test))] 162 | proba_test['PROB'] = [0 for i in range(len(df_test))] 163 | for model in models: 164 | proba_test['PROB'] += model.predict(dtest) 165 | proba_test['PROB'] /= LOOP 166 | 167 | # 根据阈值设置标枪label的值 168 | # proba_test.loc[proba_test['PROB']>=alpha,'FORTARGET'] = 1 169 | # 写入结果文件 170 | # proba_test.to_csv(output_file,index=False,index_label=False) 171 | 172 | return proba_test 173 | 174 | 175 | def runModelWithPROV(inputfile,outputfile,alpha = 0.23): 176 | """ 177 | inputfiles:输入文件, 178 | outputfile:输出的结果文件 179 | """ 180 | df_all = pd.read_csv(inputfile) 181 | # 设置特征和lable 182 | setFeaturesAndLable(df_all.columns) 183 | 184 | # 训练集和测试集 185 | df_train,df_test = xtrain_and_test(df_all) 186 | 187 | # 根据省份11和12分成两份数据,进行预测。 188 | df_train_prov11,df_train_prov12 = split_data_with_prov(df_train) 189 | df_test_prov11,df_test_prov12 = split_data_with_prov(df_test) 190 | 191 | # 训练和预测PROV = 11 192 | print("PROV == 11 start train") 193 | nround11 = getBestIteration(df_train_prov11) 194 | print("开始运行模型,训练之后预测PROV==11的值") 195 | proba_test11 = XgbModel(df_train_prov11,df_test_prov11,nround11) 196 | 197 | 198 | # 训练和预测PROV = 12 199 | print("PROV == 12 start train") 200 | nround12 = getBestIteration(df_train_prov12) 201 | print("开始运行模型,训练之后预测PROV==12的值") 202 | proba_test12 = XgbModel(df_train_prov12,df_test_prov12,nround12) 203 | 204 | # 设置,并合并结果 205 | proba_test11.loc[proba_test11['PROB']>=alpha,'FORTARGET'] = 1 206 | proba_test12.loc[proba_test12['PROB']>=alpha,'FORTARGET'] = 1 207 | 208 | proba_tmp = proba_test11.append(proba_test12) 209 | proba_test = pd.DataFrame() 210 | proba_test['EID'] = df_test['EID'] 211 | proba_test['FORTARGET_TMP'] = [0 for i in range(len(df_test))] 212 | 213 | proba_test = pd.merge(proba_test,proba_tmp,how='left',on='EID') 214 | proba_test.drop('FORTARGET_TMP',axis=1,inplace=True) 215 | 216 | proba_test.to_csv(outputfile,index=False,index_label=False) 217 | 218 | # 释放空间 219 | del df_train,df_test, df_all 220 | 221 | return proba_test 222 | 223 | # run 224 | # inputfile = "../data/alldata/df_data_plus_all.csv" 225 | # outputfile = '../xresult/sub_plus_all_prov.csv' 226 | 227 | # inputfile = "../data/alldata/df_data_all.csv" 228 | # outputfile = '../xresult/sub_all_prov.csv' 229 | 230 | # inputfile = ["../data/alldata/df_data_all.csv","../data/alldata/df_data_plus_all.csv", 231 | # "../data/alldata/df_data_onehot.csv","../data/alldata/df_data_plus_onehot.csv", 232 | # '../data/alldata/df_data_all_xgbstack.csv','../data/alldata/df_data_plus_all_xgbstack.csv', 233 | # '../data/alldata/df_data_onehot_xgbstack.csv','../data/alldata/df_data_plus_onehot_xgbstack.csv'] 234 | 235 | inputfile = ["../data/alldata/df_data_all.csv","../data/alldata/df_data_plus_all.csv", 236 | "../data/alldata/df_data_onehot.csv","../data/alldata/df_data_plus_onehot.csv", 237 | '../data/alldata/df_data_all_xgbstack.csv','../data/alldata/df_data_plus_all_xgbstack.csv', 238 | '../data/alldata/df_data_onehot_xgbstack.csv','../data/alldata/df_data_plus_onehot_xgbstack.csv', 239 | '../data/alldata/df_data_all_rfstack.csv','../data/alldata/df_data_plus_all_rfstack.csv', 240 | '../data/alldata/df_data_onehot_rfstack.csv','../data/alldata/df_data_plus_onehot_rfstack.csv', 241 | '../data/alldata/df_data_all_lgbstack.csv','../data/alldata/df_data_plus_all_lgbstack.csv', 242 | '../data/alldata/df_data_onehot_lgbstack.csv','../data/alldata/df_data_plus_onehot_lgbstack.csv', 243 | '../data/alldata/df_data_all_prov_xgbstack.csv','../data/alldata/df_data_plus_all_prov_xgbstack.csv', 244 | '../data/alldata/df_data_onehot_prov_xgbstack.csv','../data/alldata/df_data_plus_onehot_prov_xgbstack.csv', 245 | '../data/alldata/df_data_all_prov_rfstack.csv','../data/alldata/df_data_plus_all_prov_rfstack.csv', 246 | '../data/alldata/df_data_onehot_prov_rfstack.csv','../data/alldata/df_data_plus_onehot_prov_rfstack.csv', 247 | '../data/alldata/df_data_all_prov_lgbstack.csv','../data/alldata/df_data_plus_all_prov_lgbstack.csv', 248 | '../data/alldata/df_data_onehot_prov_lgbstack.csv','../data/alldata/df_data_plus_onehot_prov_lgbstack.csv', 249 | '../data/alldata/df_data_all_xgbstack_onehot.csv','../data/alldata/df_data_onehot_xgbstack_onehot.csv', 250 | '../data/alldata/df_data_plus_all_xgbstack_onehot.csv','../data/alldata/df_data_plus_onehot_xgbstack_onehot.csv', 251 | '../data/alldata/df_data_all_finaly.csv'] 252 | 253 | 254 | outputfile = ['../xresult/sub_all_prov.csv','../xresult/sub_plus_all_prov.csv', 255 | '../xresult/sub_onehot_prov.csv','../xresult/sub_plus_onehot_prov.csv', 256 | '../xresult/sub_all_xgbstack_prov.csv','../xresult/sub_plus_all_xgbstack_prov.csv', 257 | '../xresult/sub_onehot_xgbstack_prov.csv','../xresult/sub_plus_onehot_xgbstack_prov.csv', 258 | '../xresult/sub_all_rfstack_prov.csv','../xresult/sub_plus_all_rfstack_prov.csv', 259 | '../xresult/sub_onehot_rfstack_prov.csv','../xresult/sub_plus_onehot_rfstack_prov.csv', 260 | '../xresult/sub_all_lgbstack_prov.csv','../xresult/sub_plus_all_lgbstack_prov.csv', 261 | '../xresult/sub_onehot_lgbstack_prov.csv','../xresult/sub_plus_onehot_lgbstack_prov.csv', 262 | '../xresult/sub_all_prov_xgbstack_prov.csv','../xresult/sub_plus_all_prov_xgbstack_prov.csv', 263 | '../xresult/sub_onehot_prov_xgbstack_prov.csv','../xresult/sub_plus_onehot_prov_xgbstack_prov.csv', 264 | '../xresult/sub_all_prov_rfstack_prov.csv','../xresult/sub_plus_all_prov_rfstack_prov.csv', 265 | '../xresult/sub_onehot_prov_rfstack_prov.csv','../xresult/sub_plus_onehot_prov_rfstack_prov.csv', 266 | '../xresult/sub_all_prov_lgbstack_prov.csv','../xresult/sub_plus_all_prov_lgbstack_prov.csv', 267 | '../xresult/sub_onehot_prov_lgbstack_prov.csv','../xresult/sub_plus_onehot_prov_lgbstack_prov.csv', 268 | '../xresult/a_prov.csv','../xresult/b_prov.csv', 269 | '../xresult/c_prov.csv','../xresult/d_prov.csv', 270 | '../xresult/finally_prov.csv'] 271 | # 0 - 28,这里是分开计算 272 | for i in range(32,33): 273 | print(i," start --> ",inputfile[i]) 274 | runModelWithPROV(inputfile[i],outputfile[i]) 275 | print(i," end --> ",outputfile[i]) 276 | 277 | 278 | -------------------------------------------------------------------------------- /feature/10_feature.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | import matplotlib.pyplot as plt 10 | 11 | # QUALIFICATION -->QUALIFICATION 12 | # qualification --> qualification 13 | # ADDTYPE --> ADDTYPE 14 | # BEGINDATE --> BEGINDATE 15 | # EXPIRYDATE -->EXPIRYDATE 16 | 17 | # # 2.3.8 失信数据10qualification.csv提取特征 18 | # 1. 企业失信的个数,QUALIFICATION_CNT 19 | # 2. 企业第一个失信数据的日期,QUALIFICATION_FIRST_FIRST_TIME 20 | # 3. 企业最后一个失信数据的日期,QUALIFICATION_FIRST_END_TIME 21 | # 4. 企业第一个失信数据的到期日期,QUALIFICATION_END_FIRST_TIME 22 | # 5. 企业第一个失信数据的日期与其结束日期的差值,QUALIFICATION_FIRST_END_FIRST_DIFF 23 | # 24 | # 5. 企业第一个失信数据的日期与企业注册日期的差值,QUALIFICATION_FIRST_RGYEAR_DIFF 25 | # 6. 企业最后一个失信数据的日期与企业注册日期的差值,QUALIFICATION_END_RGYEAR_DIFF 26 | # 27 | # 7. 企业第一个失信数据的到期日期与企业注册日期的差值,QUALIFICATION_END_RGYEAR_DIFF 28 | # 29 | # 8. 企业第一个失信数据的日期与企业第一次变更的差值,QUALIFICATION_FIRST_CHANGE_FIRST_DIFF 30 | # 9. 企业最后一个失信数据的日期与企业第一次变更的差值,QUALIFICATION_END_CHANGE_FIRST_DIFF 31 | # 32 | # 10. 企业第一个失信数据的日期与企业最后一次变更的差值,QUALIFICATION_FIRST_CHANGE_END_DIFF 33 | # 11. 企业最后一个失信数据的日期与企业最后一次变更的差值,QUALIFICATION_END_CHANGE_END_DIFF 34 | # 35 | # --------------------------------------------------------------------------------------- 36 | # 10. 企业平均每几个月的失信一次, QUALIFICATION_PRE_MONTH_CNT = QUALIFICATION_END_RGYEAR_DIFF / QUALIFICATION_CNT 37 | # 38 | # 11. 企业失信的个数占所有失信个数个平均值的比例,QUALIFICATION_CNT_ALL_RATE 39 | # 40 | # 12. 企业对应的大类HY的平均失信数据的个数,QUALIFICATION_HY_CNT_AVG 41 | # 13. 企业对应大类HY的平均失信个数占所有失信平均个数的比例,QUALIFICATION_HY_CNT_ALL_RATE 42 | # 14. 企业失信的个数占其对应的大类HY的失信平均值的比例,QUALIFICATION_CNT_HY_RATE 43 | # 44 | # --------------------------------------------------------------------------------------- 45 | # 46 | # 12. 企业对应的大类HY的平均失信数据的个数,QUALIFICATION_ETYPE_CNT_AVG 47 | # 13. 企业对应大类HY的平均失信个数占所有失信平均个数的比例,QUALIFICATION_ETYPE_CNT_ALL_RATE 48 | # 14. 企业失信的个数占其对应的大类HY的失信平均值的比例,QUALIFICATION_CNT_ETYPE_RATE 49 | # 50 | # 对应的MAX数据特征 51 | # 复赛新特征,滑动窗口系列的特征,时间段是1年,2年,3年,5年,计算在最近k[1,2,3,5]年之后的数据,主要是个数和次数。 52 | # 时间是开始是2017-08之前的k年 53 | # 1. 之前k年的变更时间的个数。 54 | # QUALIFICATION_K_OPEN_CNT, QUALIFICATION_K_CLOSE_CNT, 55 | # 56 | 57 | # In[2]: 58 | 59 | df_all = pd.read_csv("../data/alldata/df_data123456789.csv") 60 | df_qualification = pd.read_csv("../data/public/10qualification.csv",encoding='gbk') 61 | 62 | 63 | # In[3]: 64 | 65 | # df_all.info() 66 | # df_all.head() 67 | 68 | 69 | # In[4]: 70 | 71 | 72 | # df_qualification.info() 73 | # df_qualification.head() 74 | 75 | 76 | # In[5]: 77 | 78 | # df_qualification['EXPIRYDATE'].value_counts() 79 | 80 | 81 | # In[6]: 82 | 83 | df_qualification['BEGINDATE'] = df_qualification['BEGINDATE'].map(lambda x:x.replace('年','-').replace('月','')) 84 | df_qualification.loc[df_qualification['EXPIRYDATE'].isnull(),['EXPIRYDATE']] = '2018年12月' 85 | 86 | df_qualification['EXPIRYDATE'] = df_qualification['EXPIRYDATE'].map(lambda x:x.replace('年','-').replace('月','')) 87 | 88 | 89 | def time(x): 90 | y = x[:x.find('/')] 91 | m = int(x[x.find('/')+1:x.rfind('/')]) 92 | if m < 10: m = '0'+str(m) 93 | else: m = str(m) 94 | 95 | return y + '-' + m 96 | 97 | # df_qualification['BEGINDATE'] = df_qualification['BEGINDATE'].apply(time) 98 | # df_qualification['EXPIRYDATE'] = df_qualification['EXPIRYDATE'].apply(time) 99 | 100 | 101 | # In[7]: 102 | 103 | df_qualification = df_qualification.sort_values(['BEGINDATE','EXPIRYDATE']) 104 | 105 | # df_qualification.info() 106 | # df_qualification.head() 107 | 108 | 109 | # In[8]: 110 | 111 | EIDS = set(df_qualification['EID']) 112 | 113 | # print(len(EIDS)) 114 | 115 | columns = df_qualification.columns 116 | df_xqualification = pd.DataFrame(columns=columns) 117 | 118 | # print(columns) 119 | 120 | 121 | # In[9]: 122 | 123 | k = 0 124 | for EID in EIDS: 125 | if k%3000 == 0: 126 | print('第%d次处理--------->',k) 127 | k+=1 128 | tmp = df_qualification[df_qualification['EID'] == EID] 129 | row = [EID,tmp['ADDTYPE'].values,tmp['BEGINDATE'].values,tmp['EXPIRYDATE'].values] 130 | 131 | df_xqualification = df_xqualification.append(pd.Series(row,columns),ignore_index=True) 132 | 133 | 134 | 135 | # In[10]: 136 | 137 | # df_xqualification.info() 138 | # df_xqualification.head() 139 | 140 | 141 | # In[11]: 142 | 143 | df_xqualification['QUALIFICATION_CNT'] = df_xqualification['ADDTYPE'].apply(lambda x: len(x)) 144 | 145 | df_xqualification['QUALIFICATION_FIRST_FIRST_TIME'] = df_xqualification['BEGINDATE'].apply(lambda x: x[0]) 146 | df_xqualification['QUALIFICATION_FIRST_END_TIME'] = df_xqualification['BEGINDATE'].apply(lambda x: x[-1]) 147 | df_xqualification['QUALIFICATION_END_FIRST_TIME'] = df_xqualification['EXPIRYDATE'].apply(lambda x: x[0]) 148 | 149 | 150 | 151 | # In[12]: 152 | 153 | # 2017-08 BRANCH_K_OPEN_CNT, BRANCH_K_CLOSE_CNT 154 | df_xqualification['QUALIFICATION_1_OPEN_CNT'] = df_xqualification['BEGINDATE'].map(lambda x: np.sum(np.array(x) >= '2016-08')) 155 | df_xqualification['QUALIFICATION_2_OPEN_CNT'] = df_xqualification['BEGINDATE'].map(lambda x: np.sum(np.array(x) >= '2015-08')) 156 | df_xqualification['QUALIFICATION_3_OPEN_CNT'] = df_xqualification['BEGINDATE'].map(lambda x: np.sum(np.array(x) >= '2014-08')) 157 | df_xqualification['QUALIFICATION_5_OPEN_CNT'] = df_xqualification['BEGINDATE'].map(lambda x: np.sum(np.array(x) >= '2012-08')) 158 | 159 | df_xqualification['QUALIFICATION_1_CLOSE_CNT'] = df_xqualification['EXPIRYDATE'].map(lambda x: np.sum(np.array(x) >= '2016-08')) 160 | df_xqualification['QUALIFICATION_2_CLOSE_CNT'] = df_xqualification['EXPIRYDATE'].map(lambda x: np.sum(np.array(x) >= '2015-08')) 161 | df_xqualification['QUALIFICATION_3_CLOSE_CNT'] = df_xqualification['EXPIRYDATE'].map(lambda x: np.sum(np.array(x) >= '2014-08')) 162 | df_xqualification['QUALIFICATION_5_CLOSE_CNT'] = df_xqualification['EXPIRYDATE'].map(lambda x: np.sum(np.array(x) >= '2012-08')) 163 | 164 | 165 | # 是否有QUALIFICATION数据,特征 IS_QUALIFICATION 166 | 167 | df_xqualification['IS_QUALIFICATION'] = 1 168 | 169 | 170 | # In[13]: 171 | 172 | df_xqualification.to_csv('../data/public/10qualification_1.csv',index=False,index_label=False) 173 | df_xqualification.columns 174 | 175 | 176 | 177 | # In[14]: 178 | 179 | df_all = pd.merge(df_all,df_xqualification[['EID','QUALIFICATION_CNT','QUALIFICATION_FIRST_FIRST_TIME', 'QUALIFICATION_FIRST_END_TIME', 180 | 'QUALIFICATION_END_FIRST_TIME','QUALIFICATION_1_OPEN_CNT','QUALIFICATION_2_OPEN_CNT', 181 | 'QUALIFICATION_3_OPEN_CNT','QUALIFICATION_5_OPEN_CNT','QUALIFICATION_1_CLOSE_CNT', 182 | 'QUALIFICATION_2_CLOSE_CNT','QUALIFICATION_3_CLOSE_CNT','QUALIFICATION_5_CLOSE_CNT', 183 | 'IS_QUALIFICATION']],how='left',on=['EID']) 184 | 185 | 186 | # In[15]: 187 | 188 | # df_all.info() 189 | # df_all.head() 190 | 191 | 192 | # In[16]: 193 | 194 | # 所有有管时间的特征,用注册日期填充,其他的用0填充 195 | df_all['QUALIFICATION_CNT'] = df_all['QUALIFICATION_CNT'].fillna(0) 196 | 197 | df_all['IS_QUALIFICATION'] = df_all['IS_QUALIFICATION'].fillna(0) 198 | 199 | modelist = [ 'QUALIFICATION_FIRST_FIRST_TIME', 'QUALIFICATION_FIRST_END_TIME','QUALIFICATION_END_FIRST_TIME'] 200 | 201 | for d in modelist: 202 | df_all.loc[df_all[d].isnull(),d] = df_all.loc[df_all[d].isnull(),'RGYEAR'] 203 | 204 | 205 | 206 | # In[17]: 207 | 208 | df_all[['QUALIFICATION_1_OPEN_CNT','QUALIFICATION_2_OPEN_CNT','QUALIFICATION_3_OPEN_CNT','QUALIFICATION_5_OPEN_CNT']] = df_all[['QUALIFICATION_1_OPEN_CNT','QUALIFICATION_2_OPEN_CNT','QUALIFICATION_3_OPEN_CNT','QUALIFICATION_5_OPEN_CNT']].fillna(0) 209 | 210 | 211 | df_all[['QUALIFICATION_1_CLOSE_CNT','QUALIFICATION_2_CLOSE_CNT','QUALIFICATION_3_CLOSE_CNT','QUALIFICATION_5_CLOSE_CNT']] = df_all[['QUALIFICATION_1_CLOSE_CNT','QUALIFICATION_2_CLOSE_CNT','QUALIFICATION_3_CLOSE_CNT','QUALIFICATION_5_CLOSE_CNT']].fillna(0) 212 | 213 | 214 | # In[18]: 215 | 216 | def timeDiff(x): 217 | a = x[:x.find(':')] 218 | b = x[x.find(':')+1:] 219 | y = int(a[:a.find('-')]) - int(b[:b.find('-')]) 220 | m = int(a[a.find('-')+1:]) - int(b[b.find('-')+1:]) 221 | return y * 12 + m 222 | 223 | 224 | df_all['QUALIFICATION_FIRST_END_FIRST_DIFF'] = (df_all['QUALIFICATION_END_FIRST_TIME'] + ':' + df_all['QUALIFICATION_FIRST_FIRST_TIME']).apply(timeDiff) 225 | df_all['QUALIFICATION_FIRST_RGYEAR_DIFF'] = (df_all['QUALIFICATION_FIRST_FIRST_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 226 | df_all['QUALIFICATION_END_RGYEAR_DIFF'] = (df_all['QUALIFICATION_FIRST_END_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 227 | df_all['QUALIFICATION_END_RGYEAR_DIFF'] = (df_all['QUALIFICATION_END_FIRST_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 228 | 229 | 230 | df_all['QUALIFICATION_FIRST_CHANGE_FIRST_DIFF'] = (df_all['QUALIFICATION_FIRST_FIRST_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 231 | df_all['QUALIFICATION_END_CHANGE_FIRST_DIFF'] = (df_all['QUALIFICATION_FIRST_END_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 232 | df_all['QUALIFICATION_FIRST_CHANGE_END_DIFF'] = (df_all['QUALIFICATION_FIRST_FIRST_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 233 | df_all['QUALIFICATION_END_CHANGE_END_DIFF'] = (df_all['QUALIFICATION_FIRST_END_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 234 | 235 | 236 | # --------------------------------------------------------------------------------------- 237 | # 10. 企业平均每几个月的失信一次, QUALIFICATION_PRE_MONTH_CNT = QUALIFICATION_END_RGYEAR_DIFF / QUALIFICATION_CNT 238 | 239 | # 11. 企业失信的个数占所有失信个数个平均值的比例,QUALIFICATION_CNT_ALL_RATE 240 | 241 | # 12. 企业对应的大类HY的平均失信数据的个数,QUALIFICATION_HY_CNT_AVG 242 | # 13. 企业对应大类HY的平均失信个数占所有失信平均个数的比例,QUALIFICATION_HY_CNT_ALL_RATE 243 | # 14. 企业失信的个数占其对应的大类HY的失信平均值的比例,QUALIFICATION_CNT_HY_RATE 244 | 245 | # In[19]: 246 | 247 | df_all['QUALIFICATION_PRE_MONTH_CNT'] = df_all['QUALIFICATION_END_RGYEAR_DIFF'] / df_all['QUALIFICATION_CNT'] 248 | 249 | df_all['QUALIFICATION_CNT_ALL_RATE'] = df_all['QUALIFICATION_CNT'] / df_all['QUALIFICATION_CNT'].mean() 250 | 251 | df_all['QUALIFICATION_CNT_ALL_RATE_MAX'] = df_all['QUALIFICATION_CNT'] / df_all['QUALIFICATION_CNT'].max() 252 | 253 | 254 | 255 | 256 | # In[20]: 257 | 258 | tmp = pd.DataFrame() 259 | 260 | tmp['QUALIFICATION_HY_CNT_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['QUALIFICATION_CNT'].mean() 261 | tmp['QUALIFICATION_HY_CNT_ALL_RATE'] = tmp['QUALIFICATION_HY_CNT_AVG'] / df_all['QUALIFICATION_CNT'].mean() 262 | 263 | tmp['QUALIFICATION_HY_CNT_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['QUALIFICATION_CNT'].max() 264 | tmp['QUALIFICATION_HY_CNT_ALL_RATE_MAX'] = tmp['QUALIFICATION_HY_CNT_MAX'] / df_all['QUALIFICATION_CNT'].max() 265 | 266 | 267 | tmp['HY'] = tmp.index 268 | 269 | 270 | 271 | # In[21]: 272 | 273 | tmp1 = pd.DataFrame() 274 | 275 | tmp1['QUALIFICATION_ETYPE_CNT_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['QUALIFICATION_CNT'].mean() 276 | tmp1['QUALIFICATION_ETYPE_CNT_ALL_RATE'] = tmp1['QUALIFICATION_ETYPE_CNT_AVG'] / df_all['QUALIFICATION_CNT'].mean() 277 | 278 | tmp1['QUALIFICATION_ETYPE_CNT_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['QUALIFICATION_CNT'].max() 279 | tmp1['QUALIFICATION_ETYPE_CNT_ALL_RATE_MAX'] = tmp1['QUALIFICATION_ETYPE_CNT_MAX'] / df_all['QUALIFICATION_CNT'].max() 280 | 281 | 282 | tmp1['ETYPE'] = tmp1.index 283 | 284 | 285 | 286 | # In[22]: 287 | 288 | df_all = pd.merge(df_all,tmp,how='left',on=['HY']) 289 | df_all = pd.merge(df_all,tmp1,how='left',on=['ETYPE']) 290 | 291 | 292 | # In[23]: 293 | 294 | df_all['QUALIFICATION_CNT_HY_RATE'] = df_all['QUALIFICATION_CNT'] / df_all['QUALIFICATION_HY_CNT_AVG'] 295 | df_all['QUALIFICATION_CNT_HY_RATE_MAX'] = df_all['QUALIFICATION_CNT'] / df_all['QUALIFICATION_HY_CNT_MAX'] 296 | 297 | df_all['QUALIFICATION_CNT_ETYPE_RATE'] = df_all['QUALIFICATION_CNT'] / df_all['QUALIFICATION_ETYPE_CNT_AVG'] 298 | df_all['QUALIFICATION_CNT_ETYPE_RATE_MAX'] = df_all['QUALIFICATION_CNT'] / df_all['QUALIFICATION_ETYPE_CNT_MAX'] 299 | 300 | 301 | 302 | # In[24]: 303 | 304 | # df_all.info() 305 | # df_all.head() 306 | 307 | 308 | # In[25]: 309 | 310 | # 得到在df_xchange的所有数据, 311 | df_all = df_all.fillna(0) 312 | df_all.to_csv('../data/alldata/df_data1234567890.csv',index=False,index_label=False) 313 | 314 | 315 | 316 | # In[ ]: 317 | 318 | 319 | 320 | -------------------------------------------------------------------------------- /feature/5_feature.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[6]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | # # 权利数据5right.csv提取特征 13 | # 1. 企业拥有权利的个数,RIGHT_CNT 14 | # 2. 企业拥有权利类型的个数,RIGHT_TYPE_CNT 15 | # 3. 企业拥有权利类型的比例,RIGHT_TYPE_RATE 16 | # 4. 第一个获得的权利的类型,RIGHT_FIRST_TYPECODE 17 | # 5. 最后一个获得的权利的类型,RIGHT_END_TYPECODE 18 | # 6. 获得最多的权利的类型,RIGHT_TYPECODE_MUCHID 19 | # 20 | # 7. 第一个权利申请日期, RIGHT_FIRST_ASK_TIME 21 | # 8. 第一个权利富于日期, RIGHT_FIRST_FB_TIME 22 | # 9. 最后一个权利申请日期, RIGHT_END_ASK_TIME 23 | # 10. 最后一个权利富于日期, RIGHT_END_FB_TIME 24 | # 25 | # 11. 第一个权利申请日期和权利富于日期的差值,RIGHT_FIRST_ASK_FB_DIFF 26 | # 12. 最后一个权利申请日期和权利富于日期的差值,RIGHT_END_ASK_FB_DIFF 27 | # 13. 第一个和最后一个权利申请日期的差值,RIGHT_FIRST_END_ASK_DIFF 28 | # 14. 第一个和最后一个权利富于日期的差值,RIGHT_FIRST_END_FB_DIFF 29 | # 15. 第一个申请和最后一个权利富于日期的差值,RIGHT_FIRST_ASK_END_FB_DIFF 30 | # 31 | # 16. 第一个权利的申请日期和公司注册时间的差值,RIGHT_FIRST_ASK_RGYEAR_DIFF 32 | # 17. 第一个权利的富于日期和公司注册时间的差值,RIGHT_FIRST_FB_RGYEAR_DIFF 33 | # 18. 最后一个权利的申请日期和公司注册时间的差值,RIGHT_END_ASK_RGYEAR_DIFF 34 | # 19. 最后一个权利的富于日期和公司注册时间的差值,RIGHT_END_FB_RGYEAR_DIFF 35 | # 36 | # 20. 第一个权利的申请日期和公司第一个变动时间的差值,RIGHT_FIRST_ASK_FIRST_CHANGE_DIFF 37 | # 21. 第一个权利的富于日期和公司第一个变动时间的差值,RIGHT_FIRST_FB_FIRST_CHANGE_DIFF 38 | # 22. 最后一个权利的申请日期和公司第一个变动时间的差值,RIGHT_END_ASK_FIRST_CHANGE_DIFF 39 | # 23. 最后一个权利的富于日期和公司第一个变动时间的差值,RIGHT_END_FB_FIRST_CHANGE_DIFF 40 | # 41 | # 24. 第一个权利的申请日期和公司最后一个变动时间的差值,RIGHT_FIRST_ASK_END_CHANGE_DIFF 42 | # 25. 第一个权利的富于日期和公司最后一个变动时间的差值,RIGHT_FIRST_FB_END_CHANGE_DIFF 43 | # 26. 最后一个权利的申请日期和公司最后一个变动时间的差值,RIGHT_END_ASK_END_CHANGE_DIFF 44 | # 27. 最后一个权利的富于日期和公司最后一个变动时间的差值,RIGHT_END_FB_END_CHANGE_DIFF 45 | # 46 | # --------------------------------------------------------------------------------------------------- 47 | # 28. 企业权利的个数占所有权利个数平均值的比例。RIGHT_CNT_ALL_RATE 48 | # 49 | # 29. 企业对应的大类HY的平均权利的个数,RIGHT_HY_CNT_AVG 50 | # 30. 企业对应大类HY的平均权利个数占所有权利平均个数的比例,RIGHT_HY_CNT_ALL_RATE 51 | # 31. 企业权利的个数占其对应的大类HY的平均值的比例,RIGHT_CNT_HY_RATE 52 | # 53 | # ------------------------------------------------------------------------------------------------- 54 | # 55 | # 29. 企业对应的大类ETYPE的平均权利的个数,RIGHT_ETYPE_CNT_AVG 56 | # 30. 企业对应大类ETYPE的平均权利个数占所有权利平均个数的比例,RIGHT_ETYPE_CNT_ALL_RATE 57 | # 31. 企业权利的个数占其对应的大类ETYPE的平均值的比例,RIGHT_CNT_ETYPE_RATE 58 | # 59 | # 添加对应的最大值的特征 60 | # 61 | # 复赛新特征,滑动窗口系列的特征,时间段是1年,2年,3年,5年,计算在最近k[1,2,3,5]年之后的数据,主要是个数和次数。 62 | # 时间是开始是2017-08之前的k年 63 | # 1. 之前k年的变更时间的个数。 64 | # RIGHT_K_OPEN_CNT, RIGHT_K_CLOSE_CNT 65 | # 66 | 67 | # In[7]: 68 | 69 | df_all = pd.read_csv("../data/alldata/df_data1234.csv") 70 | df_right = pd.read_csv("../data/public/5right.csv") 71 | 72 | 73 | # In[8]: 74 | 75 | # df_all.info() 76 | # df_all.head() 77 | 78 | 79 | # In[9]: 80 | 81 | # df_right.info() 82 | # df_right.head() 83 | 84 | df_right = df_right.sort_values(['ASKDATE','FBDATE']) 85 | 86 | 87 | # In[10]: 88 | 89 | # set(df_right['RIGHTTYPE']) 90 | def settime(x): 91 | y = int(x[:x.find('-')]) 92 | m = int(x[x.find('-')+1:]) 93 | m+=2 94 | y = y+int(m/12) 95 | if m%12 == 0: 96 | m = 12 97 | else: 98 | m = m%12 99 | 100 | if(m<10): 101 | return str(y)+"-0"+str(m) 102 | 103 | return str(y)+"-"+str(m) 104 | 105 | 106 | df_right.loc[df_right[df_right['RIGHTTYPE']==11][df_right['FBDATE'].isnull()].index,'FBDATE'] = df_right[df_right['RIGHTTYPE']==11][df_right['FBDATE'].isnull()]['ASKDATE'] 107 | 108 | df_right.loc[df_right[df_right['RIGHTTYPE']==40][df_right['FBDATE'].isnull()].index,'FBDATE'] = df_right[df_right['RIGHTTYPE']==40][df_right['FBDATE'].isnull()]['ASKDATE'].apply(settime) 109 | df_right.loc[df_right[df_right['RIGHTTYPE']==50][df_right['FBDATE'].isnull()].index,'FBDATE'] = df_right[df_right['RIGHTTYPE']==50][df_right['FBDATE'].isnull()]['ASKDATE'].apply(settime) 110 | 111 | 112 | 113 | # In[11]: 114 | 115 | EIDS = set(df_right['EID']) 116 | 117 | len(EIDS) 118 | 119 | 120 | # In[12]: 121 | 122 | columns = df_right.columns 123 | df_xright = pd.DataFrame(columns=columns) 124 | 125 | # print(columns) 126 | 127 | 128 | # In[13]: 129 | 130 | k = 0 131 | for EID in EIDS: 132 | if k%3000 == 0: 133 | print('第%d次处理--------->',k) 134 | k+=1 135 | tmp = df_right[df_right['EID'] == EID] 136 | row = [EID,tmp['RIGHTTYPE'].values,tmp['TYPECODE'].values,tmp['ASKDATE'].values,tmp['FBDATE'].values] 137 | 138 | df_xright = df_xright.append(pd.Series(row,columns),ignore_index=True) 139 | 140 | 141 | 142 | # In[14]: 143 | 144 | # df_xright.info() 145 | # df_xright.head() 146 | 147 | 148 | # In[15]: 149 | 150 | k = len(set(df_right['RIGHTTYPE'])) 151 | 152 | set(df_right['RIGHTTYPE']) 153 | 154 | 155 | # In[16]: 156 | 157 | def getTZCnt(x): 158 | return len(x) 159 | 160 | df_xright['RIGHT_CNT'] = df_xright['RIGHTTYPE'].apply(getTZCnt) 161 | 162 | def cnt(x): 163 | return len(set(x)) 164 | 165 | df_xright['RIGHT_TYPE_CNT'] = df_xright['RIGHTTYPE'].apply(getTZCnt) 166 | 167 | df_xright['RIGHT_TYPE_RATE'] = df_xright['RIGHT_TYPE_CNT'] / k 168 | 169 | 170 | # In[17]: 171 | 172 | def getFAlterno(x): 173 | return x[0] 174 | 175 | def getEAlterno(x): 176 | return x[-1] 177 | 178 | def getMAlterno(x): 179 | x = list(x) 180 | x.sort() 181 | m = x[0] 182 | 183 | mk = 0 184 | mm = x[0] 185 | 186 | k = 1 187 | n = len(x) 188 | 189 | for i in range(1,n): 190 | if x[i] == x[i-1]: 191 | k+=1 192 | else: 193 | if k>mk: 194 | mk = k 195 | mm = x[i-1] 196 | 197 | k = 1 198 | return mm 199 | 200 | df_xright['RIGHT_FIRST_TYPECODE'] = df_xright['RIGHTTYPE'].apply(getFAlterno) 201 | df_xright['RIGHT_END_TYPECODE'] = df_xright['RIGHTTYPE'].apply(getEAlterno) 202 | df_xright['RIGHT_TYPECODE_MUCHID'] = df_xright['RIGHTTYPE'].apply(getMAlterno) 203 | 204 | 205 | # In[18]: 206 | 207 | df_xright['RIGHT_FIRST_ASK_TIME'] = df_xright['ASKDATE'].apply(lambda x: x[0]) 208 | df_xright['RIGHT_FIRST_FB_TIME'] = df_xright['FBDATE'].apply(lambda x: x[0]) 209 | 210 | df_xright['RIGHT_END_ASK_TIME'] = df_xright['ASKDATE'].apply(lambda x: x[-1]) 211 | df_xright['RIGHT_END_FB_TIME'] = df_xright['FBDATE'].apply(lambda x: x[-1]) 212 | 213 | 214 | 215 | # In[ ]: 216 | 217 | 218 | 219 | 220 | # In[19]: 221 | 222 | # 2017-08 BRANCH_K_OPEN_CNT, BRANCH_K_CLOSE_CNT 223 | df_xright['RIGHT_1_OPEN_CNT'] = df_xright['ASKDATE'].map(lambda x: np.sum(np.array(x) >= '2016-08')) 224 | df_xright['RIGHT_2_OPEN_CNT'] = df_xright['ASKDATE'].map(lambda x: np.sum(np.array(x) >= '2015-08')) 225 | df_xright['RIGHT_3_OPEN_CNT'] = df_xright['ASKDATE'].map(lambda x: np.sum(np.array(x) >= '2014-08')) 226 | df_xright['RIGHT_5_OPEN_CNT'] = df_xright['ASKDATE'].map(lambda x: np.sum(np.array(x) >= '2012-08')) 227 | 228 | df_xright['RIGHT_1_CLOSE_CNT'] = df_xright['FBDATE'].map(lambda x: np.sum(np.array(x) >= '2016-08')) 229 | df_xright['RIGHT_2_CLOSE_CNT'] = df_xright['FBDATE'].map(lambda x: np.sum(np.array(x) >= '2015-08')) 230 | df_xright['RIGHT_3_CLOSE_CNT'] = df_xright['FBDATE'].map(lambda x: np.sum(np.array(x) >= '2014-08')) 231 | df_xright['RIGHT_5_CLOSE_CNT'] = df_xright['FBDATE'].map(lambda x: np.sum(np.array(x) >= '2012-08')) 232 | 233 | 234 | 235 | 236 | # In[20]: 237 | 238 | df_xright.to_csv('../data/public/5right_1.csv',index=False,index_label=False) 239 | # df_xright.columns 240 | 241 | 242 | 243 | # In[21]: 244 | 245 | df_all = pd.merge(df_all,df_xright[['EID', 'RIGHT_CNT','RIGHT_TYPE_CNT', 'RIGHT_TYPE_RATE', 'RIGHT_FIRST_TYPECODE', 246 | 'RIGHT_END_TYPECODE', 'RIGHT_TYPECODE_MUCHID', 'RIGHT_FIRST_ASK_TIME', 247 | 'RIGHT_FIRST_FB_TIME', 'RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME', 248 | 'RIGHT_1_OPEN_CNT','RIGHT_2_OPEN_CNT','RIGHT_3_OPEN_CNT','RIGHT_5_OPEN_CNT', 249 | 'RIGHT_1_CLOSE_CNT','RIGHT_2_CLOSE_CNT','RIGHT_3_CLOSE_CNT','RIGHT_5_CLOSE_CNT']],how='left',on=['EID']) 250 | 251 | 252 | # In[22]: 253 | 254 | # 空值填充,根据HY的类别的平均值或者众数进行填充 255 | # ['RIGHT_CNT','RIGHT_TYPE_CNT', 'RIGHT_TYPE_RATE']使用同一个大类别的均值进行填充 256 | # ['RIGHT_FIRST_TYPECODE','RIGHT_END_TYPECODE', 'RIGHT_TYPECODE_MUCHID', 'RIGHT_FIRST_ASK_TIME', 257 | # 'RIGHT_FIRST_FB_TIME', 'RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME']使用同一个大类别的众数进行填充 258 | 259 | HYLIST = set(df_all['HY']) 260 | # print(HYLIST) 261 | 262 | meanlist = ['RIGHT_CNT','RIGHT_TYPE_CNT', 'RIGHT_TYPE_RATE'] 263 | modelist = ['RIGHT_FIRST_TYPECODE','RIGHT_END_TYPECODE', 'RIGHT_TYPECODE_MUCHID', 'RIGHT_FIRST_ASK_TIME', 264 | 'RIGHT_FIRST_FB_TIME', 'RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME'] 265 | 266 | 267 | # In[23]: 268 | 269 | for HY in HYLIST: 270 | # print(df_train['HY'].value_counts()) 271 | for d in meanlist: 272 | df_all.loc[df_all[df_all[d].isnull()][df_all['HY']==HY].index,d] = df_all[df_all['HY']==HY][d].mean() 273 | 274 | for c in modelist: 275 | if(len(df_all[df_all['HY']==HY][c].value_counts().index)==0): continue 276 | df_all.loc[df_all[df_all[c].isnull()][df_all['HY']==HY].index,c] = df_all[df_all['HY']==HY][c].value_counts().index[0] 277 | 278 | 279 | # In[24]: 280 | 281 | df_all[['RIGHT_1_OPEN_CNT','RIGHT_2_OPEN_CNT','RIGHT_3_OPEN_CNT','RIGHT_5_OPEN_CNT']] = df_all[['RIGHT_1_OPEN_CNT','RIGHT_2_OPEN_CNT','RIGHT_3_OPEN_CNT','RIGHT_5_OPEN_CNT']].fillna(0) 282 | 283 | df_all[['RIGHT_1_CLOSE_CNT','RIGHT_2_CLOSE_CNT','RIGHT_3_CLOSE_CNT','RIGHT_5_CLOSE_CNT']] = df_all[['RIGHT_1_CLOSE_CNT','RIGHT_2_CLOSE_CNT','RIGHT_3_CLOSE_CNT','RIGHT_5_CLOSE_CNT']].fillna(0) 284 | 285 | 286 | # In[ ]: 287 | 288 | 289 | 290 | 291 | # In[25]: 292 | 293 | for d in meanlist: 294 | df_all.loc[df_all[d].isnull(),d] = 0 295 | 296 | for c in modelist: 297 | df_all.loc[df_all[c].isnull(),c] = df_all[c].value_counts().index[0] 298 | 299 | 300 | # In[ ]: 301 | 302 | 303 | 304 | 305 | # In[26]: 306 | 307 | def timeDiff(x): 308 | a = x[:x.find(':')] 309 | b = x[x.find(':')+1:] 310 | y = int(a[:a.find('-')]) - int(b[:b.find('-')]) 311 | m = int(a[a.find('-')+1:]) - int(b[b.find('-')+1:]) 312 | return y * 12 + m 313 | 314 | 315 | 316 | # In[27]: 317 | 318 | df_all['RIGHT_FIRST_ASK_FB_DIFF'] = (df_all['RIGHT_FIRST_FB_TIME'] + ':' + df_all['RIGHT_FIRST_ASK_TIME']).apply(timeDiff) 319 | df_all['RIGHT_END_ASK_FB_DIFF'] = (df_all['RIGHT_END_FB_TIME'] + ':' + df_all['RIGHT_END_ASK_TIME']).apply(timeDiff) 320 | df_all['RIGHT_FIRST_END_ASK_DIFF'] = (df_all['RIGHT_END_ASK_TIME'] + ':' + df_all['RIGHT_FIRST_ASK_TIME']).apply(timeDiff) 321 | df_all['RIGHT_FIRST_END_FB_DIFF'] = (df_all['RIGHT_END_FB_TIME'] + ':' + df_all['RIGHT_FIRST_FB_TIME']).apply(timeDiff) 322 | df_all['RIGHT_FIRST_ASK_END_FB_DIFF'] = (df_all['RIGHT_END_FB_TIME'] + ':' + df_all['RIGHT_FIRST_ASK_TIME']).apply(timeDiff) 323 | 324 | 325 | # In[28]: 326 | 327 | df_all['RIGHT_FIRST_ASK_RGYEAR_DIFF'] = (df_all['RIGHT_FIRST_ASK_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 328 | df_all['RIGHT_FIRST_FB_RGYEAR_DIFF'] = (df_all['RIGHT_FIRST_FB_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 329 | 330 | df_all['RIGHT_END_ASK_RGYEAR_DIFF'] = (df_all['RIGHT_END_ASK_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 331 | df_all['RIGHT_END_FB_RGYEAR_DIFF'] = (df_all['RIGHT_END_FB_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 332 | 333 | 334 | 335 | # In[29]: 336 | 337 | df_all['RIGHT_FIRST_ASK_FIRST_CHANGE_DIFF'] = (df_all['RIGHT_FIRST_ASK_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 338 | df_all['RIGHT_FIRST_FB_FIRST_CHANGE_DIFF'] = (df_all['RIGHT_FIRST_FB_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 339 | 340 | df_all['RIGHT_END_ASK_FIRST_CHANGE_DIFF'] = (df_all['RIGHT_END_ASK_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 341 | df_all['RIGHT_END_FB_FIRST_CHANGE_DIFF'] = (df_all['RIGHT_END_FB_TIME'] + ':' + df_all['FIRST_CHANGE_TIME']).apply(timeDiff) 342 | 343 | 344 | 345 | # In[30]: 346 | 347 | df_all['RIGHT_FIRST_ASK_END_CHANGE_DIFF'] = (df_all['RIGHT_FIRST_ASK_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 348 | df_all['RIGHT_FIRST_FB_END_CHANGE_DIFF'] = (df_all['RIGHT_FIRST_FB_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 349 | 350 | df_all['RIGHT_END_ASK_END_CHANGE_DIFF'] = (df_all['RIGHT_END_ASK_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 351 | df_all['RIGHT_END_FB_END_CHANGE_DIFF'] = (df_all['RIGHT_END_FB_TIME'] + ':' + df_all['END_CHANGE_TIME']).apply(timeDiff) 352 | 353 | # --------------------------------------------------------------------------------------------------- 354 | # 28. 企业权利的个数占所有权利个数平均值的比例。RIGHT_CNT_ALL_RATE 355 | 356 | # 29. 企业对应的大类HY的平均权利的个数,RIGHT_HY_CNT_AVG 357 | # 30. 企业对应大类HY的平均权利个数占所有权利平均个数的比例,RIGHT_HY_CNT_ALL_RATE 358 | # 31. 企业权利的个数占其对应的大类HY的平均值的比例,RIGHT_CNT_HY_RATE 359 | 360 | # In[31]: 361 | 362 | df_all['RIGHT_CNT_ALL_RATE'] = df_all['RIGHT_CNT'] / df_all['RIGHT_CNT'].mean() 363 | df_all['RIGHT_CNT_ALL_RATE_MAX'] = df_all['RIGHT_CNT'] / df_all['RIGHT_CNT'].max() 364 | 365 | 366 | 367 | # In[ ]: 368 | 369 | 370 | 371 | 372 | # In[ ]: 373 | 374 | 375 | 376 | 377 | # In[32]: 378 | 379 | tmp = pd.DataFrame() 380 | 381 | tmp['RIGHT_HY_CNT_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['RIGHT_CNT'].mean() 382 | tmp['RIGHT_HY_CNT_ALL_RATE'] = tmp['RIGHT_HY_CNT_AVG'] / df_all['RIGHT_CNT'].mean() 383 | 384 | tmp['RIGHT_HY_CNT_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['RIGHT_CNT'].max() 385 | tmp['RIGHT_HY_CNT_ALL_RATE'] = tmp['RIGHT_HY_CNT_MAX'] / df_all['RIGHT_CNT'].max() 386 | 387 | 388 | tmp['HY'] = tmp.index 389 | 390 | 391 | tmp1 = pd.DataFrame() 392 | 393 | tmp1['RIGHT_ETYPE_CNT_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['RIGHT_CNT'].mean() 394 | tmp1['RIGHT_ETYPE_CNT_ALL_RATE'] = tmp1['RIGHT_ETYPE_CNT_AVG'] / df_all['RIGHT_CNT'].mean() 395 | 396 | tmp1['RIGHT_ETYPE_CNT_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['RIGHT_CNT'].max() 397 | tmp1['RIGHT_ETYPE_CNT_ALL_RATE'] = tmp1['RIGHT_ETYPE_CNT_MAX'] / df_all['RIGHT_CNT'].max() 398 | 399 | 400 | tmp1['ETYPE'] = tmp1.index 401 | 402 | 403 | 404 | 405 | # In[33]: 406 | 407 | df_all = pd.merge(df_all,tmp,how='left',on=['HY']) 408 | df_all = pd.merge(df_all,tmp1,how='left',on=['ETYPE']) 409 | 410 | 411 | 412 | # In[ ]: 413 | 414 | 415 | 416 | 417 | # In[34]: 418 | 419 | df_all['RIGHT_CNT_HY_RATE'] = df_all['RIGHT_CNT'] / df_all['RIGHT_HY_CNT_AVG'] 420 | df_all['RIGHT_CNT_HY_RATE_MAX'] = df_all['RIGHT_CNT'] / df_all['RIGHT_HY_CNT_MAX'] 421 | 422 | df_all['RIGHT_CNT_ETYPE_RATE'] = df_all['RIGHT_CNT'] / df_all['RIGHT_ETYPE_CNT_AVG'] 423 | df_all['RIGHT_CNT_ETYPE_RATE_MAX'] = df_all['RIGHT_CNT'] / df_all['RIGHT_ETYPE_CNT_MAX'] 424 | 425 | 426 | 427 | # In[35]: 428 | 429 | # df_all.info() 430 | # df_all.head() 431 | 432 | 433 | # In[37]: 434 | 435 | # 得到在df_xchange的所有数据, 436 | df_all = df_all.fillna(0) 437 | df_all.to_csv('../data/alldata/df_data12345.csv',index=False,index_label=False) 438 | 439 | 440 | 441 | # In[ ]: 442 | 443 | 444 | 445 | -------------------------------------------------------------------------------- /feature/9_feature_1.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import seaborn as sns 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | # # 招聘数据9recruit.csv 提取特征 13 | # 1. 企业发布招聘的个数,RECRUIT_CNT 14 | # 2. 企业发布招聘平台的个数,RECRUIT_WZCODE_CNT 15 | # 3. 企业发布的招聘数量的均值,RECRUIT_RECRNUM_AVG 16 | # 17 | # 4. 企业发布的招聘数量的和,RECRUIT_RECRNUM_SUM 18 | # 19 | # 5. 企业发布的招聘数量的最多的个数,RECRUIT_RECRNUM_MAX 20 | # 6. 企业发布的招聘数量的最少的个数,RECRUIT_RECRNUM_MIN 21 | # 7. 企业发布第一个招聘的日期,RECRUIT_FIRST_TIME 22 | # 8. 企业发布最后一个招聘的日期,RECRUIT_END_TIME 23 | # 24 | # 9. 企业发布最后一个招聘的日期和第一个招聘日期的差值,RECRUIT_END_FIRST_TIME_DIFF 25 | # 10. 企业发布最后一个招聘的日期和注册日期的差值,RECRUIT_END_RGYEAR_TIME_DIFF 26 | # 27 | # --------------------------------------------------------------------------------------- 28 | # 11. 企业平均一次的招聘几个, RECRUIT_PRE_MONTH_CNT_AVG = RECRUIT_RECRNUM_SUM / RECRUIT_CNT 29 | # 12. 企业招聘的个数占所有招聘个数个平均值的比例,RECRUIT_CNT_ALL_RATE 30 | # 31 | # 13. 企业对应的大类HY的平均招聘数据的个数,RECRUIT_HY_CNT_AVG 32 | # 14. 企业对应大类HY的平均招聘个数占所有招聘平均个数的比例,RECRUIT_HY_CNT_ALL_RATE 33 | # 15. 企业招聘的个数占其对应的大类HY的招聘平均值的比例,RECRUIT_CNT_HY_RATE 34 | # 35 | # 36 | # 16. 企业招聘的数量占所有招聘数量平均值的比例,RECRUIT_RECRNUM_AVG_ALL_RATE 37 | # 38 | # 17. 企业对应的大类HY的平均招聘数据的平均,RECRUIT_HY_RECRNUM_AVG_AVG 39 | # 18. 企业对应大类HY的平均招聘数量占所有招聘平均数量的比例,RECRUIT_HY_RECRNUM_AVG_ALL_RATE 40 | # 19. 企业招聘的数量占其对应的大类HY的招聘数量平均值的比例,RECRUIT_RECRNUM_AVG_HY_RATE 41 | # 42 | # 43 | # 20. 企业招聘的数量占所有招聘数量平均值的比例,RECRUIT_RECRNUM_SUM_ALL_RATE 44 | # 45 | # 21. 企业对应的大类HY的平均招聘数据的平均,RECRUIT_HY_RECRNUM_SUM_AVG 46 | # 22. 企业对应大类HY的平均招聘数量占所有招聘平均数量的比例,RECRUIT_HY_RECRNUM_SUM_ALL_RATE 47 | # 23. 企业招聘的数量占其对应的大类HY的招聘数量平均值的比例,RECRUIT_RECRNUM_SUM_HY_RATE 48 | # 49 | # --------------------------------------------------------------------------------------- 50 | # 51 | # 13. 企业对应的大类ETYPE的平均招聘数据的个数,RECRUIT_ETYPE_CNT_AVG 52 | # 14. 企业对应大类ETYPE的平均招聘个数占所有招聘平均个数的比例,RECRUIT_ETYPE_CNT_ALL_RATE 53 | # 15. 企业招聘的个数占其对应的大类ETYPE的招聘平均值的比例,RECRUIT_CNT_ETYPE_RATE 54 | # 55 | # 56 | # 17. 企业对应的大类ETYPE的平均招聘数据的平均,RECRUIT_ETYPE_RECRNUM_AVG_AVG 57 | # 18. 企业对应大类ETYPE的平均招聘数量占所有招聘平均数量的比例,RECRUIT_ETYPE_RECRNUM_AVG_ALL_RATE 58 | # 19. 企业招聘的数量占其对应的大类ETYPE的招聘数量平均值的比例,RECRUIT_RECRNUM_AVG_ETYPE_RATE 59 | # 60 | # 61 | # 21. 企业对应的大类ETYPE的平均招聘数据的平均,RECRUIT_ETYPE_RECRNUM_SUM_AVG 62 | # 22. 企业对应大类ETYPE的平均招聘数量占所有招聘平均数量的比例,RECRUIT_ETYPE_RECRNUM_SUM_ALL_RATE 63 | # 23. 企业招聘的数量占其对应的大类ETYPE的招聘数量平均值的比例,RECRUIT_RECRNUM_SUM_ETYPE_RATE 64 | # 65 | # 66 | # 添加对应的max数据 67 | # 68 | # new feature POSCOD, 首先将 PNUM --> RECRNUM, 删除POSCO的特征 69 | # 70 | # 复赛新特征,滑动窗口系列的特征,时间段是1年,2年,3年,5年,计算在最近k[1,2,3,5]年之后的数据,主要是个数和次数。 71 | # 时间是开始是2017-08之前的k年 72 | # 1. 之前k年的变更时间的个数。 73 | # RECRUIT_K_OPEN_CNT, 74 | # 75 | 76 | # In[4]: 77 | 78 | 79 | 80 | 81 | # In[10]: 82 | 83 | df_all = pd.read_csv("../data/alldata/df_data123456789.csv") 84 | df_recruit = pd.read_csv("../data/public/9recruit.csv") 85 | 86 | 87 | # In[14]: 88 | 89 | # df_all.info() 90 | # df_all.head() 91 | 92 | 93 | # In[3]: 94 | 95 | # df_recruit.info() 96 | # df_recruit.head() 97 | 98 | 99 | 100 | # In[5]: 101 | 102 | df_recruit.rename(columns={'PNUM':'RECRNUM'},inplace=True) 103 | df_recruit.drop('POSCODE',axis=1,inplace=True) 104 | 105 | df_recruit['RECRNUM'] = df_recruit['RECRNUM'].map(lambda x: str(x).replace('若干','30').replace('人','')) 106 | 107 | df_recruit = df_recruit.sort_values(['RECDATE']) 108 | 109 | 110 | # In[6]: 111 | 112 | # df_recruit.info() 113 | # df_recruit.head() 114 | 115 | # df_recruit['WZCODE'].value_counts() 116 | 117 | 118 | # In[7]: 119 | 120 | df_recruit.loc[df_recruit['RECRNUM']=='nan','RECRNUM'] = 0 121 | 122 | 123 | # In[8]: 124 | 125 | df_recruit['RECRNUM'] = df_recruit['RECRNUM'].fillna(0) 126 | df_recruit['RECRNUM'] = df_recruit['RECRNUM'].astype(np.int) 127 | 128 | 129 | # In[9]: 130 | 131 | EIDS = set(df_recruit['EID']) 132 | 133 | # print(len(EIDS)) 134 | 135 | columns = df_recruit.columns 136 | df_xrecruit = pd.DataFrame(columns=columns) 137 | 138 | # print(columns) 139 | 140 | 141 | # In[10]: 142 | 143 | # df_recruit['WZCODE'].value_counts() 144 | 145 | 146 | # In[11]: 147 | 148 | k = 0 149 | for EID in EIDS: 150 | if k%3000 == 0: 151 | print('第%d次处理--------->',k) 152 | k+=1 153 | tmp = df_recruit[df_recruit['EID'] == EID] 154 | row = [EID,tmp['WZCODE'].values,tmp['RECDATE'].values,tmp['RECRNUM'].values] 155 | 156 | df_xrecruit = df_xrecruit.append(pd.Series(row,columns),ignore_index=True) 157 | 158 | 159 | 160 | # In[12]: 161 | 162 | # df_xrecruit.info() 163 | # df_xrecruit.head() 164 | 165 | 166 | # In[13]: 167 | 168 | # df_xrecruit.rename(columns={'RECDATE':'RECDATE1','RECRNUM':'RECRNUM1'},inplace=True) 169 | 170 | # df_xrecruit.rename(columns={'RECDATE1':'RECRNUM','RECRNUM1':'RECDATE'},inplace=True) 171 | 172 | 173 | # In[14]: 174 | 175 | def fun(x): 176 | x = list(x) 177 | if 'nan' in x: 178 | x.remove('nan') 179 | 180 | x = np.array(x).astype(int) 181 | 182 | return x 183 | 184 | df_xrecruit['RECRNUM'] = df_xrecruit['RECRNUM'].apply(fun) 185 | 186 | 187 | # In[15]: 188 | 189 | df_xrecruit['RECRUIT_CNT'] = df_xrecruit['WZCODE'].apply(lambda x: len(x)) 190 | df_xrecruit['RECRUIT_WZCODE_CNT'] = df_xrecruit['WZCODE'].apply(lambda x: len(set(x))) 191 | 192 | df_xrecruit['RECRUIT_RECRNUM_AVG'] = df_xrecruit['RECRNUM'].apply(lambda x: np.mean(x)) 193 | df_xrecruit['RECRUIT_RECRNUM_SUM'] = df_xrecruit['RECRNUM'].apply(lambda x: sum(x)) 194 | df_xrecruit['RECRUIT_RECRNUM_MAX'] = df_xrecruit['RECRNUM'].apply(lambda x: max(x)) 195 | df_xrecruit['RECRUIT_RECRNUM_MIN'] = df_xrecruit['RECRNUM'].apply(lambda x: min(x)) 196 | 197 | def getFirstTIme(x): 198 | x = list(x) 199 | x.sort() 200 | return x[0] 201 | 202 | def getEndTIme(x): 203 | x = list(x) 204 | x.sort() 205 | return x[-1] 206 | 207 | df_xrecruit['RECRUIT_FIRST_TIME'] = df_xrecruit['RECDATE'].apply(getFirstTIme) 208 | df_xrecruit['RECRUIT_END_TIME'] = df_xrecruit['RECDATE'].apply(getEndTIme) 209 | 210 | 211 | # In[17]: 212 | 213 | # 2017-08 BRANCH_K_OPEN_CNT, BRANCH_K_CLOSE_CNT 214 | df_xrecruit['RECRUIT_1_OPEN_CNT'] = df_xrecruit['RECDATE'].map(lambda x: np.sum(np.array(x) >= '2016-08')) 215 | df_xrecruit['RECRUIT_2_OPEN_CNT'] = df_xrecruit['RECDATE'].map(lambda x: np.sum(np.array(x) >= '2015-08')) 216 | df_xrecruit['RECRUIT_3_OPEN_CNT'] = df_xrecruit['RECDATE'].map(lambda x: np.sum(np.array(x) >= '2014-08')) 217 | df_xrecruit['RECRUIT_5_OPEN_CNT'] = df_xrecruit['RECDATE'].map(lambda x: np.sum(np.array(x) >= '2012-08')) 218 | 219 | 220 | 221 | # In[ ]: 222 | 223 | 224 | 225 | 226 | # In[18]: 227 | 228 | # df_xrecruit.info() 229 | # df_xrecruit.head() 230 | 231 | 232 | # In[19]: 233 | 234 | df_xrecruit.to_csv('../data/public/9recruit_1.csv',index=False,index_label=False) 235 | df_xrecruit.columns 236 | 237 | 238 | 239 | # In[20]: 240 | 241 | df_all = pd.merge(df_all,df_xrecruit[['EID','RECRUIT_CNT','RECRUIT_WZCODE_CNT', 'RECRUIT_RECRNUM_AVG', 'RECRUIT_RECRNUM_SUM', 242 | 'RECRUIT_RECRNUM_MAX', 'RECRUIT_RECRNUM_MIN', 'RECRUIT_FIRST_TIME', 243 | 'RECRUIT_END_TIME','RECRUIT_1_OPEN_CNT','RECRUIT_2_OPEN_CNT', 244 | 'RECRUIT_3_OPEN_CNT','RECRUIT_5_OPEN_CNT']],how='left',on=['EID']) 245 | 246 | 247 | # In[21]: 248 | 249 | # 空值填充,根据HY的类别的平均值或者众数进行填充 250 | # ['RIGHT_CNT','RIGHT_TYPE_CNT', 'RIGHT_TYPE_RATE']使用同一个大类别的均值进行填充 251 | # ['RIGHT_FIRST_TYPECODE','RIGHT_END_TYPECODE', 'RIGHT_TYPECODE_MUCHID', 'RIGHT_FIRST_ASK_TIME', 252 | # 'RIGHT_FIRST_FB_TIME', 'RIGHT_END_ASK_TIME', 'RIGHT_END_FB_TIME']使用同一个大类别的众数进行填充 253 | 254 | HYLIST = set(df_all['HY']) 255 | # print(HYLIST) 256 | 257 | meanlist = ['RECRUIT_CNT','RECRUIT_WZCODE_CNT', 'RECRUIT_RECRNUM_AVG', 'RECRUIT_RECRNUM_SUM','RECRUIT_RECRNUM_MAX','RECRUIT_RECRNUM_MIN'] 258 | modelist = ['RECRUIT_FIRST_TIME','RECRUIT_END_TIME'] 259 | 260 | 261 | # In[22]: 262 | 263 | for HY in HYLIST: 264 | # print(df_train['HY'].value_counts()) 265 | for d in meanlist: 266 | df_all.loc[df_all[df_all[d].isnull()][df_all['HY']==HY].index,d] = df_all[df_all['HY']==HY][d].mean() 267 | 268 | for c in modelist: 269 | if(len(df_all[df_all['HY']==HY][c].value_counts().index)==0): continue 270 | df_all.loc[df_all[df_all[c].isnull()][df_all['HY']==HY].index,c] = df_all[df_all['HY']==HY][c].value_counts().index[0] 271 | 272 | 273 | # In[23]: 274 | 275 | for d in meanlist: 276 | df_all.loc[df_all[d].isnull(),d] = 0 277 | 278 | for c in modelist: 279 | df_all.loc[df_all[c].isnull(),c] = df_all[c].value_counts().index[0] 280 | 281 | 282 | # In[24]: 283 | 284 | df_all[['RECRUIT_1_OPEN_CNT','RECRUIT_2_OPEN_CNT','RECRUIT_3_OPEN_CNT','RECRUIT_5_OPEN_CNT']] = df_all[['RECRUIT_1_OPEN_CNT','RECRUIT_2_OPEN_CNT','RECRUIT_3_OPEN_CNT','RECRUIT_5_OPEN_CNT']].fillna(0) 285 | 286 | 287 | # In[25]: 288 | 289 | def timeDiff(x): 290 | a = x[:x.find(':')] 291 | b = x[x.find(':')+1:] 292 | y = int(a[:a.find('-')]) - int(b[:b.find('-')]) 293 | m = int(a[a.find('-')+1:]) - int(b[b.find('-')+1:]) 294 | return y * 12 + m 295 | 296 | 297 | df_all['RECRUIT_END_FIRST_TIME_DIFF'] = (df_all['RECRUIT_END_TIME'] + ':' + df_all['RECRUIT_FIRST_TIME']).apply(timeDiff) 298 | df_all['RECRUIT_END_RGYEAR_TIME_DIFF'] = (df_all['RECRUIT_END_TIME'] + ':' + df_all['RGYEAR']).apply(timeDiff) 299 | 300 | # ------------------------------------------------------------------------------------- 301 | # 11. 企业平均一次的招聘几个, RECRUIT_PRE_MONTH_CNT_AVG = RECRUIT_RECRNUM_SUM / RECRUIT_CNT 302 | # 12. 企业招聘的个数占所有招聘个数个平均值的比例,RECRUIT_CNT_ALL_RATE 303 | 304 | # 13. 企业对应的大类HY的平均招聘数据的个数,RECRUIT_HY_CNT_AVG 305 | # 14. 企业对应大类HY的平均招聘个数占所有招聘平均个数的比例,RECRUIT_HY_CNT_ALL_RATE 306 | # 15. 企业招聘的个数占其对应的大类HY的招聘平均值的比例,RECRUIT_CNT_HY_RATE 307 | 308 | # RECRUIT_RECRNUM_AVG 309 | # 16. 企业招聘的数量占所有招聘数量平均值的比例,RECRUIT_RECRNUM_AVG_ALL_RATE 310 | 311 | # 17. 企业对应的大类HY的平均招聘数据的平均,RECRUIT_HY_RECRNUM_AVG_AVG 312 | # 18. 企业对应大类HY的平均招聘数量占所有招聘平均数量的比例,RECRUIT_HY_RECRNUM_AVG_ALL_RATE 313 | # 19. 企业招聘的数量占其对应的大类HY的招聘数量平均值的比例,RECRUIT_RECRNUM_AVG_HY_RATE 314 | 315 | # RECRUIT_RECRNUM_SUM 316 | # 20. 企业招聘的数量占所有招聘数量平均值的比例,RECRUIT_RECRNUM_SUM_ALL_RATE 317 | 318 | # 21. 企业对应的大类HY的平均招聘数据的平均,RECRUIT_HY_RECRNUM_SUM_AVG 319 | # 22. 企业对应大类HY的平均招聘数量占所有招聘平均数量的比例,RECRUIT_HY_RECRNUM_SUM_ALL_RATE 320 | # 23. 企业招聘的数量占其对应的大类HY的招聘数量平均值的比例,RECRUIT_RECRNUM_SUM_HY_RATE 321 | 322 | # In[26]: 323 | 324 | df_all['RECRUIT_PRE_MONTH_CNT_AVG'] = df_all['RECRUIT_RECRNUM_SUM'] / df_all['RECRUIT_CNT'] 325 | 326 | df_all['RECRUIT_CNT_ALL_RATE'] = df_all['RECRUIT_CNT'] / df_all['RECRUIT_CNT'].mean() 327 | df_all['RECRUIT_RECRNUM_AVG_ALL_RATE'] = df_all['RECRUIT_RECRNUM_AVG'] / df_all['RECRUIT_RECRNUM_AVG'].mean() 328 | df_all['RECRUIT_RECRNUM_SUM_ALL_RATE'] = df_all['RECRUIT_RECRNUM_SUM'] / df_all['RECRUIT_RECRNUM_SUM'].mean() 329 | 330 | df_all['RECRUIT_CNT_ALL_RATE_MAX'] = df_all['RECRUIT_CNT'] / df_all['RECRUIT_CNT'].max() 331 | df_all['RECRUIT_RECRNUM_AVG_ALL_RATE_MAX'] = df_all['RECRUIT_RECRNUM_AVG'] / df_all['RECRUIT_RECRNUM_AVG'].max() 332 | df_all['RECRUIT_RECRNUM_SUM_ALL_RATE_MAX'] = df_all['RECRUIT_RECRNUM_SUM'] / df_all['RECRUIT_RECRNUM_SUM'].max() 333 | 334 | 335 | 336 | # In[27]: 337 | 338 | 339 | tmp = pd.DataFrame() 340 | 341 | tmp['RECRUIT_HY_CNT_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['RECRUIT_CNT'].mean() 342 | tmp['RECRUIT_HY_CNT_ALL_RATE'] = tmp['RECRUIT_HY_CNT_AVG'] / df_all['RECRUIT_CNT'].mean() 343 | 344 | tmp['RECRUIT_HY_RECRNUM_AVG_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['RECRUIT_RECRNUM_AVG'].mean() 345 | tmp['RECRUIT_HY_RECRNUM_AVG_ALL_RATE'] = tmp['RECRUIT_HY_RECRNUM_AVG_AVG'] / df_all['RECRUIT_RECRNUM_AVG'].mean() 346 | 347 | tmp['RECRUIT_HY_RECRNUM_SUM_AVG'] = df_all.groupby(['HY'],as_index=True,axis=0)['RECRUIT_RECRNUM_SUM'].mean() 348 | tmp['RECRUIT_HY_RECRNUM_SUM_ALL_RATE'] = tmp['RECRUIT_HY_RECRNUM_SUM_AVG'] / df_all['RECRUIT_RECRNUM_SUM'].mean() 349 | 350 | 351 | 352 | tmp['RECRUIT_HY_CNT_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['RECRUIT_CNT'].mean() 353 | tmp['RECRUIT_HY_CNT_ALL_RATE_MAX'] = tmp['RECRUIT_HY_CNT_MAX'] / df_all['RECRUIT_CNT'].mean() 354 | 355 | tmp['RECRUIT_HY_RECRNUM_AVG_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['RECRUIT_RECRNUM_AVG'].mean() 356 | tmp['RECRUIT_HY_RECRNUM_AVG_ALL_RATE_MAX'] = tmp['RECRUIT_HY_RECRNUM_AVG_MAX'] / df_all['RECRUIT_RECRNUM_AVG'].mean() 357 | 358 | tmp['RECRUIT_HY_RECRNUM_SUM_MAX'] = df_all.groupby(['HY'],as_index=True,axis=0)['RECRUIT_RECRNUM_SUM'].mean() 359 | tmp['RECRUIT_HY_RECRNUM_SUM_ALL_RATE_MAX'] = tmp['RECRUIT_HY_RECRNUM_SUM_MAX'] / df_all['RECRUIT_RECRNUM_SUM'].mean() 360 | 361 | 362 | tmp['HY'] = tmp.index 363 | 364 | 365 | 366 | # In[28]: 367 | 368 | 369 | tmp1 = pd.DataFrame() 370 | 371 | tmp1['RECRUIT_ETYPE_CNT_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['RECRUIT_CNT'].mean() 372 | tmp1['RECRUIT_ETYPE_CNT_ALL_RATE'] = tmp1['RECRUIT_ETYPE_CNT_AVG'] / df_all['RECRUIT_CNT'].mean() 373 | 374 | tmp1['RECRUIT_ETYPE_RECRNUM_AVG_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['RECRUIT_RECRNUM_AVG'].mean() 375 | tmp1['RECRUIT_ETYPE_RECRNUM_AVG_ALL_RATE'] = tmp1['RECRUIT_ETYPE_RECRNUM_AVG_AVG'] / df_all['RECRUIT_RECRNUM_AVG'].mean() 376 | 377 | tmp1['RECRUIT_ETYPE_RECRNUM_SUM_AVG'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['RECRUIT_RECRNUM_SUM'].mean() 378 | tmp1['RECRUIT_ETYPE_RECRNUM_SUM_ALL_RATE'] = tmp1['RECRUIT_ETYPE_RECRNUM_SUM_AVG'] / df_all['RECRUIT_RECRNUM_SUM'].mean() 379 | 380 | 381 | 382 | tmp1['RECRUIT_ETYPE_CNT_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['RECRUIT_CNT'].mean() 383 | tmp1['RECRUIT_ETYPE_CNT_ALL_RATE_MAX'] = tmp1['RECRUIT_ETYPE_CNT_MAX'] / df_all['RECRUIT_CNT'].mean() 384 | 385 | tmp1['RECRUIT_ETYPE_RECRNUM_AVG_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['RECRUIT_RECRNUM_AVG'].mean() 386 | tmp1['RECRUIT_ETYPE_RECRNUM_AVG_ALL_RATE_MAX'] = tmp1['RECRUIT_ETYPE_RECRNUM_AVG_MAX'] / df_all['RECRUIT_RECRNUM_AVG'].mean() 387 | 388 | tmp1['RECRUIT_ETYPE_RECRNUM_SUM_MAX'] = df_all.groupby(['ETYPE'],as_index=True,axis=0)['RECRUIT_RECRNUM_SUM'].mean() 389 | tmp1['RECRUIT_ETYPE_RECRNUM_SUM_ALL_RATE_MAX'] = tmp1['RECRUIT_ETYPE_RECRNUM_SUM_MAX'] / df_all['RECRUIT_RECRNUM_SUM'].mean() 390 | 391 | 392 | tmp1['ETYPE'] = tmp1.index 393 | 394 | 395 | 396 | # In[29]: 397 | 398 | df_all = pd.merge(df_all,tmp,how='left',on=['HY']) 399 | df_all = pd.merge(df_all,tmp1,how='left',on=['ETYPE']) 400 | 401 | 402 | # In[30]: 403 | 404 | df_all['RECRUIT_CNT_HY_RATE'] = df_all['RECRUIT_CNT'] / df_all['RECRUIT_HY_CNT_AVG'] 405 | df_all['RECRUIT_RECRNUM_AVG_HY_RATE'] = df_all['RECRUIT_RECRNUM_AVG'] / df_all['RECRUIT_HY_RECRNUM_AVG_AVG'] 406 | df_all['RECRUIT_RECRNUM_SUM_HY_RATE'] = df_all['RECRUIT_RECRNUM_SUM'] / df_all['RECRUIT_HY_RECRNUM_SUM_AVG'] 407 | 408 | df_all['RECRUIT_CNT_HY_RATE_MAX'] = df_all['RECRUIT_CNT'] / df_all['RECRUIT_HY_CNT_MAX'] 409 | df_all['RECRUIT_RECRNUM_AVG_HY_RATE_MAX'] = df_all['RECRUIT_RECRNUM_AVG'] / df_all['RECRUIT_HY_RECRNUM_AVG_MAX'] 410 | df_all['RECRUIT_RECRNUM_SUM_HY_RATE_MAX'] = df_all['RECRUIT_RECRNUM_SUM'] / df_all['RECRUIT_HY_RECRNUM_SUM_MAX'] 411 | 412 | 413 | # In[31]: 414 | 415 | df_all['RECRUIT_CNT_ETYPE_RATE'] = df_all['RECRUIT_CNT'] / df_all['RECRUIT_ETYPE_CNT_AVG'] 416 | df_all['RECRUIT_RECRNUM_AVG_ETYPE_RATE'] = df_all['RECRUIT_RECRNUM_AVG'] / df_all['RECRUIT_ETYPE_RECRNUM_AVG_AVG'] 417 | df_all['RECRUIT_RECRNUM_SUM_ETYPE_RATE'] = df_all['RECRUIT_RECRNUM_SUM'] / df_all['RECRUIT_ETYPE_RECRNUM_SUM_AVG'] 418 | 419 | df_all['RECRUIT_CNT_ETYPE_RATE_MAX'] = df_all['RECRUIT_CNT'] / df_all['RECRUIT_ETYPE_CNT_MAX'] 420 | df_all['RECRUIT_RECRNUM_AVG_ETYPE_RATE_MAX'] = df_all['RECRUIT_RECRNUM_AVG'] / df_all['RECRUIT_ETYPE_RECRNUM_AVG_MAX'] 421 | df_all['RECRUIT_RECRNUM_SUM_ETYPE_RATE_MAX'] = df_all['RECRUIT_RECRNUM_SUM'] / df_all['RECRUIT_ETYPE_RECRNUM_SUM_MAX'] 422 | 423 | 424 | 425 | # In[15]: 426 | 427 | # df_all.info() 428 | # df_all.head() 429 | 430 | 431 | # In[16]: 432 | 433 | # 得到在df_xchange的所有数据, 434 | df_all = df_all.fillna(0) 435 | df_all.to_csv('../data/alldata/df_data123456789.csv',index=False,index_label=False) 436 | 437 | 438 | 439 | # In[ ]: 440 | 441 | 442 | 443 | 444 | # In[ ]: 445 | 446 | 447 | 448 | -------------------------------------------------------------------------------- /model/.ipynb_checkpoints/model_lr-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### 根据xbgstack的生成的50个融合特征,进行LR和朴素贝叶斯预测。" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "\n", 21 | "from sklearn.linear_model import LogisticRegression\n", 22 | "from sklearn.naive_bayes import GaussianNB" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "df_all = pd.read_csv(\"../data/alldata/df_data_plus_all_xgbstack.csv\")\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "Index(['PROV', 'EID', 'RGYEAR', 'HY', 'ZCZB', 'ETYPE', 'MPNUM', 'INUM', 'ENUM',\n", 47 | " 'FINZB',\n", 48 | " ...\n", 49 | " 'STACKFEATURE40', 'STACKFEATURE41', 'STACKFEATURE42', 'STACKFEATURE43',\n", 50 | " 'STACKFEATURE44', 'STACKFEATURE45', 'STACKFEATURE46', 'STACKFEATURE47',\n", 51 | " 'STACKFEATURE48', 'STACKFEATURE49'],\n", 52 | " dtype='object', length=593)" 53 | ] 54 | }, 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "output_type": "execute_result" 58 | } 59 | ], 60 | "source": [ 61 | "df_all.columns" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "features = ['STACKFEATURE' + str(i) for i in range(50)]\n", 73 | "features.append('EID')\n", 74 | "label = \"TARGET\"" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "def xtrain_and_test(df_all):\n", 86 | " '''\n", 87 | " 得到训练数据和测试数据\n", 88 | " '''\n", 89 | " df_label = pd.read_csv('../data/public/train.csv')\n", 90 | " df_test_label = pd.read_csv('../data/public/evaluation_public.csv')\n", 91 | "\n", 92 | " df_label.drop('ENDDATE',axis=1,inplace=True)\n", 93 | " \n", 94 | " df_train = df_all[df_all['EID'].isin(df_label['EID'])]\n", 95 | " df_test = df_all[df_all['EID'].isin(df_test_label['EID'])]\n", 96 | " \n", 97 | " df_train = pd.merge(df_train,df_label,how='left',on=['EID'])\n", 98 | " \n", 99 | " return df_train,df_test\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 6, 105 | "metadata": { 106 | "collapsed": true 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "df_train,df_test = xtrain_and_test(df_all[features])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 35, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "0.13598211340395117\n", 125 | "0.8640178865960488\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "print(len(df_train[df_train[label]==1]) / len(df_train))\n", 131 | "print(len(df_train[df_train[label]==0]) / len(df_train))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/html": [ 144 | "
\n", 145 | "\n", 158 | "\n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | "
STACKFEATURE0STACKFEATURE1STACKFEATURE2STACKFEATURE3STACKFEATURE4STACKFEATURE5STACKFEATURE6STACKFEATURE7STACKFEATURE8STACKFEATURE9...STACKFEATURE42STACKFEATURE43STACKFEATURE44STACKFEATURE45STACKFEATURE46STACKFEATURE47STACKFEATURE48STACKFEATURE49EIDTARGET
0209215198189211221196216214215...185154201146182156188165p138360
1226221120208218228216220207212...196189221142200183134205p73200
2210215198193211221199216213215...186157202146154177177162p1201160
3177173190193192177199216214215...185154196146136174177124p1972250
4210215198194211221200220214215...199157218145182200189170p1541620
\n", 308 | "

5 rows × 52 columns

\n", 309 | "
" 310 | ], 311 | "text/plain": [ 312 | " STACKFEATURE0 STACKFEATURE1 STACKFEATURE2 STACKFEATURE3 STACKFEATURE4 \\\n", 313 | "0 209 215 198 189 211 \n", 314 | "1 226 221 120 208 218 \n", 315 | "2 210 215 198 193 211 \n", 316 | "3 177 173 190 193 192 \n", 317 | "4 210 215 198 194 211 \n", 318 | "\n", 319 | " STACKFEATURE5 STACKFEATURE6 STACKFEATURE7 STACKFEATURE8 STACKFEATURE9 \\\n", 320 | "0 221 196 216 214 215 \n", 321 | "1 228 216 220 207 212 \n", 322 | "2 221 199 216 213 215 \n", 323 | "3 177 199 216 214 215 \n", 324 | "4 221 200 220 214 215 \n", 325 | "\n", 326 | " ... STACKFEATURE42 STACKFEATURE43 STACKFEATURE44 STACKFEATURE45 \\\n", 327 | "0 ... 185 154 201 146 \n", 328 | "1 ... 196 189 221 142 \n", 329 | "2 ... 186 157 202 146 \n", 330 | "3 ... 185 154 196 146 \n", 331 | "4 ... 199 157 218 145 \n", 332 | "\n", 333 | " STACKFEATURE46 STACKFEATURE47 STACKFEATURE48 STACKFEATURE49 EID \\\n", 334 | "0 182 156 188 165 p13836 \n", 335 | "1 200 183 134 205 p7320 \n", 336 | "2 154 177 177 162 p120116 \n", 337 | "3 136 174 177 124 p197225 \n", 338 | "4 182 200 189 170 p154162 \n", 339 | "\n", 340 | " TARGET \n", 341 | "0 0 \n", 342 | "1 0 \n", 343 | "2 0 \n", 344 | "3 0 \n", 345 | "4 0 \n", 346 | "\n", 347 | "[5 rows x 52 columns]" 348 | ] 349 | }, 350 | "execution_count": 7, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "df_train.head()" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 17, 362 | "metadata": { 363 | "collapsed": true 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "gnb = GaussianNB()" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": { 374 | "collapsed": false 375 | }, 376 | "outputs": [ 377 | { 378 | "name": "stdout", 379 | "output_type": "stream", 380 | "text": [ 381 | "[LibLinear]" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "lr = LogisticRegression(verbose=5,n_jobs=3,class_weight={1:0.8640178865960488,0:0.13598211340395117},penalty='l1')\n", 387 | "\n", 388 | "lr.fit(df_train[features[:50]],df_train[label])\n", 389 | "gnb.fit(df_train[features[:50]],df_train[label])" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 37, 395 | "metadata": { 396 | "collapsed": true 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "proba_lr = lr.predict_proba(df_test[features[0:50]])\n", 401 | "proba_gnb = gnb.predict_proba(df_test[features[0:50]])\n", 402 | "\n" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 38, 408 | "metadata": { 409 | "collapsed": true 410 | }, 411 | "outputs": [], 412 | "source": [ 413 | "prob_lr = proba_lr[:,0]\n", 414 | "prob_gnb = proba_gnb[:,0]\n" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 39, 420 | "metadata": { 421 | "collapsed": false 422 | }, 423 | "outputs": [ 424 | { 425 | "data": { 426 | "text/plain": [ 427 | "array([ 0.97860149, 0.96988504, 0.99274635, ..., 0.97744624,\n", 428 | " 0.9670601 , 0.97457574])" 429 | ] 430 | }, 431 | "execution_count": 39, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "prob_lr" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 40, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [ 447 | { 448 | "data": { 449 | "text/plain": [ 450 | "0.61850455532159188" 451 | ] 452 | }, 453 | "execution_count": 40, 454 | "metadata": {}, 455 | "output_type": "execute_result" 456 | } 457 | ], 458 | "source": [ 459 | "np.mean(prob_gnb)" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 41, 465 | "metadata": { 466 | "collapsed": false 467 | }, 468 | "outputs": [ 469 | { 470 | "data": { 471 | "text/plain": [ 472 | "218247" 473 | ] 474 | }, 475 | "execution_count": 41, 476 | "metadata": {}, 477 | "output_type": "execute_result" 478 | } 479 | ], 480 | "source": [ 481 | "np.sum(prob_lr > 0.5)" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": 26, 487 | "metadata": { 488 | "collapsed": false 489 | }, 490 | "outputs": [ 491 | { 492 | "data": { 493 | "text/plain": [ 494 | "136576" 495 | ] 496 | }, 497 | "execution_count": 26, 498 | "metadata": {}, 499 | "output_type": "execute_result" 500 | } 501 | ], 502 | "source": [ 503 | "np.sum(prob_gnb > 0.5)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": { 510 | "collapsed": true 511 | }, 512 | "outputs": [], 513 | "source": [] 514 | } 515 | ], 516 | "metadata": { 517 | "anaconda-cloud": {}, 518 | "kernelspec": { 519 | "display_name": "Python [conda root]", 520 | "language": "python", 521 | "name": "conda-root-py" 522 | }, 523 | "language_info": { 524 | "codemirror_mode": { 525 | "name": "ipython", 526 | "version": 3 527 | }, 528 | "file_extension": ".py", 529 | "mimetype": "text/x-python", 530 | "name": "python", 531 | "nbconvert_exporter": "python", 532 | "pygments_lexer": "ipython3", 533 | "version": "3.5.2" 534 | } 535 | }, 536 | "nbformat": 4, 537 | "nbformat_minor": 1 538 | } 539 | --------------------------------------------------------------------------------