├── README.md ├── optimizer .py └── ksb_lgb&nn.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # 2019Baai-zhihu-Cup-findexp-4th--- 2 | 2019年知乎看山杯第四名 3 | 4 | 本文为2019年知乎看山杯专家发现算法第四名方案的完整代码。核心思想是以用户为主要研究方向,从用户的历史行为挖掘有用信息,包括体现用户特性的一些信息和反应用户跟问题交互的信息去预测用户未来行为。具体采用了LightGBM等提升树模型跟xDeepFm等推荐系统常用的神经网络模型,最终结果同stacking融合不同模型得到。 5 | 6 | 官方在邀请记录文件中提供了过去一个月来用户被邀请的记录,每条邀请是一条样本,并且标注了用户是否接受了本次邀请作为样本的标签,接受为正反之为负样本,邀请记录中还包含有邀请的时间戳。通过用户历史的邀请记录去预测未来一周用户是否会接受邀请,官方给出的训练集以及测试集实际上是经过知乎内部的召回跟排序模块之后实际推送给用户的结果,所以这些邀请实际上本身就是知乎内部模型选择后的结果,包括测试集也是,我们理解的是参赛者拿到的训练跟预测数据分布本身就是有偏的,更加偏向于用户的喜好。所以本次比赛中围绕训练集样本本身去构造特征就会有很好的效果,比如用户、问题的样本数等。 7 | 8 | 邀请本身就蕴含着很多信息,当一个问题邀请某一个用户时本身就说明该用户很可能对这个问题是感兴趣的,所以我们构造了许多邀请计数特征,并在线下取得了很好的效果,但线上的提升却是有限,邀请数量的分布在训练集中是基本稳定的,但因为官方只给出了一半的验证集,所以计数特征对验证集来说是严重有偏的,这也是初赛很多参赛选手的线下线上分数差距大的一个重要原因。我们在初赛时给每一个计数赋予了一个关于当天样本总数跟当天出现的id数的权重后再加和从而使得线上线下能够相对稳定。 9 | -------------------------------------------------------------------------------- /optimizer .py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import itertools as it 4 | from torch.optim import Optimizer 5 | class Lookahead(Optimizer): 6 | def __init__(self, base_optimizer,alpha=0.5, k=6): 7 | if not 0.0 <= alpha <= 1.0: 8 | raise ValueError('Invalid slow update rate: {alpha}') 9 | if not 1 <= k: 10 | raise ValueError('Invalid lookahead steps: {k}') 11 | self.optimizer = base_optimizer 12 | self.param_groups = self.optimizer.param_groups 13 | self.alpha = alpha 14 | self.k = k 15 | for group in self.param_groups: 16 | group["step_counter"] = 0 17 | self.slow_weights = [[p.clone().detach() for p in group['params']] 18 | for group in self.param_groups] 19 | 20 | for w in it.chain(*self.slow_weights): 21 | w.requires_grad = False 22 | 23 | def step(self, closure=None): 24 | loss = None 25 | if closure is not None: 26 | loss = closure() 27 | loss = self.optimizer.step() 28 | for group,slow_weights in zip(self.param_groups,self.slow_weights): 29 | group['step_counter'] += 1 30 | if group['step_counter'] % self.k != 0: 31 | continue 32 | for p,q in zip(group['params'],slow_weights): 33 | if p.grad is None: 34 | continue 35 | q.data.add_(self.alpha,p.data - q.data) 36 | p.data.copy_(q.data) 37 | return loss 38 | 39 | class RAdam(Optimizer): 40 | ''' 41 | a PyTorch implementation of the RAdam Optimizer from th paper 42 | On the Variance of the Adaptive Learning Rate and Beyond. 43 | 44 | https://arxiv.org/abs/1908.03265 45 | Example: 46 | >>> from optimizer import RAdam 47 | >>> optimizer = RAdam(model.parameters(), lr=0.001) 48 | ''' 49 | 50 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 51 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 52 | self.buffer = [[None, None, None] for ind in range(10)] 53 | super(RAdam, self).__init__(params, defaults) 54 | 55 | def __setstate__(self, state): 56 | super(RAdam, self).__setstate__(state) 57 | 58 | def step(self, closure=None): 59 | 60 | loss = None 61 | if closure is not None: 62 | loss = closure() 63 | 64 | for group in self.param_groups: 65 | 66 | for p in group['params']: 67 | if p.grad is None: 68 | continue 69 | grad = p.grad.data.float() 70 | if grad.is_sparse: 71 | raise RuntimeError('RAdam does not support sparse gradients') 72 | 73 | p_data_fp32 = p.data.float() 74 | 75 | state = self.state[p] 76 | 77 | if len(state) == 0: 78 | state['step'] = 0 79 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 80 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 81 | else: 82 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 83 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 84 | 85 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 86 | beta1, beta2 = group['betas'] 87 | 88 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 89 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 90 | 91 | state['step'] += 1 92 | buffered = self.buffer[int(state['step'] % 10)] 93 | if state['step'] == buffered[0]: 94 | N_sma, step_size = buffered[1], buffered[2] 95 | else: 96 | buffered[0] = state['step'] 97 | beta2_t = beta2 ** state['step'] 98 | N_sma_max = 2 / (1 - beta2) - 1 99 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 100 | buffered[1] = N_sma 101 | if N_sma > 5: 102 | step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 103 | else: 104 | step_size = group['lr'] / (1 - beta1 ** state['step']) 105 | buffered[2] = step_size 106 | 107 | if group['weight_decay'] != 0: 108 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 109 | 110 | if N_sma > 5: 111 | denom = exp_avg_sq.sqrt().add_(group['eps']) 112 | p_data_fp32.addcdiv_(-step_size, exp_avg, denom) 113 | else: 114 | p_data_fp32.add_(-step_size, exp_avg) 115 | 116 | p.data.copy_(p_data_fp32) 117 | 118 | return loss 119 | 120 | # 121 | class Ralamb(Optimizer): 122 | ''' 123 | Ralamb optimizer (RAdam + LARS trick) 124 | ''' 125 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): 126 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) 127 | self.buffer = [[None, None, None] for ind in range(10)] 128 | super(Ralamb, self).__init__(params, defaults) 129 | 130 | def __setstate__(self, state): 131 | super(Ralamb, self).__setstate__(state) 132 | 133 | def step(self, closure=None): 134 | 135 | loss = None 136 | if closure is not None: 137 | loss = closure() 138 | 139 | for group in self.param_groups: 140 | 141 | for p in group['params']: 142 | if p.grad is None: 143 | continue 144 | grad = p.grad.data.float() 145 | if grad.is_sparse: 146 | raise RuntimeError('Ralamb does not support sparse gradients') 147 | 148 | p_data_fp32 = p.data.float() 149 | 150 | state = self.state[p] 151 | 152 | if len(state) == 0: 153 | state['step'] = 0 154 | state['exp_avg'] = torch.zeros_like(p_data_fp32) 155 | state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) 156 | else: 157 | state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) 158 | state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) 159 | 160 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 161 | beta1, beta2 = group['betas'] 162 | 163 | # Decay the first and second moment running average coefficient 164 | # m_t 165 | exp_avg.mul_(beta1).add_(1 - beta1, grad) 166 | # v_t 167 | exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) 168 | 169 | state['step'] += 1 170 | buffered = self.buffer[int(state['step'] % 10)] 171 | 172 | if state['step'] == buffered[0]: 173 | N_sma, radam_step = buffered[1], buffered[2] 174 | else: 175 | buffered[0] = state['step'] 176 | beta2_t = beta2 ** state['step'] 177 | N_sma_max = 2 / (1 - beta2) - 1 178 | N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) 179 | buffered[1] = N_sma 180 | 181 | # more conservative since it's an approximated value 182 | if N_sma >= 5: 183 | radam_step = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step']) 184 | else: 185 | radam_step = group['lr'] / (1 - beta1 ** state['step']) 186 | buffered[2] = radam_step 187 | 188 | if group['weight_decay'] != 0: 189 | p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32) 190 | 191 | weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10) 192 | radam_norm = p_data_fp32.pow(2).sum().sqrt() 193 | if weight_norm == 0 or radam_norm == 0: 194 | trust_ratio = 1 195 | else: 196 | trust_ratio = weight_norm / radam_norm 197 | 198 | state['weight_norm'] = weight_norm 199 | state['adam_norm'] = radam_norm 200 | state['trust_ratio'] = trust_ratio 201 | 202 | # more conservative since it's an approximated value 203 | if N_sma >= 5: 204 | denom = exp_avg_sq.sqrt().add_(group['eps']) 205 | p_data_fp32.addcdiv_(-radam_step * trust_ratio, exp_avg, denom) 206 | else: 207 | p_data_fp32.add_(-radam_step * trust_ratio, exp_avg) 208 | 209 | p.data.copy_(p_data_fp32) 210 | 211 | return loss 212 | 213 | -------------------------------------------------------------------------------- /ksb_lgb&nn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "import warnings\n", 14 | "import random\n", 15 | "import os\n", 16 | "import gc\n", 17 | "import math\n", 18 | "from multiprocessing import Pool,cpu_count\n", 19 | "from sklearn.preprocessing import Normalizer,LabelEncoder,OneHotEncoder,MinMaxScaler\n", 20 | "from sklearn.decomposition import PCA,TruncatedSVD\n", 21 | "from sklearn.externals import joblib\n", 22 | "from sklearn.model_selection import StratifiedKFold,train_test_split\n", 23 | "from sklearn.feature_extraction.text import CountVectorizer\n", 24 | "from sklearn.utils import resample\n", 25 | "from sklearn.metrics import roc_auc_score\n", 26 | "from scipy import sparse\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "from gensim.models import word2vec\n", 29 | "from gensim.scripts.glove2word2vec import glove2word2vec\n", 30 | "from gensim.test.utils import datapath as dpath, get_tmpfile\n", 31 | "import torch \n", 32 | "import codecs\n", 33 | "import xgboost as xgb\n", 34 | "import lightgbm as lgb\n", 35 | "import catboost as catb\n", 36 | "import pickle\n", 37 | "import time\n", 38 | "import datetime\n", 39 | "import math\n", 40 | "import scipy.special as special\n", 41 | "import torch\n", 42 | "import torch.nn as nn\n", 43 | "import torch.nn.functional as F\n", 44 | "import tqdm\n", 45 | "from optimizer import Lookahead\n", 46 | "from optimizer import RAdam\n", 47 | "import torch.utils.data as Data\n", 48 | "import codecs\n", 49 | "import sys\n", 50 | "import jieba.posseg\n", 51 | "import jieba.analyse\n", 52 | "import re\n", 53 | "import warnings\n", 54 | "warnings.filterwarnings('ignore')\n", 55 | "datapath = '.....'\n", 56 | "\n", 57 | "t0_train = 3838\n", 58 | "t1_train = 3867\n", 59 | "t0_eval = 3868\n", 60 | "t1_eval = 3874\n", 61 | "t0_a = 3807\n", 62 | "t1_a = 3867\n", 63 | "evalday = 7" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 1, 69 | "metadata": { 70 | "collapsed": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "def memoryOptimization(data,floattype):\n", 75 | " subdata = data.select_dtypes(include = 'int')\n", 76 | " for col in subdata.columns:\n", 77 | " m = subdata[col].max()\n", 78 | " n = subdata[col].min()\n", 79 | " if m < np.power(2,31)-1 and n >= -np.power(2,31):\n", 80 | " if m < np.power(2,15)-1 and n >= -np.power(2,15):\n", 81 | " if m < np.power(2,7)-1 and n >= -np.power(2,7):\n", 82 | " subdata[col] = subdata[col].astype(np.int8)\n", 83 | " else:\n", 84 | " subdata[col] = subdata[col].astype(np.int16)\n", 85 | " else:\n", 86 | " subdata[col] = subdata[col].astype(np.int32)\n", 87 | " data[subdata.columns] = subdata\n", 88 | " subdata = data.select_dtypes(include = 'float')\n", 89 | " data[subdata.columns] = data[subdata.columns].astype(floattype)\n", 90 | "# subdata = data.select_dtypes(include = 'object')\n", 91 | "# data[subdata.columns] = data[subdata.columns].astype('category')\n", 92 | " gc.collect()\n", 93 | " return data\n", 94 | "\n", 95 | "# def down_sample(df,df_feat,rate):#对目标特征下采样,通过给定随机数种子保证每个特征组抽样的负样本是同样的\n", 96 | "# df_majority = df_feat[df['label']==0]\n", 97 | "# df_minority = df_feat[df['label']==1]\n", 98 | "# positive_num = df_minority.shape[0]\n", 99 | "# df_majority_downsampled = resample(df_majority,\n", 100 | "# replace=False, # sample without replacement\n", 101 | "# n_samples=positive_num*rate, # to match minority class\n", 102 | "# random_state=7) # reproducible results\n", 103 | "# df_downsampled = pd.concat([df_majority_downsampled, df_minority],axis = 0,ignore_index = True)\n", 104 | "# del df_majority, df_minority, df_majority_downsampled\n", 105 | "# return df_downsampled\n", 106 | "\n", 107 | "def lgb_train_pre1(train_x,train_y,test_x,categoryfeas,dropfeas,one,save_model): \n", 108 | " train_x = train_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n", 109 | " test_x = test_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n", 110 | " pickle.dump(list(train_x.columns), open(datapath+'data/lgb_fea.pkl', 'wb'))\n", 111 | " params_lgbc ={\n", 112 | " 'boosting_type': 'gbdt',\n", 113 | " 'objective': 'binary', \n", 114 | " 'num_leaves': 41, \n", 115 | " 'learning_rate': 0.1,\n", 116 | " 'feature_fraction': 0.8,\n", 117 | " 'bagging_fraction': 0.8,\n", 118 | " 'bagging_freq': 1,\n", 119 | " 'min_sum_hessian_in_leaf': 10,\n", 120 | " 'num_threads': cpu_count() - 1,\n", 121 | " 'seed': 7, \n", 122 | " 'n_estimators':50000,\n", 123 | " 'max_depth': 6,\n", 124 | " 'subsample':0.9,\n", 125 | " 'subsample_freq':2,\n", 126 | " 'reg_alpha':0, \n", 127 | " 'reg_lambda':2\n", 128 | " # 'device': 'gpu',\n", 129 | " }\n", 130 | " \n", 131 | " pre_train = pd.Series(np.zeros(len(train_y)))\n", 132 | " pre_test = []\n", 133 | " kf = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 2019)\n", 134 | " fold = 1\n", 135 | " for train_index,eval_index in kf.split(train_x,train_y):\n", 136 | " dtrain_x = train_x.loc[train_index,:]\n", 137 | " deval_x = train_x.loc[eval_index,:]\n", 138 | " dtrain_y = train_y[train_index]\n", 139 | " deval_y = train_y[eval_index]\n", 140 | " if flag_weight:\n", 141 | " sample_weight = ((dtrain_x['inviteday']-3500)/(dtrain_x['inviteday']-3500).mean()).values\n", 142 | " else:\n", 143 | " sample_weight = None\n", 144 | " lgbc = lgb.LGBMClassifier(random_state = 2020,**params_lgbc) # np.random.randint(1,3000)\n", 145 | " lgbc.fit(dtrain_x,dtrain_y,eval_set = [(deval_x,deval_y)],eval_names = ['eval'],eval_metric = 'auc',\n", 146 | " early_stopping_rounds = 50,sample_weight = sample_weight,verbose = 100,categorical_feature = categoryfeas)\n", 147 | " pre_train[eval_index] = lgbc.predict_proba(deval_x,num_iteration = lgbc.best_iteration_)[:,1]\n", 148 | " pre_test.append(list(lgbc.predict_proba(test_x,num_iteration = lgbc.best_iteration_)[:,1]))\n", 149 | " if save_model:\n", 150 | " joblib.dump(lgbc, open(datapath+'data/lgb_'+str(params_lgbc['learning_rate'])+'_'+str(fold)+'.pkl', 'wb'))\n", 151 | " fold += 1\n", 152 | " if one:\n", 153 | " break\n", 154 | " pre_test = np.array(pre_test)\n", 155 | " pre_test = np.mean(pre_test,axis = 0)\n", 156 | " \n", 157 | " score = roc_auc_score(train_y,pre_train)\n", 158 | " feas = train_x.columns\n", 159 | " imps = lgbc.feature_importances_\n", 160 | " fea_imp = pd.DataFrame(pd.Series(feas),columns = ['feas'])\n", 161 | " fea_imp['imp'] = imps\n", 162 | " fea_imp = fea_imp.sort_values(by = 'imp',ascending = False)\n", 163 | " del dtrain_x\n", 164 | " del deval_x\n", 165 | " del dtrain_y\n", 166 | " del deval_y\n", 167 | " gc.collect()\n", 168 | " return pre_test,pre_train,score,fea_imp,lgbc.best_iteration_\n", 169 | "def xgb_train_pre1(train_x,train_y,test_x,dropfeas,one,save_model):\n", 170 | " train_x = train_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n", 171 | " test_x = test_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n", 172 | " pickle.dump(list(train_x.columns), open(datapath+'data/xgb_fea.pkl', 'wb'))\n", 173 | " params_lgbc ={ \n", 174 | " 'booster':'gbtree',\n", 175 | " 'learning_rate':0.1,\n", 176 | " 'n_estimators':50000,\n", 177 | " 'max_depth':6,\n", 178 | " 'min_child_weight':3,\n", 179 | " 'gamma':0.1,\n", 180 | " 'subsample':0.9,\n", 181 | " 'colsample_bytree':0.8,\n", 182 | " 'reg_alpha':0, \n", 183 | " 'reg_lambda':2,\n", 184 | " 'objective':'binary:logistic',\n", 185 | " 'nthread':cpu_count() - 1,\n", 186 | " 'scale_pos_weight':1,\n", 187 | " 'seed':7,\n", 188 | "# 'tree_method':'gpu_hist'\n", 189 | " }\n", 190 | " \n", 191 | " pre_train = pd.Series(np.zeros(len(train_y)))\n", 192 | " pre_test = []\n", 193 | " kf = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 2019)\n", 194 | " fold = 1\n", 195 | " for train_index,eval_index in kf.split(train_x,train_y):\n", 196 | " dtrain_x = train_x.loc[train_index,:]\n", 197 | " deval_x = train_x.loc[eval_index,:]\n", 198 | " dtrain_y = train_y[train_index]\n", 199 | " deval_y = train_y[eval_index]\n", 200 | " if flag_weight:\n", 201 | " sample_weight = ((dtrain_x['inviteday']-3500)/(dtrain_x['inviteday']-3500).mean()).values\n", 202 | " else:\n", 203 | " sample_weight = None\n", 204 | " xgbc = xgb.XGBClassifier(random_state = 2019,**params_lgbc)\n", 205 | " xgbc.fit(dtrain_x,dtrain_y,eval_set = [(deval_x,deval_y)],eval_metric = 'auc',\n", 206 | " early_stopping_rounds = 50,sample_weight = sample_weight,verbose = 100)\n", 207 | " pre_train[eval_index] = xgbc.predict_proba(deval_x, ntree_limit=xgbc.best_ntree_limit)[:,1]\n", 208 | " pre_test.append(list(xgbc.predict_proba(test_x, ntree_limit=xgbc.best_ntree_limit)[:,1]))\n", 209 | " if save_model:\n", 210 | " joblib.dump(xgbc, open(datapath+'data/xgb_'+str(params_lgbc['learning_rate'])+'_'+str(fold)+'.pkl', 'wb'))\n", 211 | " fold += 1\n", 212 | " if one:\n", 213 | " break\n", 214 | " pre_test = np.array(pre_test)\n", 215 | " pre_test = np.mean(pre_test,axis = 0)\n", 216 | " \n", 217 | " score = roc_auc_score(train_y,pre_train)\n", 218 | " feas = train_x.columns\n", 219 | " imps = xgbc.feature_importances_\n", 220 | " fea_imp = pd.DataFrame(pd.Series(feas),columns = ['feas'])\n", 221 | " fea_imp['imp'] = imps\n", 222 | " fea_imp = fea_imp.sort_values(by = 'imp',ascending = False)\n", 223 | " del dtrain_x\n", 224 | " del deval_x\n", 225 | " del dtrain_y\n", 226 | " del deval_y\n", 227 | " gc.collect()\n", 228 | " return pre_test,pre_train,score,fea_imp,xgbc.best_iteration\n", 229 | "def cat_train_pre1(train_x,train_y,test_x,categoryfeas,dropfeas,one,save_model):\n", 230 | " train_x = train_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n", 231 | " test_x = test_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n", 232 | " pickle.dump(list(train_x.columns), open(datapath+'data/cat_fea.pkl', 'wb'))\n", 233 | " params_lgbc ={ \n", 234 | " 'learning_rate':0.1,\n", 235 | " 'n_estimators':50000,\n", 236 | " 'max_depth':6,\n", 237 | "# 'subsample':0.9,\n", 238 | " 'l2_leaf_reg':2,\n", 239 | " 'objective':'Logloss',\n", 240 | " 'scale_pos_weight':1,\n", 241 | " 'eval_metric':'AUC',\n", 242 | " 'colsample_bylevel':0.8\n", 243 | " }\n", 244 | " \n", 245 | " pre_train = pd.Series(np.zeros(len(train_y)))\n", 246 | " pre_test = []\n", 247 | " kf = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 2019)\n", 248 | " fold = 1\n", 249 | " for train_index,eval_index in kf.split(train_x,train_y):\n", 250 | " dtrain_x = train_x.loc[train_index,:]\n", 251 | " deval_x = train_x.loc[eval_index,:]\n", 252 | " dtrain_y = train_y[train_index]\n", 253 | " deval_y = train_y[eval_index]\n", 254 | " catbc = catb.CatBoostClassifier(random_state = 2019,**params_lgbc)\n", 255 | " catbc.fit(dtrain_x,dtrain_y,eval_set = [(deval_x,deval_y)],cat_features = categoryfeas, # eval_metric = 'auc',\n", 256 | " early_stopping_rounds = 50,sample_weight = sample_weight,verbose = 100)\n", 257 | " pre_train[eval_index] = catbc.predict_proba(deval_x)[:,1]\n", 258 | " pre_test.append(list(catbc.predict_proba(test_x)[:,1]))\n", 259 | " if save_model:\n", 260 | " joblib.dump(catbc, open(datapath+'data/cat_'+str(params_lgbc['learning_rate'])+'_'+str(fold)+'.pkl', 'wb'))\n", 261 | " fold += 1\n", 262 | " if one:\n", 263 | " break\n", 264 | " pre_test = np.array(pre_test)\n", 265 | " pre_test = np.mean(pre_test,axis = 0)\n", 266 | " \n", 267 | " score = roc_auc_score(train_y,pre_train)\n", 268 | " feas = train_x.columns\n", 269 | " imps = catbc.get_feature_importance()\n", 270 | " fea_imp = pd.DataFrame(pd.Series(feas),columns = ['feas'])\n", 271 | " fea_imp['imp'] = imps\n", 272 | " fea_imp = fea_imp.sort_values(by = 'imp',ascending = False)\n", 273 | " del dtrain_x\n", 274 | " del deval_x\n", 275 | " del dtrain_y\n", 276 | " del deval_y\n", 277 | " gc.collect()\n", 278 | " return pre_test,pre_train,score,fea_imp,catbc.get_best_iteration()\n", 279 | "\n", 280 | "def parallelize_dataframe(df,func):\n", 281 | " df_split = np.array_split(df,20)#cpu_count()\n", 282 | " pool = Pool(20)#cpu_count()\n", 283 | " df = pd.concat(pool.map(func, df_split))\n", 284 | " pool.close()\n", 285 | " pool.join()\n", 286 | " return df\n", 287 | "\n", 288 | "def mostliketheme(x):\n", 289 | " if x == '-1':\n", 290 | " return '-1'\n", 291 | " for theme in iter(x.strip().split(',')):\n", 292 | " theme = theme.strip().split(':')\n", 293 | " try:\n", 294 | " if float(theme[1])>biggestlike:\n", 295 | " biggestlike = theme[1]\n", 296 | " mostliketheme = theme[0]\n", 297 | " except:\n", 298 | " biggestlike = theme[1]\n", 299 | " mostliketheme = theme[0]\n", 300 | " return mostliketheme\n", 301 | "\n", 302 | "def getweekday(data):\n", 303 | " return data%7" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "# .数据处理" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "## 一、预处理" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "### 词向量\n", 325 | "\n", 326 | "处理词向量文件" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": true 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "data_word = pd.read_csv(datapath+'data/word_vectors_64d.txt',sep = ' ',header = None,\n", 338 | " names = ['word_'+str(i) for i in range(64)])\n", 339 | "\n", 340 | "d = data_word['word_0'].apply(lambda x:x.split('\\t'))\n", 341 | "data_word['wordId'] = d.apply(lambda x:x[0])\n", 342 | "data_word['word_0'] = d.apply(lambda x:x[1]).astype(np.float32)\n", 343 | "data_word = memoryOptimization(data_word,np.float32)\n", 344 | "\n", 345 | "data_word.to_csv(datapath+'data/word_vector.csv',header = True,index = False)\n", 346 | "del data_word\n", 347 | "del d\n", 348 | "gc.collect()" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "### 单字向量\n", 356 | "\n", 357 | "处理单字向量文件" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "data_letter = pd.read_csv(datapath+'data/single_word_vectors_64d.txt',sep = ' ',header = None,\n", 369 | " names = ['letter_'+str(i) for i in range(64)])\n", 370 | "\n", 371 | "d = data_letter['letter_0'].apply(lambda x:x.split('\\t'))\n", 372 | "data_letter['letterId'] = d.apply(lambda x:x[0])\n", 373 | "data_letter['letter_0'] = d.apply(lambda x:x[1]).astype(np.float32)\n", 374 | "data_letter = memoryOptimization(data_letter,np.float32)\n", 375 | "\n", 376 | "data_letter.to_csv(datapath+'data/letter_vector.csv',header = True,index = False)\n", 377 | "del data_letter\n", 378 | "del d\n", 379 | "gc.collect()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "### 话题向量\n", 387 | "\n", 388 | "处理话题向量文件" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": { 395 | "collapsed": true 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "data_theme = pd.read_csv(datapath+'data/topic_vectors_64d.txt',sep = ' ',header = None,\n", 400 | " names = ['theme_'+str(i) for i in range(64)])\n", 401 | "\n", 402 | "d = data_theme['theme_0'].apply(lambda x:x.split('\\t'))\n", 403 | "data_theme['themeId'] = d.apply(lambda x:x[0])\n", 404 | "data_theme['theme_0'] = d.apply(lambda x:x[1]).astype(np.float32)\n", 405 | "data_theme = memoryOptimization(data_theme,np.float32)\n", 406 | "data_theme.to_csv(datapath+'data/theme_vector.csv',header = True,index = False)\n" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "利用pca将64维的话题向量进行压缩为22维,以进一步去除噪声" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "collapsed": true 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "normalizer = Normalizer(copy = False)\n", 425 | "data_pca = data_theme[['theme_'+str(i) for i in range(64)]]\n", 426 | "data_pca = normalizer.fit_transform(data_pca)\n", 427 | "n = int(64*0.35)\n", 428 | "svd = TruncatedSVD(n_components = n)\n", 429 | "data_pca = svd.fit_transform(data_pca)\n", 430 | "data_pca= pd.DataFrame(data_pca,columns = ['theme_'+str(i) for i in range(n)])\n", 431 | "data_pca['themeId'] = data_theme['themeId']\n", 432 | "\n", 433 | "data_pca.to_csv(datapath+'data/theme_vector_pca.csv',header = True,index = False)\n", 434 | "del data_theme\n", 435 | "del data_pca\n", 436 | "del d\n", 437 | "gc.collect()" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "### 回答记录文件\n", 445 | "\n", 446 | "处理回答记录文件,根据回答时间字段生成回答的天、时字段" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "collapsed": true 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "columns = ['answerId','qId','writerId','answertime','content_letters','content_words','good_bool','recommend_bool','yuanzhuo_bool',\n", 458 | " 'picture_bool','vedio_bool','wordnum','likenum','cancellikenum','commentnum','collectnum','3qnum','jubaonum','unhelpnum',\n", 459 | " 'unlikenum']\n", 460 | "data_answer = pd.read_csv(datapath+'data/answer_info.txt',sep = '\\t',header = None,names = columns)\n", 461 | "data_answer['answerday'] = np.nan\n", 462 | "data_answer['answerhour'] = np.nan\n", 463 | "data_answer['answerday'] = data_answer['answertime'].apply(lambda x: int(x.split('-')[0][1:]))\n", 464 | "data_answer['answerhour'] = data_answer['answertime'].apply(lambda x: int(x.split('-')[1][1:]))\n", 465 | "data_answer = memoryOptimization(data_answer,np.float32)\n", 466 | "\n", 467 | "data_answer.to_csv(datapath+'data/data_answer.csv',header = True,index = False)\n", 468 | "del data_answer\n", 469 | "gc.collect()" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": {}, 475 | "source": [ 476 | "### 问题文件\n", 477 | "\n", 478 | "处理问题信息文件,根据问题创建时间字段生成问题创建的的天、时字段 " 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": { 485 | "collapsed": true 486 | }, 487 | "outputs": [], 488 | "source": [ 489 | "columns = ['qId','createtime','title_letters','title_words','describe_letters','describe_words','themeId']\n", 490 | "data_question = pd.read_csv(datapath+'data/question_info.txt',sep = '\\t',header = None,names = columns)\n", 491 | "\n", 492 | "data_question['createday'] = np.nan\n", 493 | "data_question['createhour'] = np.nan\n", 494 | "data_question['createday'] = data_question['createtime'].apply(lambda x: int(x.split('-')[0][1:]))\n", 495 | "data_question['createhour'] = data_question['createtime'].apply(lambda x: int(x.split('-')[0][1:]))\n", 496 | "data_question = memoryOptimization(data_question,np.float32)\n", 497 | "\n", 498 | "data_question.to_csv(datapath+'data/data_question.csv',header = True,index = False)" 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "问题绑定的话题相当于问题的tag字段,直接做countvector成类似onehot的形式维度过大,所以这里采用利用给定话题64维向量pca降维压缩并聚合的方式表示问题tag,利用pca后得到的话题的22维embedding,得到问题关于话题的一个embdeding表示,具体方法是: \n", 506 | "1)将问题绑定的多个话题的对应22维em加和取平均 \n", 507 | "2)如果问题绑定的话题为缺失,即‘-1’,则对应的emb为nan \n", 508 | "代码如下:" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": { 515 | "collapsed": true 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "t2v_pca = pd.read_csv(datapath+'data/theme_vector_pca.csv')\n", 520 | "cols = list(t2v_pca.columns)\n", 521 | "cols.remove('themeId')\n", 522 | "cols = ['themeId']+cols\n", 523 | "t2v_pca = t2v_pca[cols]\n", 524 | "dic_t2v_pca = {}\n", 525 | "for row in iter(t2v_pca.values):\n", 526 | " dic_t2v_pca[row[0]] = row[1:]\n", 527 | "\n", 528 | "def get_data_themeembs(data):\n", 529 | " result = []\n", 530 | " for themes in iter(data['themeId'].values):\n", 531 | " if themes == '-1':\n", 532 | " result.append([])\n", 533 | " continue\n", 534 | " cur = np.zeros(int(64*0.35))\n", 535 | " themes = themes.split(',')\n", 536 | " for theme in iter(themes):\n", 537 | " cur = cur + dic_t2v_pca[theme]\n", 538 | " cur = cur/len(themes)\n", 539 | " result.append(list(cur))\n", 540 | " return pd.DataFrame(result,columns = ['themeId'+str(i) for i in range(int(64*0.35))])\n", 541 | "\n", 542 | "q_theme = parallelize_dataframe(data_question,get_data_themeembs)\n", 543 | "q_theme.to_csv(datapath+'data/question_theme.csv',header = True,index = False)\n", 544 | "\n", 545 | "del data_question\n", 546 | "del q_theme\n", 547 | "del t2w_pca\n", 548 | "gc.collect()" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "对问题title跟描述利用TFIDF进行过滤" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": { 562 | "collapsed": true 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "def gen_idf(corpus,outputfile):\n", 567 | " ignored = {'', ' ', '', '。', ':', ',', ')', '(', '!', '?', '”', '“'}\n", 568 | " id_freq = {}\n", 569 | " i = 0\n", 570 | " for doc in corpus:\n", 571 | " doc = set(x for x in doc if x not in ignored)\n", 572 | " for x in doc:\n", 573 | " id_freq[x] = id_freq.get(x, 0) + 1\n", 574 | " i += 1\n", 575 | " with open(outputfile, 'w', encoding='utf-8') as f:\n", 576 | " for key, value in id_freq.items():\n", 577 | " f.write(key + ' ' + str(math.log(i / value, 2)) + '\\n')\n", 578 | " \n", 579 | "class IDFLoader(object):\n", 580 | " def __init__(self, idf_path):\n", 581 | " self.idf_path = idf_path\n", 582 | " self.idf_freq = {} # idf\n", 583 | " self.mean_idf = 0.0 # 均值\n", 584 | " self.load_idf()\n", 585 | "\n", 586 | " def load_idf(self): # 从文件中载入idf\n", 587 | " cnt = 0\n", 588 | " with open(self.idf_path, 'r', encoding='utf-8') as f:\n", 589 | " for line in f:\n", 590 | " try:\n", 591 | " word, freq = line.strip().split(' ')\n", 592 | " cnt += 1\n", 593 | " except Exception as e:\n", 594 | " pass\n", 595 | " self.idf_freq[word] = float(freq)\n", 596 | "\n", 597 | " print('Vocabularies loaded: %d' % cnt)\n", 598 | " self.mean_idf = sum(self.idf_freq.values()) / cnt\n", 599 | " \n", 600 | "class TFIDF(object):\n", 601 | " def __init__(self, idf_path):\n", 602 | " self.idf_loader = IDFLoader(idf_path)\n", 603 | " self.idf_freq = self.idf_loader.idf_freq\n", 604 | " self.mean_idf = self.idf_loader.mean_idf\n", 605 | "\n", 606 | " def extract_sentence_keywords(self, sentence,filter_word=None,topK=None,all_tfidf = False): # 提取关键词\n", 607 | " # 过滤\n", 608 | " #seg_list = segment(sentence)\n", 609 | " seg_list = [x for x in sentence if len(x)>1]\n", 610 | " freq = {}\n", 611 | " for w in seg_list:\n", 612 | " freq[w] = freq.get(w, 0.0) + 1.0\n", 613 | " total = sum(freq.values())\n", 614 | "\n", 615 | " for k in freq: # 计算 TF-IDF\n", 616 | " freq[k] *= self.idf_freq.get(k, self.mean_idf) / total\n", 617 | "\n", 618 | "\n", 619 | " tags = sorted(freq, key=freq.__getitem__, reverse=True) # 排序\n", 620 | " if filter_word!=None:\n", 621 | " tags = [x for x in tags if x not in filter_word]\n", 622 | " if topK!=None:\n", 623 | " if all_tfidf:\n", 624 | " return tags[:topK],freq\n", 625 | " else:\n", 626 | " return tags[:topK]\n", 627 | " else:\n", 628 | " if all_tfidf:\n", 629 | " return tags,freq\n", 630 | " else:\n", 631 | " return tags\n", 632 | " \n", 633 | " def extract_corpus_keywords(self, corpus, filter_word=None,topK=None,all_tfidf = False): # 提取关键词\n", 634 | " # 过滤\n", 635 | " #seg_list = segment(sentence)\n", 636 | " all_tags = []\n", 637 | " all_freq = []\n", 638 | " for sentence in corpus:\n", 639 | " seg_list = [x for x in sentence if len(x)>1]\n", 640 | " freq = {}\n", 641 | " for w in seg_list:\n", 642 | " freq[w] = freq.get(w, 0.0) + 1.0\n", 643 | " total = sum(freq.values())\n", 644 | "\n", 645 | " for k in freq: # 计算 TF-IDF\n", 646 | " freq[k] *= self.idf_freq.get(k, self.mean_idf) / total\n", 647 | " if all_tfidf:\n", 648 | " all_freq.append(freq)\n", 649 | " tags = sorted(freq, key=freq.__getitem__, reverse=True) # 排序\n", 650 | " if filter_word!=None:\n", 651 | " tags = [x for x in tags if x not in filter_word]\n", 652 | " if topK!=None:\n", 653 | " all_tags.append(tags[:topK])\n", 654 | " else:\n", 655 | " all_tags.append(tags)\n", 656 | " if all_tfidf: \n", 657 | " return all_tags,all_freq\n", 658 | " else:\n", 659 | " return all_tags\n", 660 | "\n", 661 | "\n", 662 | "question_info = pd.read_csv(datapath+'data/question_info.txt', header=None, sep='\\t')\n", 663 | "question_info.columns = ['问题id','问题创建时间','问题标题单字编码','问题标题切词编码','问题描述单字编码','问题描述切词编码','问题绑定话题']\n", 664 | "question_info['len'] = question_info['问题标题切词编码'].apply(lambda x:len(x.split(',')))\n", 665 | "question_info['len'].max()\n", 666 | "\n", 667 | "def text(row):\n", 668 | " text = row.问题标题切词编码.split(',')\n", 669 | " if row.问题描述切词编码!=str(-1):\n", 670 | " text.extend(row.问题描述切词编码.split(','))\n", 671 | " return text\n", 672 | "question_info['text'] = question_info.apply(lambda row:text(row),axis=1)\n", 673 | "question_info['title'] = question_info['问题标题切词编码'].apply(lambda x:x.split(','))\n", 674 | "\n", 675 | "out_file = datapath+'data/问题标题.idf.txt'\n", 676 | "if not os.path.exists(out_file):\n", 677 | " corpus = question_info.title.values.tolist()\n", 678 | " gen_idf(corpus,out_file)\n", 679 | "tdidf = TFIDF(out_file)\n", 680 | "x = question_info.title.values.tolist()\n", 681 | "tags = tdidf.extract_corpus_keywords(x)\n", 682 | "question_info['tfidf_title'] = tags\n", 683 | "del x,tags\n", 684 | "gc.collect()\n", 685 | "\n", 686 | "out_file = datapath+'data/问题标题描述.idf.txt'\n", 687 | "if not os.path.exists(out_file):\n", 688 | " corpus = question_info.text.values.tolist()\n", 689 | " gen_idf(corpus,out_file)\n", 690 | "tdidf = TFIDF(out_file)\n", 691 | "x = question_info.text.values.tolist()\n", 692 | "tags = tdidf.extract_corpus_keywords(x)\n", 693 | "question_info['tfidf_text'] = tags\n", 694 | "demo_data = question_info[['问题id','len','tfidf_text','tfidf_title','问题描述切词编码']]\n", 695 | "\n", 696 | "def get_topk(row):\n", 697 | " ### title\n", 698 | "# if row.问题标题切词编码==str(-1):\n", 699 | "# return\n", 700 | " n = round(row.len/3)\n", 701 | " if n==0:\n", 702 | " n=1\n", 703 | " row['title_topk'] = row.tfidf_title[:n]\n", 704 | " if row.问题描述切词编码==str(-1):\n", 705 | " row['text_topk'] = row.tfidf_text[:n]\n", 706 | " else:\n", 707 | " if row.tfidf_title[0]=='-1':\n", 708 | " m = min(4,round(2*len(row.tfidf_text)/3))\n", 709 | " row['text_topk'] = row.tfidf_text[:m]\n", 710 | " else:\n", 711 | " row['text_topk'] = row.tfidf_text[:round(2*row.len/3)]\n", 712 | " return row\n", 713 | "\n", 714 | "\n", 715 | "def parallelize_dataframe(df, func):\n", 716 | " df_split = np.array_split(df, 16)\n", 717 | " pool = Pool(16)\n", 718 | " df = pd.concat(pool.map(func, df_split))\n", 719 | " pool.close()\n", 720 | " pool.join()\n", 721 | " return df\n", 722 | "def get_topk_all(df):\n", 723 | " df = df.apply(lambda row:get_topk(row),axis=1)\n", 724 | " return df\n", 725 | "\n", 726 | "demo_data = parallelize_dataframe(demo_data, get_topk_all)\n", 727 | "demo_data['问题标题切词编码']=question_info['问题标题切词编码']\n", 728 | "\n", 729 | "def clean_1(ls):\n", 730 | " if len(ls)==1 and ls[0]=='-1':\n", 731 | " return ls\n", 732 | " ls = [x for x in ls if x!='-1']\n", 733 | " return ls\n", 734 | "\n", 735 | "x = demo_data[[\"问题id\",'title_topk','text_topk']]\n", 736 | "x['text_topk'] = x['text_topk'].apply(lambda x:clean_1(x))\n", 737 | "x['title_topk'] = x['title_topk'].apply(lambda s:','.join(s))\n", 738 | "x['text_topk'] = x['text_topk'].apply(lambda s:','.join(s))\n", 739 | "x.to_csv(datapath+'data/data_q_title_tfidf.csv')" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "### 用户文件\n", 747 | "\n", 748 | "处理用户信息文件, \n", 749 | "drop掉keywords等没有意义的列; \n", 750 | "将类别特征中的‘unknown’替换为nan; \n", 751 | "构建用户最感兴趣的主题;" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "metadata": { 758 | "collapsed": true 759 | }, 760 | "outputs": [], 761 | "source": [ 762 | "columns = ['writerId','sex','keywords','publishrank','heatrank','registertype','platform','activity','bool_A','bool_B','bool_C',\n", 763 | " 'bool_D','bool_E','category_A','category_B','category_C','category_D','category_E','yanzhi','attentionthemes','likethemes']\n", 764 | "data_writer = pd.read_csv(datapath+'data/member_info.txt',sep = '\\t',names = columns)\n", 765 | "data_writer.drop(['keywords','publishrank', 'heatrank', 'registertype', 'platform'],axis = 1)\n", 766 | "data_writer[data_writer.select_dtypes(include = 'object').columns] = data_writer.select_dtypes(include = 'object').applymap(lambda x: float('nan') if x == 'unknown' else x)\n", 767 | "\n", 768 | "def mostliketheme(x):\n", 769 | " if x == '-1':\n", 770 | " return '-1'\n", 771 | " for theme in iter(x.strip().split(',')):\n", 772 | " theme = theme.strip().split(':')\n", 773 | " try:\n", 774 | " if float(theme[1])>biggestlike:\n", 775 | " biggestlike = theme[1]\n", 776 | " mostliketheme = theme[0]\n", 777 | " except:\n", 778 | " biggestlike = theme[1]\n", 779 | " mostliketheme = theme[0]\n", 780 | " return mostliketheme\n", 781 | "\n", 782 | "data_writer['mostliketheme'] = data_writer['likethemes'].apply(mostliketheme)\n", 783 | "data_writer = memoryOptimization(data_writer,np.float32)\n", 784 | "data_writer.to_csv(datapath+'data/data_writer.csv',header = True,index = False)" 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": {}, 790 | "source": [ 791 | "与问题文件类似的,利用话题的22维emb分别得到用户关于用户关注话题的emb以及关于用户感兴趣的话题的emb, \n", 792 | "其中感兴趣话题的聚合方式是加权平均,权重是用户对该话题感兴趣程度" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": null, 798 | "metadata": { 799 | "collapsed": true 800 | }, 801 | "outputs": [], 802 | "source": [ 803 | "t2v_pca = pd.read_csv(datapath+'data/theme_vector_pca.csv')\n", 804 | "cols = list(t2v_pca.columns)\n", 805 | "cols.remove('themeId')\n", 806 | "cols = ['themeId']+cols\n", 807 | "t2v_pca = t2v_pca[cols]\n", 808 | "dic_t2v_pca = {}\n", 809 | "for row in iter(t2v_pca.values):\n", 810 | " dic_t2v_pca[row[0]] = row[1:]\n", 811 | "\n", 812 | "def get_data_themeembs(data):\n", 813 | " result = []\n", 814 | " for themes in iter(data['attentionthemes'].values):\n", 815 | " if themes == '-1':\n", 816 | " result.append([])\n", 817 | " continue\n", 818 | " cur = np.zeros(int(64*0.35))\n", 819 | " themes = themes.split(',')\n", 820 | " for theme in iter(themes):\n", 821 | " cur = cur + dic_t2v_pca[theme]\n", 822 | " cur = cur/len(themes)\n", 823 | " result.append(list(cur))\n", 824 | " return pd.DataFrame(result,columns = ['attentionthemes'+str(i) for i in range(int(64*0.35))])\n", 825 | "\n", 826 | "writer_attentionthemes = parallelize_dataframe(data_writer,get_data_themeembs)\n", 827 | "writer_attentionthemes.to_csv(datapath+'data/writer_attentiontheme.csv',header = True,index = False)\n", 828 | "\n", 829 | "def get_data_themeembs_weight(data):\n", 830 | " def f(s,data_v,leix):\n", 831 | " result = np.zeros(22)\n", 832 | " if s == '-1':\n", 833 | " return result\n", 834 | " s = s.strip().split(',')\n", 835 | " for t in iter(s):\n", 836 | " t = t.strip().split(':')\n", 837 | " result = result+data_v.loc[data_v[leix+'Id'] == t[0],[leix+'_'+str(i) for i in range(22)]].values[0]*float(t[1])\n", 838 | " try:\n", 839 | " result = result/len(s)\n", 840 | " except:\n", 841 | " pass\n", 842 | " return result\n", 843 | " data_v = t2v_pca\n", 844 | " col = 'likethemes'\n", 845 | " leix = 'theme'\n", 846 | " return pd.DataFrame(list(data.apply(lambda x:f(x,data_v,leix))),columns = [col+str(i) for i in range(int(64*0.35))])\n", 847 | "\n", 848 | "writer_likethemes = parallelize_dataframe(data_writer['likethemes'],get_data_themeembs_weight)\n", 849 | "writer_likethemes.to_csv(datapath+'data/writer_liketheme.csv',header = True,index = False)\n", 850 | "\n", 851 | "del data_writer\n", 852 | "del writer_likethemes\n", 853 | "del writer_attentionthemes\n", 854 | "gc.collect()" 855 | ] 856 | }, 857 | { 858 | "cell_type": "markdown", 859 | "metadata": {}, 860 | "source": [ 861 | "### 训练集\n", 862 | "处理训练集,根据邀请时间字段生成邀请的天、时字段 " 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": null, 868 | "metadata": { 869 | "collapsed": true 870 | }, 871 | "outputs": [], 872 | "source": [ 873 | "data_invite = pd.read_csv(datapath+'data/invite_info.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime','label'])\n", 874 | "data_invite['inviteday'] = np.nan\n", 875 | "data_invite['invitehour'] = np.nan\n", 876 | "data_invite['inviteday'] = data_invite.invitetime.apply(lambda x: int(x.split('-')[0][1:]))\n", 877 | "data_invite['invitehour'] = data_invite.invitetime.apply(lambda x: int(x.split('-')[1][1:]))\n", 878 | "data_invite = memoryOptimization(data_invite,np.float32)" 879 | ] 880 | }, 881 | { 882 | "cell_type": "markdown", 883 | "metadata": {}, 884 | "source": [ 885 | "根据官方的说法一个问题只会对同一个用户邀请一次,训练集中存在2k个左右重复的样本,将这部分样本去重,保留邀请时间最早的样本" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "metadata": { 892 | "collapsed": true 893 | }, 894 | "outputs": [], 895 | "source": [ 896 | "data_invite['index'] = data_invite.index\n", 897 | "data_invite = data_invite.sort_values(by = ['inviteday','invitehour']).reset_index(drop = True).drop_duplicates(['qId','writerId'],keep = 'first')\n", 898 | "data_invite = data_invite.sort_values(by = 'index').reset_index(drop = True)\n", 899 | "del data_invite['index']" 900 | ] 901 | }, 902 | { 903 | "cell_type": "markdown", 904 | "metadata": {}, 905 | "source": [ 906 | "删除回答和邀请时间>10天的正样本" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": null, 912 | "metadata": { 913 | "collapsed": true 914 | }, 915 | "outputs": [], 916 | "source": [ 917 | "data_answer = pd.read_csv(datapath+'data/data_answer.csv')[['qId','writerId','answerday','answerhour']]\n", 918 | "data_invite = data_invite.merge(data_answer[['qId','writerId','answerday','answerhour']],on = ['writerId','qId'],how = 'left')\n", 919 | "data_invite['deltday'] = data_invite['answerday']-data_invite['inviteday']\n", 920 | "data_invite = data_invite[~(data_invite['deltday']>10)].reset_index(drop = True)\n", 921 | "del data_invite['answerday']\n", 922 | "del data_invite['answerhour']\n", 923 | "del data_invite['deltday']\n", 924 | "del data_answer\n", 925 | "data_invite.to_csv(datapath+'data/data_invite.csv',header = True,index = False)\n", 926 | "del data_invite\n", 927 | "gc.collect()" 928 | ] 929 | }, 930 | { 931 | "cell_type": "markdown", 932 | "metadata": {}, 933 | "source": [ 934 | "### 测试集1&2" 935 | ] 936 | }, 937 | { 938 | "cell_type": "markdown", 939 | "metadata": {}, 940 | "source": [ 941 | "处理测试集1和测试集2,根据邀请时间字段生成邀请的天、时字段" 942 | ] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": null, 947 | "metadata": { 948 | "collapsed": true 949 | }, 950 | "outputs": [], 951 | "source": [ 952 | "data_eval = pd.read_csv(datapath+'data/invite_info_evaluate_1.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime','label'])\n", 953 | "data_eval['inviteday'] = np.nan\n", 954 | "data_eval['invitehour'] = np.nan\n", 955 | "data_eval['inviteday'] = data_eval.invitetime.apply(lambda x: int(x.split('-')[0][1:]))\n", 956 | "data_eval['invitehour'] = data_eval.invitetime.apply(lambda x: int(x.split('-')[1][1:]))\n", 957 | "data_eval = memoryOptimization(data_eval,np.float64)\n", 958 | "data_eval.to_csv(datapath+'data/data_invite_eval.csv',header = True,index = False)\n", 959 | "del data_eval\n", 960 | "\n", 961 | "data_test = pd.read_csv(datapath+'data/invite_info_evaluate_2_0926.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime','label'])\n", 962 | "data_test['inviteday'] = np.nan\n", 963 | "data_test['invitehour'] = np.nan\n", 964 | "data_test['inviteday'] = data_test.invitetime.apply(lambda x: int(x.split('-')[0][1:]))\n", 965 | "data_test['invitehour'] = data_test.invitetime.apply(lambda x: int(x.split('-')[1][1:]))\n", 966 | "data_test = memoryOptimization(data_test,np.float64)\n", 967 | "data_test.to_csv(datapath+'data/data_invite_test.csv',header = True,index = False)\n", 968 | "del data_test" 969 | ] 970 | }, 971 | { 972 | "cell_type": "markdown", 973 | "metadata": {}, 974 | "source": [ 975 | "## 二、数据拼接 \n", 976 | "拼接train、test1、test2得到完整数据data,将用户信息和问题信息merge到data上" 977 | ] 978 | }, 979 | { 980 | "cell_type": "code", 981 | "execution_count": null, 982 | "metadata": { 983 | "collapsed": true 984 | }, 985 | "outputs": [], 986 | "source": [ 987 | "data_train = pd.read_csv(datapath+'data/data_invite.csv')#train\n", 988 | "data_test1 = pd.read_csv(datapath+'data/data_invite_eval.csv')#test1\n", 989 | "data_test2 = pd.read_csv(datapath+'data/data_invite_test.csv')#test2\n", 990 | "data_train['type'] = 'train'\n", 991 | "data_test1['type'] = 'test1'\n", 992 | "data_test2['type'] = 'test2'\n", 993 | "data = pd.concat([data_train,data_test1,data_test2],axis = 0,ignore_index = True)\n", 994 | "data['label'] = data['label'].fillna(-1)\n", 995 | "data = data.drop(['invitetime'],axis = 1)\n", 996 | "data = memoryOptimization(data,np.float32)\n", 997 | "del data_test1\n", 998 | "del data_test2\n", 999 | "del data_train\n", 1000 | "gc.collect()\n", 1001 | "\n", 1002 | "data_question = pd.read_csv(datapath+'data/data_question.csv')\n", 1003 | "question_theme = pd.read_csv(datapath+'data/question_theme.csv')\n", 1004 | "data_question = pd.concat([data_question,question_theme],axis = 1)\n", 1005 | "data_question = data_question.drop(['createtime','title_letters','title_words','describe_letters','describe_words'],axis = 1)\n", 1006 | "data_question = memoryOptimization(data_question,np.float32)\n", 1007 | "del question_theme\n", 1008 | "\n", 1009 | "data_writer = pd.read_csv(datapath+'data/data_writer.csv')\n", 1010 | "writer_attentiontheme = pd.read_csv(datapath+'data/writer_attentiontheme.csv')\n", 1011 | "data_writer = pd.concat([data_writer,writer_attentiontheme],axis = 1)\n", 1012 | "data_writer = memoryOptimization(data_writer,np.float32)\n", 1013 | "del writer_attentiontheme\n", 1014 | "gc.collect()\n", 1015 | "\n", 1016 | "data = pd.merge(data,data_question,how = 'left',on = 'qId')\n", 1017 | "data = pd.merge(data,data_writer,how = 'left',on = 'writerId')\n", 1018 | "data['inviteallhour'] = (data['inviteday']-3800)*24+data['invitehour']\n", 1019 | "data['inviteweekday'] = getweekday(data['inviteday'])\n", 1020 | "data['createweekday'] = getweekday(data['createday'])\n", 1021 | "del data_question\n", 1022 | "del data_writer\n", 1023 | "gc.collect()" 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "markdown", 1028 | "metadata": {}, 1029 | "source": [ 1030 | "# .特征工程\n", 1031 | "\n", 1032 | "## 一、单一侧特征(用户侧、问题侧)" 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "markdown", 1037 | "metadata": {}, 1038 | "source": [ 1039 | "### 1、计数类特征" 1040 | ] 1041 | }, 1042 | { 1043 | "cell_type": "code", 1044 | "execution_count": null, 1045 | "metadata": { 1046 | "collapsed": true 1047 | }, 1048 | "outputs": [], 1049 | "source": [ 1050 | "df = data[[]]" 1051 | ] 1052 | }, 1053 | { 1054 | "cell_type": "markdown", 1055 | "metadata": {}, 1056 | "source": [ 1057 | "1)滑窗统计特征,对id及类别特征统计过去7天的邀请数,反应过去一周的邀请情况" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "code", 1062 | "execution_count": null, 1063 | "metadata": { 1064 | "collapsed": true 1065 | }, 1066 | "outputs": [], 1067 | "source": [ 1068 | "def get_crossfeas_inv(data,fea1,fea2):\n", 1069 | " dataf = data[[fea1,fea2]].copy()\n", 1070 | " bool_s = (~dataf[fea1].isna())&(~dataf[fea2].isna())\n", 1071 | " dataf['cross'] = np.nan\n", 1072 | " dataf.loc[bool_s,'cross'] = dataf.loc[bool_s,fea1].apply(str)+'_'+dataf.loc[bool_s,fea2].apply(str)\n", 1073 | " return dataf['cross'].values\n", 1074 | "\n", 1075 | "def lastndayinvite(dataf,n,feas,use_weight):\n", 1076 | " dicfea = feas[0]\n", 1077 | " if len(feas)>1:\n", 1078 | " fea = dicfea\n", 1079 | " for i in feas[1:]:\n", 1080 | " fea = fea+'_'+i\n", 1081 | " dataf[fea] = get_crossfeas_inv(dataf,feas[0],feas[1])\n", 1082 | " else:\n", 1083 | " fea = dicfea\n", 1084 | " \n", 1085 | " if use_weight:\n", 1086 | " gps = dataf.groupby(['inviteday'])\n", 1087 | " dic = {}\n", 1088 | " for gp_id in iter(dataf['inviteday'].unique()):\n", 1089 | " gp = gps.get_group(gp_id)\n", 1090 | " dic[gp_id] = gp.shape[0]/gp[dicfea].nunique()\n", 1091 | " dic = pd.Series(dic)\n", 1092 | " dic = (dic.mean()/dic).round(3).to_dict()\n", 1093 | " \n", 1094 | " data_gps = dataf.groupby([fea,'inviteday']).size().astype(float).reset_index()\n", 1095 | " if use_weight:\n", 1096 | " values_0 = []\n", 1097 | " for row in iter(data_gps[['inviteday',0]].values):\n", 1098 | " values_0.append(dic[row[0]]*row[1])\n", 1099 | "\n", 1100 | " data_gps[0] = values_0\n", 1101 | " data_gps = data_gps.rename(columns = {0:'size'})\n", 1102 | " \n", 1103 | " result = dataf[[]]\n", 1104 | " result['val'] = np.nan\n", 1105 | " for day in iter(dataf['inviteday'].unique()):\n", 1106 | " result.loc[dataf['inviteday'] == day,'val'] = get_invite_count(data_gps[(data_gps['inviteday']<=day)&(data_gps['inviteday']>day-n)],dataf[dataf['inviteday'] == day],fea)\n", 1107 | " result.loc[(~dataf[fea].isna())&(result['val'].isna()),'val'] = 0\n", 1108 | " \n", 1109 | " if len(feas)>1:\n", 1110 | " del dataf[fea]\n", 1111 | " gc.collect()\n", 1112 | " \n", 1113 | " return result['val'].values\n", 1114 | "\n", 1115 | "def get_invite_count(df_train,df_test,fea):\n", 1116 | " df_train = df_train[[fea,'size']].groupby(fea).sum()['size'].reset_index()\n", 1117 | " new_fea_name = fea+'_count'\n", 1118 | " df_train.columns = [fea,new_fea_name]\n", 1119 | " df_test = df_test[[fea]].merge(df_train,on = fea,how = 'left')\n", 1120 | " \n", 1121 | " return df_test[new_fea_name].values\n", 1122 | "\n", 1123 | "#7天邀请数,代表用户被邀请的频率\n", 1124 | "n = 7\n", 1125 | "for fea in ['qId','writerId']+['category_C','sex','activity']:#'invitehour','createhour','activity','bool_A','bool_B','bool_C','bool_D','bool_E','category_A','yanzhi'\n", 1126 | " df[fea+'_last%sday_count' %n] = lastndayinvite(data,n,[fea],use_weight = False)\n", 1127 | " print(fea+' is ok')" 1128 | ] 1129 | }, 1130 | { 1131 | "cell_type": "markdown", 1132 | "metadata": {}, 1133 | "source": [ 1134 | "对盐值等频分箱后作为类别变量处理,也可以等间距分箱、卡方分箱等,或者直接取整后作为类别变量; \n", 1135 | "另外invitehour等时间类数值特征也可以分箱,这里只考虑了盐值" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "code", 1140 | "execution_count": 8, 1141 | "metadata": { 1142 | "collapsed": true 1143 | }, 1144 | "outputs": [], 1145 | "source": [ 1146 | "def split_data(dataf,col,split_num):\n", 1147 | "# dataf = dataf.copy()\n", 1148 | " count = dataf.shape[0]\n", 1149 | " n = math.ceil(count/split_num)\n", 1150 | " split_index = [i*n for i in range(1,split_num)]\n", 1151 | " values = sorted(list(dataf[col]))\n", 1152 | " split_point = [values[i] for i in split_index]\n", 1153 | " split_point = sorted(list(set(split_point)))\n", 1154 | " return split_point\n", 1155 | "\n", 1156 | "def get_group(x,split_bin):\n", 1157 | " n = len(split_bin)\n", 1158 | " if x <= min(split_bin):\n", 1159 | " return min(split_bin)\n", 1160 | " elif x> max(split_bin):\n", 1161 | " return max(split_bin)+max(split_bin)/n\n", 1162 | " else:\n", 1163 | " for i in range(n-1):\n", 1164 | " if split_bin[i] < x <= split_bin[i+1]:\n", 1165 | " return split_bin[i+1]\n", 1166 | "\n", 1167 | "points = split_data(data,'yanzhi',split_num = 10)\n", 1168 | "data['yanzhi_d'] = data['yanzhi'].apply(lambda x:get_group(x,points)).apply(int)\n", 1169 | "df['yanzhi_d_last%sday_count' %n] = lastndayinvite(data,n,['yanzhi_d'],use_weight = False)\n", 1170 | "print('yanzhi_d is ok')\n", 1171 | "del data['yanzhi_d']\n", 1172 | "gc.collect()" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "markdown", 1177 | "metadata": {}, 1178 | "source": [ 1179 | "2)滑窗统计特征,过去三天每天分别的邀请量计数统计,反应的是对应用户或问题近期的邀请情况" 1180 | ] 1181 | }, 1182 | { 1183 | "cell_type": "code", 1184 | "execution_count": null, 1185 | "metadata": { 1186 | "collapsed": true 1187 | }, 1188 | "outputs": [], 1189 | "source": [ 1190 | "def get_last3dayinvitenum(dataf,df,fea,use_weight):\n", 1191 | " if use_weight:\n", 1192 | " gps = dataf.groupby(['inviteday'])\n", 1193 | " dic = {}\n", 1194 | " for gp_id in iter(dataf['inviteday'].unique()):\n", 1195 | " gp = gps.get_group(gp_id)\n", 1196 | " dic[gp_id] = gp.shape[0]/gp[fea].nunique()\n", 1197 | " dic = pd.Series(dic)\n", 1198 | " dic = (dic.mean()/dic).round(3).to_dict()\n", 1199 | " \n", 1200 | " data_gps = dataf.groupby(['inviteday',fea]).size().astype(float)\n", 1201 | " if use_weight:\n", 1202 | " for day in iter(dataf['inviteday'].unique()):#range(t0_eval,t1_eval+1):\n", 1203 | " data_gps[day] = data_gps[day]*dic[day]\n", 1204 | " data_gps = data_gps.reset_index().rename(columns = {0:'size'})\n", 1205 | " \n", 1206 | " for i in [fea+'_last3invnum'+str(i) for i in range(3)]+[fea+'_curdayinvnum']:\n", 1207 | " df[i] = np.nan\n", 1208 | " dic_result = {}\n", 1209 | " pool = Pool(10)\n", 1210 | " for day in iter(dataf['inviteday'].unique()):\n", 1211 | " dic_result[day] = pool.apply_async(func = get_last3invnum,args = (data_gps[(data_gps['inviteday']<=day)&(data_gps['inviteday']>=day-3)],dataf[dataf['inviteday'] == day],fea,day,))\n", 1212 | " pool.close()\n", 1213 | " pool.join()\n", 1214 | " \n", 1215 | " for day in iter(dataf['inviteday'].unique()):\n", 1216 | " df.loc[dataf['inviteday'] == day,[fea+'_last3invnum'+str(i) for i in range(3)]+[fea+'_curdayinvnum']] = dic_result[day].get()\n", 1217 | " \n", 1218 | " return df\n", 1219 | "\n", 1220 | "def get_last3invnum(data_train,data_test,fea,day):\n", 1221 | " data_train = data_train[data_train[fea].isin(data_test[fea].unique())]\n", 1222 | " gps = data_train.groupby(fea)\n", 1223 | " dic_fea = {}\n", 1224 | " daylist = [day-3,day-2,day-1,day]\n", 1225 | " for val in iter(data_train.reset_index()[fea].unique()):\n", 1226 | " gp = gps.get_group(val)\n", 1227 | " dic_val = gp['size']\n", 1228 | " dic_val.index = gp['inviteday'].values\n", 1229 | " dic_fea[val] = []\n", 1230 | " for day in iter(daylist):\n", 1231 | " try:\n", 1232 | " dic_fea[val].append(dic_val[day])\n", 1233 | " except:\n", 1234 | " dic_fea[val].append(0)\n", 1235 | " \n", 1236 | " dic_fea = pd.DataFrame(dic_fea).T.reset_index()\n", 1237 | " dic_fea.columns = [fea]+[fea+'_last3invnum'+str(i) for i in range(3)]+[fea+'_curdayinvnum']\n", 1238 | " data_test = data_test.merge(dic_fea,on = fea,how = 'left')\n", 1239 | " return data_test[[fea+'_last3invnum'+str(i) for i in range(3)]+[fea+'_curdayinvnum']].values\n", 1240 | "\n", 1241 | "for fea in ['qId','writerId']:#,\n", 1242 | " df = get_last3dayinvitenum(data,df,fea,use_weight = False)\n", 1243 | " print(fea+' is ok')\n", 1244 | "\n", 1245 | "df['qId_last3+1invnum_mean'] = df[['qId_last3invnum0','qId_last3invnum1','qId_last3invnum2','qId_curdayinvnum']].mean(axis = 1)\n", 1246 | "df['qId_last3+1invnum_std'] = df[['qId_last3invnum0','qId_last3invnum1','qId_last3invnum2','qId_curdayinvnum']].std(axis = 1)\n", 1247 | "df['writerId_last3+1invnum_mean'] = df[['writerId_last3invnum0','writerId_last3invnum1','writerId_last3invnum2','writerId_curdayinvnum']].mean(axis = 1)\n", 1248 | "df['writerId_last3+1invnum_std'] = df[['writerId_last3invnum0','writerId_last3invnum1','writerId_last3invnum2','writerId_curdayinvnum']].std(axis = 1)\n", 1249 | "df = df.drop(['qId_curdayinvnum','writerId_curdayinvnum'],axis = 1)\n", 1250 | "gc.collect()" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "markdown", 1255 | "metadata": {}, 1256 | "source": [ 1257 | "3)当天的邀请数计数特征,当天的邀请数量" 1258 | ] 1259 | }, 1260 | { 1261 | "cell_type": "code", 1262 | "execution_count": null, 1263 | "metadata": { 1264 | "collapsed": true 1265 | }, 1266 | "outputs": [], 1267 | "source": [ 1268 | "def get_curdayinvitenum(dataf,fea,use_weight):\n", 1269 | " if use_weight:\n", 1270 | " gps = dataf.groupby(['inviteday'])\n", 1271 | " dic = {}\n", 1272 | " for gp_id in iter(dataf['inviteday'].unique()):\n", 1273 | " gp = gps.get_group(gp_id)\n", 1274 | " dic[gp_id] = gp.shape[0]/gp[fea].nunique()\n", 1275 | " dic = pd.Series(dic)\n", 1276 | " dic = (dic.mean()/dic).round(3).to_dict()\n", 1277 | " \n", 1278 | " data_gps = dataf.groupby(['inviteday',fea]).size().astype('float')\n", 1279 | " if use_weight:\n", 1280 | " for day in iter(dataf['inviteday'].unique()):#range(t0_eval,t1_eval+1):\n", 1281 | " data_gps[day] = data_gps[day]*dic[day]\n", 1282 | " data_gps = data_gps.reset_index().rename(columns = {0:'size'})\n", 1283 | " \n", 1284 | " result = data.merge(data_gps,on = [fea,'inviteday'],how = 'left')['size'].values\n", 1285 | " \n", 1286 | " return result\n", 1287 | "\n", 1288 | "for fea in ['qId','writerId','invitehour','createhour','createday','createweekday',\n", 1289 | " 'sex','activity','bool_D','category_C','category_E','yanzhi','mostliketheme']:\n", 1290 | " df[fea+'_curdayinv_count'] = get_curdayinvitenum(data,fea,use_weight = False)\n", 1291 | " print(fea+' is ok')" 1292 | ] 1293 | }, 1294 | { 1295 | "cell_type": "markdown", 1296 | "metadata": {}, 1297 | "source": [ 1298 | "4)全局计数特征:对类别特征,包括id,统计整个数据集的邀请数;虽然已经构造了历史计数特征,但全局的统计仍然有意义" 1299 | ] 1300 | }, 1301 | { 1302 | "cell_type": "code", 1303 | "execution_count": null, 1304 | "metadata": { 1305 | "collapsed": true 1306 | }, 1307 | "outputs": [], 1308 | "source": [ 1309 | "def get_alldata_count(data, fea, new_column_name,use_weight):#构造类别特征数量统计特征\n", 1310 | " if use_weight:\n", 1311 | " gps = data.groupby('inviteday')\n", 1312 | " dic = {}\n", 1313 | " for gp_id in iter(data['inviteday'].unique()):\n", 1314 | " gp = gps.get_group(gp_id)\n", 1315 | " dic[gp_id] = gp.shape[0]/gp[fea].nunique()\n", 1316 | " dic = pd.Series(dic)\n", 1317 | " dic = (dic.mean()/dic).round(3).to_dict()\n", 1318 | " \n", 1319 | " dataf = data[[fea,'inviteday']].groupby([fea,'inviteday']).size().astype(float).reset_index()\n", 1320 | " if use_weight:\n", 1321 | " values_0 = []\n", 1322 | " for row in iter(dataf[['inviteday',0]].values):\n", 1323 | " values_0.append(dic[row[0]]*row[1])\n", 1324 | " dataf[0] = values_0\n", 1325 | " dataf = dataf[[fea,0]].groupby(fea).sum()[0].reset_index()\n", 1326 | " \n", 1327 | " dataf = dataf.rename(columns = {0:new_column_name})\n", 1328 | " dataf = data.merge(dataf, on = fea, how = \"left\") \n", 1329 | " return dataf[new_column_name]\n", 1330 | "\n", 1331 | "for fea in ['qId','createday','createhour','invitehour','writerId','yanzhi','mostliketheme']:\n", 1332 | " df['%s_count' % fea] = get_alldata_count(data,fea,'%s_count' % fea,use_weight = False)\n", 1333 | " print(fea+' is ok')" 1334 | ] 1335 | }, 1336 | { 1337 | "cell_type": "markdown", 1338 | "metadata": {}, 1339 | "source": [ 1340 | "5)用户或问题id的历史统计特征关于问题或用户id的平均值 \n", 1341 | "实际该部分属于用户跟问题的交叉,而不是单一侧" 1342 | ] 1343 | }, 1344 | { 1345 | "cell_type": "code", 1346 | "execution_count": null, 1347 | "metadata": { 1348 | "collapsed": true 1349 | }, 1350 | "outputs": [], 1351 | "source": [ 1352 | "#过去7天的统计聚合\n", 1353 | "df['writerId_last7count_gp_qId'] = df['writerId_last7day_count'].groupby(data['qId']).transform(np.mean)\n", 1354 | "df['qId_last7count_gp_writerId'] = df['qId_last7day_count'].groupby(data['writerId']).transform(np.mean)\n", 1355 | "\n", 1356 | "#全局的统计集合\n", 1357 | "df['writerId_count_gp_qId'] = df['writerId_count'].groupby(data['qId']).transform(np.mean)\n", 1358 | "df['qId_count_gp_writerId'] = df['qId_count'].groupby(data['writerId']).transform(np.mean)\n", 1359 | "\n", 1360 | "#当天的统计聚合\n", 1361 | "df['writerId_curcount_gp_qId'] = df['writerId_curdayinv_count'].groupby(data['qId']).transform(np.mean)\n", 1362 | "df['qId_curcount_gp_writerId'] = df['qId_curdayinv_count'].groupby(data['writerId']).transform(np.mean)" 1363 | ] 1364 | }, 1365 | { 1366 | "cell_type": "code", 1367 | "execution_count": null, 1368 | "metadata": { 1369 | "collapsed": true 1370 | }, 1371 | "outputs": [], 1372 | "source": [ 1373 | "df = memoryOptimization(df,np.float32)\n", 1374 | "with open(datapath+'df/df_1.1.pkl','wb') as f:\n", 1375 | " pickle.dump(df,f)\n", 1376 | "del df" 1377 | ] 1378 | }, 1379 | { 1380 | "cell_type": "markdown", 1381 | "metadata": {}, 1382 | "source": [ 1383 | "### 2、目标编码特征\n", 1384 | " 因为回答记录文件的记录时长是两个月,所以该部分利用回答记录文件统计历史的回答计数,利用训练集的标签统计历史点击率特征; \n", 1385 | " 构造该部分特征时注意数据时间穿越(泄露)问题;" 1386 | ] 1387 | }, 1388 | { 1389 | "cell_type": "code", 1390 | "execution_count": null, 1391 | "metadata": { 1392 | "collapsed": true 1393 | }, 1394 | "outputs": [], 1395 | "source": [ 1396 | "df = data[[]]" 1397 | ] 1398 | }, 1399 | { 1400 | "cell_type": "code", 1401 | "execution_count": null, 1402 | "metadata": { 1403 | "collapsed": true 1404 | }, 1405 | "outputs": [], 1406 | "source": [ 1407 | "data_answer = pd.read_csv(datapath+'data/data_answer.csv')\n", 1408 | "# data_answer = data_answer[['qId','writerId','answerday','answerhour']]\n", 1409 | "data_question = pd.read_csv(datapath+'data/data_question.csv')\n", 1410 | "data_writer = pd.read_csv(datapath+'data/data_writer.csv')\n", 1411 | "data_answer = pd.merge(data_answer,data_question,on = 'qId',how = 'left')\n", 1412 | "data_answer = pd.merge(data_answer,data_writer,on = 'writerId',how = 'left')\n", 1413 | "\n", 1414 | "data_answer = data_answer.rename(columns = {'answerhour':'invitehour','answerday':'inviteday'})\n", 1415 | "#为了便于构造特征,将问题回答记录中的回答时间名称改为邀请时间\n", 1416 | "data_answer['inviteweekday'] = getweekday(data_answer['inviteday'])\n", 1417 | "data_answer['createweekday'] = getweekday(data_answer['createday'])\n", 1418 | "data_answer['label'] = 1\n", 1419 | "del data_writer\n", 1420 | "del data_question\n", 1421 | "gc.collect()" 1422 | ] 1423 | }, 1424 | { 1425 | "cell_type": "markdown", 1426 | "metadata": {}, 1427 | "source": [ 1428 | "1)历史第七天用户和问题的目标编码特征,滑窗统计特征,包括回答数量以及邀请的接受率,id类特征构造的特征结果特别稀疏" 1429 | ] 1430 | }, 1431 | { 1432 | "cell_type": "code", 1433 | "execution_count": 12, 1434 | "metadata": { 1435 | "collapsed": true 1436 | }, 1437 | "outputs": [], 1438 | "source": [ 1439 | "def slidewindow(dataf,data_dic,dayfea,func):\n", 1440 | " dataf = data.copy()\n", 1441 | " dataf['day_lastweek'] = dataf[dayfea[0]]-7\n", 1442 | " dic = data_dic.groupby(dayfea)['label'].agg(func).reset_index()\n", 1443 | " dic = dic.rename(columns = {dayfea[0]:'day_lastweek','label':'count'})\n", 1444 | " dataf = pd.merge(dataf,dic,on = ['day_lastweek']+dayfea[1:],how = 'left')\n", 1445 | " dataf.loc[(~dataf[datafea[1]].isna())&(dataf['count'].isna()),'count'] = 0\n", 1446 | " return dataf['count'].values\n", 1447 | "\n", 1448 | "for fea in ['qId','writerId']+['activity','category_A','category_D']:\n", 1449 | " for dayfea in ['inviteday','createday']:\n", 1450 | " str_week = dayfea.split('day')[0]+'weekday'\n", 1451 | " df['%s_%slastweek2label_count' % (fea,dayfea)] = slidewindow(data,data_answer,[dayfea,fea],func = np.sum)\n", 1452 | " df['%s_%slastweek2label_rate' % (fea,dayfea)] = slidewindow(data,data[data['label'] != -1],[dayfea,fea],func = np.mean)" 1453 | ] 1454 | }, 1455 | { 1456 | "cell_type": "markdown", 1457 | "metadata": {}, 1458 | "source": [ 1459 | "2)历史目标编码特征,滑窗统计特征,labelcount选取时间窗口为3周,labelctr选取为整个历史训练集" 1460 | ] 1461 | }, 1462 | { 1463 | "cell_type": "code", 1464 | "execution_count": null, 1465 | "metadata": { 1466 | "collapsed": true 1467 | }, 1468 | "outputs": [], 1469 | "source": [ 1470 | "def get_crossfeas_tar(data,data_asw,fea1,fea2):\n", 1471 | " dataf = pd.concat([data[[fea1,fea2]],data_asw[[fea1,fea2]]],axis = 0).reset_index(drop = True)\n", 1472 | " bool_s = (~dataf[fea1].isna())&(~dataf[fea2].isna())\n", 1473 | " dataf['cross'] = np.nan\n", 1474 | " dataf.loc[bool_s,'cross'] = dataf.loc[bool_s,fea1].apply(str)+'_'+dataf.loc[bool_s,fea2].apply(str)\n", 1475 | " cross = dataf['cross'].values\n", 1476 | " c1 = cross[:data.shape[0]]\n", 1477 | " c2 = cross[data.shape[0]:]\n", 1478 | " return c1,c2\n", 1479 | "\n", 1480 | "def targetencoder(dataf,data_asw,df,feas):\n", 1481 | " dicfea = feas[0]\n", 1482 | " if len(feas)>1:\n", 1483 | " fea = dicfea\n", 1484 | " for i in feas[1:]:\n", 1485 | " fea = fea+'_'+i\n", 1486 | " dataf[fea],data_asw[fea] = get_crossfeas_tar(dataf,data_asw,feas[0],feas[1])\n", 1487 | " else:\n", 1488 | " fea = dicfea\n", 1489 | "\n", 1490 | "# dataf = dataf.copy()\n", 1491 | " gps_asw = data_asw.groupby('inviteday')\n", 1492 | " dic_asw = {}\n", 1493 | " for gp_id in iter(data_asw['inviteday'].unique()):\n", 1494 | " gp = gps_asw.get_group(gp_id)\n", 1495 | " dic_asw[gp_id] = gp.shape[0]/gp[dicfea].nunique()\n", 1496 | " dic_asw = pd.Series(dic_asw)\n", 1497 | " dic_asw = (dic_asw.mean()/dic_asw).round(3).to_dict()\n", 1498 | "\n", 1499 | " data_asw = data_asw[[fea,'inviteday']].groupby([fea,'inviteday']).size().reset_index()\n", 1500 | " values_0 = []\n", 1501 | " for row in iter(data_asw[['inviteday',0]].values):\n", 1502 | " values_0.append(dic_asw[row[0]]*row[1])\n", 1503 | " data_asw[0] = values_0\n", 1504 | " data_asw = data_asw.rename(columns = {0:'size'})\n", 1505 | " \n", 1506 | " gps_inv = dataf.groupby('inviteday')\n", 1507 | " dic_inv = {}\n", 1508 | " for gp_id in iter(dataf['inviteday'].unique()):\n", 1509 | " gp = gps_inv.get_group(gp_id)\n", 1510 | " dic_inv[gp_id] = gp.shape[0]/gp[dicfea].nunique()\n", 1511 | " dic_inv = pd.Series(dic_inv)\n", 1512 | " dic_inv = (dic_inv.mean()/dic_inv).round(3).to_dict()\n", 1513 | " \n", 1514 | " data_gps = dataf[[fea,'inviteday','label']].groupby([fea,\n", 1515 | " 'inviteday']).agg(['sum','count'])['label'].reset_index()\n", 1516 | " values_sum = []\n", 1517 | " values_count = []\n", 1518 | " for row in iter(data_gps[['inviteday','sum','count']].values):\n", 1519 | " values_sum.append(dic_inv[row[0]]*row[1])\n", 1520 | " values_count.append(dic_inv[row[0]]*row[2])\n", 1521 | " data_gps['sum'] = values_sum\n", 1522 | " data_gps['count'] = values_count\n", 1523 | " \n", 1524 | " dataf['timegroup'] = (dataf['inviteday']/3).apply(int)\n", 1525 | " data_gps['timegroup'] = (data_gps['inviteday']/3).apply(int)\n", 1526 | " data_train = dataf[dataf['label'] != -1]\n", 1527 | " data_test = dataf[dataf['label'] == -1] \n", 1528 | " \n", 1529 | " df[fea+'_label_count'] = np.nan#np.zeros(dataf.shape[0])\n", 1530 | " daylen_asw = 21\n", 1531 | " for day in iter(data_train['inviteday'].unique()):\n", 1532 | " df.loc[dataf['inviteday'] == day,fea+'_label_count'] = get_label_count(data_asw[(data_asw['inviteday']=day-daylen_asw)],dataf[dataf['inviteday'] == day],fea)\n", 1533 | " df.loc[data_test.index,fea+'_label_count'] = get_label_count(data_asw[(data_asw['inviteday']=t0_eval-daylen_asw)],data_test,fea)\n", 1534 | " df.loc[(~dataf[fea].isna())&(df[fea+'_label_count'].isna()),fea+'_label_count'] = 0\n", 1535 | " \n", 1536 | " df[fea+'_label_ctr'] = np.nan#np.zeros(dataf.shape[0])\n", 1537 | " for gp in iter(data_train['timegroup'].unique()):\n", 1538 | " df.loc[(dataf['timegroup'] == gp)&(dataf['label'] != -1),fea+'_label_ctr'] = get_label_ctr(data_gps[data_gps['timegroup'] < gp],data_train[data_train['timegroup'] == gp],fea)\n", 1539 | " df.loc[data_test.index,fea+'_label_ctr'] = get_label_ctr(data_gps[data_gps['inviteday']1:\n", 1543 | " del dataf[fea]\n", 1544 | " del data_asw[fea]\n", 1545 | " del dataf['timegroup']\n", 1546 | " del data_train\n", 1547 | " del data_test\n", 1548 | " gc.collect()\n", 1549 | " \n", 1550 | " return df\n", 1551 | "\n", 1552 | "def get_label_count(df_train,df_test,fea):\n", 1553 | " df_train = df_train[[fea,'size']].groupby(fea).sum()['size'].reset_index()\n", 1554 | " new_fea_name = fea+'_label_count'\n", 1555 | " df_train = df_train[[fea,'size']].rename(columns = {'size':new_fea_name})\n", 1556 | " df_test = df_test.merge(df_train,on = fea,how = 'left')\n", 1557 | " \n", 1558 | " return df_test[new_fea_name].values\n", 1559 | "\n", 1560 | "def get_label_ctr(df_train,df_test,fea):\n", 1561 | " df_train = df_train[[fea,'sum','count']].groupby(fea).sum()[['sum','count']].reset_index()\n", 1562 | " new_fea_name = fea+'_label_ctr'\n", 1563 | " df_train[new_fea_name] = (df_train['sum'] + 1) / (df_train['count'] + 1)\n", 1564 | " df_train = df_train[[fea,new_fea_name]]\n", 1565 | " df_test = df_test.merge(df_train,on = fea,how = 'left')\n", 1566 | " \n", 1567 | " return df_test[new_fea_name].values " 1568 | ] 1569 | }, 1570 | { 1571 | "cell_type": "code", 1572 | "execution_count": null, 1573 | "metadata": { 1574 | "collapsed": true 1575 | }, 1576 | "outputs": [], 1577 | "source": [ 1578 | "for fea in ['qId','createhour','invitehour','mostliketheme','writerId','yanzhi','category_C','category_D','activity']:\n", 1579 | " df = targetencoder(data,data_answer,df,[fea])\n", 1580 | " print(fea+' is ok')" 1581 | ] 1582 | }, 1583 | { 1584 | "cell_type": "markdown", 1585 | "metadata": {}, 1586 | "source": [ 1587 | "3)多标签类别特征的目标编码 \n", 1588 | "对多个tag的统计结果进行排序,取topk个作为k列特征,这样处理的目的在于过滤标签,许多标签是冗余的" 1589 | ] 1590 | }, 1591 | { 1592 | "cell_type": "code", 1593 | "execution_count": null, 1594 | "metadata": { 1595 | "collapsed": true 1596 | }, 1597 | "outputs": [], 1598 | "source": [ 1599 | "def targetencoder_multi(dataf,data_asw,df,fea):\n", 1600 | " dataf['timegroup'] = (dataf['inviteday']/3).apply(int)\n", 1601 | " data_train = dataf[dataf['label'] != -1]\n", 1602 | " data_test = dataf[dataf['label'] == -1]\n", 1603 | " daylen_asw = 21\n", 1604 | " for i in range(3):\n", 1605 | " df[fea+'_label_count%s' %i] = np.nan#np.zeros(data.shape[0])\n", 1606 | " for day in iter(data_train['inviteday'].unique()):\n", 1607 | " df.loc[dataf['inviteday'] == day,[fea+'_label_count'+str(i) for i in range(3)]] = get_multi_label_count(data_asw[(data_asw['inviteday']=day-daylen_asw)],dataf[dataf['inviteday'] == day],fea)#\n", 1608 | " df.loc[data_test.index,[fea+'_label_count'+str(i) for i in range(3)]] = get_multi_label_count(data_asw[(data_asw['inviteday']=t0_eval-daylen_asw)],data_test,fea)#\n", 1609 | "\n", 1610 | " for i in range(3):\n", 1611 | " df[fea+'_label_ctr%s' %i] = np.nan#np.zeros(data.shape[0])\n", 1612 | " for gp in iter(data_train['timegroup'].unique()):\n", 1613 | " try:\n", 1614 | " df.loc[(dataf['timegroup'] == gp)&(dataf['label'] != -1),[fea+'_label_ctr'+str(i) for i in range(3)]] = get_multi_label_ctr(data_train[data_train['timegroup'] < gp],data_train[data_train['timegroup'] == gp],fea)\n", 1615 | " except:\n", 1616 | " pass\n", 1617 | " df.loc[data_test.index,[fea+'_label_ctr'+str(i) for i in range(3)]] = get_multi_label_ctr(data_train,data_test,fea)\n", 1618 | " del dataf['timegroup']\n", 1619 | " \n", 1620 | " return df\n", 1621 | "\n", 1622 | "def get_multi_label_count(df_train,df_test,fea):\n", 1623 | " countall = {}\n", 1624 | " for row in iter(df_train[fea].values):\n", 1625 | " for theme in iter(row.strip().split(',')):\n", 1626 | " try:\n", 1627 | " countall[theme] = countall[theme] + 1\n", 1628 | " except:\n", 1629 | " countall[theme] = 1\n", 1630 | "\n", 1631 | " result_count = []\n", 1632 | " for row in iter(df_test[fea].values):\n", 1633 | " row_count = []\n", 1634 | " for theme in iter(row.strip().split(',')):\n", 1635 | " try:\n", 1636 | " row_count.append(countall[theme])\n", 1637 | " except:\n", 1638 | " pass\n", 1639 | " row_count.sort(reverse = True) \n", 1640 | " result_count.append(row_count[:3])\n", 1641 | " \n", 1642 | " result_count = pd.DataFrame(result_count)\n", 1643 | " for i in range(3):\n", 1644 | " if i not in result_count.columns:\n", 1645 | " result_count[i] = np.nan\n", 1646 | " return result_count.values\n", 1647 | "\n", 1648 | "def get_multi_label_ctr(df_train,df_test,fea):\n", 1649 | " count1 = {}\n", 1650 | " countall = {}\n", 1651 | " for row in iter(df_train[[fea,'label']].values):\n", 1652 | " for theme in iter(row[0].strip().split(',')):\n", 1653 | " try:\n", 1654 | " countall[theme] = countall[theme] + 1\n", 1655 | " except:\n", 1656 | " countall[theme] = 1\n", 1657 | " if row[1] == 1:\n", 1658 | " try:\n", 1659 | " count1[theme] = count1[theme] + 1\n", 1660 | " except:\n", 1661 | " count1[theme] = 1\n", 1662 | "\n", 1663 | " result_ctr = []\n", 1664 | " for row in iter(df_test[fea].values):\n", 1665 | " row_ctr = []\n", 1666 | " for theme in iter(row.strip().split(',')):\n", 1667 | " try:\n", 1668 | " row_ctr.append((count1[theme]+1)/(countall[theme]+1))\n", 1669 | " except:\n", 1670 | " pass\n", 1671 | " row_ctr.sort(reverse = True)\n", 1672 | " result_ctr.append(row_ctr[:3])\n", 1673 | " \n", 1674 | " result_ctr = pd.DataFrame(result_ctr)\n", 1675 | " for i in range(3):\n", 1676 | " if i not in result_ctr.columns:\n", 1677 | " result_ctr[i] = np.nan\n", 1678 | " return result_ctr.values " 1679 | ] 1680 | }, 1681 | { 1682 | "cell_type": "code", 1683 | "execution_count": null, 1684 | "metadata": { 1685 | "collapsed": true, 1686 | "scrolled": true 1687 | }, 1688 | "outputs": [], 1689 | "source": [ 1690 | "for fea in ['themeId','attentionthemes']:\n", 1691 | " df = targetencoder_multi(data,data_answer,df,fea)\n", 1692 | " print(fea+' is ok')" 1693 | ] 1694 | }, 1695 | { 1696 | "cell_type": "markdown", 1697 | "metadata": {}, 1698 | "source": [ 1699 | "对问题的标题利用tdidf过滤的结果作为问题的多标签类别特征,进行多标签目标编码:" 1700 | ] 1701 | }, 1702 | { 1703 | "cell_type": "code", 1704 | "execution_count": null, 1705 | "metadata": { 1706 | "collapsed": true 1707 | }, 1708 | "outputs": [], 1709 | "source": [ 1710 | "data_question = pd.read_csv(datapath+'data_q_title_tfidf.csv')\n", 1711 | "data_question = data_question[['问题id','title_topk','text_topk']].rename(columns = {'问题id':'qId','title_topk':'title_words_tfidf0','text_topk':'title_words_tfidf1'})\n", 1712 | "\n", 1713 | "data = data.merge(data_question[['qId','title_words_tfidf1']],on = 'qId',how = 'left')\n", 1714 | "data_answer = data_answer.merge(data_question[['qId','title_words_tfidf1']],on = 'qId',how = 'left')\n", 1715 | "del data_question\n", 1716 | "\n", 1717 | "df = targetencoder_multi(data,data_answer,df,'title_words_tfidf1')\n", 1718 | "del data['title_words_tfidf1']\n", 1719 | "del data_answer['title_words_tfidf1']\n", 1720 | "\n", 1721 | "df['title_words_tfidf1_label_count_mean'] = df[['title_words_tfidf1_label_count0','title_words_tfidf1_label_count1','title_words_tfidf1_label_count2']].mean(axis = 1)\n", 1722 | "df['title_words_tfidf1_label_ctr_mean'] = df[['title_words_tfidf1_label_ctr0','title_words_tfidf1_label_ctr1','title_words_tfidf1_label_ctr2']].mean(axis = 1)\n", 1723 | "\n", 1724 | "cols = ['title_words_tfidf1_label_ctr0','title_words_tfidf1_label_ctr1', 'title_words_tfidf1_label_ctr2','title_words_tfidf1_label_ctr_mean']\n", 1725 | "df[cols] = df[cols]*1000" 1726 | ] 1727 | }, 1728 | { 1729 | "cell_type": "markdown", 1730 | "metadata": {}, 1731 | "source": [ 1732 | "4)分组统计特征,用户或问题id的目标编码特征关于问题或用户id的平均值 " 1733 | ] 1734 | }, 1735 | { 1736 | "cell_type": "code", 1737 | "execution_count": null, 1738 | "metadata": { 1739 | "collapsed": true 1740 | }, 1741 | "outputs": [], 1742 | "source": [ 1743 | "df['writerId_label_count_gp_qId'] = df['writerId_label_count'].groupby(data['qId']).transform(np.mean)\n", 1744 | "df['qId_label_count_gp_writerId'] = df['qId_label_count'].groupby(data['writerId']).transform(np.mean)\n", 1745 | "\n", 1746 | "df['writerId_label_ctr_gp_qId'] = df['writerId_label_ctr'].groupby(data['qId']).transform(np.mean)\n", 1747 | "df['qId_label_ctr_gp_writerId'] = df['qId_label_ctr'].groupby(data['writerId']).transform(np.mean)" 1748 | ] 1749 | }, 1750 | { 1751 | "cell_type": "code", 1752 | "execution_count": null, 1753 | "metadata": { 1754 | "collapsed": true 1755 | }, 1756 | "outputs": [], 1757 | "source": [ 1758 | "df = memoryOptimization(df,np.float32)\n", 1759 | "with open(datapath+'df/df_1.2.pkl','wb') as f:\n", 1760 | " pickle.dump(df,f)\n", 1761 | "del df\n", 1762 | "del data_answer\n", 1763 | "gc.collect()" 1764 | ] 1765 | }, 1766 | { 1767 | "cell_type": "markdown", 1768 | "metadata": {}, 1769 | "source": [ 1770 | "### 3、其它特征" 1771 | ] 1772 | }, 1773 | { 1774 | "cell_type": "code", 1775 | "execution_count": null, 1776 | "metadata": { 1777 | "collapsed": true 1778 | }, 1779 | "outputs": [], 1780 | "source": [ 1781 | "df = data[[]]" 1782 | ] 1783 | }, 1784 | { 1785 | "cell_type": "markdown", 1786 | "metadata": {}, 1787 | "source": [ 1788 | "1)邀请间隔时长:构造每条样本的问题id最近一次发出邀请到现在的时间间隔以及用户id最近一次被邀请到现在的时间间隔(单位小时)" 1789 | ] 1790 | }, 1791 | { 1792 | "cell_type": "code", 1793 | "execution_count": 13, 1794 | "metadata": { 1795 | "collapsed": true 1796 | }, 1797 | "outputs": [], 1798 | "source": [ 1799 | "def get_hourlenfromlastinv(dataf,df,fea):\n", 1800 | " result = []\n", 1801 | " last_invite = {}\n", 1802 | " for row in iter(dataf[[fea,'inviteallhour']].sort_values(by = 'inviteallhour',ascending = True).values):\n", 1803 | " try:\n", 1804 | " result.append(last_invite[row[0]])\n", 1805 | " except:\n", 1806 | " result.append(-1)\n", 1807 | " last_invite[row[0]] = row[1]\n", 1808 | "\n", 1809 | " df1 = dataf[[]]\n", 1810 | " df1['seq'] = dataf.sort_values(by = 'inviteallhour').index\n", 1811 | " df1['lastinvitehour_'+fea] = result\n", 1812 | " df1 = df1.sort_values(by = 'seq')\n", 1813 | "\n", 1814 | " df1 = df1.reset_index(drop = True)\n", 1815 | " del df1['seq']\n", 1816 | " df = pd.concat([df,df1],axis = 1)\n", 1817 | " del df1\n", 1818 | " gc.collect()\n", 1819 | "\n", 1820 | " df.loc[df['lastinvitehour_'+fea] == -1,'lastinvitehour_'+fea] = None\n", 1821 | " df['hourlenfromlastinvite_'+fea] = dataf['inviteallhour'] - df['lastinvitehour_'+fea]\n", 1822 | " del df['lastinvitehour_'+fea]\n", 1823 | " \n", 1824 | " return df\n", 1825 | "\n", 1826 | "df = get_hourlenfromlastinv(data,df,'qId')\n", 1827 | "df = get_hourlenfromlastinv(data,df,'writerId')" 1828 | ] 1829 | }, 1830 | { 1831 | "cell_type": "markdown", 1832 | "metadata": {}, 1833 | "source": [ 1834 | "2)时效性:问题从创建当前邀请的存在时长,反应问题的时效性" 1835 | ] 1836 | }, 1837 | { 1838 | "cell_type": "code", 1839 | "execution_count": null, 1840 | "metadata": { 1841 | "collapsed": true 1842 | }, 1843 | "outputs": [], 1844 | "source": [ 1845 | "df['q_life'] = data['inviteday'] - data['createday']" 1846 | ] 1847 | }, 1848 | { 1849 | "cell_type": "markdown", 1850 | "metadata": {}, 1851 | "source": [ 1852 | "3)活跃度:问题或用户近期的活跃度,用id近期有活动的天数表示,分为邀请活跃度以及回答活跃度,对于问题来说是问题近期是否有被邀请或者被回答,对于用户来说代表了用户近期是否更愿意回答或接收邀请" 1853 | ] 1854 | }, 1855 | { 1856 | "cell_type": "code", 1857 | "execution_count": null, 1858 | "metadata": { 1859 | "collapsed": true 1860 | }, 1861 | "outputs": [], 1862 | "source": [ 1863 | "def get_activeday(dataf,data_asw,df,fea):\n", 1864 | " df[fea+'_activeday_inv'] = np.nan\n", 1865 | " for day in iter(dataf['inviteday'].unique()):\n", 1866 | " df.loc[dataf['inviteday'] == day,fea+'_activeday_inv'] = get_daynum(dataf[dataf['inviteday']=day-daylen_asw)],dataf[dataf['inviteday'] == day],idfea,fea,dic) \n", 2111 | " result[dataf['label'] == -1] = get_multi_count(data_asw[(data_asw['inviteday']=day-daylen_asw)],dataf[dataf['label'] == -1],idfea,fea,dic)\n", 2112 | " \n", 2113 | " result.columns = [idfea+'_'+fea+'_lastnday_labelcount' +str(i) for i in range(5)]\n", 2114 | " \n", 2115 | " return result\n", 2116 | "\n", 2117 | "def get_multi_count(df_train,df_test,idfea,fea,dic):\n", 2118 | " \n", 2119 | " day_max = df_test['inviteday'].min()\n", 2120 | " countall = {}\n", 2121 | " for row in iter(df_train[[idfea,fea,'inviteday']].values):\n", 2122 | " if row[1] == '-1':\n", 2123 | " continue\n", 2124 | " t = (1.5-0.3*np.floor((day_max-row[2])/3))#*dic[row[2]]#1#\n", 2125 | " if t<0.5:\n", 2126 | " t = 0.6\n", 2127 | " for theme in iter(row[1].strip().split(',')):\n", 2128 | " theme = row[0]+'_'+theme\n", 2129 | " try:\n", 2130 | " countall[theme] = countall[theme] + t\n", 2131 | " except:\n", 2132 | " countall[theme] = t\n", 2133 | " \n", 2134 | " result_count = []\n", 2135 | " for row in iter(df_test[[idfea,fea]].values):\n", 2136 | " row_count = []\n", 2137 | " if row[1] == '-1':\n", 2138 | " result_count.append([np.nan])\n", 2139 | " continue\n", 2140 | " for theme in iter(row[1].strip().split(',')):\n", 2141 | " theme = row[0]+'_'+theme\n", 2142 | " try:\n", 2143 | " row_count.append(countall[theme])\n", 2144 | " except:\n", 2145 | " row_count.append(0)\n", 2146 | " row_count.sort(reverse = True) \n", 2147 | " result_count.append(row_count[:5])\n", 2148 | " \n", 2149 | " result_count = pd.DataFrame(result_count)\n", 2150 | " for i in range(5):\n", 2151 | " if i not in result_count.columns:\n", 2152 | " result_count[i] = np.nan\n", 2153 | " del countall\n", 2154 | " return result_count.values\n", 2155 | "\n", 2156 | "def lastndaylabelctr_multi(dataf,data_asw,idfea,fea): \n", 2157 | "\n", 2158 | " result = dataf[[]]\n", 2159 | " result['val0'] = np.nan\n", 2160 | " result['val1'] = np.nan\n", 2161 | " result['val2'] = np.nan\n", 2162 | " result['val3'] = np.nan\n", 2163 | " result['val4'] = np.nan\n", 2164 | "\n", 2165 | " for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n", 2166 | " result[dataf['inviteday'] == day] = get_multi_ctr(data_asw[(data_asw['inviteday']=day-n)],dataf[dataf['inviteday'] == day],day,n,'themeId','themeId',q2themes,{},dic_themes,{}) \n", 2491 | " df.loc[dataf['label'] == -1,scorefea] = get_sim_itemCF(data_asw[(data_asw['inviteday']=t0_eval-n)],dataf[dataf['label'] == -1],t0_eval,n,'themeId','themeId',q2themes,{},dic_themes,{})\n", 2492 | " return df\n", 2493 | "\n", 2494 | "def writerlastndayperformance_itemCF_pool(dataf,data_asw,data_train_noclick,df,q2themes,q2words,dic_themes,dic_words): \n", 2495 | " \n", 2496 | " #######sim_theme######\n", 2497 | " \n", 2498 | " scorefea = 'simhis_base_theme_theme_itemcf'\n", 2499 | " df[scorefea] = np.nan\n", 2500 | " n = 30\n", 2501 | "\n", 2502 | " pool = Pool(2)\n", 2503 | " result = {}\n", 2504 | " result[t0_eval] = pool.apply_async(\n", 2505 | " func = get_sim_itemCF,\n", 2506 | " args = (data_asw.loc[(data_asw['inviteday']=t0_eval-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['label'] == -1,['writerId','qId']],t0_eval,n,'themeId','themeId',q2themes.copy(),{},dic_themes.copy(),{},False,)\n", 2507 | " )\n", 2508 | " for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n", 2509 | " result[day] = pool.apply_async(\n", 2510 | " func = get_sim_itemCF,\n", 2511 | " args = (data_asw.loc[(data_asw['inviteday']=day-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['inviteday'] == day,['writerId','qId']],day,n,'themeId','themeId',q2themes.copy(),{},dic_themes.copy(),{},False,)\n", 2512 | " )\n", 2513 | " pool.close()\n", 2514 | " pool.join()\n", 2515 | " for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n", 2516 | " df.loc[dataf['inviteday'] == day,scorefea] = result[day].get() \n", 2517 | " df.loc[dataf['label'] == -1,scorefea] = result[t0_eval].get()\n", 2518 | " with open(datapath+'newfea_copy/sim_theme.pkl','wb') as f:\n", 2519 | " pickle.dump(df,f)\n", 2520 | "\n", 2521 | " #######sim_title######\n", 2522 | " \n", 2523 | " scorefea = 'simhis_base_title_title_itemcf'\n", 2524 | " df[scorefea] = np.nan\n", 2525 | " n = 30\n", 2526 | "\n", 2527 | " pool = Pool(2)\n", 2528 | " result = {}\n", 2529 | " result[t0_eval] = pool.apply_async(\n", 2530 | " func = get_sim_itemCF,\n", 2531 | " args = (data_asw.loc[(data_asw['inviteday']=t0_eval-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['label'] == -1,['writerId','qId']],t0_eval,n,'title_words_tfidf1','title_words_tfidf1',q2words.copy(),{},dic_words.copy(),{},False,)\n", 2532 | " )\n", 2533 | " for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n", 2534 | " result[day] = pool.apply_async(\n", 2535 | " func = get_sim_itemCF,\n", 2536 | " args = (data_asw.loc[(data_asw['inviteday']=day-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['inviteday'] == day,['writerId','qId']],day,n,'title_words_tfidf1','title_words_tfidf1',q2words.copy(),{},dic_words.copy(),{},False,)\n", 2537 | " )\n", 2538 | " \n", 2539 | " pool.close()\n", 2540 | " pool.join()\n", 2541 | " for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n", 2542 | " df.loc[dataf['inviteday'] == day,scorefea] = result[day].get() \n", 2543 | " df.loc[dataf['label'] == -1,scorefea] = result[t0_eval].get()\n", 2544 | " with open(datapath+'newfea_copy/sim_title.pkl','wb') as f:\n", 2545 | " pickle.dump(df,f)\n", 2546 | " \n", 2547 | " ######sim_noclick########\n", 2548 | " \n", 2549 | " scorefea = 'simhis_base_theme_theme_noclick'\n", 2550 | " df[scorefea] = np.nan\n", 2551 | " n = 15\n", 2552 | "\n", 2553 | " data_asw = data_train_noclick[data_train_noclick['label'] == 0].reset_index(drop = True)\n", 2554 | " pool = Pool(2)\n", 2555 | " result = {}\n", 2556 | " result[t0_eval] = pool.apply_async(\n", 2557 | " func = get_sim_itemCF,\n", 2558 | " args = (data_asw.loc[(data_asw['inviteday']=t0_eval-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['label'] == -1,['writerId','qId']].copy(),t0_eval,n,'themeId','themeId',q2themes.copy(),{},dic_themes.copy(),{},True,)\n", 2559 | " )\n", 2560 | " for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n", 2561 | " result[day] = pool.apply_async(\n", 2562 | " func = get_sim_itemCF,\n", 2563 | " args = (data_asw.loc[(data_asw['inviteday']=day-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['inviteday'] == day,['writerId','qId']].copy(),day,n,'themeId','themeId',q2themes.copy(),{},dic_themes.copy(),{},True,)\n", 2564 | " )\n", 2565 | " pool.close()\n", 2566 | " pool.join()\n", 2567 | " for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n", 2568 | " df.loc[dataf['inviteday'] == day,scorefea] = result[day].get() \n", 2569 | " df.loc[dataf['label'] == -1,scorefea] = result[t0_eval].get()\n", 2570 | " \n", 2571 | " return df\n", 2572 | "\n", 2573 | "\n", 2574 | "def get_sim_itemCF(data_train,data_test,day,n,hisfea,curfea,q2hisfea,q2curfea,dic_his,dic_cur,noclick):\n", 2575 | " #day:待计算数据邀请日\n", 2576 | " #n:统计时长\n", 2577 | " #hisfea:历史聚合特征(theme/word)\n", 2578 | " #curfea:当前比较特征(theme/word)\n", 2579 | " #q2hisfea:问题到hisfea的dic\n", 2580 | " #q2curfea:问题到curfea的dic\n", 2581 | " #dic_cur:当前训练接的theme/word的emb\n", 2582 | " #dic_his:历史回答的theme/word的emb\n", 2583 | " if curfea == hisfea:\n", 2584 | " dic_cur = dic_his\n", 2585 | " q2curfea = q2hisfea\n", 2586 | " if noclick:\n", 2587 | " pathfold = hisfea+'_noclick'\n", 2588 | " else:\n", 2589 | " pathfold = hisfea\n", 2590 | " \n", 2591 | " if os.path.exists(datapath+'dic_q2hisfea/%s/his_%s_%s.pkl' %(pathfold,day,n)) and os.path.exists(datapath+'dic_q2hisfea/%s/his_delts_%s_%s.pkl' %(pathfold,day,n)):\n", 2592 | " with open(datapath+'dic_q2hisfea/%s/his_%s_%s.pkl' %(pathfold,day,n),'rb') as f:\n", 2593 | " dic_t = pickle.load(f)\n", 2594 | " with open(datapath+'dic_q2hisfea/%s/his_delts_%s_%s.pkl' %(pathfold,day,n),'rb') as f:\n", 2595 | " dic_delt = pickle.load(f)\n", 2596 | " else:\n", 2597 | " dic_t = {}\n", 2598 | " dic_delt = {}\n", 2599 | " data_train = data_train[data_train['writerId'].isin(data_test['writerId'].unique())].groupby('writerId')\n", 2600 | " for writerid in iter(data_train.size().index):\n", 2601 | " dic_t[writerid] = []#可能为kong\n", 2602 | " dic_delt[writerid] = []\n", 2603 | " gp = data_train.get_group(writerid)\n", 2604 | " gp = gp.sort_values(by = ['inviteday','invitehour'],ascending = False)#.iloc[:10,:]\n", 2605 | " for row in iter(gp[['inviteday','qId']].values):\n", 2606 | " themes = q2hisfea[row[1]]\n", 2607 | " if themes[0] == '-1':\n", 2608 | " continue\n", 2609 | " dic_t[writerid].append(themes)\n", 2610 | " dic_delt[writerid].append(day-row[0])\n", 2611 | " with open(datapath+'dic_q2hisfea/%s/his_%s_%s.pkl' %(pathfold,day,n),'wb') as f:\n", 2612 | " pickle.dump(dic_t,f)\n", 2613 | " with open(datapath+'dic_q2hisfea/%s/his_delts_%s_%s.pkl' %(pathfold,day,n),'wb') as f:\n", 2614 | " pickle.dump(dic_delt,f)\n", 2615 | " \n", 2616 | " result = []\n", 2617 | " for row in iter(data_test[['writerId','qId']].values):\n", 2618 | " try:\n", 2619 | " list_t = dic_t[row[0]]\n", 2620 | "# list_delt = dic_delt[row[0]]\n", 2621 | " except:\n", 2622 | " result.append(np.nan)#无历史记录\n", 2623 | " continue\n", 2624 | " curthemes = q2curfea[row[1]]\n", 2625 | " if curthemes[0] == '-1':#当前问题t缺失\n", 2626 | " result.append(np.nan)\n", 2627 | " continue\n", 2628 | " \n", 2629 | " if len(list_t) == 0:#有历史记录,但历史记录的问题的t都是-1\n", 2630 | " result.append(np.nan)\n", 2631 | " continue\n", 2632 | " \n", 2633 | " sim_between_his_cur = 0\n", 2634 | " for index,histhemes in iter(enumerate(list_t)):#循环历史问题\n", 2635 | "# deltday = list_delt[index]\n", 2636 | "# weight = \n", 2637 | " sims_between_q = []\n", 2638 | " for histheme in iter(histhemes):#循环问题的t\n", 2639 | " for curtheme in iter(curthemes):#循环当前问题的t\n", 2640 | " sims_between_q.append(cos_sim(dic_his[histheme],dic_cur[curtheme]))\n", 2641 | " q_sim = np.sort(sims_between_q)[-3:].mean()\n", 2642 | " if q_sim > 0.7:#simhold,相当于取与该问题相似的问题的topk与历史问题取交集\n", 2643 | " sim_between_his_cur += q_sim#*weight,关于间隔时间的权\n", 2644 | " result.append(sim_between_his_cur)\n", 2645 | " \n", 2646 | " return result\n", 2647 | "\n", 2648 | "#话题64维emb索引字典\n", 2649 | "theme_vec = pd.read_csv(datapath+'data/theme_vector.csv')\n", 2650 | "cols = theme_vec.columns.tolist()\n", 2651 | "cols.remove('themeId')\n", 2652 | "cols = ['themeId']+cols\n", 2653 | "theme_vec = theme_vec[cols]\n", 2654 | "dic_themes = {}\n", 2655 | "for row in iter(theme_vec.values):\n", 2656 | " dic_themes[row[0]] = row[1:]\n", 2657 | "\n", 2658 | "#问题绑定话题的索引字典\n", 2659 | "data_question = pd.read_csv(datapath+'data/data_question.csv')[['qId','themeId']]\n", 2660 | "q2themes = {}\n", 2661 | "for row in iter(data_question[['qId','themeId']].values):\n", 2662 | " q2themes[row[0]] = row[1].split(',')\n", 2663 | "del data_question\n", 2664 | "\n", 2665 | "#切词64维emb索引字典\n", 2666 | "word_vec = pd.read_csv(datapath+'data/word_vector.csv')\n", 2667 | "cols = word_vec.columns.tolist()\n", 2668 | "cols.remove('wordId')\n", 2669 | "cols = ['wordId']+cols\n", 2670 | "word_vec = word_vec[cols]\n", 2671 | "dic_words = {}\n", 2672 | "for row in iter(word_vec.values):\n", 2673 | " dic_words[row[0]] = row[1:]\n", 2674 | "\n", 2675 | "#问题标题&描述切词过滤后的索引字典\n", 2676 | "data_question = pd.read_csv(datapath+'data_q_title_tfidf.csv')\n", 2677 | "data_question = data_question[['问题id','title_topk','text_topk']].rename(columns = {'问题id':'qId',\n", 2678 | " 'title_topk':'title_words_tfidf0','text_topk':'title_words_tfidf1'})\n", 2679 | "q2words = {}\n", 2680 | "for row in iter(data_question[['qId','title_words_tfidf1']].values):\n", 2681 | " q2words[row[0]] = row[1].split(',')\n", 2682 | "del data_question\n", 2683 | "\n", 2684 | "df = writerlastndayperformance_itemCF_pool(data,data_answer,data,df,q2themes,q2words,dic_themes,dic_words)\n", 2685 | "\n", 2686 | "del data_answer\n", 2687 | "gc.collect()" 2688 | ] 2689 | }, 2690 | { 2691 | "cell_type": "markdown", 2692 | "metadata": {}, 2693 | "source": [ 2694 | "2)用户关注/感兴趣话题与当前问题绑定话题的相似度" 2695 | ] 2696 | }, 2697 | { 2698 | "cell_type": "code", 2699 | "execution_count": null, 2700 | "metadata": { 2701 | "collapsed": true 2702 | }, 2703 | "outputs": [], 2704 | "source": [ 2705 | "#用户与当前问题的theme相似度\n", 2706 | "def get_user_theme_emb(data_writer):\n", 2707 | " \n", 2708 | " user_topic_fea_64 = []\n", 2709 | " for index,row in data_writer.iterrows():\n", 2710 | " if row['attentionthemes'] == '-1' and row['likethemes'] == '-1':\n", 2711 | " user_topic_fea_64.append(np.zeros(64))\n", 2712 | " if row['attentionthemes'] != '-1':\n", 2713 | " ft = row['attentionthemes'].strip().split(',')\n", 2714 | " temp = np.zeros(64)\n", 2715 | " for t in iter(ft):\n", 2716 | " temp = temp + t2v[t]\n", 2717 | " temp = temp/len(ft)\n", 2718 | " user_topic_fea_64.append(temp)\n", 2719 | " if row['attentionthemes'] == '-1' and row['likethemes'] != '-1':\n", 2720 | " it = row['likethemes'].strip().split(',')\n", 2721 | " temp = np.zeros(64)\n", 2722 | " # w = []\n", 2723 | " # for t in it:\n", 2724 | " # w.append(float(t.split(':')[1]))\n", 2725 | " # for t in it:\n", 2726 | " # wei = float(t.split(':')[1]) / sum(w)\n", 2727 | " # temp += wei*t2v_norm[int(t.split(':')[0][1:])]\n", 2728 | " for t in iter(it):\n", 2729 | " temp = temp + t2v[t.split(':')[0]]\n", 2730 | " temp = temp/len(it)\n", 2731 | " user_topic_fea_64.append(temp)\n", 2732 | " user_topic_fea_64 = pd.DataFrame(user_topic_fea_64,columns = ['writer_theme_emb%i' %i for i in range(64)])\n", 2733 | " return user_topic_fea_64\n", 2734 | "\n", 2735 | "def get_question_theme_emb(data_q):\n", 2736 | " result = []\n", 2737 | " for themes in iter(data_q['themeId']):\n", 2738 | " if themes == '-1':\n", 2739 | " result.append(np.zeros(64))\n", 2740 | " continue\n", 2741 | " themes = themes.split(',')\n", 2742 | " emb = np.zeros(64)\n", 2743 | " for theme in iter(themes):\n", 2744 | " emb = emb + t2v[theme]\n", 2745 | " emb = emb/len(themes)\n", 2746 | " result.append(emb)\n", 2747 | " result = pd.DataFrame(result,columns = ['q_theme_emb%i' %i for i in range(64)])\n", 2748 | " return result\n", 2749 | "\n", 2750 | "def cos_sim(vector_a, vector_b):\n", 2751 | "\n", 2752 | " vector_a = np.mat(vector_a)\n", 2753 | " vector_b = np.mat(vector_b)\n", 2754 | " num = float(vector_a * vector_b.T)\n", 2755 | " denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)\n", 2756 | " cos = num / denom\n", 2757 | " sim = 0.5 + 0.5 * cos\n", 2758 | " return sim\n", 2759 | "\n", 2760 | "def get_sim_theme_between_q_writer(data):\n", 2761 | " result = []\n", 2762 | " for row in iter(data.values):\n", 2763 | " result.append(cos_sim(row[:64],row[64:]))\n", 2764 | " \n", 2765 | " return pd.Series(result)\n", 2766 | "\n", 2767 | "theme_vec = pd.read_csv(datapath+'data/theme_vector.csv')\n", 2768 | "cols = theme_vec.columns.tolist()\n", 2769 | "cols.remove('themeId')\n", 2770 | "cols = ['themeId']+cols\n", 2771 | "theme_vec = theme_vec[cols]\n", 2772 | "t2v = {}\n", 2773 | "for row in iter(theme_vec.values):\n", 2774 | " t2v[row[0]] = row[1:]\n", 2775 | "\n", 2776 | "data_writer = pd.read_csv(datapath+'data/data_writer.csv')\n", 2777 | "user_topic_fea_64 = parallelize_dataframe(data_writer,get_user_theme_emb).reset_index(drop = True)\n", 2778 | "user_topic_fea_64['writerId'] = data_writer['writerId']\n", 2779 | "data = data.merge(user_topic_fea_64,on = 'writerId',how = 'left')\n", 2780 | "\n", 2781 | "data_question = pd.read_csv(datapath+'data/data_question.csv')\n", 2782 | "q_topic_fea_64 = parallelize_dataframe(data_question,get_question_theme_emb).reset_index(drop = True)\n", 2783 | "q_topic_fea_64['qId'] = data_question['qId']\n", 2784 | "data = data.merge(q_topic_fea_64,on = 'qId',how = 'left')\n", 2785 | "\n", 2786 | "cols = ['writer_theme_emb%i' %i for i in range(64)]+['q_theme_emb%i' %i for i in range(64)]\n", 2787 | "df['sim_theme_between_q_writer'] = parallelize_dataframe(data[cols],get_sim_theme_between_q_writer).values\n", 2788 | "data = data.drop(cols,axis = 1)\n", 2789 | "del data_question\n", 2790 | "del data_writer\n", 2791 | "del theme_vec\n", 2792 | "del t2v\n", 2793 | "del user_topic_fea_64\n", 2794 | "del q_topic_fea_64\n", 2795 | "gc.collect()" 2796 | ] 2797 | }, 2798 | { 2799 | "cell_type": "markdown", 2800 | "metadata": {}, 2801 | "source": [ 2802 | "3)用户关注/感兴趣话题与当前问题绑定话题的重合统计" 2803 | ] 2804 | }, 2805 | { 2806 | "cell_type": "code", 2807 | "execution_count": null, 2808 | "metadata": { 2809 | "collapsed": true 2810 | }, 2811 | "outputs": [], 2812 | "source": [ 2813 | "def samethemenum_atten(row):\n", 2814 | " row = row.values\n", 2815 | " if row[0] == '-1' or row[1] == '-1':\n", 2816 | " return np.nan\n", 2817 | " row0 = row[0].strip().split(',')\n", 2818 | " row1 = row[1].strip().split(',')\n", 2819 | " num = 0\n", 2820 | " for theme in iter(row0):\n", 2821 | " if theme in row1:\n", 2822 | " num = num+1\n", 2823 | " return num\n", 2824 | "def samethemenum_like(row):\n", 2825 | " row = row.values\n", 2826 | " if row[0] == '-1' or row[1] == '-1':\n", 2827 | " return [np.nan,np.nan]\n", 2828 | " row0 = row[0].strip().split(',')\n", 2829 | " row1 = row[1].strip().split(',')\n", 2830 | " num = 0\n", 2831 | " quannum = 0\n", 2832 | " for theme in iter(row1):\n", 2833 | " theme = theme.strip().split(':')\n", 2834 | " if theme[0] in row0:\n", 2835 | " num = num+1\n", 2836 | " quannum = quannum+float(theme[1])\n", 2837 | "\n", 2838 | " return [num,quannum]\n", 2839 | "\n", 2840 | "def p1(dataf):\n", 2841 | " return dataf.apply(samethemenum_like,axis = 1)\n", 2842 | "df['samenum_like'] = np.nan\n", 2843 | "df['sameqnum_like'] = np.nan\n", 2844 | "df[['samenum_like','sameqnum_like']] = parallelize_dataframe(data[['themeId','likethemes']],p1).values\n", 2845 | "\n", 2846 | "def p2(dataf):\n", 2847 | " return dataf.apply(samethemenum_atten,axis = 1)\n", 2848 | "df['samenum_atten'] = parallelize_dataframe(data[['themeId','attentionthemes']],p2).values" 2849 | ] 2850 | }, 2851 | { 2852 | "cell_type": "code", 2853 | "execution_count": null, 2854 | "metadata": { 2855 | "collapsed": true 2856 | }, 2857 | "outputs": [], 2858 | "source": [ 2859 | "df = memoryOptimization(df,np.float32)\n", 2860 | "with open(datapath+'df/df_2.4.pkl','wb') as f:\n", 2861 | " pickle.dump(df,f)\n", 2862 | "del df" 2863 | ] 2864 | }, 2865 | { 2866 | "cell_type": "markdown", 2867 | "metadata": {}, 2868 | "source": [ 2869 | "### 三、用户历史反馈统计特征" 2870 | ] 2871 | }, 2872 | { 2873 | "cell_type": "code", 2874 | "execution_count": null, 2875 | "metadata": { 2876 | "collapsed": true 2877 | }, 2878 | "outputs": [], 2879 | "source": [ 2880 | "df = data[[]]" 2881 | ] 2882 | }, 2883 | { 2884 | "cell_type": "code", 2885 | "execution_count": null, 2886 | "metadata": { 2887 | "collapsed": true 2888 | }, 2889 | "outputs": [], 2890 | "source": [ 2891 | "def ans_quality(dataf,data_asw,df,feas,id_fea): \n", 2892 | " data_train = dataf[dataf['label'] != -1]\n", 2893 | " data_test = dataf[dataf['label'] == -1] \n", 2894 | " if (len(id_fea) > 1):\n", 2895 | " for fea in feas:\n", 2896 | " df[id_fea[1]+'_'+fea+'_sum'] = np.nan\n", 2897 | " df[id_fea[1]+'_'+fea+'_mean'] = np.nan\n", 2898 | " df[id_fea[1]+'_'+fea+'_max'] = np.nan\n", 2899 | " daylen_asw = 21\n", 2900 | " for day in iter(data_train['inviteday'].unique()):\n", 2901 | " df.loc[dataf['inviteday'] == day,[id_fea[1]+'_'+fea+'_sum',id_fea[1]+'_'+fea+'_mean',id_fea[1]+'_'+fea+'_max']] = \\\n", 2902 | " get_values(data_asw[(data_asw['inviteday']=day-daylen_asw)\n", 2903 | "# df[[id_fea[1]+'_'+fea+'_sum',id_fea[1]+'_'+fea+'_mean',id_fea[1]+'_'+fea+'_max']] = \\\n", 2904 | " df.loc[data_test.index,[id_fea[1]+'_'+fea+'_sum',id_fea[1]+'_'+fea+'_mean',id_fea[1]+'_'+fea+'_max']] = \\\n", 2905 | " get_values(data_asw[(data_asw['inviteday']=t0_eval-daylen_asw)\n", 2906 | " # df.loc[(~dataf[fea].isna())&(df[fea+'_label_count'].isna()),fea+'_label_count'] = 0\n", 2907 | " else:\n", 2908 | " for fea in feas:\n", 2909 | " df[fea+'_sum'] = np.nan\n", 2910 | " df[fea+'_mean'] = np.nan\n", 2911 | " df[fea+'_max'] = np.nan\n", 2912 | " daylen_asw = 21\n", 2913 | " for day in iter(data_train['inviteday'].unique()):\n", 2914 | " df.loc[dataf['inviteday'] == day,[fea+'_sum',fea+'_mean',fea+'_max']] = \\\n", 2915 | " get_values(data_asw[(data_asw['inviteday']=day-daylen_asw)\n", 2916 | "# df[[fea+'_sum',fea+'_mean',fea+'_max']] = \\\n", 2917 | " df.loc[data_test.index,[fea+'_sum',fea+'_mean',fea+'_max']] = \\\n", 2918 | " get_values(data_asw[(data_asw['inviteday']=t0_eval-daylen_asw)\n", 2919 | " # df.loc[(~dataf[fea].isna())&(df[fea+'_label_count'].isna()),fea+'_label_count'] = 0\n", 2920 | " del data_train\n", 2921 | " del data_test\n", 2922 | " gc.collect()\n", 2923 | " \n", 2924 | " return df\n", 2925 | "def get_values(df_train,df_test,fea,id_fea):\n", 2926 | " df_train = df_train[[fea]+id_fea].groupby(id_fea)[fea].agg(['sum','mean','max']).reset_index()\n", 2927 | " new_fea_name = id_fea+[fea+'_sum',fea+'_mean',fea+'_max']\n", 2928 | " df_train.columns=new_fea_name\n", 2929 | " df_test = df_test.merge(df_train,on = id_fea, how = 'left')\n", 2930 | " return df_test[[fea+'_sum',fea+'_mean',fea+'_max']].values\n", 2931 | "\n", 2932 | "# 用户回答质量统计\n", 2933 | "ans_quality_cols = ['collectnum', 'commentnum', 'good_bool','picture_bool', 'recommend_bool','yuanzhuo_bool', 'video_bool', 'unhelpnum','wordnum','unlikenum', 'cancellikenum','jubaonum', \\\n", 2934 | " '3qnum', 'likenum']\n", 2935 | "id_fea = ['writerId']#writerId\n", 2936 | "df = ans_quality(data[id_fea+['inviteday','label']], data_answer,df, ans_quality_cols, id_fea)" 2937 | ] 2938 | }, 2939 | { 2940 | "cell_type": "code", 2941 | "execution_count": null, 2942 | "metadata": { 2943 | "collapsed": true 2944 | }, 2945 | "outputs": [], 2946 | "source": [ 2947 | "df = memoryOptimization(df,np.float32)\n", 2948 | "with open(datapath+'df/df_3.pkl','wb') as f:\n", 2949 | " pickle.dump(df,f)\n", 2950 | "del df" 2951 | ] 2952 | }, 2953 | { 2954 | "cell_type": "markdown", 2955 | "metadata": {}, 2956 | "source": [ 2957 | "# .NN特征构建" 2958 | ] 2959 | }, 2960 | { 2961 | "cell_type": "code", 2962 | "execution_count": null, 2963 | "metadata": { 2964 | "collapsed": true 2965 | }, 2966 | "outputs": [], 2967 | "source": [ 2968 | "df = data[[]]" 2969 | ] 2970 | }, 2971 | { 2972 | "cell_type": "markdown", 2973 | "metadata": {}, 2974 | "source": [ 2975 | "### 1、加载问题关键字\n", 2976 | "\n", 2977 | "问题关键字构建方式:利用tf-idf过滤,对于只有标题的取最重要的1/3,对于有描述的取总的2/3" 2978 | ] 2979 | }, 2980 | { 2981 | "cell_type": "code", 2982 | "execution_count": null, 2983 | "metadata": { 2984 | "collapsed": true 2985 | }, 2986 | "outputs": [], 2987 | "source": [ 2988 | "data_question = pd.read_csv('data/data_q_title_tfidf.csv')\n", 2989 | "data_question.columns = ['index', 'qId', 'title_words', 'describe_words']\n", 2990 | "df['all_words'] = data[['qId']].merge(data_question[['describe_words', 'qId']],on = 'qId',how = 'left')['describe_words'].values\n", 2991 | "del data_question\n", 2992 | "gc.collect()" 2993 | ] 2994 | }, 2995 | { 2996 | "cell_type": "markdown", 2997 | "metadata": {}, 2998 | "source": [ 2999 | "### 2、基于用户历史回答信息得到的用户感兴趣的关键字 / 主题信息\n", 3000 | "利用提供的answer文件,利用当前用户的历史回答信息(注意时序),得到用户感兴趣的关键字、主题信息,并按照出现频率为不同的主题、关键字设置权重,出现频率越高代表用户对这个主题的关注越高,设置的权重越高" 3001 | ] 3002 | }, 3003 | { 3004 | "cell_type": "code", 3005 | "execution_count": null, 3006 | "metadata": { 3007 | "collapsed": true 3008 | }, 3009 | "outputs": [], 3010 | "source": [ 3011 | "def getHistInfo(data, data_ans, info_type, dayn=30):\n", 3012 | "\n", 3013 | " def getUserhistTopic_all(dataf, data_ans, info_type, dayn=30, testflag=False):\n", 3014 | " rs = dataf[[]]\n", 3015 | " \n", 3016 | " rs[info_type[0]] = np.nan \n", 3017 | " result = {}\n", 3018 | " for day in tqdm.tqdm_notebook(dataf.loc[dataf['label']!=-1, 'inviteday'].unique()):\n", 3019 | " rs.loc[dataf['inviteday']==day, info_type[0]] = getlastndayHisttopic(data_ans.loc[(data_ans['answerday']=day-dayn)],\n", 3020 | " dataf.loc[dataf['inviteday']==day], info_type[1]) \n", 3021 | "\n", 3022 | " rs.loc[dataf['label']==-1, info_type[0]] = getlastndayHisttopic(data_ans.loc[(data_ans['answerday']<3868)&(data_ans['answerday']>=3868-dayn)],\n", 3023 | " dataf[dataf['label']==-1], info_type[1])\n", 3024 | " return rs\n", 3025 | "\n", 3026 | " #得到过去历史信息 \n", 3027 | " def getlastndayHisttopic(data_ans, dataf, fea): \n", 3028 | " gps = data_ans[['writerId', fea]].groupby('writerId')\n", 3029 | " rs = []\n", 3030 | " temp_dic = {}\n", 3031 | " for uid in dataf['writerId'].values:\n", 3032 | " topics = {}\n", 3033 | " try:\n", 3034 | " if uid in temp_dic:\n", 3035 | " rs.append(temp_dic[uid]) \n", 3036 | " else: \n", 3037 | " for i in gps.get_group(uid)[fea]:\n", 3038 | " tps = i.split(',')\n", 3039 | " for tp in tps:\n", 3040 | " if tp not in topics:\n", 3041 | " topics[tp] = 1\n", 3042 | " else:\n", 3043 | " topics[tp] += 1 \n", 3044 | "\n", 3045 | " topics = pd.Series(topics)\n", 3046 | " topics =(topics/topics.mean()).round(3).to_dict()\n", 3047 | " topics = sorted(topics.items(),key=lambda x: x[1], reverse=True)[:100]\n", 3048 | " temp = []\n", 3049 | " for t in topics:\n", 3050 | " temp.append(t[0]+ ':' +str(t[1]))\n", 3051 | " rs.append(','.join(temp)) \n", 3052 | " temp_dic[uid] = ','.join(temp)\n", 3053 | " except:\n", 3054 | " rs.append('-1')\n", 3055 | " temp_dic[uid] = '-1'\n", 3056 | " return rs\n", 3057 | " \n", 3058 | " type_dic = {'topics':['hist_user_themes', 'themeId'], 'words':['hist_user_unlike_themes', 'words_list']}\n", 3059 | " user_topic = getUserhistTopic_all(data, data_ans, type_dic[info_type], dayn=dayn)\n", 3060 | " return user_topic\n", 3061 | "\n", 3062 | "data_ans = pd.read_csv(datapath+'data/data_answer.csv')\n", 3063 | "data_question = pd.read_csv(datapath+'data/data_question.csv')\n", 3064 | "data_ans = data_ans.merge(data_question[['qId', 'themeId']],on = 'qId',how = 'left')\n", 3065 | "# 加载word\n", 3066 | "data_question = pd.read_csv(datapath+'data/data_q_title_tfidf.csv')\n", 3067 | "data_question.columns = ['index', 'qId', 'title_words', 'describe_words']\n", 3068 | "data_ans = data_ans.merge(data_question[['describe_words', 'qId']],on = 'qId',how = 'left').rename(columns = {'describe_words':'words_list'})\n", 3069 | "\n", 3070 | "# data_test 表示要处理的数据\n", 3071 | "df['user_topics'] = getHistInfo(data, data_ans, 'topics')\n", 3072 | "df['user_words'] = getHistInfo(data, data_ans, 'words')" 3073 | ] 3074 | }, 3075 | { 3076 | "cell_type": "markdown", 3077 | "metadata": {}, 3078 | "source": [ 3079 | "### 3、基于用户历史未回答信息得到的用户未点击问题的关键字 / 主题信息\n", 3080 | "利用当前用户的历史未回答的信息(注意时序),得到用户讨厌的关键字、主题信息,并按照出现频率为不同的主题、关键字设置权重,出现频率越高代表用户对这个主题的讨厌度越高,设置的权重越高" 3081 | ] 3082 | }, 3083 | { 3084 | "cell_type": "code", 3085 | "execution_count": null, 3086 | "metadata": { 3087 | "collapsed": true 3088 | }, 3089 | "outputs": [], 3090 | "source": [ 3091 | "def getUnlikeInfo(data, data_ans, info_type, dayn=7): # data[data['label']==0]\n", 3092 | " \n", 3093 | " def getUserhistTopic_F_all(dataf, data_ans, info_type, dayn=7):\n", 3094 | " rs = dataf[[]]\n", 3095 | " rs[info_type[0]] = np.nan \n", 3096 | " result = {}\n", 3097 | "\n", 3098 | " for day in tqdm.tqdm_notebook(dataf.loc[dataf['label']!=-1, 'inviteday'].unique()):\n", 3099 | " if day>3838:\n", 3100 | " rs.loc[dataf['inviteday']==day, info_type[0]] = getlastndayHisttopic(data_ans.loc[(data_ans['inviteday']=day-dayn)],\n", 3101 | " dataf.loc[dataf['inviteday']==day], info_type[0])\n", 3102 | "\n", 3103 | " rs.loc[dataf['label']==-1, info_type[0]] = getlastndayHisttopic(data_ans.loc[(data_ans['inviteday']<3868)&(data_ans['inviteday']>=3868-dayn)],\n", 3104 | " dataf[dataf['label']==-1], info_type[1])\n", 3105 | " return rs\n", 3106 | "\n", 3107 | " #得到过去历史信息 \n", 3108 | " def getlastndayHisttopic(data_ans, dataf, fea): \n", 3109 | " gps = data_ans[['writerId', fea]].groupby('writerId')\n", 3110 | " rs = []\n", 3111 | " temp_dic = {}\n", 3112 | " for uid in dataf['writerId'].values:\n", 3113 | " topics = {}\n", 3114 | " try:\n", 3115 | " if uid in temp_dic:\n", 3116 | " rs.append(temp_dic[uid]) \n", 3117 | " else: \n", 3118 | " for i in gps.get_group(uid)[fea]:\n", 3119 | " tps = i.split(',')\n", 3120 | " for tp in tps:\n", 3121 | " if tp not in topics:\n", 3122 | " topics[tp] = 1\n", 3123 | " else:\n", 3124 | " topics[tp] += 1 \n", 3125 | "\n", 3126 | " topics = pd.Series(topics)\n", 3127 | " topics =(topics/topics.mean()).round(3).to_dict()\n", 3128 | " topics = sorted(topics.items(),key=lambda x: x[1], reverse=True)[:100]\n", 3129 | " temp = []\n", 3130 | " for t in topics:\n", 3131 | " temp.append(t[0]+ ':' +str(t[1]))\n", 3132 | " rs.append(','.join(temp)) \n", 3133 | " temp_dic[uid] = ','.join(temp)\n", 3134 | "\n", 3135 | " except:\n", 3136 | " #print('error')\n", 3137 | " rs.append('-1')\n", 3138 | " return rs\n", 3139 | " \n", 3140 | " type_dic = {'topics':['hist_user_unlike_themes', 'themeId'], 'words':['hist_user_unlike_themes', 'words_list']}\n", 3141 | " user_unlike_topic = getUserhistTopic_F_all(data, data_ans, type_dic[info_type], dayn=dayn)\n", 3142 | " return user_unlike_topic\n", 3143 | "\n", 3144 | "unlikeData = data[data['label']==0] ## 用户未回答(label=0)的数据\n", 3145 | "df['user_unlike_topic'] = getUnlikeTopics(data, unlikeData, 'topics')\n", 3146 | "df['user_unlike_word'] = getUnlikeWord(data, unlikeData, 'words')" 3147 | ] 3148 | }, 3149 | { 3150 | "cell_type": "code", 3151 | "execution_count": null, 3152 | "metadata": { 3153 | "collapsed": true 3154 | }, 3155 | "outputs": [], 3156 | "source": [ 3157 | "df = memoryOptimization(df,np.float32)\n", 3158 | "with open(datapath+'df/df_nn.pkl','wb') as f:\n", 3159 | " pickle.dump(df,f)\n", 3160 | "del df" 3161 | ] 3162 | }, 3163 | { 3164 | "cell_type": "markdown", 3165 | "metadata": {}, 3166 | "source": [ 3167 | "### 4、deepWalk\n", 3168 | "\n", 3169 | "经过分析,邀请本身就蕴含有丰富的信息,这里利用deepwalk得到社交网络信息,提取问题跟用户邀请的信息,将用户和问题放在同一空间 \n", 3170 | "(本部分代码参考19年腾讯赛冠军队伍开源)" 3171 | ] 3172 | }, 3173 | { 3174 | "cell_type": "code", 3175 | "execution_count": null, 3176 | "metadata": { 3177 | "collapsed": true 3178 | }, 3179 | "outputs": [], 3180 | "source": [ 3181 | "def get_sentences(dataf,present,target):\n", 3182 | " sentences=[]\n", 3183 | " dic={}\n", 3184 | " day=0\n", 3185 | "\n", 3186 | " for item in iter(dataf[['inviteday',present,target]].values):\n", 3187 | " if day!=item[0]:\n", 3188 | " for key in iter(dic):\n", 3189 | " sentences.append(dic[key])\n", 3190 | " dic={}\n", 3191 | " day=item[0]\n", 3192 | " try:\n", 3193 | " dic[item[1]].append(str(item[2]))\n", 3194 | " except:\n", 3195 | " dic[item[1]]=[str(item[2])]\n", 3196 | " for key in iter(dic):\n", 3197 | " sentences.append(dic[key]) \n", 3198 | " random.shuffle(sentences)\n", 3199 | " return sentences\n", 3200 | "\n", 3201 | "def get_w2v(data_w2v, model,fea,Type, flag):\n", 3202 | " dic = {'qId':'item','writerId':'user'}\n", 3203 | " values = data_w2v[fea].unique()\n", 3204 | " w2v=[]\n", 3205 | " for v in iter(values): \n", 3206 | " try:\n", 3207 | " a=[str(v)]\n", 3208 | " if flag:\n", 3209 | " v = dic[fea]+'_'+str(v)\n", 3210 | " a.extend(model[str(v)])\n", 3211 | " w2v.append(a)\n", 3212 | " except:\n", 3213 | " pass\n", 3214 | " return pd.DataFrame(w2v,columns = [fea]+[fea+'_'+Type+str(i) for i in range(32)])\n", 3215 | "\n", 3216 | "def get_sentences_deepwalk(log,f1,f2):\n", 3217 | " #构建图\n", 3218 | " dic={}\n", 3219 | " for item in iter(log[[f1,f2]].values):\n", 3220 | "# try:\n", 3221 | "# str(item[1])\n", 3222 | "# str(item[0])\n", 3223 | "# except:\n", 3224 | "# continue\n", 3225 | " try:\n", 3226 | " dic['item_'+str((item[1]))].add('user_'+str((item[0])))\n", 3227 | " except:\n", 3228 | " dic['item_'+str((item[1]))]=set(['user_'+str((item[0]))])\n", 3229 | " try:\n", 3230 | " dic['user_'+str((item[0]))].add('item_'+str((item[1])))\n", 3231 | " except:\n", 3232 | " dic['user_'+str((item[0]))]=set(['item_'+str((item[1]))])\n", 3233 | " dic_cont={}\n", 3234 | " for key in iter(dic):\n", 3235 | " dic[key]=list(dic[key])\n", 3236 | " dic_cont[key]=len(dic[key])\n", 3237 | " print(\"creating\") \n", 3238 | " #构建路径\n", 3239 | " path_length=10 \n", 3240 | " sentences=[]\n", 3241 | " length=[]\n", 3242 | " for key in iter(dic):\n", 3243 | " sentence=[key]\n", 3244 | " while len(sentence)!=path_length:\n", 3245 | " key=dic[sentence[-1]][random.randint(0,dic_cont[sentence[-1]]-1)]\n", 3246 | " if len(sentence)>=2 and key == sentence[-2]:\n", 3247 | " break\n", 3248 | " else:\n", 3249 | " sentence.append(key)\n", 3250 | " sentences.append(sentence)\n", 3251 | " length.append(len(sentence))\n", 3252 | " if len(sentences)%100000==0:\n", 3253 | " print(len(sentences))\n", 3254 | " print(np.mean(length))\n", 3255 | " print(len(sentences))\n", 3256 | " random.shuffle(sentences)\n", 3257 | " return sentences\n", 3258 | "\n", 3259 | "model_deepwalk_sentence = get_sentences_deepwalk(data.sort_values(by='inviteday'), 'writerId', 'qId',)\n", 3260 | "model_deepwalk = word2vec.Word2Vec(model_deepwalk_sentence,size = 32,window = 4,min_count = 1,sg = 1,workers = 15,iter = 20)\n", 3261 | "print('model ok')\n", 3262 | "deepw_mId = get_w2v(data, model_deepwalk,'writerId','dw',flag = True)\n", 3263 | "deepw_qId = get_w2v(data, model_deepwalk,'qId','dw',flag = True)\n", 3264 | "deepw_mId.head()\n", 3265 | "\n", 3266 | "with open(datapath+'data/m_dwdf.pkl', 'wb') as f:\n", 3267 | " pickle.dump(deepw_mId, f)\n", 3268 | "with open(datapath+'data/q_dwdf.pkl', 'wb') as f:\n", 3269 | " pickle.dump(deepw_qId, f)" 3270 | ] 3271 | }, 3272 | { 3273 | "cell_type": "markdown", 3274 | "metadata": {}, 3275 | "source": [ 3276 | "### 5、类别映射字典\n", 3277 | "将字符串类型的特征转化为数字" 3278 | ] 3279 | }, 3280 | { 3281 | "cell_type": "code", 3282 | "execution_count": null, 3283 | "metadata": { 3284 | "collapsed": true 3285 | }, 3286 | "outputs": [], 3287 | "source": [ 3288 | "single_features = ['invitehour', 'createhour', 'inviteweekday', 'createweekday', 'sex', 'activity', 'bool_A',\n", 3289 | " 'bool_B', 'bool_C', 'bool_D', 'bool_E', 'category_E', 'category_A', 'category_B', 'category_C', 'category_D']\n", 3290 | "dic = {}\n", 3291 | "temp = data[data['label']!=-1]\n", 3292 | "for s in tqdm.tqdm_notebook(single_features):\n", 3293 | " dic[s] = {}\n", 3294 | " dic[s]['unk'] = 0\n", 3295 | " for i in temp[s].unique():\n", 3296 | " dic[s][str(i)] = len(dic[s])\n", 3297 | " \n", 3298 | "print('singlefeatures ok')\n", 3299 | "with open(datapath+'/data/dic_all.pkl', 'wb') as f:\n", 3300 | " pickle.dump(dic, f)" 3301 | ] 3302 | }, 3303 | { 3304 | "cell_type": "markdown", 3305 | "metadata": {}, 3306 | "source": [ 3307 | "### 6、相关文件\n", 3308 | "- word_weight_64.pkl 利用提供的word embedding 得到的嵌入矩阵\n", 3309 | "- word2index:word映射index文件\n", 3310 | "- topic_weight_64:利用提供的topic embedding 得到的嵌入矩阵\n", 3311 | "- topic2index:topic映射index的文件" 3312 | ] 3313 | }, 3314 | { 3315 | "cell_type": "code", 3316 | "execution_count": null, 3317 | "metadata": { 3318 | "collapsed": true 3319 | }, 3320 | "outputs": [], 3321 | "source": [ 3322 | "#### word_weight_64, word2index\n", 3323 | "text = codecs.open(datapath+'data/word_vectors_64d.txt').readlines()\n", 3324 | "text = [i.replace('\\t',' ') for i in text]\n", 3325 | "wf = codecs.open(datapath+'data/word_vectors_suntp.txt', 'w')\n", 3326 | "for i in text:\n", 3327 | " wf.write(i)\n", 3328 | "\n", 3329 | "glove_file = dpath(datapath+'data/word_vectors_suntp.txt', )\n", 3330 | "tmp_file = get_tmpfile(datapath+\"data/word_vectors_suntp_w2v.txt\")\n", 3331 | "glove2word2vec(glove_file, tmp_file)\n", 3332 | "vectors = Vectors(name='word_vectors_suntp_w2v.txt', cache=datapath+'/data')\n", 3333 | "\n", 3334 | "topic2index = dict()\n", 3335 | "topic2index['unk'] = 0\n", 3336 | "topic2index['pad'] = 1\n", 3337 | "t = vectors.stoi\n", 3338 | "for i in t:\n", 3339 | " topic2index[i] = t[i]+2\n", 3340 | " \n", 3341 | "a = torch.Tensor(2, 64).uniform_(-1,1)\n", 3342 | "weight = torch.cat([a,vectors.vectors], dim=0)\n", 3343 | "print(weight.size())\n", 3344 | "with open(datapath+'data/word_weight_64.pkl', 'wb') as f:\n", 3345 | " pickle.dump(weight, f)\n", 3346 | " \n", 3347 | "with open(datapath+'data/word2index.pkl', 'wb') as f:\n", 3348 | " pickle.dump(topic2index, f)\n", 3349 | " \n", 3350 | "###########topic_weight_64.pkl, topic2index\n", 3351 | "\n", 3352 | "text = codecs.open(datapath+'data/topic_vectors_64d.txt').readlines()\n", 3353 | "text = [i.replace('\\t',' ') for i in text]\n", 3354 | "wf = codecs.open(datapath+'data/topic_vectors_64d_sun.txt', 'w')\n", 3355 | "for i in text:\n", 3356 | " wf.write(i)\n", 3357 | "\n", 3358 | "glove_file = dpath(datapath+'data/topic_vectors_64d_sun.txt')\n", 3359 | "tmp_file = get_tmpfile(datapath+\"data/topic_vectors_64d_sun_w2v.txt\")\n", 3360 | "glove2word2vec(glove_file, tmp_file)\n", 3361 | "vectors = Vectors(name='topic_vectors_64d_sun_w2v.txt', cache=datapath+'/data')\n", 3362 | "\n", 3363 | "topic2index = dict()\n", 3364 | "topic2index['unk'] = 0\n", 3365 | "topic2index['pad'] = 1\n", 3366 | "t = vectors.stoi\n", 3367 | "for i in t:\n", 3368 | " topic2index[i] = t[i]+2\n", 3369 | " \n", 3370 | "a = torch.Tensor(2, 64).uniform_(-1,1)\n", 3371 | "weight = torch.cat([a,vectors.vectors], dim=0)\n", 3372 | "with open(datapath+'data/topic_weight_64.pkl', 'wb') as f:\n", 3373 | " pickle.dump(weight, f)\n", 3374 | " \n", 3375 | "with open(datapath+'data/topic2index.pkl', 'wb') as f:\n", 3376 | " pickle.dump(topic2index, f)" 3377 | ] 3378 | }, 3379 | { 3380 | "cell_type": "markdown", 3381 | "metadata": {}, 3382 | "source": [ 3383 | "# .模型训练(树模型&NN)" 3384 | ] 3385 | }, 3386 | { 3387 | "cell_type": "markdown", 3388 | "metadata": {}, 3389 | "source": [ 3390 | "## 1、稠密特征拼接" 3391 | ] 3392 | }, 3393 | { 3394 | "cell_type": "code", 3395 | "execution_count": null, 3396 | "metadata": { 3397 | "collapsed": true 3398 | }, 3399 | "outputs": [], 3400 | "source": [ 3401 | "#labelEncoder\n", 3402 | "labelfeatures = ['sex','activity','bool_A','bool_B','bool_C','bool_D','bool_E','category_E']\n", 3403 | "data[labelfeatures] = data[labelfeatures].fillna('-1')\n", 3404 | "for feature in labelfeatures:\n", 3405 | " le = LabelEncoder()\n", 3406 | " try:\n", 3407 | " data[feature] = le.fit_transform(data[feature].apply(int))\n", 3408 | " except:\n", 3409 | " data[feature] = le.fit_transform(data[feature])\n", 3410 | "\n", 3411 | "#拼接df\n", 3412 | "for feagp in ['1.1','1.2','1.3','2.1','2.2','2.3','2.4','3']:\n", 3413 | " with open(datapath+'data/df_%s.pkl' %feagp,'rb') as f:\n", 3414 | " df = pickle.load(f)\n", 3415 | " data = pd.concat([data.df],axis = 1)\n", 3416 | " print(feagp+' is ok')\n", 3417 | " del df\n", 3418 | "\n", 3419 | "del data['themeId']\n", 3420 | "del data['attentionthemes']\n", 3421 | "del data['likethemes']\n", 3422 | "data = memoryOptimization(data,np.float32)" 3423 | ] 3424 | }, 3425 | { 3426 | "cell_type": "markdown", 3427 | "metadata": {}, 3428 | "source": [ 3429 | "## 2、树模型训练" 3430 | ] 3431 | }, 3432 | { 3433 | "cell_type": "code", 3434 | "execution_count": null, 3435 | "metadata": { 3436 | "collapsed": true 3437 | }, 3438 | "outputs": [], 3439 | "source": [ 3440 | "train = data[data['label'] != -1]\n", 3441 | "# train = down_sample(train,train,rate = 3)\n", 3442 | "test = data[data['label'] == -1].reset_index(drop = True)\n", 3443 | "train_x = train.drop(['label'],axis = 1)\n", 3444 | "train_y = train['label']\n", 3445 | "test_x = test.drop(['label'],axis = 1)\n", 3446 | "del train\n", 3447 | "del test\n", 3448 | "gc.collect()\n", 3449 | "\n", 3450 | "mfeas = ['category_A','category_B','category_D','category_C','mostliketheme']\n", 3451 | "catefeas = ['sex','activity','bool_A','bool_B','bool_C','bool_D','bool_E','category_E']\n", 3452 | "\n", 3453 | "#lgb\n", 3454 | "pre_test,pre_train,score,fea_imp,iternum = lgb_train_pre1(train_x.drop(['qId','writerId'],axis = 1),train_y,\n", 3455 | " test_x.drop(['qId','writerId'],axis = 1),catefeas,dropfeas = list(set(['inviteweekday','createweekday']+mfeas)&set(list(train_x.columns))),one = False,save_model=True)\n", 3456 | "\n", 3457 | "pre_train.to_csv(datapath+'data/lgb_0.1.csv', index=False)\n", 3458 | "sub_sample = pd.read_csv(datapath+'invite_info_evaluate_2_0926.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime'])\n", 3459 | "sub_sample['label'] = pre_test\n", 3460 | "sub_sample.to_csv(datapath+'data/lgb_0.1.txt',sep = '\\t',header = False,index = False)\n", 3461 | "\n", 3462 | "#cat\n", 3463 | "pre_test,pre_train,score,fea_imp,iternum = cat_train_pre1(train_x.drop(['qId','writerId'],axis = 1),train_y,\n", 3464 | " test_x.drop(['qId','writerId'],axis = 1),catefeas,dropfeas = list(set(['inviteweekday','createweekday']+mfeas)&set(list(train_x.columns))),one = False,save_model=True)\n", 3465 | "pre_train.to_csv(datapath+'data/cat_0.1.csv', index=False)\n", 3466 | "sub_sample = pd.read_csv(datapath+'invite_info_evaluate_2_0926.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime'])\n", 3467 | "sub_sample['label'] = pre_test\n", 3468 | "sub_sample.to_csv(datapath+'data/cat_0.1.txt',sep = '\\t',header = False,index = False)\n", 3469 | "\n", 3470 | "#xgb\n", 3471 | "pre_test,pre_train,score,fea_imp,iternum = xgb_train_pre1(train_x.drop(['qId','writerId'],axis = 1),train_y,\n", 3472 | " test_x.drop(['qId','writerId'],axis = 1),dropfeas = list(set(['inviteweekday','createweekday']+mfeas+catefeas)&set(list(train_x.columns))),one = False,save_model=True)\n", 3473 | "pre_train.to_csv(datapath+'data/xgb_0.1.csv', index=False)\n", 3474 | "sub_sample = pd.read_csv(datapath+'invite_info_evaluate_2_0926.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime'])\n", 3475 | "sub_sample['label'] = pre_test\n", 3476 | "sub_sample.to_csv(datapath+'data/xgb_0.1.txt',sep = '\\t',header = False,index = False)" 3477 | ] 3478 | }, 3479 | { 3480 | "cell_type": "markdown", 3481 | "metadata": {}, 3482 | "source": [ 3483 | "## 3、nn部分特征拼接" 3484 | ] 3485 | }, 3486 | { 3487 | "cell_type": "markdown", 3488 | "metadata": {}, 3489 | "source": [ 3490 | "除了上面用到的稠密特征外,nn部分还用到了其它特征" 3491 | ] 3492 | }, 3493 | { 3494 | "cell_type": "code", 3495 | "execution_count": null, 3496 | "metadata": { 3497 | "collapsed": true 3498 | }, 3499 | "outputs": [], 3500 | "source": [ 3501 | "#nn\n", 3502 | "with open(datapath+'data/df_nn.pkl','rb') as f:\n", 3503 | " df = pickle.load(f)\n", 3504 | "data = pd.concat([data.df],axis = 1)\n", 3505 | "del df\n", 3506 | "\n", 3507 | "with open(datapath+'data/m_dwdf.pkl', 'rb') as f:\n", 3508 | " m_dwdf = pickle.load(f)\n", 3509 | "with open(datapath+'data/q_dwdf.pkl', 'rb') as f:\n", 3510 | " q_dwdf = pickle.load(f) \n", 3511 | "data = pd.merge(data, m_dwdf, on='writerId', how='left')\n", 3512 | "data = pd.merge(data, q_dwdf, on='qId', how='left')\n", 3513 | "\n", 3514 | "del m_dwdf\n", 3515 | "del q_dwdf\n", 3516 | "gc.collect()\n", 3517 | "data = memoryOptimization(data,np.float32)" 3518 | ] 3519 | }, 3520 | { 3521 | "cell_type": "markdown", 3522 | "metadata": {}, 3523 | "source": [ 3524 | "### 特征处理\n", 3525 | "- 1、类别特征处理, 转化成数值,方便进行embedding\n", 3526 | "- 2、数值特征归一化 \n", 3527 | "- 3、deepwalk 信息拼接" 3528 | ] 3529 | }, 3530 | { 3531 | "cell_type": "code", 3532 | "execution_count": null, 3533 | "metadata": { 3534 | "collapsed": true 3535 | }, 3536 | "outputs": [], 3537 | "source": [ 3538 | "#类别特征处理, 转化成数值,方便进行embedding\n", 3539 | "with open(datapath+'data/dic_all.pkl', 'rb') as f:\n", 3540 | " dic = pickle.load(f)\n", 3541 | "\n", 3542 | "single_features = ['invitehour', 'createhour', 'inviteweekday', 'createweekday', 'sex', 'activity', 'bool_A',\n", 3543 | " 'bool_B', 'bool_C', 'bool_D', 'bool_E', 'category_E', 'category_A', 'category_B', 'category_C', 'category_D']\n", 3544 | "for i in single_features:\n", 3545 | " data[i] = data[i].apply(lambda x: dic[i][str(x)] if str(x) in dic[i] else 0)\n", 3546 | "\n", 3547 | "#数值特征归一化 \n", 3548 | "new_dense = ['qId_bool_B_last7day_count', 'writerId_title_words_deltop150_lastnday_labelcount1', 'hourlenfromlastinvite_qId', 'writerId_createhour_last7day_count_gp_qId', 'collect_sum', 'thanks_max', 'qId_mostliketheme_last7day_count', 'qId_bool_C_label_count', 'thumbs_up_max', 'writerId_createweekday_label_ctr', 'qId_count', 'title_words_tfidf1_label_count1', 'writerId_themeId_lastnday_labelcount3', 'mostliketheme_curdayinv_count', 'themeId_label_ctr1', 'writerId_title_words_deltop150_lastnday_labelctr0', 'writerId_themeId_lastnday_labelctr_mean', 'isgood_sum', 'qId_label_count_gp_writerId', 'qId_invitehour_last7day_count_gp_writerId', 'nohelp_mean', 'activity_curdayinv_count', 'writerId_last3invnum2', 'qId_count_gp_writerId', 'writerId_title_words_deltop150_lastnday_labelctr2', 'qId_sex_last7day_count', 'createhour_curdayinv_count', 'qId_label_ctr_gp_writerId', 'q_class_300_bool_B_label_count', 'category_E_curdayinv_count', 'writerId_count', 'qId_inviteweekday_label_ctr', 'qId_last3+1invnum_mean', 'qId_category_B_last7day_count_gp_writerId', 'qId_bool_D_label_ctr', 'q_class_300_category_D_label_count', 'themeId_label_count0', 'q_class_300_bool_C_label_count', 'themeId_label_ctr2', 'writerId_themeId_lastnday_labelcount4', 'qId_invitehour_label_ctr', 'category_A_createdaylastweek2label_count', 'writerId_inviteweekday_label_count', 'qId_invitedaylastweek2label_count', 'category_A_invitedaylastweek2label_rate', 'title_words_tfidf1_label_count2', 'attentionthemes_label_ctr1', 'writerId_themeId_lastnday_labelctr0', 'writerId_title_words_deltop150_lastnday_labelctr4', 'comment_sum', 'quzan_max', 'qId_invitehour_last7day_count', 'writerId_createday_last7day_count', 'qId_activeday_inv', 'qId_bool_B_last7day_count_gp_writerId', 'attentionthemes_label_count1', 'qId_last7day_count', 'writerId_createdaylastweek2label_count', 'qId_mostliketheme_last7day_count_gp_writerId', 'category_D_label_ctr', 'themeId_label_count2', 'writerId_title_words_deltop150_lastnday_labelcount2', 'qId_category_B_last7day_count', 'writerId_invitehour_label_count', 'qId_bool_E_last7day_count_gp_writerId', 'opposition_max', 'writerId_title_words_deltop150_lastnday_labelctr3', 'istabel_sum', 'q_class_300_bool_E_label_ctr', 'istabel_mean', 'mostliketheme_count', 'writerId_q_class_300_label_count', 'writerId_themeId_lastnday_labelctr3', 'createday_curdayinv_count', 'qId_last3+1invnum_std', 'invitehour_count', 'qId_last7count_gp_writerId', 'category_D_createdaylastweek2label_rate', 'quzan_sum', 'writerId_themeId_lastnday_labelcount1', 'writerId_createhour_last7day_count', 'writerId_createhour_label_count', 'writerId_last3invnum1', 'category_C_last7day_count', 'writerId_curcount_gp_qId', 'qId_bool_D_label_count', 'qId_bool_D_last7day_count', 'sex_last7day_count', 'activity_createdaylastweek2label_rate', 'writerId_createdaylastweek2label_rate', 'yanzhi_count', 'activity_createdaylastweek2label_count', 'qId_activeday_asw', 'category_C_label_ctr', 'writerId_last3+1invnum_std', 'writerId_themeId_lastnday_labelctr1', 'category_A_invitedaylastweek2label_count', 'thumbs_up_mean', 'qId_label_count', 'writerId_label_count', 'writerId_createday_last7day_count_gp_qId', 'thumbs_up_sum', 'report_mean', 'qId_bool_A_last7day_count_gp_writerId', 'writerId_createweekday_last7day_count_gp_qId', 'qId_bool_B_label_ctr', 'samenum_like', 'isvideo_max', 'simhis_base_theme_theme_itemcf', 'writerId_invitehour_label_ctr', 'title_words_tfidf1_label_ctr2', 'writerId_q_class_300_label_ctr', 'qId_last3invnum2', 'writerId_curdayinv_count', 'writerId_themeId_lastnday_labelcount0', 'qId_activity_last7day_count_gp_writerId', 'qId_bool_C_last7day_count', 'title_words_tfidf1_label_ctr0', 'writerId_invitedaylastweek2label_count', 'writerId_title_words_deltop150_lastnday_labelcount4', 'writerId_themeId_lastnday_labelcount_sum', 'opposition_sum', 'simhis_base_theme_theme_noclick', 'writerId_themeId_lastnday_labelcount2', 'qId_category_E_last7day_count_gp_writerId', 'writerId_last7count_gp_qId', 'writerId_createweekday_last7day_count', 'qId_bool_C_label_ctr', 'q_class_300_bool_B_label_ctr', 'title_words_tfidf1_label_count0', 'qId_category_C_last7day_count_gp_writerId', 'qId_category_E_last7day_count', 'topic_sim', 'mostliketheme_label_ctr', 'isvideo_sum', 'q_class_300_category_D_label_ctr', 'writerId_themeId_lastnday_labelctr2', 'comment_max', 'quzan_mean', 'qId_bool_E_label_ctr', 'createweekday_curdayinv_count', 'hourlenfromlastinvite_writerId', 'qId_bool_A_last7day_count', 'createhour_label_ctr', 'attentionthemes_label_count2', 'isrec_mean', 'writerId_last7day_count', 'length_max', 'qId_category_D_last7day_count_gp_writerId', 'writerId_label_ctr_gp_qId', 'writerId_count_gp_qId', 'qId_curdayinv_count', 'writerId_inviteweekday_label_ctr', 'writerId_activeday_asw', 'writerId_themeId_lastnday_labelcount_mean', 'writerId_activeday_inv', 'writerId_invitehour_last7day_count_gp_qId', 'category_D_invitedaylastweek2label_count', 'collect_max', 'attentionthemes_label_ctr2', 'q_class_300_bool_A_label_ctr', 'report_sum', 'createhour_count', 'title_words_tfidf1_label_ctr_mean', 'writerId_invitedaylastweek2label_rate', 'istabel_max', 'bool_D_curdayinv_count', 'yanzhi_label_ctr', 'isimage_sum', 'qId_invitedaylastweek2label_rate', 'category_A_createdaylastweek2label_rate', 'comment_mean', 'q_life', 'length_mean', 'activity_label_ctr', 'qId_invitehour_label_count', 'createhour_label_count', 'category_D_createdaylastweek2label_count', 'qId_yanzhi_last7day_count', 'themeId_label_count1', 'isimage_max', 'writerId_label_count_gp_qId', 'yanzhi_curdayinv_count', 'writerId_createweekday_label_count', 'title_words_tfidf1_label_count_mean', 'writerId_title_words_deltop150_lastnday_labelcount3', 'invitehour_label_count', 'mostliketheme_label_count', 'activity_last7day_count', 'qId_last3invnum1', 'nohelp_sum', 'createday_count', 'qId_yanzhi_last7day_count_gp_writerId', 'qId_inviteweekday_label_count', 'writerId_label_ctr', 'writerId_createhour_label_ctr', 'qId_bool_B_label_count', 'length_sum', 'qId_category_D_last7day_count', 'q_class_300_bool_E_label_count', 'q_class_300_bool_A_label_count', 'qId_category_C_last7day_count', 'writerId_title_words_deltop150_lastnday_labelcount0', 'writerId_last3+1invnum_mean', 'isrec_sum', 'thanks_sum', 'invitehour_curdayinv_count', 'writerId_last3invnum0', 'yanzhi_label_count', 'qId_sex_last7day_count_gp_writerId', 'qId_last3invnum0', 'samenum_atten', 'themeId_label_ctr0', 'simhis_base_title_title_itemcf', 'qId_bool_E_last7day_count', 'category_D_invitedaylastweek2label_rate', 'writerId_title_words_deltop150_lastnday_labelctr1', 'collect_mean', 'attentionthemes_label_ctr0', 'qId_label_ctr', 'attentionthemes_label_count0', 'q_class_300_bool_C_label_ctr', 'isimage_mean', 'qId_category_A_last7day_count', 'yanzhi_d_last7day_count', 'nohelp_max', 'qId_bool_E_label_count', 'invitehour_label_ctr', 'writerId_invitehour_last7day_count', 'qId_bool_D_last7day_count_gp_writerId', 'qId_activity_last7day_count', 'isgood_max', 'qId_category_A_last7day_count_gp_writerId', 'thanks_mean', 'report_max', 'isgood_mean', 'category_C_curdayinv_count', 'isvideo_mean', 'opposition_mean', 'qId_curcount_gp_writerId', 'isrec_max', 'title_words_tfidf1_label_ctr1', 'qId_bool_C_last7day_count_gp_writerId', 'writerId_themeId_lastnday_labelctr4']\n", 3549 | "num_dic = {}\n", 3550 | "for fea in tqdm.tqdm_notebook(new_dense):\n", 3551 | " try:\n", 3552 | " scaler_val = data[fea][~data[fea].isnull()].values\n", 3553 | " scaler = StandardScaler().fit(scaler_val.reshape((len(scaler_val), 1)))\n", 3554 | " num_dic[fea] = scaler\n", 3555 | " data[fea].fillna(scaler.mean_[0], inplace=True)\n", 3556 | " data[fea] = scaler.transform(data[fea].values.reshape((len(data), 1))).reshape((len(data),)).tolist()\n", 3557 | " except:\n", 3558 | " print(fea)\n", 3559 | "del scaler_val, scaler\n", 3560 | "gc.collect()" 3561 | ] 3562 | }, 3563 | { 3564 | "cell_type": "markdown", 3565 | "metadata": {}, 3566 | "source": [ 3567 | "## 4、nn 训练" 3568 | ] 3569 | }, 3570 | { 3571 | "cell_type": "markdown", 3572 | "metadata": {}, 3573 | "source": [ 3574 | "### Model" 3575 | ] 3576 | }, 3577 | { 3578 | "cell_type": "code", 3579 | "execution_count": null, 3580 | "metadata": { 3581 | "collapsed": true 3582 | }, 3583 | "outputs": [], 3584 | "source": [ 3585 | "import torch\n", 3586 | "import torch.nn as nn\n", 3587 | "import pandas as pd\n", 3588 | "import random\n", 3589 | "import pickle\n", 3590 | "import torch.nn.functional as F\n", 3591 | "import tqdm\n", 3592 | "from sklearn.metrics import roc_auc_score\n", 3593 | "from optimizer import Lookahead\n", 3594 | "from optimizer import RAdam\n", 3595 | "import numpy as np\n", 3596 | "import os\n", 3597 | "import torch.utils.data as Data\n", 3598 | "import gc\n", 3599 | "from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler\n", 3600 | "from sklearn.model_selection import train_test_split, StratifiedKFold\n", 3601 | "import codecs" 3602 | ] 3603 | }, 3604 | { 3605 | "cell_type": "code", 3606 | "execution_count": null, 3607 | "metadata": { 3608 | "collapsed": true 3609 | }, 3610 | "outputs": [], 3611 | "source": [ 3612 | "#mask generate\n", 3613 | "def sequence_mask(embed, feature):\n", 3614 | " \n", 3615 | " mask = (feature!=1).unsqueeze(-1).expand_as(embed).float()\n", 3616 | " return embed*mask\n", 3617 | "\n", 3618 | "class TextCNN(nn.Module):\n", 3619 | " def __init__(self, args):\n", 3620 | " super(TextCNN, self).__init__()\n", 3621 | " self.args = args\n", 3622 | "\n", 3623 | " chanel_num = 1\n", 3624 | " filter_num = args['filter_num']\n", 3625 | " filter_sizes = args['filter_sizes']\n", 3626 | "\n", 3627 | " vocabulary_size = args['vocabulary_size']\n", 3628 | " embedding_dimension = args['embedding_dim']\n", 3629 | " self.embedding = nn.Embedding(vocabulary_size, embedding_dimension)\n", 3630 | " #self.embedding = self.embedding.from_pretrained(args.vectors, freeze=False)\n", 3631 | " self.embedding.weight.data.copy_(args['pretrained_weight'])\n", 3632 | " \n", 3633 | " self.convs = nn.ModuleList([nn.Conv2d(1, filter_num, (size, embedding_dimension)) for size in filter_sizes])\n", 3634 | " self.dropout = nn.Dropout(args['dropout'])\n", 3635 | " #self.fc = nn.Linear(len(filter_sizes)*filter_num, class_num)\n", 3636 | " self.feature_num = len(filter_sizes)*filter_num*4\n", 3637 | " def forward(self, x, y):\n", 3638 | "\n", 3639 | " x = self.embedding(x)\n", 3640 | " x = x.unsqueeze(1)\n", 3641 | " y = self.embedding(y)\n", 3642 | " y = y.unsqueeze(1)\n", 3643 | " \n", 3644 | " x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]\n", 3645 | " x = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in x]\n", 3646 | " x = torch.cat(x, 1)\n", 3647 | " y = [F.relu(conv(y)).squeeze(3) for conv in self.convs]\n", 3648 | " y = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in y]\n", 3649 | " y = torch.cat(y, 1)\n", 3650 | " rs = torch.cat([x, y, torch.abs(x-y), x*y], 1)\n", 3651 | " return rs\n", 3652 | "\n", 3653 | "class xDeepFM(nn.Module):\n", 3654 | " \n", 3655 | " def __init__(self, params):\n", 3656 | " super(xDeepFM, self).__init__()\n", 3657 | " self.device = params['device']\n", 3658 | " #self.mlp_input_dim = params['field_size'] * params['embedding_size']\n", 3659 | " self.k = params['k'] ##\n", 3660 | " self.dic = params['dic'] # 字典\n", 3661 | " self.single_features = params['single_features'] #\n", 3662 | " self.muti_features = params['muti_features']#\n", 3663 | " self.num_features = params['num_features']\n", 3664 | " self.cross_features = params['cross_features']\n", 3665 | " self.topic_features = params['topic_features']\n", 3666 | " self.other_features = params['other_features']\n", 3667 | " self.l2 = params['l2']\n", 3668 | " self.norm = params['normNum']\n", 3669 | " self.usetext = params['usetext']\n", 3670 | " self.usecin = params['usecin']\n", 3671 | " self.usetopic = params['usetopic']\n", 3672 | " self.useword = params['useword']\n", 3673 | " self.word_features = params['word_features']\n", 3674 | " #textcnn\n", 3675 | " if params['usetext']:\n", 3676 | " self.textcnn = TextCNN(params['textargs'])\n", 3677 | " self.textlen = self.textcnn.feature_num\n", 3678 | " \n", 3679 | " #mem\n", 3680 | " self.usemem = params['usemem']\n", 3681 | " if self.usemem:\n", 3682 | " mem_dim = 0 #64*2\n", 3683 | " \n", 3684 | " self.linear1 = nn.Linear(128, 64)\n", 3685 | " self.sig = nn.Linear(128+64,64)\n", 3686 | " self.mem1 = nn.Embedding(len(self.dic['writerId']), 64)\n", 3687 | " self.mem0 = nn.Embedding(len(self.dic['writerId']), 64)\n", 3688 | " else:\n", 3689 | " mem_dim = 0\n", 3690 | " \n", 3691 | " first_orders = nn.ModuleDict()\n", 3692 | " second_orders = nn.ModuleDict()\n", 3693 | " ## feature -index\n", 3694 | " feature_index = {}\n", 3695 | " for s in self.single_features+self.num_features + self.other_features:\n", 3696 | " feature_index[s] = [len(feature_index), len(feature_index)+1]\n", 3697 | "\n", 3698 | " if 'writerId' in self.single_features:\n", 3699 | " self.single_features.remove('writerId') \n", 3700 | " \n", 3701 | " temp_index = 0\n", 3702 | " if self.usetopic:\n", 3703 | " temp_index = len(feature_index)+100\n", 3704 | " feature_index['attentionthemes'] = [len(feature_index), temp_index] \n", 3705 | " feature_index['themeId'] = [temp_index, temp_index+13] \n", 3706 | " feature_index['likethemes'] = [temp_index+13, temp_index+13+10]\n", 3707 | " feature_index['likethemes_att'] = [temp_index+13+10, temp_index+13+10+10]\n", 3708 | " #hist_user_themes\n", 3709 | " feature_index['hist_user_themes'] = [temp_index+13+10+10, temp_index+13+10+10+10]\n", 3710 | " feature_index['hist_user_themes_att'] = [temp_index+13+10+10+10, temp_index+13+10+10+10+10] \n", 3711 | " if self.usetext:\n", 3712 | " feature_index['m_interest_topic'] = feature_index['likethemes']\n", 3713 | " feature_index['q_topic'] = feature_index['themeId'] \n", 3714 | " if self.useword:\n", 3715 | " if temp_index>0:\n", 3716 | " feature_index['hist_user_words'] = [temp_index+13+10+10+10+10, temp_index+13+10+10+10+10 + 20 ] # 20\n", 3717 | " feature_index['hist_user_words_att'] = [temp_index+13+10+10+10+10 + 20, temp_index+13+10+10+10+10 + 20 + 20 ]\n", 3718 | " feature_index['all_words'] = [temp_index+13+10+10+10+10 + 20 + 20, temp_index+13+10+10+10+10 + 20 + 20 + 10 ] # 20 \n", 3719 | " feature_index['hist_user_unlike_themes'] = [temp_index+13+10+10+10+10 + 20 + 20 + 10, temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 ] # 20\n", 3720 | " feature_index['hist_user_unlike_themes_att'] = [temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10, temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10 ] # 20\n", 3721 | " feature_index['hist_user_unlike_words'] = [temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10, temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10 + 20 ] # 20\n", 3722 | " feature_index['hist_user_unlike_words_att'] = [temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10 + 20, temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10 + 20 +20 ] # 20\n", 3723 | " \n", 3724 | " self.feature_index = feature_index ## index\n", 3725 | " \n", 3726 | " for s in self.single_features:\n", 3727 | " first_orders[s] = nn.Embedding(len(self.dic[s]), 1)\n", 3728 | " nn.init.normal_(first_orders[s].weight, mean=0, std=0.0001)\n", 3729 | " second_orders[s] = nn.Embedding(len(self.dic[s]), self.k)\n", 3730 | " nn.init.normal_(second_orders[s].weight, mean=0, std=0.0001)\n", 3731 | " \n", 3732 | " for s in self.muti_features:\n", 3733 | " first_orders[s] = nn.Embedding(len(self.dic[s])+2, 1)\n", 3734 | " nn.init.normal_(first_orders[s].weight, mean=0, std=0.0001)\n", 3735 | " second_orders[s] = nn.Embedding(len(self.dic[s])+2, self.k)\n", 3736 | " nn.init.normal_(second_orders[s].weight, mean=0, std=0.0001)\n", 3737 | " \n", 3738 | " self.first_orders = first_orders.to(self.device)\n", 3739 | " self.second_orders = second_orders.to(self.device)\n", 3740 | " \n", 3741 | " self.norm_num = nn.ModuleDict()\n", 3742 | " for s in self.num_features:\n", 3743 | " self.norm_num[s] = nn.BatchNorm1d(1)\n", 3744 | " \n", 3745 | " ######################################################dnn\n", 3746 | " self.p = params['p'] # drop_out\n", 3747 | " self.layers = params['layers']\n", 3748 | " self.input_dim = (len(self.single_features)+ len(self.muti_features) + len(self.topic_features) + len(self.word_features)) * self.k + len(self.num_features) + mem_dim #*self.k #* self.k #+ 2* self.k\n", 3749 | " self.deep_layers = nn.Sequential()\n", 3750 | " net_dims = [self.input_dim]+self.layers\n", 3751 | " for i in range(len(self.layers)):\n", 3752 | " self.deep_layers.add_module('fc%d' % (i+1), nn.Linear(net_dims[i], net_dims[i+1]))\n", 3753 | " self.deep_layers.add_module('bn%d' % (i+1), nn.BatchNorm1d(net_dims[i+1]))\n", 3754 | " self.deep_layers.add_module('relu%d' % (i+1), nn.ReLU()) \n", 3755 | " self.deep_layers.add_module('dropout%d' % (i+1), nn.Dropout(self.p)) \n", 3756 | " for name, tensor in self.deep_layers.named_parameters():\n", 3757 | " if 'weight' in name:\n", 3758 | " nn.init.normal_(tensor, mean=0, std=0.0001)\n", 3759 | " self.deep_layers = self.deep_layers.to(self.device)\n", 3760 | " \n", 3761 | " ## topic \n", 3762 | " if params['usetopic']:\n", 3763 | " self.topic_weight = nn.Embedding(params['textargs']['vocabulary_size'], 64)\n", 3764 | " self.topic_weight.weight.data.copy_(params['textargs']['pretrained_weight'])\n", 3765 | " self.topic_linear = nn.Sequential(nn.Linear(64, self.k), nn.ReLU())\n", 3766 | " \n", 3767 | " #word\n", 3768 | " if self.useword:\n", 3769 | " temp_weight = params['word2index']#word_weight\n", 3770 | " self.word_weight = nn.Embedding(len(params['word2index']), 64)\n", 3771 | " self.word_weight.weight.data.copy_(params['word_weight'])\n", 3772 | " self.word_linear = nn.Sequential(nn.Linear(64, self.k), nn.ReLU())\n", 3773 | " \n", 3774 | " #############################################################cin \n", 3775 | " self.num_field = len(self.cross_features)+ len(self.muti_features) + len(self.topic_features) + len(self.word_features) #+ len(self.num_features)#+ 2\n", 3776 | " self.conv1ds = nn.ModuleList()\n", 3777 | " self.cin_layers = params['cin_layers']\n", 3778 | " cin_layers_dims = [self.num_field]+self.cin_layers\n", 3779 | " self.split_half = params['split_half']\n", 3780 | " self.hidden_dims_split_half = [self.num_field]\n", 3781 | " prev_dim = 0\n", 3782 | " for i in range(len(self.cin_layers)):\n", 3783 | " self.conv1ds.append(nn.Conv1d(cin_layers_dims[0]*self.hidden_dims_split_half[-1], cin_layers_dims[i+1], 1))\n", 3784 | " if self.split_half and i != len(self.cin_layers)-1:\n", 3785 | " self.hidden_dims_split_half.append(cin_layers_dims[i+1] // 2)\n", 3786 | " prev_dim += cin_layers_dims[i+1] // 2\n", 3787 | " else:\n", 3788 | " self.hidden_dims_split_half.append(cin_layers_dims[i+1])\n", 3789 | " prev_dim += cin_layers_dims[i+1]\n", 3790 | " self.conv1ds = self.conv1ds.to(self.device)\n", 3791 | " \n", 3792 | " if self.usetext:\n", 3793 | " textlen = self.textlen\n", 3794 | " else:\n", 3795 | " textlen = 0\n", 3796 | "\n", 3797 | " self.output = nn.Sequential(nn.Linear(prev_dim+self.layers[-1]+ textlen + len(self.other_features), 512), # no-linear\n", 3798 | " nn.BatchNorm1d(512),\n", 3799 | " nn.ReLU(),\n", 3800 | " nn.Dropout(self.p),\n", 3801 | " nn.Linear(512, 256),\n", 3802 | " nn.BatchNorm1d(256),\n", 3803 | " nn.ReLU(),\n", 3804 | " nn.Dropout(self.p),\n", 3805 | " nn.Linear(256,64),\n", 3806 | " nn.BatchNorm1d(64),\n", 3807 | " nn.ReLU(),\n", 3808 | " nn.Dropout(self.p))\n", 3809 | " self.end = nn.Sequential(nn.Linear(64, 1),) \n", 3810 | " \n", 3811 | " def forward(self, input_x):\n", 3812 | " embed1 = {}\n", 3813 | " embed2 = {}\n", 3814 | " norm_num = {}\n", 3815 | " for s in self.single_features: \n", 3816 | " embed1[s] = self.first_orders[s](input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()).squeeze(-1) #B * 1 \n", 3817 | " embed2[s] = self.second_orders[s](input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()) #b*1*k\n", 3818 | " \n", 3819 | " if self.usetopic:\n", 3820 | " topics = {}\n", 3821 | " for s in self.topic_features:\n", 3822 | " if s not in ['hist_user_themes', 'likethemes', 'hist_user_unlike_themes']:#'likethemes', \n", 3823 | " temp = (torch.sum(sequence_mask(self.topic_weight(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()),\n", 3824 | " input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()), dim=1)/(torch.sum((input_x[:,self.feature_index[s][0]:self.feature_index[s][1]]!=1).float()+1e-10, dim=-1).unsqueeze(-1)))#.unsqueeze(1)# b * 1 * k\n", 3825 | " topics[s] = self.topic_linear(temp).unsqueeze(1)\n", 3826 | " else: \n", 3827 | " s_att = s+'_att'\n", 3828 | " s_value = self.topic_weight(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long())\n", 3829 | " s_att_val = input_x[:,self.feature_index[s_att][0]:self.feature_index[s_att][1]].unsqueeze(-1).expand_as(s_value).float()\n", 3830 | " temp = torch.sum(s_value * s_att_val, dim=1)/(torch.sum((input_x[:,self.feature_index[s_att][0]:self.feature_index[s_att][1]]).float()+1e-10, dim=-1).unsqueeze(-1)) \n", 3831 | " temp = temp +1e-12\n", 3832 | " topics[s] = self.topic_linear(temp).unsqueeze(1)\n", 3833 | " \n", 3834 | " if self.useword:\n", 3835 | " words = {}\n", 3836 | " for s in self.word_features:\n", 3837 | " if s in ['all_words']: \n", 3838 | " temp = (torch.sum(sequence_mask(self.word_weight(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()),\n", 3839 | " input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()), dim=1)/(torch.sum((input_x[:,self.feature_index[s][0]:self.feature_index[s][1]]!=1).float()+1e-10, dim=-1).unsqueeze(-1)))#.unsqueeze(1)# b * 1 * k\n", 3840 | " words[s] = self.word_linear(temp).unsqueeze(1)\n", 3841 | " else:\n", 3842 | " s_att = s+'_att'\n", 3843 | " s_value = self.word_weight(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long())\n", 3844 | " s_att_val = input_x[:,self.feature_index[s_att][0]:self.feature_index[s_att][1]].unsqueeze(-1).expand_as(s_value).float()\n", 3845 | " temp = torch.sum(s_value * s_att_val, dim=1)/(torch.sum((input_x[:,self.feature_index[s_att][0]:self.feature_index[s_att][1]]).float()+1e-10, dim=-1).unsqueeze(-1)) \n", 3846 | " temp = temp +1e-12\n", 3847 | " words[s] = self.word_linear(temp).unsqueeze(1)\n", 3848 | " ##other\n", 3849 | " others_num = []\n", 3850 | " for s in self.other_features:\n", 3851 | " others_num.append(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]])\n", 3852 | " others_num = torch.cat(others_num, dim=1)\n", 3853 | " \n", 3854 | " if self.norm: \n", 3855 | " for s in self.num_features:\n", 3856 | " norm_num[s] = self.norm_num[s](input_x[:,self.feature_index[s][0]:self.feature_index[s][1]])\n", 3857 | " else:\n", 3858 | " for s in self.num_features:\n", 3859 | " norm_num[s] = input_x[:,self.feature_index[s][0]:self.feature_index[s][1]]\n", 3860 | " \n", 3861 | " \n", 3862 | " ## mem\n", 3863 | " if self.usemem:\n", 3864 | " mem1 = self.mem1(input_x[:,self.feature_index['writerId'][0]:self.feature_index['writerId'][1]].long()).squeeze(1)\n", 3865 | " mem0 = self.mem0(input_x[:,self.feature_index['writerId'][0]:self.feature_index['writerId'][1]].long()).squeeze(1)\n", 3866 | " \n", 3867 | " # ###########################以下是MLP\n", 3868 | " y = []\n", 3869 | " input_size = 0\n", 3870 | " for s in embed2:\n", 3871 | " y.append(embed2[s])\n", 3872 | " #topic\n", 3873 | " if self.usetopic:\n", 3874 | " for s in topics:\n", 3875 | " y.append(topics[s]) \n", 3876 | " #words\n", 3877 | " if self.useword:\n", 3878 | " for s in words:\n", 3879 | " y.append(words[s])\n", 3880 | " \n", 3881 | " y = torch.cat(y,1)\n", 3882 | " input_size += len(embed2)* self.k\n", 3883 | " if self.usetopic:\n", 3884 | " if self.useword:\n", 3885 | " y = torch.reshape(y, [-1, (len(embed2)+len(topics)+len(words)) * self.k])\n", 3886 | " else:\n", 3887 | " y = torch.reshape(y, [-1, (len(embed2)+len(topics)) * self.k])\n", 3888 | " else:\n", 3889 | " y = torch.reshape(y, [-1, len(embed2)* self.k])\n", 3890 | " \n", 3891 | " temp = []\n", 3892 | " temp.append(y) \n", 3893 | " \n", 3894 | " for s in self.num_features:\n", 3895 | " temp.append(norm_num[s]) \n", 3896 | " x = torch.cat(temp, -1)\n", 3897 | " \n", 3898 | " ## dnn_logits\n", 3899 | " dnn_logits = self.deep_layers(x) \n", 3900 | " # ##########################################################CIN \n", 3901 | " x = []\n", 3902 | " for s in self.muti_features+self.cross_features:\n", 3903 | " x.append(embed2[s])\n", 3904 | " \n", 3905 | " if self.usetopic:\n", 3906 | " for s in self.topic_features:\n", 3907 | " x.append(topics[s])\n", 3908 | " \n", 3909 | " #word\n", 3910 | " if self.useword:\n", 3911 | " for s in self.word_features:\n", 3912 | " x.append(words[s])\n", 3913 | " \n", 3914 | " x0 = torch.cat(x, 1)\n", 3915 | " res = []\n", 3916 | " x_list = [x0]\n", 3917 | " for k in range(1, len(self.cin_layers)+1):\n", 3918 | " z_k = torch.einsum('bhd,bmd->bhmd', x_list[-1], x_list[0])\n", 3919 | " z_k = z_k.reshape(x0.shape[0], x_list[-1].shape[1] * x0.shape[1], x0.shape[2])\n", 3920 | " x_k = self.conv1ds[k-1](z_k)\n", 3921 | " x_k = torch.relu(x_k)\n", 3922 | " if self.split_half and k != len(self.cin_layers):\n", 3923 | " next_hidden, hi = torch.split(x_k, x_k.shape[1] // 2, 1)\n", 3924 | " else:\n", 3925 | " next_hidden, hi = x_k, x_k\n", 3926 | "\n", 3927 | " x_list.append(next_hidden)\n", 3928 | " res.append(hi)\n", 3929 | "\n", 3930 | " res = torch.cat(res, dim=1)\n", 3931 | " res = torch.sum(res, dim=2)\n", 3932 | "\n", 3933 | " if self.usetext:\n", 3934 | " textinfo = self.textcnn(input_x[:,self.feature_index['q_topic'][0]:self.feature_index['q_topic'][1]].long(),\n", 3935 | " input_x[:,self.feature_index['m_interest_topic'][0]:self.feature_index['m_interest_topic'][1]].long()) \n", 3936 | " allput = torch.cat([dnn_logits, res, textinfo], dim=1)\n", 3937 | " else: \n", 3938 | " allput = torch.cat([dnn_logits, res, others_num], dim=1) \n", 3939 | " score = self.output(allput)\n", 3940 | " \n", 3941 | " if self.usemem:\n", 3942 | " with torch.no_grad():\n", 3943 | " mem_n = score.detach() \n", 3944 | " m_info = F.tanh(self.linear1(torch.cat([mem0, mem1], dim=-1))) \n", 3945 | " return self.end(score), mem_n, mem0, mem1 \n", 3946 | " else:\n", 3947 | " return self.end(score)" 3948 | ] 3949 | }, 3950 | { 3951 | "cell_type": "markdown", 3952 | "metadata": {}, 3953 | "source": [ 3954 | "### util\n", 3955 | "主要对word,topic进行处理,将word,topic映射到对应的数值,方便进行embedding" 3956 | ] 3957 | }, 3958 | { 3959 | "cell_type": "code", 3960 | "execution_count": null, 3961 | "metadata": { 3962 | "collapsed": true 3963 | }, 3964 | "outputs": [], 3965 | "source": [ 3966 | "## pad 1 加入topics信息\n", 3967 | "def deal_text(textdata, params, maxlen=10):\n", 3968 | " temp = []\n", 3969 | " topic2ix = params['textargs']['topic2index']\n", 3970 | " \n", 3971 | " for text in textdata:\n", 3972 | " tps = str(text).split(',')\n", 3973 | " if '-1' in text:\n", 3974 | " rs = [1]* maxlen\n", 3975 | " else:\n", 3976 | " rs = list(map(lambda x: topic2ix[x],tps))\n", 3977 | " if len(rs)<=maxlen:\n", 3978 | " rs += [1]*(maxlen-len(rs))\n", 3979 | " else:\n", 3980 | " rs = rs[:maxlen]\n", 3981 | " temp.append(rs)\n", 3982 | " \n", 3983 | " return np.array(temp)\n", 3984 | "\n", 3985 | "def deal_text2(textdata, params):\n", 3986 | " temp = []\n", 3987 | " topic2ix = params['textargs']['topic2index']\n", 3988 | " weight = params['textargs']['pretrained_weight']\n", 3989 | " for text in textdata:\n", 3990 | " tps = str(text).split(',')\n", 3991 | " if '-1' in text:\n", 3992 | " rs = np.array([0]*64)\n", 3993 | " else:\n", 3994 | " rs = list(map(lambda x: np.array(weight[topic2ix[x]]), tps))\n", 3995 | " rs = np.mean(np.array(rs),axis=0)\n", 3996 | " temp.append(rs)\n", 3997 | " \n", 3998 | " \n", 3999 | " return np.array(temp)\n", 4000 | "\n", 4001 | "# 处理带权重的topics\n", 4002 | "def deal_text3(textdata, params, maxlen=10):\n", 4003 | " temp = []\n", 4004 | " topic2ix = params['textargs']['topic2index']\n", 4005 | " temp_att = []\n", 4006 | " ix = 0\n", 4007 | "\n", 4008 | " for text in textdata:\n", 4009 | " tps = str(text).split(',')\n", 4010 | " ix += 1\n", 4011 | " if '-1' in text:\n", 4012 | " rs = [1] * maxlen\n", 4013 | " rs_att = [0.0] * maxlen\n", 4014 | " else:\n", 4015 | " tps = list(filter(lambda x:'Infinity' not in x , tps))\n", 4016 | " rs = list(map(lambda x: topic2ix[x.split(':')[0]],tps)) \n", 4017 | " rs_att = list(map(lambda x: float(x.split(':')[1]),tps))\n", 4018 | " \n", 4019 | " if len(rs) <= maxlen:\n", 4020 | " rs += [1]*(maxlen-len(rs))\n", 4021 | " rs_att += [0.0] * (maxlen-len(rs_att))\n", 4022 | " else:\n", 4023 | " rs = rs[:maxlen]\n", 4024 | " rs_att = rs_att[:maxlen]\n", 4025 | " \n", 4026 | " temp.append(rs)\n", 4027 | " temp_att.append(rs_att)\n", 4028 | " \n", 4029 | " temp = np.array(temp)\n", 4030 | " temp_att = np.array(temp_att)\n", 4031 | " \n", 4032 | " return np.concatenate([temp,temp_att], axis=-1)\n", 4033 | "\n", 4034 | "# 权重word\n", 4035 | "def deal_word(textdata, params, maxlen=10):\n", 4036 | " temp = []\n", 4037 | " topic2ix = params['word2index']\n", 4038 | " temp_att = []\n", 4039 | " ix = 0\n", 4040 | "\n", 4041 | " for text in textdata:\n", 4042 | " tps = str(text).split(',')\n", 4043 | " ix += 1\n", 4044 | " if '-1' in text:\n", 4045 | " rs = [1] * maxlen\n", 4046 | " rs_att = [0.0] * maxlen\n", 4047 | " else:\n", 4048 | " tps = list(filter(lambda x:'Infinity' not in x , tps))\n", 4049 | " rs = list(map(lambda x: topic2ix[x.split(':')[0]],tps)) \n", 4050 | " rs_att = list(map(lambda x: float(x.split(':')[1]),tps))\n", 4051 | " \n", 4052 | " if len(rs) <= maxlen:\n", 4053 | " rs += [1]*(maxlen-len(rs))\n", 4054 | " rs_att += [0.0] * (maxlen-len(rs_att))\n", 4055 | " else:\n", 4056 | " rs = rs[:maxlen]\n", 4057 | " rs_att = rs_att[:maxlen]\n", 4058 | " \n", 4059 | " temp.append(rs)\n", 4060 | " temp_att.append(rs_att)\n", 4061 | " \n", 4062 | " temp = np.array(temp)\n", 4063 | " temp_att = np.array(temp_att)\n", 4064 | " \n", 4065 | " return np.concatenate([temp,temp_att], axis=-1)\n", 4066 | "\n", 4067 | "# 非权重\n", 4068 | "def deal_word2(textdata, params, maxlen=10):\n", 4069 | " temp = []\n", 4070 | " topic2ix = params['word2index']\n", 4071 | " \n", 4072 | " for text in textdata:\n", 4073 | " tps = str(text).split(',')\n", 4074 | " if '-1' in text:\n", 4075 | " rs = [1]* maxlen\n", 4076 | " else:\n", 4077 | " rs = list(map(lambda x: topic2ix[x],tps))\n", 4078 | " if len(rs)<=maxlen:\n", 4079 | " rs += [1]*(maxlen-len(rs))\n", 4080 | " else:\n", 4081 | " rs = rs[:maxlen]\n", 4082 | " temp.append(rs)\n", 4083 | " \n", 4084 | " return np.array(temp) \n", 4085 | "\n", 4086 | "import os\n", 4087 | "def setup_seed(seed):\n", 4088 | " random.seed(seed)\n", 4089 | " os.environ['PYTHONHASHSEED'] = str(seed)\n", 4090 | " np.random.seed(seed)\n", 4091 | " torch.manual_seed(seed)\n", 4092 | " torch.cuda.manual_seed(seed)\n", 4093 | " torch.backends.cudnn.deterministic = True" 4094 | ] 4095 | }, 4096 | { 4097 | "cell_type": "markdown", 4098 | "metadata": {}, 4099 | "source": [ 4100 | "### train test\n", 4101 | "本部分主要是模型的训练(使用Lookahead+Adam 优化器),验证,预测" 4102 | ] 4103 | }, 4104 | { 4105 | "cell_type": "code", 4106 | "execution_count": null, 4107 | "metadata": { 4108 | "collapsed": true 4109 | }, 4110 | "outputs": [], 4111 | "source": [ 4112 | "def eval(model, devloader, params):\n", 4113 | " preds = [] \n", 4114 | " print('eval')\n", 4115 | " model.eval()\n", 4116 | " trues = []\n", 4117 | " for x,y in devloader:\n", 4118 | " with torch.no_grad():\n", 4119 | " x = x.to(params['device']).float()\n", 4120 | " if params['usemem']:\n", 4121 | " score, _, _, _ = model(x)\n", 4122 | " else:\n", 4123 | " score = model(x)\n", 4124 | " preds+=score.cpu().reshape(-1).tolist()\n", 4125 | " trues+=y.cpu().reshape(-1).tolist()\n", 4126 | " auc = roc_auc_score(trues, preds)\n", 4127 | " print('auc: ', auc)\n", 4128 | " return auc\n", 4129 | "\n", 4130 | "temp_x = 0\n", 4131 | "def train(params, trainset, devset, foldix=0):\n", 4132 | " x = []\n", 4133 | " for i in params['single_features']+ params['num_features']+ params['other_features']:\n", 4134 | " x.append(np.expand_dims(trainset[i], axis=1))\n", 4135 | " if params['usetext']:\n", 4136 | " x.append(deal_text(trainset['attentionthemes'], params, maxlen=10))\n", 4137 | " x.append(deal_text(trainset['themeId'], params, maxlen=10))\n", 4138 | " if params['usetopic']:\n", 4139 | " x.append(deal_text(trainset['attentionthemes'], params, maxlen=100)) \n", 4140 | " x.append(deal_text(trainset['themeId'], params, maxlen=13))\n", 4141 | " x.append(deal_text3(trainset['likethemes'], params, maxlen=10))#w_topics1\n", 4142 | " #hist_user_themes_att\n", 4143 | " x.append(deal_text3(trainset['hist_user_themes'], params, maxlen=10))\n", 4144 | " if params['useword']:\n", 4145 | " x.append(deal_word(trainset['hist_user_words'], params, maxlen=20))\n", 4146 | " x.append(deal_word2(trainset['all_words'], params, maxlen=10))\n", 4147 | "\n", 4148 | " #hist_user_unlike_themes\n", 4149 | " x.append(deal_text3(trainset['hist_user_unlike_themes'], params, maxlen=10))\n", 4150 | " #hist_user_unlike_words\n", 4151 | " x.append(deal_word(trainset['hist_user_unlike_words'], params, maxlen=20))\n", 4152 | " \n", 4153 | " train_tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x, axis=-1)),\n", 4154 | " torch.from_numpy(np.expand_dims(trainset['label'], axis=1))) \n", 4155 | "\n", 4156 | " trainloader = Data.DataLoader(\n", 4157 | " dataset=train_tensor_data,\n", 4158 | " batch_size=params['batch_size'],\n", 4159 | " shuffle=not params['usemem'],\n", 4160 | " num_workers=0,\n", 4161 | " )\n", 4162 | " del trainset\n", 4163 | " gc.collect()\n", 4164 | " print('train load ok')\n", 4165 | " x_val = [] \n", 4166 | " for i in params['single_features']+ params['num_features']+ params['other_features']:\n", 4167 | " x_val.append(np.expand_dims(devset[i], axis=1))\n", 4168 | " if params['usetext']:\n", 4169 | " x_val.append(deal_text(devset['attentionthemes'], params, maxlen=10))\n", 4170 | " x_val.append(deal_text(devset['themeId'], params, maxlen=10)) \n", 4171 | " \n", 4172 | " if params['usetopic']:\n", 4173 | " x_val.append(deal_text(devset['attentionthemes'], params, maxlen=100)) \n", 4174 | " x_val.append(deal_text(devset['themeId'], params, maxlen=13))\n", 4175 | " x_val.append(deal_text3(devset['likethemes'], params, maxlen=10))\n", 4176 | " x_val.append(deal_text3(devset['hist_user_themes'], params, maxlen=10))\n", 4177 | " if params['useword']:#\n", 4178 | " x_val.append(deal_word(devset['hist_user_words'], params, maxlen=20))\n", 4179 | " x_val.append(deal_word2(devset['all_words'], params, maxlen=10))\n", 4180 | "\n", 4181 | " x_val.append(deal_text3(devset['hist_user_unlike_themes'], params, maxlen=10))\n", 4182 | " #hist_user_unlike_words\n", 4183 | " x_val.append(deal_word(devset['hist_user_unlike_words'], params, maxlen=20))\n", 4184 | " \n", 4185 | " dev_tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x_val, axis=-1)),\n", 4186 | " torch.from_numpy(np.expand_dims(devset['label'], axis=1))) \n", 4187 | "\n", 4188 | " devloader = Data.DataLoader(\n", 4189 | " dataset=dev_tensor_data,\n", 4190 | " batch_size=params['batch_size'],\n", 4191 | " shuffle=False,\n", 4192 | " num_workers=0,\n", 4193 | " )\n", 4194 | " print('dev loader ok')\n", 4195 | " model = xDeepFM(params)\n", 4196 | " model.to(params['device'])\n", 4197 | " base_optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['l2'])\n", 4198 | " optimizer = Lookahead(base_optimizer=base_optimizer,k=5,alpha=0.5)\n", 4199 | " best_auc = 0\n", 4200 | " criterion = nn.MSELoss() ## mem\n", 4201 | " maxauc = 0\n", 4202 | " for epoch in tqdm.tqdm_notebook(range(params['epochs'])):\n", 4203 | " all_loss = 0\n", 4204 | " i=0 \n", 4205 | " all_loss_mem = 0\n", 4206 | " for x, y in tqdm.tqdm_notebook(trainloader):\n", 4207 | " x = x.to(params['device']).float()\n", 4208 | " y = y.to(params['device']).float() \n", 4209 | " if params['usemem']:\n", 4210 | " score, mem_n, mem0, mem1 = model(x)\n", 4211 | " loss = F.binary_cross_entropy_with_logits(score, y)\n", 4212 | " mem = (1-y.expand_as(mem_n)) * mem0 + y.expand_as(mem_n) * mem1\n", 4213 | " memloss = criterion(mem, mem_n)\n", 4214 | " loss = loss + memloss\n", 4215 | " all_loss_mem += memloss.detach().cpu().item()\n", 4216 | " else:\n", 4217 | " score = model(x) \n", 4218 | " loss = F.binary_cross_entropy_with_logits(score, y)\n", 4219 | " \n", 4220 | " optimizer.zero_grad()\n", 4221 | " loss.backward()\n", 4222 | " i+=1\n", 4223 | " optimizer.step()\n", 4224 | " all_loss += loss.detach().cpu().item()\n", 4225 | " if i % params['num_display_steps']==0:\n", 4226 | " if params['usemem']:\n", 4227 | " #print('mem_loss:', all_loss_mem/params['num_display_steps'])\n", 4228 | " all_loss_mem = 0\n", 4229 | " all_loss = 0\n", 4230 | " auc = eval(model, devloader, params)\n", 4231 | " model.train()\n", 4232 | " if auc > maxauc:\n", 4233 | " maxauc = auc\n", 4234 | " torch.save(model.state_dict(), datapath+'data/'+'_auc_'+ str(auc)+'_'+str(foldix)+'.pth')\n", 4235 | " print(datapath+'data/'+'_auc_'+ str(auc)+'_'+str(foldix)+'.pth saved!')\n", 4236 | "\n", 4237 | " return datapath+'data/'+'_auc_'+ str(maxauc)+'_'+str(foldix)+'.pth'\n", 4238 | "\n", 4239 | "def test(params, testset, testpath, onefold=False):\n", 4240 | " model = xDeepFM(params)\n", 4241 | " model.load_state_dict(torch.load(testpath))\n", 4242 | " model.to(params['device'])\n", 4243 | " \n", 4244 | " x = []\n", 4245 | " for i in params['single_features']+ params['num_features'] + params['other_features']:\n", 4246 | " x.append(np.expand_dims(testset[i], axis=1))\n", 4247 | " if params['usetext']:\n", 4248 | " x.append(deal_text(testset['attentionthemes'], params, maxlen=10))\n", 4249 | " x.append(deal_text(testset['themeId'], params, maxlen=10)) \n", 4250 | "\n", 4251 | " if params['usetopic']:\n", 4252 | " x.append(deal_text(testset['attentionthemes'], params, maxlen=100)) \n", 4253 | " x.append(deal_text(testset['themeId'], params, maxlen=13))\n", 4254 | " x.append(deal_text3(testset['likethemes'], params, maxlen=10))#w_topics1\n", 4255 | " x.append(deal_text3(testset['hist_user_themes'], params, maxlen=10))\n", 4256 | "\n", 4257 | " if params['useword']:\n", 4258 | " x.append(deal_word(testset['hist_user_words'], params, maxlen=20))\n", 4259 | " x.append(deal_word2(testset['all_words'], params, maxlen=10))\n", 4260 | " #hist_user_unlike_themes\n", 4261 | " x.append(deal_text3(testset['hist_user_unlike_themes'], params, maxlen=10))\n", 4262 | " #hist_user_unlike_words\n", 4263 | " x.append(deal_word(testset['hist_user_unlike_words'], params, maxlen=20))\n", 4264 | " \n", 4265 | " test_tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x, axis=-1)),\n", 4266 | " torch.from_numpy(np.expand_dims(testset['label'], axis=1))) \n", 4267 | "\n", 4268 | " testloader = Data.DataLoader(\n", 4269 | " dataset=test_tensor_data,\n", 4270 | " batch_size=params['batch_size'],\n", 4271 | " shuffle=False,\n", 4272 | " num_workers=0,\n", 4273 | " )\n", 4274 | " preds = []\n", 4275 | " model.eval()\n", 4276 | " for x, y in testloader:\n", 4277 | " x = x.to(params['device']).float()\n", 4278 | " score = model(x)\n", 4279 | " score = torch.sigmoid(score)\n", 4280 | " preds+=score.cpu().reshape(-1).tolist()\n", 4281 | " if onefold: \n", 4282 | " testdata = pd.read_csv(datapath+'data/m_q_invite_test.csv') \n", 4283 | " testdata['label'] = pd.Series(preds)\n", 4284 | " testdata[['q_id', 'm_id', 'invite_time', 'label']].to_csv(datapath+'submit.txt', index = None, sep = '\\t', header=None)\n", 4285 | " print('test finished!')\n", 4286 | " return preds" 4287 | ] 4288 | }, 4289 | { 4290 | "cell_type": "markdown", 4291 | "metadata": {}, 4292 | "source": [ 4293 | "### Fold\n", 4294 | "本部分包括随机5折、按时间顺序5折、5折的测试,对比随机5折和按时间顺序5折效果,发现随机5折的效果更好" 4295 | ] 4296 | }, 4297 | { 4298 | "cell_type": "code", 4299 | "execution_count": null, 4300 | "metadata": { 4301 | "collapsed": true 4302 | }, 4303 | "outputs": [], 4304 | "source": [ 4305 | "def train_fold(trainset, params, fold=5):\n", 4306 | " train_y = trainset['label']\n", 4307 | " train_x = trainset\n", 4308 | " rs_list = []\n", 4309 | " pre_train = pd.Series(np.zeros(len(train_y)))\n", 4310 | " \n", 4311 | " kf = StratifiedKFold(n_splits = fold,shuffle = True,random_state = 2019)\n", 4312 | " for ix,(train_index,eval_index) in enumerate(kf.split(train_x,train_y)):\n", 4313 | " dtrain_x = train_x.loc[train_index,:]\n", 4314 | " deval_x = train_x.loc[eval_index,:]\n", 4315 | " rs = train(params, dtrain_x, deval_x, ix)\n", 4316 | " pre_train[eval_index] = test(params, deval_x, rs) \n", 4317 | " rs_list.append(rs)\n", 4318 | " \n", 4319 | " train_pre = trainset[['qId', 'writerId', 'inviteday', 'label', 'invitehour']]\n", 4320 | " train_pre['pre_label'] = pre_train\n", 4321 | " with open(datapath + 'train-fold-3zhou.pkl', 'wb') as f:\n", 4322 | " pickle.dump(train_pre, f, protocol=4)\n", 4323 | " \n", 4324 | " file = codecs.open(datapath + 'fold_list_3zhou.txt', 'w')\n", 4325 | " file.write(','.join(rs_list))\n", 4326 | " print('train -fold end!')\n", 4327 | "\n", 4328 | "def train_fold_time(trainset, params, fold=5):\n", 4329 | " train_x = trainset\n", 4330 | " rs_list = []\n", 4331 | " pre_train = pd.Series(np.zeros(len(train_x)))\n", 4332 | " \n", 4333 | " start = train_x['inviteday'].min()\n", 4334 | " block_len = np.ceil(float(3868-start)/fold)\n", 4335 | " for i in range(5): \n", 4336 | " bool_eval = (train_x['inviteday']>=start)&(train_x['inviteday']