├── README.md
├── optimizer .py
└── ksb_lgb&nn.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # 2019Baai-zhihu-Cup-findexp-4th---
2 | 2019年知乎看山杯第四名
3 | 
4 |    本文为2019年知乎看山杯专家发现算法第四名方案的完整代码。核心思想是以用户为主要研究方向，从用户的历史行为挖掘有用信息，包括体现用户特性的一些信息和反应用户跟问题交互的信息去预测用户未来行为。具体采用了LightGBM等提升树模型跟xDeepFm等推荐系统常用的神经网络模型，最终结果同stacking融合不同模型得到。
5 | 
6 |    官方在邀请记录文件中提供了过去一个月来用户被邀请的记录，每条邀请是一条样本，并且标注了用户是否接受了本次邀请作为样本的标签，接受为正反之为负样本，邀请记录中还包含有邀请的时间戳。通过用户历史的邀请记录去预测未来一周用户是否会接受邀请，官方给出的训练集以及测试集实际上是经过知乎内部的召回跟排序模块之后实际推送给用户的结果，所以这些邀请实际上本身就是知乎内部模型选择后的结果，包括测试集也是，我们理解的是参赛者拿到的训练跟预测数据分布本身就是有偏的，更加偏向于用户的喜好。所以本次比赛中围绕训练集样本本身去构造特征就会有很好的效果，比如用户、问题的样本数等。
7 | 
8 |    邀请本身就蕴含着很多信息，当一个问题邀请某一个用户时本身就说明该用户很可能对这个问题是感兴趣的，所以我们构造了许多邀请计数特征，并在线下取得了很好的效果，但线上的提升却是有限，邀请数量的分布在训练集中是基本稳定的，但因为官方只给出了一半的验证集，所以计数特征对验证集来说是严重有偏的，这也是初赛很多参赛选手的线下线上分数差距大的一个重要原因。我们在初赛时给每一个计数赋予了一个关于当天样本总数跟当天出现的id数的权重后再加和从而使得线上线下能够相对稳定。
9 | 


--------------------------------------------------------------------------------
/optimizer .py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import itertools as it
  4 | from torch.optim import Optimizer
  5 | class Lookahead(Optimizer):
  6 |     def __init__(self, base_optimizer,alpha=0.5, k=6):
  7 |         if not 0.0 <= alpha <= 1.0:
  8 |             raise ValueError('Invalid slow update rate: {alpha}')
  9 |         if not 1 <= k:
 10 |             raise ValueError('Invalid lookahead steps: {k}')
 11 |         self.optimizer = base_optimizer
 12 |         self.param_groups = self.optimizer.param_groups
 13 |         self.alpha = alpha
 14 |         self.k = k
 15 |         for group in self.param_groups:
 16 |             group["step_counter"] = 0
 17 |         self.slow_weights = [[p.clone().detach() for p in group['params']]
 18 |                                 for group in self.param_groups]
 19 | 
 20 |         for w in it.chain(*self.slow_weights):
 21 |             w.requires_grad = False
 22 | 
 23 |     def step(self, closure=None):
 24 |         loss = None
 25 |         if closure is not None:
 26 |             loss = closure()
 27 |         loss = self.optimizer.step()
 28 |         for group,slow_weights in zip(self.param_groups,self.slow_weights):
 29 |             group['step_counter'] += 1
 30 |             if group['step_counter'] % self.k != 0:
 31 |                 continue
 32 |             for p,q in zip(group['params'],slow_weights):
 33 |                 if p.grad is None:
 34 |                     continue
 35 |                 q.data.add_(self.alpha,p.data - q.data)
 36 |                 p.data.copy_(q.data)
 37 |         return loss
 38 | 
 39 | class RAdam(Optimizer):
 40 |     '''
 41 |     a PyTorch implementation of the RAdam Optimizer from th paper
 42 |     On the Variance of the Adaptive Learning Rate and Beyond.
 43 | 
 44 |     https://arxiv.org/abs/1908.03265
 45 |     Example:
 46 |         >>> from optimizer import RAdam
 47 |         >>> optimizer = RAdam(model.parameters(), lr=0.001)
 48 |     '''
 49 | 
 50 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 51 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 52 |         self.buffer = [[None, None, None] for ind in range(10)]
 53 |         super(RAdam, self).__init__(params, defaults)
 54 | 
 55 |     def __setstate__(self, state):
 56 |         super(RAdam, self).__setstate__(state)
 57 | 
 58 |     def step(self, closure=None):
 59 | 
 60 |         loss = None
 61 |         if closure is not None:
 62 |             loss = closure()
 63 | 
 64 |         for group in self.param_groups:
 65 | 
 66 |             for p in group['params']:
 67 |                 if p.grad is None:
 68 |                     continue
 69 |                 grad = p.grad.data.float()
 70 |                 if grad.is_sparse:
 71 |                     raise RuntimeError('RAdam does not support sparse gradients')
 72 | 
 73 |                 p_data_fp32 = p.data.float()
 74 | 
 75 |                 state = self.state[p]
 76 | 
 77 |                 if len(state) == 0:
 78 |                     state['step'] = 0
 79 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
 80 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
 81 |                 else:
 82 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
 83 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
 84 | 
 85 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 86 |                 beta1, beta2 = group['betas']
 87 | 
 88 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 89 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 90 | 
 91 |                 state['step'] += 1
 92 |                 buffered = self.buffer[int(state['step'] % 10)]
 93 |                 if state['step'] == buffered[0]:
 94 |                     N_sma, step_size = buffered[1], buffered[2]
 95 |                 else:
 96 |                     buffered[0] = state['step']
 97 |                     beta2_t = beta2 ** state['step']
 98 |                     N_sma_max = 2 / (1 - beta2) - 1
 99 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
100 |                     buffered[1] = N_sma
101 |                     if N_sma > 5:
102 |                         step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
103 |                     else:
104 |                         step_size = group['lr'] / (1 - beta1 ** state['step'])
105 |                     buffered[2] = step_size
106 | 
107 |                 if group['weight_decay'] != 0:
108 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
109 | 
110 |                 if N_sma > 5:
111 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
112 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
113 |                 else:
114 |                     p_data_fp32.add_(-step_size, exp_avg)
115 | 
116 |                 p.data.copy_(p_data_fp32)
117 | 
118 |         return loss
119 | 
120 | #
121 | class Ralamb(Optimizer):
122 |     '''
123 |     Ralamb optimizer (RAdam + LARS trick)
124 |     '''
125 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
126 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
127 |         self.buffer = [[None, None, None] for ind in range(10)]
128 |         super(Ralamb, self).__init__(params, defaults)
129 | 
130 |     def __setstate__(self, state):
131 |         super(Ralamb, self).__setstate__(state)
132 | 
133 |     def step(self, closure=None):
134 | 
135 |         loss = None
136 |         if closure is not None:
137 |             loss = closure()
138 | 
139 |         for group in self.param_groups:
140 | 
141 |             for p in group['params']:
142 |                 if p.grad is None:
143 |                     continue
144 |                 grad = p.grad.data.float()
145 |                 if grad.is_sparse:
146 |                     raise RuntimeError('Ralamb does not support sparse gradients')
147 | 
148 |                 p_data_fp32 = p.data.float()
149 | 
150 |                 state = self.state[p]
151 | 
152 |                 if len(state) == 0:
153 |                     state['step'] = 0
154 |                     state['exp_avg'] = torch.zeros_like(p_data_fp32)
155 |                     state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
156 |                 else:
157 |                     state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
158 |                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
159 | 
160 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
161 |                 beta1, beta2 = group['betas']
162 | 
163 |                 # Decay the first and second moment running average coefficient
164 |                 # m_t
165 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
166 |                 # v_t
167 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
168 | 
169 |                 state['step'] += 1
170 |                 buffered = self.buffer[int(state['step'] % 10)]
171 | 
172 |                 if state['step'] == buffered[0]:
173 |                     N_sma, radam_step = buffered[1], buffered[2]
174 |                 else:
175 |                     buffered[0] = state['step']
176 |                     beta2_t = beta2 ** state['step']
177 |                     N_sma_max = 2 / (1 - beta2) - 1
178 |                     N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
179 |                     buffered[1] = N_sma
180 | 
181 |                     # more conservative since it's an approximated value
182 |                     if N_sma >= 5:
183 |                         radam_step = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
184 |                     else:
185 |                         radam_step = group['lr'] / (1 - beta1 ** state['step'])
186 |                     buffered[2] = radam_step
187 | 
188 |                 if group['weight_decay'] != 0:
189 |                     p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
190 | 
191 |                 weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)
192 |                 radam_norm = p_data_fp32.pow(2).sum().sqrt()
193 |                 if weight_norm == 0 or radam_norm == 0:
194 |                     trust_ratio = 1
195 |                 else:
196 |                     trust_ratio = weight_norm / radam_norm
197 | 
198 |                 state['weight_norm'] = weight_norm
199 |                 state['adam_norm'] = radam_norm
200 |                 state['trust_ratio'] = trust_ratio
201 | 
202 |                 # more conservative since it's an approximated value
203 |                 if N_sma >= 5:
204 |                     denom = exp_avg_sq.sqrt().add_(group['eps'])
205 |                     p_data_fp32.addcdiv_(-radam_step * trust_ratio, exp_avg, denom)
206 |                 else:
207 |                     p_data_fp32.add_(-radam_step * trust_ratio, exp_avg)
208 | 
209 |                 p.data.copy_(p_data_fp32)
210 | 
211 |         return loss
212 | 
213 | 


--------------------------------------------------------------------------------
/ksb_lgb&nn.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 6,
   6 |    "metadata": {
   7 |     "collapsed": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "import numpy as np\n",
  12 |     "import pandas as pd\n",
  13 |     "import warnings\n",
  14 |     "import random\n",
  15 |     "import os\n",
  16 |     "import gc\n",
  17 |     "import math\n",
  18 |     "from multiprocessing import Pool,cpu_count\n",
  19 |     "from sklearn.preprocessing import Normalizer,LabelEncoder,OneHotEncoder,MinMaxScaler\n",
  20 |     "from sklearn.decomposition import PCA,TruncatedSVD\n",
  21 |     "from sklearn.externals import joblib\n",
  22 |     "from sklearn.model_selection import StratifiedKFold,train_test_split\n",
  23 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
  24 |     "from sklearn.utils import resample\n",
  25 |     "from sklearn.metrics import roc_auc_score\n",
  26 |     "from scipy import sparse\n",
  27 |     "import matplotlib.pyplot as plt\n",
  28 |     "from gensim.models import word2vec\n",
  29 |     "from gensim.scripts.glove2word2vec import glove2word2vec\n",
  30 |     "from gensim.test.utils import datapath as dpath, get_tmpfile\n",
  31 |     "import torch \n",
  32 |     "import codecs\n",
  33 |     "import xgboost as xgb\n",
  34 |     "import lightgbm as lgb\n",
  35 |     "import catboost as catb\n",
  36 |     "import pickle\n",
  37 |     "import time\n",
  38 |     "import datetime\n",
  39 |     "import math\n",
  40 |     "import scipy.special as special\n",
  41 |     "import torch\n",
  42 |     "import torch.nn as nn\n",
  43 |     "import torch.nn.functional as F\n",
  44 |     "import tqdm\n",
  45 |     "from optimizer import Lookahead\n",
  46 |     "from optimizer import RAdam\n",
  47 |     "import torch.utils.data as Data\n",
  48 |     "import codecs\n",
  49 |     "import sys\n",
  50 |     "import jieba.posseg\n",
  51 |     "import jieba.analyse\n",
  52 |     "import re\n",
  53 |     "import warnings\n",
  54 |     "warnings.filterwarnings('ignore')\n",
  55 |     "datapath = '.....'\n",
  56 |     "\n",
  57 |     "t0_train = 3838\n",
  58 |     "t1_train = 3867\n",
  59 |     "t0_eval = 3868\n",
  60 |     "t1_eval = 3874\n",
  61 |     "t0_a = 3807\n",
  62 |     "t1_a = 3867\n",
  63 |     "evalday = 7"
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": 1,
  69 |    "metadata": {
  70 |     "collapsed": true
  71 |    },
  72 |    "outputs": [],
  73 |    "source": [
  74 |     "def memoryOptimization(data,floattype):\n",
  75 |     "    subdata = data.select_dtypes(include = 'int')\n",
  76 |     "    for col in subdata.columns:\n",
  77 |     "        m = subdata[col].max()\n",
  78 |     "        n = subdata[col].min()\n",
  79 |     "        if m < np.power(2,31)-1 and n >=  -np.power(2,31):\n",
  80 |     "            if m < np.power(2,15)-1 and n >=  -np.power(2,15):\n",
  81 |     "                if m < np.power(2,7)-1 and n >=  -np.power(2,7):\n",
  82 |     "                    subdata[col] = subdata[col].astype(np.int8)\n",
  83 |     "                else:\n",
  84 |     "                    subdata[col] = subdata[col].astype(np.int16)\n",
  85 |     "            else:\n",
  86 |     "                subdata[col] = subdata[col].astype(np.int32)\n",
  87 |     "    data[subdata.columns] = subdata\n",
  88 |     "    subdata = data.select_dtypes(include = 'float')\n",
  89 |     "    data[subdata.columns] = data[subdata.columns].astype(floattype)\n",
  90 |     "#     subdata = data.select_dtypes(include = 'object')\n",
  91 |     "#     data[subdata.columns] = data[subdata.columns].astype('category')\n",
  92 |     "    gc.collect()\n",
  93 |     "    return data\n",
  94 |     "\n",
  95 |     "# def down_sample(df,df_feat,rate):#对目标特征下采样，通过给定随机数种子保证每个特征组抽样的负样本是同样的\n",
  96 |     "#     df_majority = df_feat[df['label']==0]\n",
  97 |     "#     df_minority = df_feat[df['label']==1]\n",
  98 |     "#     positive_num = df_minority.shape[0]\n",
  99 |     "#     df_majority_downsampled = resample(df_majority,\n",
 100 |     "#                                      replace=False,  # sample without replacement\n",
 101 |     "#                                      n_samples=positive_num*rate,  # to match minority class\n",
 102 |     "#                                      random_state=7)  # reproducible results\n",
 103 |     "#     df_downsampled = pd.concat([df_majority_downsampled, df_minority],axis = 0,ignore_index = True)\n",
 104 |     "#     del df_majority, df_minority, df_majority_downsampled\n",
 105 |     "#     return df_downsampled\n",
 106 |     "\n",
 107 |     "def lgb_train_pre1(train_x,train_y,test_x,categoryfeas,dropfeas,one,save_model):    \n",
 108 |     "    train_x = train_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n",
 109 |     "    test_x = test_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n",
 110 |     "    pickle.dump(list(train_x.columns), open(datapath+'data/lgb_fea.pkl', 'wb'))\n",
 111 |     "    params_lgbc ={\n",
 112 |     "        'boosting_type': 'gbdt',\n",
 113 |     "        'objective': 'binary',  \n",
 114 |     "        'num_leaves': 41, \n",
 115 |     "        'learning_rate': 0.1,\n",
 116 |     "        'feature_fraction': 0.8,\n",
 117 |     "        'bagging_fraction': 0.8,\n",
 118 |     "        'bagging_freq': 1,\n",
 119 |     "        'min_sum_hessian_in_leaf': 10,\n",
 120 |     "        'num_threads': cpu_count() - 1,\n",
 121 |     "        'seed': 7, \n",
 122 |     "        'n_estimators':50000,\n",
 123 |     "        'max_depth': 6,\n",
 124 |     "        'subsample':0.9,\n",
 125 |     "        'subsample_freq':2,\n",
 126 |     "        'reg_alpha':0, \n",
 127 |     "        'reg_lambda':2\n",
 128 |     "        # 'device': 'gpu',\n",
 129 |     "    }\n",
 130 |     "    \n",
 131 |     "    pre_train = pd.Series(np.zeros(len(train_y)))\n",
 132 |     "    pre_test = []\n",
 133 |     "    kf = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 2019)\n",
 134 |     "    fold = 1\n",
 135 |     "    for train_index,eval_index in kf.split(train_x,train_y):\n",
 136 |     "        dtrain_x = train_x.loc[train_index,:]\n",
 137 |     "        deval_x = train_x.loc[eval_index,:]\n",
 138 |     "        dtrain_y = train_y[train_index]\n",
 139 |     "        deval_y = train_y[eval_index]\n",
 140 |     "        if flag_weight:\n",
 141 |     "            sample_weight = ((dtrain_x['inviteday']-3500)/(dtrain_x['inviteday']-3500).mean()).values\n",
 142 |     "        else:\n",
 143 |     "            sample_weight = None\n",
 144 |     "        lgbc = lgb.LGBMClassifier(random_state = 2020,**params_lgbc) # np.random.randint(1,3000)\n",
 145 |     "        lgbc.fit(dtrain_x,dtrain_y,eval_set = [(deval_x,deval_y)],eval_names = ['eval'],eval_metric = 'auc',\n",
 146 |     "                 early_stopping_rounds = 50,sample_weight = sample_weight,verbose = 100,categorical_feature = categoryfeas)\n",
 147 |     "        pre_train[eval_index] = lgbc.predict_proba(deval_x,num_iteration = lgbc.best_iteration_)[:,1]\n",
 148 |     "        pre_test.append(list(lgbc.predict_proba(test_x,num_iteration = lgbc.best_iteration_)[:,1]))\n",
 149 |     "        if save_model:\n",
 150 |     "            joblib.dump(lgbc, open(datapath+'data/lgb_'+str(params_lgbc['learning_rate'])+'_'+str(fold)+'.pkl', 'wb'))\n",
 151 |     "        fold += 1\n",
 152 |     "        if one:\n",
 153 |     "            break\n",
 154 |     "    pre_test = np.array(pre_test)\n",
 155 |     "    pre_test = np.mean(pre_test,axis = 0)\n",
 156 |     "    \n",
 157 |     "    score = roc_auc_score(train_y,pre_train)\n",
 158 |     "    feas = train_x.columns\n",
 159 |     "    imps = lgbc.feature_importances_\n",
 160 |     "    fea_imp = pd.DataFrame(pd.Series(feas),columns = ['feas'])\n",
 161 |     "    fea_imp['imp'] = imps\n",
 162 |     "    fea_imp = fea_imp.sort_values(by = 'imp',ascending = False)\n",
 163 |     "    del dtrain_x\n",
 164 |     "    del deval_x\n",
 165 |     "    del dtrain_y\n",
 166 |     "    del deval_y\n",
 167 |     "    gc.collect()\n",
 168 |     "    return pre_test,pre_train,score,fea_imp,lgbc.best_iteration_\n",
 169 |     "def xgb_train_pre1(train_x,train_y,test_x,dropfeas,one,save_model):\n",
 170 |     "    train_x = train_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n",
 171 |     "    test_x = test_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n",
 172 |     "    pickle.dump(list(train_x.columns), open(datapath+'data/xgb_fea.pkl', 'wb'))\n",
 173 |     "    params_lgbc ={        \n",
 174 |     "        'booster':'gbtree',\n",
 175 |     "        'learning_rate':0.1,\n",
 176 |     "        'n_estimators':50000,\n",
 177 |     "        'max_depth':6,\n",
 178 |     "        'min_child_weight':3,\n",
 179 |     "        'gamma':0.1,\n",
 180 |     "        'subsample':0.9,\n",
 181 |     "        'colsample_bytree':0.8,\n",
 182 |     "        'reg_alpha':0, \n",
 183 |     "        'reg_lambda':2,\n",
 184 |     "        'objective':'binary:logistic',\n",
 185 |     "        'nthread':cpu_count() - 1,\n",
 186 |     "        'scale_pos_weight':1,\n",
 187 |     "        'seed':7,\n",
 188 |     "#         'tree_method':'gpu_hist'\n",
 189 |     "    }\n",
 190 |     "    \n",
 191 |     "    pre_train = pd.Series(np.zeros(len(train_y)))\n",
 192 |     "    pre_test = []\n",
 193 |     "    kf = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 2019)\n",
 194 |     "    fold = 1\n",
 195 |     "    for train_index,eval_index in kf.split(train_x,train_y):\n",
 196 |     "        dtrain_x = train_x.loc[train_index,:]\n",
 197 |     "        deval_x = train_x.loc[eval_index,:]\n",
 198 |     "        dtrain_y = train_y[train_index]\n",
 199 |     "        deval_y = train_y[eval_index]\n",
 200 |     "        if flag_weight:\n",
 201 |     "            sample_weight = ((dtrain_x['inviteday']-3500)/(dtrain_x['inviteday']-3500).mean()).values\n",
 202 |     "        else:\n",
 203 |     "            sample_weight = None\n",
 204 |     "        xgbc = xgb.XGBClassifier(random_state = 2019,**params_lgbc)\n",
 205 |     "        xgbc.fit(dtrain_x,dtrain_y,eval_set = [(deval_x,deval_y)],eval_metric = 'auc',\n",
 206 |     "                 early_stopping_rounds = 50,sample_weight = sample_weight,verbose = 100)\n",
 207 |     "        pre_train[eval_index] = xgbc.predict_proba(deval_x, ntree_limit=xgbc.best_ntree_limit)[:,1]\n",
 208 |     "        pre_test.append(list(xgbc.predict_proba(test_x, ntree_limit=xgbc.best_ntree_limit)[:,1]))\n",
 209 |     "        if save_model:\n",
 210 |     "            joblib.dump(xgbc, open(datapath+'data/xgb_'+str(params_lgbc['learning_rate'])+'_'+str(fold)+'.pkl', 'wb'))\n",
 211 |     "        fold += 1\n",
 212 |     "        if one:\n",
 213 |     "            break\n",
 214 |     "    pre_test = np.array(pre_test)\n",
 215 |     "    pre_test = np.mean(pre_test,axis = 0)\n",
 216 |     "    \n",
 217 |     "    score = roc_auc_score(train_y,pre_train)\n",
 218 |     "    feas = train_x.columns\n",
 219 |     "    imps = xgbc.feature_importances_\n",
 220 |     "    fea_imp = pd.DataFrame(pd.Series(feas),columns = ['feas'])\n",
 221 |     "    fea_imp['imp'] = imps\n",
 222 |     "    fea_imp = fea_imp.sort_values(by = 'imp',ascending = False)\n",
 223 |     "    del dtrain_x\n",
 224 |     "    del deval_x\n",
 225 |     "    del dtrain_y\n",
 226 |     "    del deval_y\n",
 227 |     "    gc.collect()\n",
 228 |     "    return pre_test,pre_train,score,fea_imp,xgbc.best_iteration\n",
 229 |     "def cat_train_pre1(train_x,train_y,test_x,categoryfeas,dropfeas,one,save_model):\n",
 230 |     "    train_x = train_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n",
 231 |     "    test_x = test_x.drop(dropfeas+['inviteday','inviteallhour'],axis = 1)\n",
 232 |     "    pickle.dump(list(train_x.columns), open(datapath+'data/cat_fea.pkl', 'wb'))\n",
 233 |     "    params_lgbc ={        \n",
 234 |     "        'learning_rate':0.1,\n",
 235 |     "        'n_estimators':50000,\n",
 236 |     "        'max_depth':6,\n",
 237 |     "#         'subsample':0.9,\n",
 238 |     "        'l2_leaf_reg':2,\n",
 239 |     "        'objective':'Logloss',\n",
 240 |     "        'scale_pos_weight':1,\n",
 241 |     "        'eval_metric':'AUC',\n",
 242 |     "        'colsample_bylevel':0.8\n",
 243 |     "    }\n",
 244 |     "    \n",
 245 |     "    pre_train = pd.Series(np.zeros(len(train_y)))\n",
 246 |     "    pre_test = []\n",
 247 |     "    kf = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 2019)\n",
 248 |     "    fold = 1\n",
 249 |     "    for train_index,eval_index in kf.split(train_x,train_y):\n",
 250 |     "        dtrain_x = train_x.loc[train_index,:]\n",
 251 |     "        deval_x = train_x.loc[eval_index,:]\n",
 252 |     "        dtrain_y = train_y[train_index]\n",
 253 |     "        deval_y = train_y[eval_index]\n",
 254 |     "        catbc = catb.CatBoostClassifier(random_state = 2019,**params_lgbc)\n",
 255 |     "        catbc.fit(dtrain_x,dtrain_y,eval_set = [(deval_x,deval_y)],cat_features = categoryfeas, # eval_metric = 'auc',\n",
 256 |     "                 early_stopping_rounds = 50,sample_weight = sample_weight,verbose = 100)\n",
 257 |     "        pre_train[eval_index] = catbc.predict_proba(deval_x)[:,1]\n",
 258 |     "        pre_test.append(list(catbc.predict_proba(test_x)[:,1]))\n",
 259 |     "        if save_model:\n",
 260 |     "            joblib.dump(catbc, open(datapath+'data/cat_'+str(params_lgbc['learning_rate'])+'_'+str(fold)+'.pkl', 'wb'))\n",
 261 |     "        fold += 1\n",
 262 |     "        if one:\n",
 263 |     "            break\n",
 264 |     "    pre_test = np.array(pre_test)\n",
 265 |     "    pre_test = np.mean(pre_test,axis = 0)\n",
 266 |     "    \n",
 267 |     "    score = roc_auc_score(train_y,pre_train)\n",
 268 |     "    feas = train_x.columns\n",
 269 |     "    imps = catbc.get_feature_importance()\n",
 270 |     "    fea_imp = pd.DataFrame(pd.Series(feas),columns = ['feas'])\n",
 271 |     "    fea_imp['imp'] = imps\n",
 272 |     "    fea_imp = fea_imp.sort_values(by = 'imp',ascending = False)\n",
 273 |     "    del dtrain_x\n",
 274 |     "    del deval_x\n",
 275 |     "    del dtrain_y\n",
 276 |     "    del deval_y\n",
 277 |     "    gc.collect()\n",
 278 |     "    return pre_test,pre_train,score,fea_imp,catbc.get_best_iteration()\n",
 279 |     "\n",
 280 |     "def parallelize_dataframe(df,func):\n",
 281 |     "    df_split = np.array_split(df,20)#cpu_count()\n",
 282 |     "    pool = Pool(20)#cpu_count()\n",
 283 |     "    df = pd.concat(pool.map(func, df_split))\n",
 284 |     "    pool.close()\n",
 285 |     "    pool.join()\n",
 286 |     "    return df\n",
 287 |     "\n",
 288 |     "def mostliketheme(x):\n",
 289 |     "    if x == '-1':\n",
 290 |     "        return '-1'\n",
 291 |     "    for theme in iter(x.strip().split(',')):\n",
 292 |     "        theme = theme.strip().split(':')\n",
 293 |     "        try:\n",
 294 |     "            if float(theme[1])>biggestlike:\n",
 295 |     "                biggestlike = theme[1]\n",
 296 |     "                mostliketheme = theme[0]\n",
 297 |     "        except:\n",
 298 |     "            biggestlike = theme[1]\n",
 299 |     "            mostliketheme = theme[0]\n",
 300 |     "    return mostliketheme\n",
 301 |     "\n",
 302 |     "def getweekday(data):\n",
 303 |     "    return data%7"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "markdown",
 308 |    "metadata": {},
 309 |    "source": [
 310 |     "# .数据处理"
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "markdown",
 315 |    "metadata": {},
 316 |    "source": [
 317 |     "## 一、预处理"
 318 |    ]
 319 |   },
 320 |   {
 321 |    "cell_type": "markdown",
 322 |    "metadata": {},
 323 |    "source": [
 324 |     "### 词向量\n",
 325 |     "\n",
 326 |     "处理词向量文件"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "code",
 331 |    "execution_count": null,
 332 |    "metadata": {
 333 |     "collapsed": true
 334 |    },
 335 |    "outputs": [],
 336 |    "source": [
 337 |     "data_word = pd.read_csv(datapath+'data/word_vectors_64d.txt',sep = ' ',header = None,\n",
 338 |     "                        names = ['word_'+str(i) for i in range(64)])\n",
 339 |     "\n",
 340 |     "d = data_word['word_0'].apply(lambda x:x.split('\\t'))\n",
 341 |     "data_word['wordId'] = d.apply(lambda x:x[0])\n",
 342 |     "data_word['word_0'] = d.apply(lambda x:x[1]).astype(np.float32)\n",
 343 |     "data_word = memoryOptimization(data_word,np.float32)\n",
 344 |     "\n",
 345 |     "data_word.to_csv(datapath+'data/word_vector.csv',header = True,index = False)\n",
 346 |     "del data_word\n",
 347 |     "del d\n",
 348 |     "gc.collect()"
 349 |    ]
 350 |   },
 351 |   {
 352 |    "cell_type": "markdown",
 353 |    "metadata": {},
 354 |    "source": [
 355 |     "### 单字向量\n",
 356 |     "\n",
 357 |     "处理单字向量文件"
 358 |    ]
 359 |   },
 360 |   {
 361 |    "cell_type": "code",
 362 |    "execution_count": null,
 363 |    "metadata": {
 364 |     "collapsed": true
 365 |    },
 366 |    "outputs": [],
 367 |    "source": [
 368 |     "data_letter = pd.read_csv(datapath+'data/single_word_vectors_64d.txt',sep = ' ',header = None,\n",
 369 |     "                        names = ['letter_'+str(i) for i in range(64)])\n",
 370 |     "\n",
 371 |     "d = data_letter['letter_0'].apply(lambda x:x.split('\\t'))\n",
 372 |     "data_letter['letterId'] = d.apply(lambda x:x[0])\n",
 373 |     "data_letter['letter_0'] = d.apply(lambda x:x[1]).astype(np.float32)\n",
 374 |     "data_letter = memoryOptimization(data_letter,np.float32)\n",
 375 |     "\n",
 376 |     "data_letter.to_csv(datapath+'data/letter_vector.csv',header = True,index = False)\n",
 377 |     "del data_letter\n",
 378 |     "del d\n",
 379 |     "gc.collect()"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "markdown",
 384 |    "metadata": {},
 385 |    "source": [
 386 |     "### 话题向量\n",
 387 |     "\n",
 388 |     "处理话题向量文件"
 389 |    ]
 390 |   },
 391 |   {
 392 |    "cell_type": "code",
 393 |    "execution_count": null,
 394 |    "metadata": {
 395 |     "collapsed": true
 396 |    },
 397 |    "outputs": [],
 398 |    "source": [
 399 |     "data_theme = pd.read_csv(datapath+'data/topic_vectors_64d.txt',sep = ' ',header = None,\n",
 400 |     "                        names = ['theme_'+str(i) for i in range(64)])\n",
 401 |     "\n",
 402 |     "d = data_theme['theme_0'].apply(lambda x:x.split('\\t'))\n",
 403 |     "data_theme['themeId'] = d.apply(lambda x:x[0])\n",
 404 |     "data_theme['theme_0'] = d.apply(lambda x:x[1]).astype(np.float32)\n",
 405 |     "data_theme = memoryOptimization(data_theme,np.float32)\n",
 406 |     "data_theme.to_csv(datapath+'data/theme_vector.csv',header = True,index = False)\n"
 407 |    ]
 408 |   },
 409 |   {
 410 |    "cell_type": "markdown",
 411 |    "metadata": {},
 412 |    "source": [
 413 |     "利用pca将64维的话题向量进行压缩为22维，以进一步去除噪声"
 414 |    ]
 415 |   },
 416 |   {
 417 |    "cell_type": "code",
 418 |    "execution_count": null,
 419 |    "metadata": {
 420 |     "collapsed": true
 421 |    },
 422 |    "outputs": [],
 423 |    "source": [
 424 |     "normalizer = Normalizer(copy = False)\n",
 425 |     "data_pca = data_theme[['theme_'+str(i) for i in range(64)]]\n",
 426 |     "data_pca = normalizer.fit_transform(data_pca)\n",
 427 |     "n = int(64*0.35)\n",
 428 |     "svd = TruncatedSVD(n_components = n)\n",
 429 |     "data_pca = svd.fit_transform(data_pca)\n",
 430 |     "data_pca= pd.DataFrame(data_pca,columns = ['theme_'+str(i) for i in range(n)])\n",
 431 |     "data_pca['themeId'] = data_theme['themeId']\n",
 432 |     "\n",
 433 |     "data_pca.to_csv(datapath+'data/theme_vector_pca.csv',header = True,index = False)\n",
 434 |     "del data_theme\n",
 435 |     "del data_pca\n",
 436 |     "del d\n",
 437 |     "gc.collect()"
 438 |    ]
 439 |   },
 440 |   {
 441 |    "cell_type": "markdown",
 442 |    "metadata": {},
 443 |    "source": [
 444 |     "### 回答记录文件\n",
 445 |     "\n",
 446 |     "处理回答记录文件，根据回答时间字段生成回答的天、时字段"
 447 |    ]
 448 |   },
 449 |   {
 450 |    "cell_type": "code",
 451 |    "execution_count": null,
 452 |    "metadata": {
 453 |     "collapsed": true
 454 |    },
 455 |    "outputs": [],
 456 |    "source": [
 457 |     "columns = ['answerId','qId','writerId','answertime','content_letters','content_words','good_bool','recommend_bool','yuanzhuo_bool',\n",
 458 |     "           'picture_bool','vedio_bool','wordnum','likenum','cancellikenum','commentnum','collectnum','3qnum','jubaonum','unhelpnum',\n",
 459 |     "           'unlikenum']\n",
 460 |     "data_answer = pd.read_csv(datapath+'data/answer_info.txt',sep = '\\t',header = None,names = columns)\n",
 461 |     "data_answer['answerday'] = np.nan\n",
 462 |     "data_answer['answerhour'] = np.nan\n",
 463 |     "data_answer['answerday'] = data_answer['answertime'].apply(lambda x: int(x.split('-')[0][1:]))\n",
 464 |     "data_answer['answerhour'] = data_answer['answertime'].apply(lambda x: int(x.split('-')[1][1:]))\n",
 465 |     "data_answer = memoryOptimization(data_answer,np.float32)\n",
 466 |     "\n",
 467 |     "data_answer.to_csv(datapath+'data/data_answer.csv',header = True,index = False)\n",
 468 |     "del data_answer\n",
 469 |     "gc.collect()"
 470 |    ]
 471 |   },
 472 |   {
 473 |    "cell_type": "markdown",
 474 |    "metadata": {},
 475 |    "source": [
 476 |     "### 问题文件\n",
 477 |     "\n",
 478 |     "处理问题信息文件，根据问题创建时间字段生成问题创建的的天、时字段  "
 479 |    ]
 480 |   },
 481 |   {
 482 |    "cell_type": "code",
 483 |    "execution_count": null,
 484 |    "metadata": {
 485 |     "collapsed": true
 486 |    },
 487 |    "outputs": [],
 488 |    "source": [
 489 |     "columns = ['qId','createtime','title_letters','title_words','describe_letters','describe_words','themeId']\n",
 490 |     "data_question = pd.read_csv(datapath+'data/question_info.txt',sep = '\\t',header = None,names = columns)\n",
 491 |     "\n",
 492 |     "data_question['createday'] = np.nan\n",
 493 |     "data_question['createhour'] = np.nan\n",
 494 |     "data_question['createday'] = data_question['createtime'].apply(lambda x: int(x.split('-')[0][1:]))\n",
 495 |     "data_question['createhour'] = data_question['createtime'].apply(lambda x: int(x.split('-')[0][1:]))\n",
 496 |     "data_question = memoryOptimization(data_question,np.float32)\n",
 497 |     "\n",
 498 |     "data_question.to_csv(datapath+'data/data_question.csv',header = True,index = False)"
 499 |    ]
 500 |   },
 501 |   {
 502 |    "cell_type": "markdown",
 503 |    "metadata": {},
 504 |    "source": [
 505 |     "问题绑定的话题相当于问题的tag字段，直接做countvector成类似onehot的形式维度过大，所以这里采用利用给定话题64维向量pca降维压缩并聚合的方式表示问题tag，利用pca后得到的话题的22维embedding，得到问题关于话题的一个embdeding表示，具体方法是：  \n",
 506 |     "1）将问题绑定的多个话题的对应22维em加和取平均  \n",
 507 |     "2）如果问题绑定的话题为缺失，即‘-1’，则对应的emb为nan  \n",
 508 |     "代码如下："
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "code",
 513 |    "execution_count": null,
 514 |    "metadata": {
 515 |     "collapsed": true
 516 |    },
 517 |    "outputs": [],
 518 |    "source": [
 519 |     "t2v_pca = pd.read_csv(datapath+'data/theme_vector_pca.csv')\n",
 520 |     "cols = list(t2v_pca.columns)\n",
 521 |     "cols.remove('themeId')\n",
 522 |     "cols = ['themeId']+cols\n",
 523 |     "t2v_pca = t2v_pca[cols]\n",
 524 |     "dic_t2v_pca = {}\n",
 525 |     "for row in iter(t2v_pca.values):\n",
 526 |     "    dic_t2v_pca[row[0]] = row[1:]\n",
 527 |     "\n",
 528 |     "def get_data_themeembs(data):\n",
 529 |     "    result = []\n",
 530 |     "    for themes in iter(data['themeId'].values):\n",
 531 |     "        if themes == '-1':\n",
 532 |     "            result.append([])\n",
 533 |     "            continue\n",
 534 |     "        cur = np.zeros(int(64*0.35))\n",
 535 |     "        themes = themes.split(',')\n",
 536 |     "        for theme in iter(themes):\n",
 537 |     "            cur = cur + dic_t2v_pca[theme]\n",
 538 |     "        cur = cur/len(themes)\n",
 539 |     "        result.append(list(cur))\n",
 540 |     "    return pd.DataFrame(result,columns = ['themeId'+str(i) for i in range(int(64*0.35))])\n",
 541 |     "\n",
 542 |     "q_theme = parallelize_dataframe(data_question,get_data_themeembs)\n",
 543 |     "q_theme.to_csv(datapath+'data/question_theme.csv',header = True,index = False)\n",
 544 |     "\n",
 545 |     "del data_question\n",
 546 |     "del q_theme\n",
 547 |     "del t2w_pca\n",
 548 |     "gc.collect()"
 549 |    ]
 550 |   },
 551 |   {
 552 |    "cell_type": "markdown",
 553 |    "metadata": {},
 554 |    "source": [
 555 |     "对问题title跟描述利用TFIDF进行过滤"
 556 |    ]
 557 |   },
 558 |   {
 559 |    "cell_type": "code",
 560 |    "execution_count": null,
 561 |    "metadata": {
 562 |     "collapsed": true
 563 |    },
 564 |    "outputs": [],
 565 |    "source": [
 566 |     "def gen_idf(corpus,outputfile):\n",
 567 |     "    ignored = {'', ' ', '', '。', '：', '，', '）', '（', '！', '?', '”', '“'}\n",
 568 |     "    id_freq = {}\n",
 569 |     "    i = 0\n",
 570 |     "    for doc in corpus:\n",
 571 |     "        doc = set(x for x in doc if x not in ignored)\n",
 572 |     "        for x in doc:\n",
 573 |     "            id_freq[x] = id_freq.get(x, 0) + 1\n",
 574 |     "        i += 1\n",
 575 |     "    with open(outputfile, 'w', encoding='utf-8') as f:\n",
 576 |     "        for key, value in id_freq.items():\n",
 577 |     "            f.write(key + ' ' + str(math.log(i / value, 2)) + '\\n')\n",
 578 |     "            \n",
 579 |     "class IDFLoader(object):\n",
 580 |     "    def __init__(self, idf_path):\n",
 581 |     "        self.idf_path = idf_path\n",
 582 |     "        self.idf_freq = {}     # idf\n",
 583 |     "        self.mean_idf = 0.0    # 均值\n",
 584 |     "        self.load_idf()\n",
 585 |     "\n",
 586 |     "    def load_idf(self):       # 从文件中载入idf\n",
 587 |     "        cnt = 0\n",
 588 |     "        with open(self.idf_path, 'r', encoding='utf-8') as f:\n",
 589 |     "            for line in f:\n",
 590 |     "                try:\n",
 591 |     "                    word, freq = line.strip().split(' ')\n",
 592 |     "                    cnt += 1\n",
 593 |     "                except Exception as e:\n",
 594 |     "                    pass\n",
 595 |     "                self.idf_freq[word] = float(freq)\n",
 596 |     "\n",
 597 |     "        print('Vocabularies loaded: %d' % cnt)\n",
 598 |     "        self.mean_idf = sum(self.idf_freq.values()) / cnt\n",
 599 |     "        \n",
 600 |     "class TFIDF(object):\n",
 601 |     "    def __init__(self, idf_path):\n",
 602 |     "        self.idf_loader = IDFLoader(idf_path)\n",
 603 |     "        self.idf_freq = self.idf_loader.idf_freq\n",
 604 |     "        self.mean_idf = self.idf_loader.mean_idf\n",
 605 |     "\n",
 606 |     "    def extract_sentence_keywords(self, sentence,filter_word=None,topK=None,all_tfidf = False):    # 提取关键词\n",
 607 |     "        # 过滤\n",
 608 |     "        #seg_list = segment(sentence)\n",
 609 |     "        seg_list = [x for x in sentence if len(x)>1]\n",
 610 |     "        freq = {}\n",
 611 |     "        for w in seg_list:\n",
 612 |     "            freq[w] = freq.get(w, 0.0) + 1.0\n",
 613 |     "        total = sum(freq.values())\n",
 614 |     "\n",
 615 |     "        for k in freq:   # 计算 TF-IDF\n",
 616 |     "            freq[k] *= self.idf_freq.get(k, self.mean_idf) / total\n",
 617 |     "\n",
 618 |     "\n",
 619 |     "        tags = sorted(freq, key=freq.__getitem__, reverse=True)  # 排序\n",
 620 |     "        if filter_word!=None:\n",
 621 |     "            tags = [x for x in tags if x not in filter_word]\n",
 622 |     "        if topK!=None:\n",
 623 |     "            if all_tfidf:\n",
 624 |     "                return tags[:topK],freq\n",
 625 |     "            else:\n",
 626 |     "                return tags[:topK]\n",
 627 |     "        else:\n",
 628 |     "            if all_tfidf:\n",
 629 |     "                return tags,freq\n",
 630 |     "            else:\n",
 631 |     "                return tags\n",
 632 |     "            \n",
 633 |     "    def extract_corpus_keywords(self, corpus, filter_word=None,topK=None,all_tfidf = False):    # 提取关键词\n",
 634 |     "        # 过滤\n",
 635 |     "        #seg_list = segment(sentence)\n",
 636 |     "        all_tags = []\n",
 637 |     "        all_freq = []\n",
 638 |     "        for sentence in corpus:\n",
 639 |     "            seg_list = [x for x in sentence if len(x)>1]\n",
 640 |     "            freq = {}\n",
 641 |     "            for w in seg_list:\n",
 642 |     "                freq[w] = freq.get(w, 0.0) + 1.0\n",
 643 |     "            total = sum(freq.values())\n",
 644 |     "\n",
 645 |     "            for k in freq:   # 计算 TF-IDF\n",
 646 |     "                freq[k] *= self.idf_freq.get(k, self.mean_idf) / total\n",
 647 |     "            if all_tfidf:\n",
 648 |     "                all_freq.append(freq)\n",
 649 |     "            tags = sorted(freq, key=freq.__getitem__, reverse=True)  # 排序\n",
 650 |     "            if filter_word!=None:\n",
 651 |     "                tags = [x for x in tags if x not in filter_word]\n",
 652 |     "            if topK!=None:\n",
 653 |     "                 all_tags.append(tags[:topK])\n",
 654 |     "            else:\n",
 655 |     "                all_tags.append(tags)\n",
 656 |     "        if all_tfidf:            \n",
 657 |     "            return all_tags,all_freq\n",
 658 |     "        else:\n",
 659 |     "            return all_tags\n",
 660 |     "\n",
 661 |     "\n",
 662 |     "question_info = pd.read_csv(datapath+'data/question_info.txt', header=None, sep='\\t')\n",
 663 |     "question_info.columns = ['问题id','问题创建时间','问题标题单字编码','问题标题切词编码','问题描述单字编码','问题描述切词编码','问题绑定话题']\n",
 664 |     "question_info['len'] = question_info['问题标题切词编码'].apply(lambda x:len(x.split(',')))\n",
 665 |     "question_info['len'].max()\n",
 666 |     "\n",
 667 |     "def text(row):\n",
 668 |     "    text = row.问题标题切词编码.split(',')\n",
 669 |     "    if row.问题描述切词编码!=str(-1):\n",
 670 |     "        text.extend(row.问题描述切词编码.split(','))\n",
 671 |     "    return text\n",
 672 |     "question_info['text'] = question_info.apply(lambda row:text(row),axis=1)\n",
 673 |     "question_info['title'] =  question_info['问题标题切词编码'].apply(lambda x:x.split(','))\n",
 674 |     "\n",
 675 |     "out_file = datapath+'data/问题标题.idf.txt'\n",
 676 |     "if not os.path.exists(out_file):\n",
 677 |     "    corpus = question_info.title.values.tolist()\n",
 678 |     "    gen_idf(corpus,out_file)\n",
 679 |     "tdidf = TFIDF(out_file)\n",
 680 |     "x = question_info.title.values.tolist()\n",
 681 |     "tags = tdidf.extract_corpus_keywords(x)\n",
 682 |     "question_info['tfidf_title'] = tags\n",
 683 |     "del x,tags\n",
 684 |     "gc.collect()\n",
 685 |     "\n",
 686 |     "out_file = datapath+'data/问题标题描述.idf.txt'\n",
 687 |     "if not os.path.exists(out_file):\n",
 688 |     "    corpus = question_info.text.values.tolist()\n",
 689 |     "    gen_idf(corpus,out_file)\n",
 690 |     "tdidf = TFIDF(out_file)\n",
 691 |     "x = question_info.text.values.tolist()\n",
 692 |     "tags = tdidf.extract_corpus_keywords(x)\n",
 693 |     "question_info['tfidf_text'] = tags\n",
 694 |     "demo_data = question_info[['问题id','len','tfidf_text','tfidf_title','问题描述切词编码']]\n",
 695 |     "\n",
 696 |     "def get_topk(row):\n",
 697 |     "    ### title\n",
 698 |     "#     if row.问题标题切词编码==str(-1):\n",
 699 |     "#         return\n",
 700 |     "    n = round(row.len/3)\n",
 701 |     "    if n==0:\n",
 702 |     "        n=1\n",
 703 |     "    row['title_topk'] = row.tfidf_title[:n]\n",
 704 |     "    if row.问题描述切词编码==str(-1):\n",
 705 |     "        row['text_topk'] = row.tfidf_text[:n]\n",
 706 |     "    else:\n",
 707 |     "        if row.tfidf_title[0]=='-1':\n",
 708 |     "            m = min(4,round(2*len(row.tfidf_text)/3))\n",
 709 |     "            row['text_topk'] = row.tfidf_text[:m]\n",
 710 |     "        else:\n",
 711 |     "            row['text_topk'] = row.tfidf_text[:round(2*row.len/3)]\n",
 712 |     "    return row\n",
 713 |     "\n",
 714 |     "\n",
 715 |     "def parallelize_dataframe(df, func):\n",
 716 |     "    df_split = np.array_split(df, 16)\n",
 717 |     "    pool = Pool(16)\n",
 718 |     "    df = pd.concat(pool.map(func, df_split))\n",
 719 |     "    pool.close()\n",
 720 |     "    pool.join()\n",
 721 |     "    return df\n",
 722 |     "def get_topk_all(df):\n",
 723 |     "    df = df.apply(lambda row:get_topk(row),axis=1)\n",
 724 |     "    return df\n",
 725 |     "\n",
 726 |     "demo_data = parallelize_dataframe(demo_data, get_topk_all)\n",
 727 |     "demo_data['问题标题切词编码']=question_info['问题标题切词编码']\n",
 728 |     "\n",
 729 |     "def clean_1(ls):\n",
 730 |     "    if len(ls)==1 and ls[0]=='-1':\n",
 731 |     "        return ls\n",
 732 |     "    ls = [x for x in ls if x!='-1']\n",
 733 |     "    return ls\n",
 734 |     "\n",
 735 |     "x = demo_data[[\"问题id\",'title_topk','text_topk']]\n",
 736 |     "x['text_topk'] = x['text_topk'].apply(lambda x:clean_1(x))\n",
 737 |     "x['title_topk'] = x['title_topk'].apply(lambda s:','.join(s))\n",
 738 |     "x['text_topk'] = x['text_topk'].apply(lambda s:','.join(s))\n",
 739 |     "x.to_csv(datapath+'data/data_q_title_tfidf.csv')"
 740 |    ]
 741 |   },
 742 |   {
 743 |    "cell_type": "markdown",
 744 |    "metadata": {},
 745 |    "source": [
 746 |     "### 用户文件\n",
 747 |     "\n",
 748 |     "处理用户信息文件,  \n",
 749 |     "drop掉keywords等没有意义的列；  \n",
 750 |     "将类别特征中的‘unknown’替换为nan；  \n",
 751 |     "构建用户最感兴趣的主题；"
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "code",
 756 |    "execution_count": null,
 757 |    "metadata": {
 758 |     "collapsed": true
 759 |    },
 760 |    "outputs": [],
 761 |    "source": [
 762 |     "columns = ['writerId','sex','keywords','publishrank','heatrank','registertype','platform','activity','bool_A','bool_B','bool_C',\n",
 763 |     "          'bool_D','bool_E','category_A','category_B','category_C','category_D','category_E','yanzhi','attentionthemes','likethemes']\n",
 764 |     "data_writer = pd.read_csv(datapath+'data/member_info.txt',sep = '\\t',names = columns)\n",
 765 |     "data_writer.drop(['keywords','publishrank', 'heatrank', 'registertype', 'platform'],axis = 1)\n",
 766 |     "data_writer[data_writer.select_dtypes(include = 'object').columns] = data_writer.select_dtypes(include = 'object').applymap(lambda x: float('nan') if x == 'unknown' else x)\n",
 767 |     "\n",
 768 |     "def mostliketheme(x):\n",
 769 |     "    if x == '-1':\n",
 770 |     "        return '-1'\n",
 771 |     "    for theme in iter(x.strip().split(',')):\n",
 772 |     "        theme = theme.strip().split(':')\n",
 773 |     "        try:\n",
 774 |     "            if float(theme[1])>biggestlike:\n",
 775 |     "                biggestlike = theme[1]\n",
 776 |     "                mostliketheme = theme[0]\n",
 777 |     "        except:\n",
 778 |     "            biggestlike = theme[1]\n",
 779 |     "            mostliketheme = theme[0]\n",
 780 |     "    return mostliketheme\n",
 781 |     "\n",
 782 |     "data_writer['mostliketheme'] = data_writer['likethemes'].apply(mostliketheme)\n",
 783 |     "data_writer = memoryOptimization(data_writer,np.float32)\n",
 784 |     "data_writer.to_csv(datapath+'data/data_writer.csv',header = True,index = False)"
 785 |    ]
 786 |   },
 787 |   {
 788 |    "cell_type": "markdown",
 789 |    "metadata": {},
 790 |    "source": [
 791 |     "与问题文件类似的，利用话题的22维emb分别得到用户关于用户关注话题的emb以及关于用户感兴趣的话题的emb，  \n",
 792 |     "其中感兴趣话题的聚合方式是加权平均，权重是用户对该话题感兴趣程度"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "code",
 797 |    "execution_count": null,
 798 |    "metadata": {
 799 |     "collapsed": true
 800 |    },
 801 |    "outputs": [],
 802 |    "source": [
 803 |     "t2v_pca = pd.read_csv(datapath+'data/theme_vector_pca.csv')\n",
 804 |     "cols = list(t2v_pca.columns)\n",
 805 |     "cols.remove('themeId')\n",
 806 |     "cols = ['themeId']+cols\n",
 807 |     "t2v_pca = t2v_pca[cols]\n",
 808 |     "dic_t2v_pca = {}\n",
 809 |     "for row in iter(t2v_pca.values):\n",
 810 |     "    dic_t2v_pca[row[0]] = row[1:]\n",
 811 |     "\n",
 812 |     "def get_data_themeembs(data):\n",
 813 |     "    result = []\n",
 814 |     "    for themes in iter(data['attentionthemes'].values):\n",
 815 |     "        if themes == '-1':\n",
 816 |     "            result.append([])\n",
 817 |     "            continue\n",
 818 |     "        cur = np.zeros(int(64*0.35))\n",
 819 |     "        themes = themes.split(',')\n",
 820 |     "        for theme in iter(themes):\n",
 821 |     "            cur = cur + dic_t2v_pca[theme]\n",
 822 |     "        cur = cur/len(themes)\n",
 823 |     "        result.append(list(cur))\n",
 824 |     "    return pd.DataFrame(result,columns = ['attentionthemes'+str(i) for i in range(int(64*0.35))])\n",
 825 |     "\n",
 826 |     "writer_attentionthemes = parallelize_dataframe(data_writer,get_data_themeembs)\n",
 827 |     "writer_attentionthemes.to_csv(datapath+'data/writer_attentiontheme.csv',header = True,index = False)\n",
 828 |     "\n",
 829 |     "def get_data_themeembs_weight(data):\n",
 830 |     "    def f(s,data_v,leix):\n",
 831 |     "        result = np.zeros(22)\n",
 832 |     "        if s == '-1':\n",
 833 |     "            return result\n",
 834 |     "        s = s.strip().split(',')\n",
 835 |     "        for t in iter(s):\n",
 836 |     "            t = t.strip().split(':')\n",
 837 |     "            result = result+data_v.loc[data_v[leix+'Id'] == t[0],[leix+'_'+str(i) for i in range(22)]].values[0]*float(t[1])\n",
 838 |     "        try:\n",
 839 |     "            result = result/len(s)\n",
 840 |     "        except:\n",
 841 |     "            pass\n",
 842 |     "        return result\n",
 843 |     "    data_v = t2v_pca\n",
 844 |     "    col = 'likethemes'\n",
 845 |     "    leix = 'theme'\n",
 846 |     "    return pd.DataFrame(list(data.apply(lambda x:f(x,data_v,leix))),columns = [col+str(i) for i in range(int(64*0.35))])\n",
 847 |     "\n",
 848 |     "writer_likethemes = parallelize_dataframe(data_writer['likethemes'],get_data_themeembs_weight)\n",
 849 |     "writer_likethemes.to_csv(datapath+'data/writer_liketheme.csv',header = True,index = False)\n",
 850 |     "\n",
 851 |     "del data_writer\n",
 852 |     "del writer_likethemes\n",
 853 |     "del writer_attentionthemes\n",
 854 |     "gc.collect()"
 855 |    ]
 856 |   },
 857 |   {
 858 |    "cell_type": "markdown",
 859 |    "metadata": {},
 860 |    "source": [
 861 |     "### 训练集\n",
 862 |     "处理训练集，根据邀请时间字段生成邀请的天、时字段 "
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "code",
 867 |    "execution_count": null,
 868 |    "metadata": {
 869 |     "collapsed": true
 870 |    },
 871 |    "outputs": [],
 872 |    "source": [
 873 |     "data_invite = pd.read_csv(datapath+'data/invite_info.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime','label'])\n",
 874 |     "data_invite['inviteday'] = np.nan\n",
 875 |     "data_invite['invitehour'] = np.nan\n",
 876 |     "data_invite['inviteday'] = data_invite.invitetime.apply(lambda x: int(x.split('-')[0][1:]))\n",
 877 |     "data_invite['invitehour'] = data_invite.invitetime.apply(lambda x: int(x.split('-')[1][1:]))\n",
 878 |     "data_invite = memoryOptimization(data_invite,np.float32)"
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "markdown",
 883 |    "metadata": {},
 884 |    "source": [
 885 |     "根据官方的说法一个问题只会对同一个用户邀请一次，训练集中存在2k个左右重复的样本，将这部分样本去重，保留邀请时间最早的样本"
 886 |    ]
 887 |   },
 888 |   {
 889 |    "cell_type": "code",
 890 |    "execution_count": null,
 891 |    "metadata": {
 892 |     "collapsed": true
 893 |    },
 894 |    "outputs": [],
 895 |    "source": [
 896 |     "data_invite['index'] = data_invite.index\n",
 897 |     "data_invite = data_invite.sort_values(by = ['inviteday','invitehour']).reset_index(drop = True).drop_duplicates(['qId','writerId'],keep = 'first')\n",
 898 |     "data_invite = data_invite.sort_values(by = 'index').reset_index(drop = True)\n",
 899 |     "del data_invite['index']"
 900 |    ]
 901 |   },
 902 |   {
 903 |    "cell_type": "markdown",
 904 |    "metadata": {},
 905 |    "source": [
 906 |     "删除回答和邀请时间>10天的正样本"
 907 |    ]
 908 |   },
 909 |   {
 910 |    "cell_type": "code",
 911 |    "execution_count": null,
 912 |    "metadata": {
 913 |     "collapsed": true
 914 |    },
 915 |    "outputs": [],
 916 |    "source": [
 917 |     "data_answer = pd.read_csv(datapath+'data/data_answer.csv')[['qId','writerId','answerday','answerhour']]\n",
 918 |     "data_invite = data_invite.merge(data_answer[['qId','writerId','answerday','answerhour']],on = ['writerId','qId'],how = 'left')\n",
 919 |     "data_invite['deltday'] = data_invite['answerday']-data_invite['inviteday']\n",
 920 |     "data_invite = data_invite[~(data_invite['deltday']>10)].reset_index(drop = True)\n",
 921 |     "del data_invite['answerday']\n",
 922 |     "del data_invite['answerhour']\n",
 923 |     "del data_invite['deltday']\n",
 924 |     "del data_answer\n",
 925 |     "data_invite.to_csv(datapath+'data/data_invite.csv',header = True,index = False)\n",
 926 |     "del data_invite\n",
 927 |     "gc.collect()"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "markdown",
 932 |    "metadata": {},
 933 |    "source": [
 934 |     "### 测试集1&2"
 935 |    ]
 936 |   },
 937 |   {
 938 |    "cell_type": "markdown",
 939 |    "metadata": {},
 940 |    "source": [
 941 |     "处理测试集1和测试集2，根据邀请时间字段生成邀请的天、时字段"
 942 |    ]
 943 |   },
 944 |   {
 945 |    "cell_type": "code",
 946 |    "execution_count": null,
 947 |    "metadata": {
 948 |     "collapsed": true
 949 |    },
 950 |    "outputs": [],
 951 |    "source": [
 952 |     "data_eval = pd.read_csv(datapath+'data/invite_info_evaluate_1.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime','label'])\n",
 953 |     "data_eval['inviteday'] = np.nan\n",
 954 |     "data_eval['invitehour'] = np.nan\n",
 955 |     "data_eval['inviteday'] = data_eval.invitetime.apply(lambda x: int(x.split('-')[0][1:]))\n",
 956 |     "data_eval['invitehour'] = data_eval.invitetime.apply(lambda x: int(x.split('-')[1][1:]))\n",
 957 |     "data_eval = memoryOptimization(data_eval,np.float64)\n",
 958 |     "data_eval.to_csv(datapath+'data/data_invite_eval.csv',header = True,index = False)\n",
 959 |     "del data_eval\n",
 960 |     "\n",
 961 |     "data_test = pd.read_csv(datapath+'data/invite_info_evaluate_2_0926.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime','label'])\n",
 962 |     "data_test['inviteday'] = np.nan\n",
 963 |     "data_test['invitehour'] = np.nan\n",
 964 |     "data_test['inviteday'] = data_test.invitetime.apply(lambda x: int(x.split('-')[0][1:]))\n",
 965 |     "data_test['invitehour'] = data_test.invitetime.apply(lambda x: int(x.split('-')[1][1:]))\n",
 966 |     "data_test = memoryOptimization(data_test,np.float64)\n",
 967 |     "data_test.to_csv(datapath+'data/data_invite_test.csv',header = True,index = False)\n",
 968 |     "del data_test"
 969 |    ]
 970 |   },
 971 |   {
 972 |    "cell_type": "markdown",
 973 |    "metadata": {},
 974 |    "source": [
 975 |     "## 二、数据拼接  \n",
 976 |     "拼接train、test1、test2得到完整数据data，将用户信息和问题信息merge到data上"
 977 |    ]
 978 |   },
 979 |   {
 980 |    "cell_type": "code",
 981 |    "execution_count": null,
 982 |    "metadata": {
 983 |     "collapsed": true
 984 |    },
 985 |    "outputs": [],
 986 |    "source": [
 987 |     "data_train = pd.read_csv(datapath+'data/data_invite.csv')#train\n",
 988 |     "data_test1 = pd.read_csv(datapath+'data/data_invite_eval.csv')#test1\n",
 989 |     "data_test2 = pd.read_csv(datapath+'data/data_invite_test.csv')#test2\n",
 990 |     "data_train['type'] = 'train'\n",
 991 |     "data_test1['type'] = 'test1'\n",
 992 |     "data_test2['type'] = 'test2'\n",
 993 |     "data = pd.concat([data_train,data_test1,data_test2],axis = 0,ignore_index = True)\n",
 994 |     "data['label'] = data['label'].fillna(-1)\n",
 995 |     "data = data.drop(['invitetime'],axis = 1)\n",
 996 |     "data = memoryOptimization(data,np.float32)\n",
 997 |     "del data_test1\n",
 998 |     "del data_test2\n",
 999 |     "del data_train\n",
1000 |     "gc.collect()\n",
1001 |     "\n",
1002 |     "data_question = pd.read_csv(datapath+'data/data_question.csv')\n",
1003 |     "question_theme = pd.read_csv(datapath+'data/question_theme.csv')\n",
1004 |     "data_question = pd.concat([data_question,question_theme],axis = 1)\n",
1005 |     "data_question = data_question.drop(['createtime','title_letters','title_words','describe_letters','describe_words'],axis = 1)\n",
1006 |     "data_question = memoryOptimization(data_question,np.float32)\n",
1007 |     "del question_theme\n",
1008 |     "\n",
1009 |     "data_writer = pd.read_csv(datapath+'data/data_writer.csv')\n",
1010 |     "writer_attentiontheme = pd.read_csv(datapath+'data/writer_attentiontheme.csv')\n",
1011 |     "data_writer = pd.concat([data_writer,writer_attentiontheme],axis = 1)\n",
1012 |     "data_writer = memoryOptimization(data_writer,np.float32)\n",
1013 |     "del writer_attentiontheme\n",
1014 |     "gc.collect()\n",
1015 |     "\n",
1016 |     "data = pd.merge(data,data_question,how = 'left',on = 'qId')\n",
1017 |     "data = pd.merge(data,data_writer,how = 'left',on = 'writerId')\n",
1018 |     "data['inviteallhour'] = (data['inviteday']-3800)*24+data['invitehour']\n",
1019 |     "data['inviteweekday'] = getweekday(data['inviteday'])\n",
1020 |     "data['createweekday'] = getweekday(data['createday'])\n",
1021 |     "del data_question\n",
1022 |     "del data_writer\n",
1023 |     "gc.collect()"
1024 |    ]
1025 |   },
1026 |   {
1027 |    "cell_type": "markdown",
1028 |    "metadata": {},
1029 |    "source": [
1030 |     "# .特征工程\n",
1031 |     "\n",
1032 |     "## 一、单一侧特征（用户侧、问题侧）"
1033 |    ]
1034 |   },
1035 |   {
1036 |    "cell_type": "markdown",
1037 |    "metadata": {},
1038 |    "source": [
1039 |     "### 1、计数类特征"
1040 |    ]
1041 |   },
1042 |   {
1043 |    "cell_type": "code",
1044 |    "execution_count": null,
1045 |    "metadata": {
1046 |     "collapsed": true
1047 |    },
1048 |    "outputs": [],
1049 |    "source": [
1050 |     "df = data[[]]"
1051 |    ]
1052 |   },
1053 |   {
1054 |    "cell_type": "markdown",
1055 |    "metadata": {},
1056 |    "source": [
1057 |     "1）滑窗统计特征，对id及类别特征统计过去7天的邀请数，反应过去一周的邀请情况"
1058 |    ]
1059 |   },
1060 |   {
1061 |    "cell_type": "code",
1062 |    "execution_count": null,
1063 |    "metadata": {
1064 |     "collapsed": true
1065 |    },
1066 |    "outputs": [],
1067 |    "source": [
1068 |     "def get_crossfeas_inv(data,fea1,fea2):\n",
1069 |     "    dataf = data[[fea1,fea2]].copy()\n",
1070 |     "    bool_s = (~dataf[fea1].isna())&(~dataf[fea2].isna())\n",
1071 |     "    dataf['cross'] = np.nan\n",
1072 |     "    dataf.loc[bool_s,'cross'] = dataf.loc[bool_s,fea1].apply(str)+'_'+dataf.loc[bool_s,fea2].apply(str)\n",
1073 |     "    return dataf['cross'].values\n",
1074 |     "\n",
1075 |     "def lastndayinvite(dataf,n,feas,use_weight):\n",
1076 |     "    dicfea = feas[0]\n",
1077 |     "    if len(feas)>1:\n",
1078 |     "        fea = dicfea\n",
1079 |     "        for i in feas[1:]:\n",
1080 |     "            fea = fea+'_'+i\n",
1081 |     "        dataf[fea] = get_crossfeas_inv(dataf,feas[0],feas[1])\n",
1082 |     "    else:\n",
1083 |     "        fea = dicfea\n",
1084 |     "    \n",
1085 |     "    if use_weight:\n",
1086 |     "        gps = dataf.groupby(['inviteday'])\n",
1087 |     "        dic = {}\n",
1088 |     "        for gp_id in iter(dataf['inviteday'].unique()):\n",
1089 |     "            gp = gps.get_group(gp_id)\n",
1090 |     "            dic[gp_id] = gp.shape[0]/gp[dicfea].nunique()\n",
1091 |     "        dic = pd.Series(dic)\n",
1092 |     "        dic = (dic.mean()/dic).round(3).to_dict()\n",
1093 |     "    \n",
1094 |     "    data_gps = dataf.groupby([fea,'inviteday']).size().astype(float).reset_index()\n",
1095 |     "    if use_weight:\n",
1096 |     "        values_0 = []\n",
1097 |     "        for row in iter(data_gps[['inviteday',0]].values):\n",
1098 |     "            values_0.append(dic[row[0]]*row[1])\n",
1099 |     "\n",
1100 |     "        data_gps[0] = values_0\n",
1101 |     "    data_gps = data_gps.rename(columns = {0:'size'})\n",
1102 |     "    \n",
1103 |     "    result = dataf[[]]\n",
1104 |     "    result['val'] = np.nan\n",
1105 |     "    for day in iter(dataf['inviteday'].unique()):\n",
1106 |     "        result.loc[dataf['inviteday'] == day,'val'] = get_invite_count(data_gps[(data_gps['inviteday']<=day)&(data_gps['inviteday']>day-n)],dataf[dataf['inviteday'] == day],fea)\n",
1107 |     "    result.loc[(~dataf[fea].isna())&(result['val'].isna()),'val'] = 0\n",
1108 |     "    \n",
1109 |     "    if len(feas)>1:\n",
1110 |     "        del dataf[fea]\n",
1111 |     "    gc.collect()\n",
1112 |     "    \n",
1113 |     "    return result['val'].values\n",
1114 |     "\n",
1115 |     "def get_invite_count(df_train,df_test,fea):\n",
1116 |     "    df_train = df_train[[fea,'size']].groupby(fea).sum()['size'].reset_index()\n",
1117 |     "    new_fea_name = fea+'_count'\n",
1118 |     "    df_train.columns = [fea,new_fea_name]\n",
1119 |     "    df_test = df_test[[fea]].merge(df_train,on = fea,how = 'left')\n",
1120 |     "    \n",
1121 |     "    return df_test[new_fea_name].values\n",
1122 |     "\n",
1123 |     "#7天邀请数，代表用户被邀请的频率\n",
1124 |     "n = 7\n",
1125 |     "for fea in ['qId','writerId']+['category_C','sex','activity']:#'invitehour','createhour','activity','bool_A','bool_B','bool_C','bool_D','bool_E','category_A','yanzhi'\n",
1126 |     "    df[fea+'_last%sday_count' %n] = lastndayinvite(data,n,[fea],use_weight = False)\n",
1127 |     "    print(fea+' is ok')"
1128 |    ]
1129 |   },
1130 |   {
1131 |    "cell_type": "markdown",
1132 |    "metadata": {},
1133 |    "source": [
1134 |     "对盐值等频分箱后作为类别变量处理，也可以等间距分箱、卡方分箱等，或者直接取整后作为类别变量；  \n",
1135 |     "另外invitehour等时间类数值特征也可以分箱，这里只考虑了盐值"
1136 |    ]
1137 |   },
1138 |   {
1139 |    "cell_type": "code",
1140 |    "execution_count": 8,
1141 |    "metadata": {
1142 |     "collapsed": true
1143 |    },
1144 |    "outputs": [],
1145 |    "source": [
1146 |     "def split_data(dataf,col,split_num):\n",
1147 |     "#     dataf = dataf.copy()\n",
1148 |     "    count = dataf.shape[0]\n",
1149 |     "    n = math.ceil(count/split_num)\n",
1150 |     "    split_index = [i*n for i in range(1,split_num)]\n",
1151 |     "    values = sorted(list(dataf[col]))\n",
1152 |     "    split_point = [values[i] for i in split_index]\n",
1153 |     "    split_point = sorted(list(set(split_point)))\n",
1154 |     "    return split_point\n",
1155 |     "\n",
1156 |     "def get_group(x,split_bin):\n",
1157 |     "    n = len(split_bin)\n",
1158 |     "    if x <= min(split_bin):\n",
1159 |     "        return min(split_bin)\n",
1160 |     "    elif x> max(split_bin):\n",
1161 |     "        return max(split_bin)+max(split_bin)/n\n",
1162 |     "    else:\n",
1163 |     "        for i in range(n-1):\n",
1164 |     "            if split_bin[i] < x <= split_bin[i+1]:\n",
1165 |     "                return split_bin[i+1]\n",
1166 |     "\n",
1167 |     "points = split_data(data,'yanzhi',split_num = 10)\n",
1168 |     "data['yanzhi_d'] = data['yanzhi'].apply(lambda x:get_group(x,points)).apply(int)\n",
1169 |     "df['yanzhi_d_last%sday_count' %n] = lastndayinvite(data,n,['yanzhi_d'],use_weight = False)\n",
1170 |     "print('yanzhi_d is ok')\n",
1171 |     "del data['yanzhi_d']\n",
1172 |     "gc.collect()"
1173 |    ]
1174 |   },
1175 |   {
1176 |    "cell_type": "markdown",
1177 |    "metadata": {},
1178 |    "source": [
1179 |     "2）滑窗统计特征，过去三天每天分别的邀请量计数统计，反应的是对应用户或问题近期的邀请情况"
1180 |    ]
1181 |   },
1182 |   {
1183 |    "cell_type": "code",
1184 |    "execution_count": null,
1185 |    "metadata": {
1186 |     "collapsed": true
1187 |    },
1188 |    "outputs": [],
1189 |    "source": [
1190 |     "def get_last3dayinvitenum(dataf,df,fea,use_weight):\n",
1191 |     "    if use_weight:\n",
1192 |     "        gps = dataf.groupby(['inviteday'])\n",
1193 |     "        dic = {}\n",
1194 |     "        for gp_id in iter(dataf['inviteday'].unique()):\n",
1195 |     "            gp = gps.get_group(gp_id)\n",
1196 |     "            dic[gp_id] = gp.shape[0]/gp[fea].nunique()\n",
1197 |     "        dic = pd.Series(dic)\n",
1198 |     "        dic = (dic.mean()/dic).round(3).to_dict()\n",
1199 |     "        \n",
1200 |     "    data_gps = dataf.groupby(['inviteday',fea]).size().astype(float)\n",
1201 |     "    if use_weight:\n",
1202 |     "        for day in iter(dataf['inviteday'].unique()):#range(t0_eval,t1_eval+1):\n",
1203 |     "            data_gps[day] = data_gps[day]*dic[day]\n",
1204 |     "    data_gps = data_gps.reset_index().rename(columns = {0:'size'})\n",
1205 |     "    \n",
1206 |     "    for i in [fea+'_last3invnum'+str(i) for i in range(3)]+[fea+'_curdayinvnum']:\n",
1207 |     "        df[i] = np.nan\n",
1208 |     "    dic_result = {}\n",
1209 |     "    pool = Pool(10)\n",
1210 |     "    for day in iter(dataf['inviteday'].unique()):\n",
1211 |     "        dic_result[day] = pool.apply_async(func = get_last3invnum,args = (data_gps[(data_gps['inviteday']<=day)&(data_gps['inviteday']>=day-3)],dataf[dataf['inviteday'] == day],fea,day,))\n",
1212 |     "    pool.close()\n",
1213 |     "    pool.join()\n",
1214 |     "    \n",
1215 |     "    for day in iter(dataf['inviteday'].unique()):\n",
1216 |     "        df.loc[dataf['inviteday'] == day,[fea+'_last3invnum'+str(i) for i in range(3)]+[fea+'_curdayinvnum']] = dic_result[day].get()\n",
1217 |     "    \n",
1218 |     "    return df\n",
1219 |     "\n",
1220 |     "def get_last3invnum(data_train,data_test,fea,day):\n",
1221 |     "    data_train = data_train[data_train[fea].isin(data_test[fea].unique())]\n",
1222 |     "    gps = data_train.groupby(fea)\n",
1223 |     "    dic_fea = {}\n",
1224 |     "    daylist = [day-3,day-2,day-1,day]\n",
1225 |     "    for val in iter(data_train.reset_index()[fea].unique()):\n",
1226 |     "        gp = gps.get_group(val)\n",
1227 |     "        dic_val = gp['size']\n",
1228 |     "        dic_val.index = gp['inviteday'].values\n",
1229 |     "        dic_fea[val] = []\n",
1230 |     "        for day in iter(daylist):\n",
1231 |     "            try:\n",
1232 |     "                dic_fea[val].append(dic_val[day])\n",
1233 |     "            except:\n",
1234 |     "                dic_fea[val].append(0)\n",
1235 |     "        \n",
1236 |     "    dic_fea = pd.DataFrame(dic_fea).T.reset_index()\n",
1237 |     "    dic_fea.columns = [fea]+[fea+'_last3invnum'+str(i) for i in range(3)]+[fea+'_curdayinvnum']\n",
1238 |     "    data_test = data_test.merge(dic_fea,on = fea,how = 'left')\n",
1239 |     "    return data_test[[fea+'_last3invnum'+str(i) for i in range(3)]+[fea+'_curdayinvnum']].values\n",
1240 |     "\n",
1241 |     "for  fea in ['qId','writerId']:#,\n",
1242 |     "    df = get_last3dayinvitenum(data,df,fea,use_weight = False)\n",
1243 |     "    print(fea+' is ok')\n",
1244 |     "\n",
1245 |     "df['qId_last3+1invnum_mean'] = df[['qId_last3invnum0','qId_last3invnum1','qId_last3invnum2','qId_curdayinvnum']].mean(axis = 1)\n",
1246 |     "df['qId_last3+1invnum_std'] = df[['qId_last3invnum0','qId_last3invnum1','qId_last3invnum2','qId_curdayinvnum']].std(axis = 1)\n",
1247 |     "df['writerId_last3+1invnum_mean'] = df[['writerId_last3invnum0','writerId_last3invnum1','writerId_last3invnum2','writerId_curdayinvnum']].mean(axis = 1)\n",
1248 |     "df['writerId_last3+1invnum_std'] = df[['writerId_last3invnum0','writerId_last3invnum1','writerId_last3invnum2','writerId_curdayinvnum']].std(axis = 1)\n",
1249 |     "df = df.drop(['qId_curdayinvnum','writerId_curdayinvnum'],axis = 1)\n",
1250 |     "gc.collect()"
1251 |    ]
1252 |   },
1253 |   {
1254 |    "cell_type": "markdown",
1255 |    "metadata": {},
1256 |    "source": [
1257 |     "3）当天的邀请数计数特征，当天的邀请数量"
1258 |    ]
1259 |   },
1260 |   {
1261 |    "cell_type": "code",
1262 |    "execution_count": null,
1263 |    "metadata": {
1264 |     "collapsed": true
1265 |    },
1266 |    "outputs": [],
1267 |    "source": [
1268 |     "def get_curdayinvitenum(dataf,fea,use_weight):\n",
1269 |     "    if use_weight:\n",
1270 |     "        gps = dataf.groupby(['inviteday'])\n",
1271 |     "        dic = {}\n",
1272 |     "        for gp_id in iter(dataf['inviteday'].unique()):\n",
1273 |     "            gp = gps.get_group(gp_id)\n",
1274 |     "            dic[gp_id] = gp.shape[0]/gp[fea].nunique()\n",
1275 |     "        dic = pd.Series(dic)\n",
1276 |     "        dic = (dic.mean()/dic).round(3).to_dict()\n",
1277 |     "    \n",
1278 |     "    data_gps = dataf.groupby(['inviteday',fea]).size().astype('float')\n",
1279 |     "    if use_weight:\n",
1280 |     "        for day in iter(dataf['inviteday'].unique()):#range(t0_eval,t1_eval+1):\n",
1281 |     "            data_gps[day] = data_gps[day]*dic[day]\n",
1282 |     "    data_gps = data_gps.reset_index().rename(columns = {0:'size'})\n",
1283 |     "    \n",
1284 |     "    result = data.merge(data_gps,on = [fea,'inviteday'],how = 'left')['size'].values\n",
1285 |     "    \n",
1286 |     "    return result\n",
1287 |     "\n",
1288 |     "for fea in ['qId','writerId','invitehour','createhour','createday','createweekday',\n",
1289 |     "            'sex','activity','bool_D','category_C','category_E','yanzhi','mostliketheme']:\n",
1290 |     "    df[fea+'_curdayinv_count'] = get_curdayinvitenum(data,fea,use_weight = False)\n",
1291 |     "    print(fea+' is ok')"
1292 |    ]
1293 |   },
1294 |   {
1295 |    "cell_type": "markdown",
1296 |    "metadata": {},
1297 |    "source": [
1298 |     "4）全局计数特征：对类别特征，包括id，统计整个数据集的邀请数；虽然已经构造了历史计数特征，但全局的统计仍然有意义"
1299 |    ]
1300 |   },
1301 |   {
1302 |    "cell_type": "code",
1303 |    "execution_count": null,
1304 |    "metadata": {
1305 |     "collapsed": true
1306 |    },
1307 |    "outputs": [],
1308 |    "source": [
1309 |     "def get_alldata_count(data, fea, new_column_name,use_weight):#构造类别特征数量统计特征\n",
1310 |     "    if use_weight:\n",
1311 |     "        gps = data.groupby('inviteday')\n",
1312 |     "        dic = {}\n",
1313 |     "        for gp_id in iter(data['inviteday'].unique()):\n",
1314 |     "            gp = gps.get_group(gp_id)\n",
1315 |     "            dic[gp_id] = gp.shape[0]/gp[fea].nunique()\n",
1316 |     "        dic = pd.Series(dic)\n",
1317 |     "        dic = (dic.mean()/dic).round(3).to_dict()\n",
1318 |     "        \n",
1319 |     "    dataf = data[[fea,'inviteday']].groupby([fea,'inviteday']).size().astype(float).reset_index()\n",
1320 |     "    if use_weight:\n",
1321 |     "        values_0 = []\n",
1322 |     "        for row in iter(dataf[['inviteday',0]].values):\n",
1323 |     "            values_0.append(dic[row[0]]*row[1])\n",
1324 |     "        dataf[0] = values_0\n",
1325 |     "    dataf = dataf[[fea,0]].groupby(fea).sum()[0].reset_index()\n",
1326 |     "    \n",
1327 |     "    dataf = dataf.rename(columns = {0:new_column_name})\n",
1328 |     "    dataf = data.merge(dataf, on = fea, how = \"left\") \n",
1329 |     "    return dataf[new_column_name]\n",
1330 |     "\n",
1331 |     "for fea in ['qId','createday','createhour','invitehour','writerId','yanzhi','mostliketheme']:\n",
1332 |     "    df['%s_count' % fea] = get_alldata_count(data,fea,'%s_count' % fea,use_weight = False)\n",
1333 |     "    print(fea+' is ok')"
1334 |    ]
1335 |   },
1336 |   {
1337 |    "cell_type": "markdown",
1338 |    "metadata": {},
1339 |    "source": [
1340 |     "5）用户或问题id的历史统计特征关于问题或用户id的平均值  \n",
1341 |     "实际该部分属于用户跟问题的交叉，而不是单一侧"
1342 |    ]
1343 |   },
1344 |   {
1345 |    "cell_type": "code",
1346 |    "execution_count": null,
1347 |    "metadata": {
1348 |     "collapsed": true
1349 |    },
1350 |    "outputs": [],
1351 |    "source": [
1352 |     "#过去7天的统计聚合\n",
1353 |     "df['writerId_last7count_gp_qId'] = df['writerId_last7day_count'].groupby(data['qId']).transform(np.mean)\n",
1354 |     "df['qId_last7count_gp_writerId'] = df['qId_last7day_count'].groupby(data['writerId']).transform(np.mean)\n",
1355 |     "\n",
1356 |     "#全局的统计集合\n",
1357 |     "df['writerId_count_gp_qId'] = df['writerId_count'].groupby(data['qId']).transform(np.mean)\n",
1358 |     "df['qId_count_gp_writerId'] = df['qId_count'].groupby(data['writerId']).transform(np.mean)\n",
1359 |     "\n",
1360 |     "#当天的统计聚合\n",
1361 |     "df['writerId_curcount_gp_qId'] = df['writerId_curdayinv_count'].groupby(data['qId']).transform(np.mean)\n",
1362 |     "df['qId_curcount_gp_writerId'] = df['qId_curdayinv_count'].groupby(data['writerId']).transform(np.mean)"
1363 |    ]
1364 |   },
1365 |   {
1366 |    "cell_type": "code",
1367 |    "execution_count": null,
1368 |    "metadata": {
1369 |     "collapsed": true
1370 |    },
1371 |    "outputs": [],
1372 |    "source": [
1373 |     "df = memoryOptimization(df,np.float32)\n",
1374 |     "with open(datapath+'df/df_1.1.pkl','wb') as f:\n",
1375 |     "    pickle.dump(df,f)\n",
1376 |     "del df"
1377 |    ]
1378 |   },
1379 |   {
1380 |    "cell_type": "markdown",
1381 |    "metadata": {},
1382 |    "source": [
1383 |     "### 2、目标编码特征\n",
1384 |     "  因为回答记录文件的记录时长是两个月，所以该部分利用回答记录文件统计历史的回答计数，利用训练集的标签统计历史点击率特征；  \n",
1385 |     "  构造该部分特征时注意数据时间穿越（泄露）问题；"
1386 |    ]
1387 |   },
1388 |   {
1389 |    "cell_type": "code",
1390 |    "execution_count": null,
1391 |    "metadata": {
1392 |     "collapsed": true
1393 |    },
1394 |    "outputs": [],
1395 |    "source": [
1396 |     "df = data[[]]"
1397 |    ]
1398 |   },
1399 |   {
1400 |    "cell_type": "code",
1401 |    "execution_count": null,
1402 |    "metadata": {
1403 |     "collapsed": true
1404 |    },
1405 |    "outputs": [],
1406 |    "source": [
1407 |     "data_answer = pd.read_csv(datapath+'data/data_answer.csv')\n",
1408 |     "# data_answer = data_answer[['qId','writerId','answerday','answerhour']]\n",
1409 |     "data_question = pd.read_csv(datapath+'data/data_question.csv')\n",
1410 |     "data_writer = pd.read_csv(datapath+'data/data_writer.csv')\n",
1411 |     "data_answer = pd.merge(data_answer,data_question,on = 'qId',how = 'left')\n",
1412 |     "data_answer = pd.merge(data_answer,data_writer,on = 'writerId',how = 'left')\n",
1413 |     "\n",
1414 |     "data_answer = data_answer.rename(columns = {'answerhour':'invitehour','answerday':'inviteday'})\n",
1415 |     "#为了便于构造特征，将问题回答记录中的回答时间名称改为邀请时间\n",
1416 |     "data_answer['inviteweekday'] = getweekday(data_answer['inviteday'])\n",
1417 |     "data_answer['createweekday'] = getweekday(data_answer['createday'])\n",
1418 |     "data_answer['label'] = 1\n",
1419 |     "del data_writer\n",
1420 |     "del data_question\n",
1421 |     "gc.collect()"
1422 |    ]
1423 |   },
1424 |   {
1425 |    "cell_type": "markdown",
1426 |    "metadata": {},
1427 |    "source": [
1428 |     "1）历史第七天用户和问题的目标编码特征，滑窗统计特征，包括回答数量以及邀请的接受率，id类特征构造的特征结果特别稀疏"
1429 |    ]
1430 |   },
1431 |   {
1432 |    "cell_type": "code",
1433 |    "execution_count": 12,
1434 |    "metadata": {
1435 |     "collapsed": true
1436 |    },
1437 |    "outputs": [],
1438 |    "source": [
1439 |     "def slidewindow(dataf,data_dic,dayfea,func):\n",
1440 |     "    dataf = data.copy()\n",
1441 |     "    dataf['day_lastweek'] = dataf[dayfea[0]]-7\n",
1442 |     "    dic = data_dic.groupby(dayfea)['label'].agg(func).reset_index()\n",
1443 |     "    dic = dic.rename(columns = {dayfea[0]:'day_lastweek','label':'count'})\n",
1444 |     "    dataf = pd.merge(dataf,dic,on = ['day_lastweek']+dayfea[1:],how = 'left')\n",
1445 |     "    dataf.loc[(~dataf[datafea[1]].isna())&(dataf['count'].isna()),'count'] = 0\n",
1446 |     "    return dataf['count'].values\n",
1447 |     "\n",
1448 |     "for fea in ['qId','writerId']+['activity','category_A','category_D']:\n",
1449 |     "    for dayfea in ['inviteday','createday']:\n",
1450 |     "        str_week = dayfea.split('day')[0]+'weekday'\n",
1451 |     "        df['%s_%slastweek2label_count' % (fea,dayfea)] = slidewindow(data,data_answer,[dayfea,fea],func = np.sum)\n",
1452 |     "        df['%s_%slastweek2label_rate' % (fea,dayfea)] = slidewindow(data,data[data['label'] != -1],[dayfea,fea],func = np.mean)"
1453 |    ]
1454 |   },
1455 |   {
1456 |    "cell_type": "markdown",
1457 |    "metadata": {},
1458 |    "source": [
1459 |     "2）历史目标编码特征，滑窗统计特征，labelcount选取时间窗口为3周，labelctr选取为整个历史训练集"
1460 |    ]
1461 |   },
1462 |   {
1463 |    "cell_type": "code",
1464 |    "execution_count": null,
1465 |    "metadata": {
1466 |     "collapsed": true
1467 |    },
1468 |    "outputs": [],
1469 |    "source": [
1470 |     "def get_crossfeas_tar(data,data_asw,fea1,fea2):\n",
1471 |     "    dataf = pd.concat([data[[fea1,fea2]],data_asw[[fea1,fea2]]],axis = 0).reset_index(drop = True)\n",
1472 |     "    bool_s = (~dataf[fea1].isna())&(~dataf[fea2].isna())\n",
1473 |     "    dataf['cross'] = np.nan\n",
1474 |     "    dataf.loc[bool_s,'cross'] = dataf.loc[bool_s,fea1].apply(str)+'_'+dataf.loc[bool_s,fea2].apply(str)\n",
1475 |     "    cross = dataf['cross'].values\n",
1476 |     "    c1 = cross[:data.shape[0]]\n",
1477 |     "    c2 = cross[data.shape[0]:]\n",
1478 |     "    return c1,c2\n",
1479 |     "\n",
1480 |     "def targetencoder(dataf,data_asw,df,feas):\n",
1481 |     "    dicfea = feas[0]\n",
1482 |     "    if len(feas)>1:\n",
1483 |     "        fea = dicfea\n",
1484 |     "        for i in feas[1:]:\n",
1485 |     "            fea = fea+'_'+i\n",
1486 |     "        dataf[fea],data_asw[fea] = get_crossfeas_tar(dataf,data_asw,feas[0],feas[1])\n",
1487 |     "    else:\n",
1488 |     "        fea = dicfea\n",
1489 |     "\n",
1490 |     "#     dataf = dataf.copy()\n",
1491 |     "    gps_asw = data_asw.groupby('inviteday')\n",
1492 |     "    dic_asw = {}\n",
1493 |     "    for gp_id in iter(data_asw['inviteday'].unique()):\n",
1494 |     "        gp = gps_asw.get_group(gp_id)\n",
1495 |     "        dic_asw[gp_id] = gp.shape[0]/gp[dicfea].nunique()\n",
1496 |     "    dic_asw = pd.Series(dic_asw)\n",
1497 |     "    dic_asw = (dic_asw.mean()/dic_asw).round(3).to_dict()\n",
1498 |     "\n",
1499 |     "    data_asw = data_asw[[fea,'inviteday']].groupby([fea,'inviteday']).size().reset_index()\n",
1500 |     "    values_0 = []\n",
1501 |     "    for row in iter(data_asw[['inviteday',0]].values):\n",
1502 |     "        values_0.append(dic_asw[row[0]]*row[1])\n",
1503 |     "    data_asw[0] = values_0\n",
1504 |     "    data_asw = data_asw.rename(columns = {0:'size'})\n",
1505 |     "    \n",
1506 |     "    gps_inv = dataf.groupby('inviteday')\n",
1507 |     "    dic_inv = {}\n",
1508 |     "    for gp_id in iter(dataf['inviteday'].unique()):\n",
1509 |     "        gp = gps_inv.get_group(gp_id)\n",
1510 |     "        dic_inv[gp_id] = gp.shape[0]/gp[dicfea].nunique()\n",
1511 |     "    dic_inv = pd.Series(dic_inv)\n",
1512 |     "    dic_inv = (dic_inv.mean()/dic_inv).round(3).to_dict()\n",
1513 |     "    \n",
1514 |     "    data_gps = dataf[[fea,'inviteday','label']].groupby([fea,\n",
1515 |     "                'inviteday']).agg(['sum','count'])['label'].reset_index()\n",
1516 |     "    values_sum = []\n",
1517 |     "    values_count = []\n",
1518 |     "    for row in iter(data_gps[['inviteday','sum','count']].values):\n",
1519 |     "        values_sum.append(dic_inv[row[0]]*row[1])\n",
1520 |     "        values_count.append(dic_inv[row[0]]*row[2])\n",
1521 |     "    data_gps['sum'] = values_sum\n",
1522 |     "    data_gps['count'] = values_count\n",
1523 |     "    \n",
1524 |     "    dataf['timegroup'] = (dataf['inviteday']/3).apply(int)\n",
1525 |     "    data_gps['timegroup'] = (data_gps['inviteday']/3).apply(int)\n",
1526 |     "    data_train = dataf[dataf['label'] != -1]\n",
1527 |     "    data_test = dataf[dataf['label'] == -1]  \n",
1528 |     "    \n",
1529 |     "    df[fea+'_label_count'] = np.nan#np.zeros(dataf.shape[0])\n",
1530 |     "    daylen_asw = 21\n",
1531 |     "    for day in iter(data_train['inviteday'].unique()):\n",
1532 |     "        df.loc[dataf['inviteday'] == day,fea+'_label_count'] = get_label_count(data_asw[(data_asw['inviteday']<day)&(data_asw['inviteday']>=day-daylen_asw)],dataf[dataf['inviteday'] == day],fea)\n",
1533 |     "    df.loc[data_test.index,fea+'_label_count'] = get_label_count(data_asw[(data_asw['inviteday']<t0_eval)&(data_asw['inviteday']>=t0_eval-daylen_asw)],data_test,fea)\n",
1534 |     "    df.loc[(~dataf[fea].isna())&(df[fea+'_label_count'].isna()),fea+'_label_count'] = 0\n",
1535 |     "    \n",
1536 |     "    df[fea+'_label_ctr'] = np.nan#np.zeros(dataf.shape[0])\n",
1537 |     "    for gp in iter(data_train['timegroup'].unique()):\n",
1538 |     "        df.loc[(dataf['timegroup'] == gp)&(dataf['label'] != -1),fea+'_label_ctr'] = get_label_ctr(data_gps[data_gps['timegroup'] < gp],data_train[data_train['timegroup'] == gp],fea)\n",
1539 |     "    df.loc[data_test.index,fea+'_label_ctr'] = get_label_ctr(data_gps[data_gps['inviteday']<t0_eval],data_test,fea)\n",
1540 |     "    df.loc[(~dataf[fea].isna())&(df[fea+'_label_ctr'].isna()),fea+'_label_ctr'] = 0\n",
1541 |     "\n",
1542 |     "    if len(feas)>1:\n",
1543 |     "        del dataf[fea]\n",
1544 |     "        del data_asw[fea]\n",
1545 |     "    del dataf['timegroup']\n",
1546 |     "    del data_train\n",
1547 |     "    del data_test\n",
1548 |     "    gc.collect()\n",
1549 |     "    \n",
1550 |     "    return df\n",
1551 |     "\n",
1552 |     "def get_label_count(df_train,df_test,fea):\n",
1553 |     "    df_train = df_train[[fea,'size']].groupby(fea).sum()['size'].reset_index()\n",
1554 |     "    new_fea_name = fea+'_label_count'\n",
1555 |     "    df_train = df_train[[fea,'size']].rename(columns = {'size':new_fea_name})\n",
1556 |     "    df_test = df_test.merge(df_train,on = fea,how = 'left')\n",
1557 |     "    \n",
1558 |     "    return df_test[new_fea_name].values\n",
1559 |     "\n",
1560 |     "def get_label_ctr(df_train,df_test,fea):\n",
1561 |     "    df_train = df_train[[fea,'sum','count']].groupby(fea).sum()[['sum','count']].reset_index()\n",
1562 |     "    new_fea_name = fea+'_label_ctr'\n",
1563 |     "    df_train[new_fea_name] = (df_train['sum'] + 1) / (df_train['count'] + 1)\n",
1564 |     "    df_train = df_train[[fea,new_fea_name]]\n",
1565 |     "    df_test = df_test.merge(df_train,on = fea,how = 'left')\n",
1566 |     "    \n",
1567 |     "    return df_test[new_fea_name].values   "
1568 |    ]
1569 |   },
1570 |   {
1571 |    "cell_type": "code",
1572 |    "execution_count": null,
1573 |    "metadata": {
1574 |     "collapsed": true
1575 |    },
1576 |    "outputs": [],
1577 |    "source": [
1578 |     "for fea in ['qId','createhour','invitehour','mostliketheme','writerId','yanzhi','category_C','category_D','activity']:\n",
1579 |     "    df = targetencoder(data,data_answer,df,[fea])\n",
1580 |     "    print(fea+' is ok')"
1581 |    ]
1582 |   },
1583 |   {
1584 |    "cell_type": "markdown",
1585 |    "metadata": {},
1586 |    "source": [
1587 |     "3）多标签类别特征的目标编码  \n",
1588 |     "对多个tag的统计结果进行排序，取topk个作为k列特征，这样处理的目的在于过滤标签，许多标签是冗余的"
1589 |    ]
1590 |   },
1591 |   {
1592 |    "cell_type": "code",
1593 |    "execution_count": null,
1594 |    "metadata": {
1595 |     "collapsed": true
1596 |    },
1597 |    "outputs": [],
1598 |    "source": [
1599 |     "def targetencoder_multi(dataf,data_asw,df,fea):\n",
1600 |     "    dataf['timegroup'] = (dataf['inviteday']/3).apply(int)\n",
1601 |     "    data_train = dataf[dataf['label'] != -1]\n",
1602 |     "    data_test = dataf[dataf['label'] == -1]\n",
1603 |     "    daylen_asw = 21\n",
1604 |     "    for i in range(3):\n",
1605 |     "        df[fea+'_label_count%s' %i] = np.nan#np.zeros(data.shape[0])\n",
1606 |     "    for day in iter(data_train['inviteday'].unique()):\n",
1607 |     "        df.loc[dataf['inviteday'] == day,[fea+'_label_count'+str(i) for i in range(3)]] = get_multi_label_count(data_asw[(data_asw['inviteday']<day)&(data_asw['inviteday']>=day-daylen_asw)],dataf[dataf['inviteday'] == day],fea)#\n",
1608 |     "    df.loc[data_test.index,[fea+'_label_count'+str(i) for i in range(3)]] = get_multi_label_count(data_asw[(data_asw['inviteday']<t0_eval)&(data_asw['inviteday']>=t0_eval-daylen_asw)],data_test,fea)#\n",
1609 |     "\n",
1610 |     "    for i in range(3):\n",
1611 |     "        df[fea+'_label_ctr%s' %i] = np.nan#np.zeros(data.shape[0])\n",
1612 |     "    for gp in iter(data_train['timegroup'].unique()):\n",
1613 |     "        try:\n",
1614 |     "            df.loc[(dataf['timegroup'] == gp)&(dataf['label'] != -1),[fea+'_label_ctr'+str(i) for i in range(3)]] = get_multi_label_ctr(data_train[data_train['timegroup'] < gp],data_train[data_train['timegroup'] == gp],fea)\n",
1615 |     "        except:\n",
1616 |     "            pass\n",
1617 |     "    df.loc[data_test.index,[fea+'_label_ctr'+str(i) for i in range(3)]] = get_multi_label_ctr(data_train,data_test,fea)\n",
1618 |     "    del dataf['timegroup']\n",
1619 |     "    \n",
1620 |     "    return df\n",
1621 |     "\n",
1622 |     "def get_multi_label_count(df_train,df_test,fea):\n",
1623 |     "    countall = {}\n",
1624 |     "    for row in iter(df_train[fea].values):\n",
1625 |     "        for theme in iter(row.strip().split(',')):\n",
1626 |     "            try:\n",
1627 |     "                countall[theme] = countall[theme] + 1\n",
1628 |     "            except:\n",
1629 |     "                countall[theme] = 1\n",
1630 |     "\n",
1631 |     "    result_count = []\n",
1632 |     "    for row in iter(df_test[fea].values):\n",
1633 |     "        row_count = []\n",
1634 |     "        for theme in iter(row.strip().split(',')):\n",
1635 |     "            try:\n",
1636 |     "                row_count.append(countall[theme])\n",
1637 |     "            except:\n",
1638 |     "                pass\n",
1639 |     "        row_count.sort(reverse = True)       \n",
1640 |     "        result_count.append(row_count[:3])\n",
1641 |     "        \n",
1642 |     "    result_count = pd.DataFrame(result_count)\n",
1643 |     "    for i in range(3):\n",
1644 |     "        if i not in result_count.columns:\n",
1645 |     "            result_count[i] = np.nan\n",
1646 |     "    return result_count.values\n",
1647 |     "\n",
1648 |     "def get_multi_label_ctr(df_train,df_test,fea):\n",
1649 |     "    count1 = {}\n",
1650 |     "    countall = {}\n",
1651 |     "    for row in iter(df_train[[fea,'label']].values):\n",
1652 |     "        for theme in iter(row[0].strip().split(',')):\n",
1653 |     "            try:\n",
1654 |     "                countall[theme] = countall[theme] + 1\n",
1655 |     "            except:\n",
1656 |     "                countall[theme] = 1\n",
1657 |     "            if row[1] == 1:\n",
1658 |     "                try:\n",
1659 |     "                    count1[theme] = count1[theme] + 1\n",
1660 |     "                except:\n",
1661 |     "                    count1[theme] = 1\n",
1662 |     "\n",
1663 |     "    result_ctr = []\n",
1664 |     "    for row in iter(df_test[fea].values):\n",
1665 |     "        row_ctr = []\n",
1666 |     "        for theme in iter(row.strip().split(',')):\n",
1667 |     "            try:\n",
1668 |     "                row_ctr.append((count1[theme]+1)/(countall[theme]+1))\n",
1669 |     "            except:\n",
1670 |     "                pass\n",
1671 |     "        row_ctr.sort(reverse = True)\n",
1672 |     "        result_ctr.append(row_ctr[:3])\n",
1673 |     "        \n",
1674 |     "    result_ctr = pd.DataFrame(result_ctr)\n",
1675 |     "    for i in range(3):\n",
1676 |     "        if i not in result_ctr.columns:\n",
1677 |     "            result_ctr[i] = np.nan\n",
1678 |     "    return result_ctr.values "
1679 |    ]
1680 |   },
1681 |   {
1682 |    "cell_type": "code",
1683 |    "execution_count": null,
1684 |    "metadata": {
1685 |     "collapsed": true,
1686 |     "scrolled": true
1687 |    },
1688 |    "outputs": [],
1689 |    "source": [
1690 |     "for fea in ['themeId','attentionthemes']:\n",
1691 |     "    df = targetencoder_multi(data,data_answer,df,fea)\n",
1692 |     "    print(fea+' is ok')"
1693 |    ]
1694 |   },
1695 |   {
1696 |    "cell_type": "markdown",
1697 |    "metadata": {},
1698 |    "source": [
1699 |     "对问题的标题利用tdidf过滤的结果作为问题的多标签类别特征，进行多标签目标编码："
1700 |    ]
1701 |   },
1702 |   {
1703 |    "cell_type": "code",
1704 |    "execution_count": null,
1705 |    "metadata": {
1706 |     "collapsed": true
1707 |    },
1708 |    "outputs": [],
1709 |    "source": [
1710 |     "data_question = pd.read_csv(datapath+'data_q_title_tfidf.csv')\n",
1711 |     "data_question = data_question[['问题id','title_topk','text_topk']].rename(columns = {'问题id':'qId','title_topk':'title_words_tfidf0','text_topk':'title_words_tfidf1'})\n",
1712 |     "\n",
1713 |     "data = data.merge(data_question[['qId','title_words_tfidf1']],on = 'qId',how = 'left')\n",
1714 |     "data_answer = data_answer.merge(data_question[['qId','title_words_tfidf1']],on = 'qId',how = 'left')\n",
1715 |     "del data_question\n",
1716 |     "\n",
1717 |     "df = targetencoder_multi(data,data_answer,df,'title_words_tfidf1')\n",
1718 |     "del data['title_words_tfidf1']\n",
1719 |     "del data_answer['title_words_tfidf1']\n",
1720 |     "\n",
1721 |     "df['title_words_tfidf1_label_count_mean'] = df[['title_words_tfidf1_label_count0','title_words_tfidf1_label_count1','title_words_tfidf1_label_count2']].mean(axis = 1)\n",
1722 |     "df['title_words_tfidf1_label_ctr_mean'] = df[['title_words_tfidf1_label_ctr0','title_words_tfidf1_label_ctr1','title_words_tfidf1_label_ctr2']].mean(axis = 1)\n",
1723 |     "\n",
1724 |     "cols = ['title_words_tfidf1_label_ctr0','title_words_tfidf1_label_ctr1', 'title_words_tfidf1_label_ctr2','title_words_tfidf1_label_ctr_mean']\n",
1725 |     "df[cols] = df[cols]*1000"
1726 |    ]
1727 |   },
1728 |   {
1729 |    "cell_type": "markdown",
1730 |    "metadata": {},
1731 |    "source": [
1732 |     "4）分组统计特征，用户或问题id的目标编码特征关于问题或用户id的平均值 "
1733 |    ]
1734 |   },
1735 |   {
1736 |    "cell_type": "code",
1737 |    "execution_count": null,
1738 |    "metadata": {
1739 |     "collapsed": true
1740 |    },
1741 |    "outputs": [],
1742 |    "source": [
1743 |     "df['writerId_label_count_gp_qId'] = df['writerId_label_count'].groupby(data['qId']).transform(np.mean)\n",
1744 |     "df['qId_label_count_gp_writerId'] = df['qId_label_count'].groupby(data['writerId']).transform(np.mean)\n",
1745 |     "\n",
1746 |     "df['writerId_label_ctr_gp_qId'] = df['writerId_label_ctr'].groupby(data['qId']).transform(np.mean)\n",
1747 |     "df['qId_label_ctr_gp_writerId'] = df['qId_label_ctr'].groupby(data['writerId']).transform(np.mean)"
1748 |    ]
1749 |   },
1750 |   {
1751 |    "cell_type": "code",
1752 |    "execution_count": null,
1753 |    "metadata": {
1754 |     "collapsed": true
1755 |    },
1756 |    "outputs": [],
1757 |    "source": [
1758 |     "df = memoryOptimization(df,np.float32)\n",
1759 |     "with open(datapath+'df/df_1.2.pkl','wb') as f:\n",
1760 |     "    pickle.dump(df,f)\n",
1761 |     "del df\n",
1762 |     "del data_answer\n",
1763 |     "gc.collect()"
1764 |    ]
1765 |   },
1766 |   {
1767 |    "cell_type": "markdown",
1768 |    "metadata": {},
1769 |    "source": [
1770 |     "### 3、其它特征"
1771 |    ]
1772 |   },
1773 |   {
1774 |    "cell_type": "code",
1775 |    "execution_count": null,
1776 |    "metadata": {
1777 |     "collapsed": true
1778 |    },
1779 |    "outputs": [],
1780 |    "source": [
1781 |     "df = data[[]]"
1782 |    ]
1783 |   },
1784 |   {
1785 |    "cell_type": "markdown",
1786 |    "metadata": {},
1787 |    "source": [
1788 |     "1）邀请间隔时长：构造每条样本的问题id最近一次发出邀请到现在的时间间隔以及用户id最近一次被邀请到现在的时间间隔（单位小时）"
1789 |    ]
1790 |   },
1791 |   {
1792 |    "cell_type": "code",
1793 |    "execution_count": 13,
1794 |    "metadata": {
1795 |     "collapsed": true
1796 |    },
1797 |    "outputs": [],
1798 |    "source": [
1799 |     "def get_hourlenfromlastinv(dataf,df,fea):\n",
1800 |     "    result = []\n",
1801 |     "    last_invite = {}\n",
1802 |     "    for row in iter(dataf[[fea,'inviteallhour']].sort_values(by = 'inviteallhour',ascending = True).values):\n",
1803 |     "        try:\n",
1804 |     "            result.append(last_invite[row[0]])\n",
1805 |     "        except:\n",
1806 |     "            result.append(-1)\n",
1807 |     "        last_invite[row[0]] = row[1]\n",
1808 |     "\n",
1809 |     "    df1 = dataf[[]]\n",
1810 |     "    df1['seq'] = dataf.sort_values(by = 'inviteallhour').index\n",
1811 |     "    df1['lastinvitehour_'+fea] = result\n",
1812 |     "    df1 = df1.sort_values(by = 'seq')\n",
1813 |     "\n",
1814 |     "    df1 = df1.reset_index(drop = True)\n",
1815 |     "    del df1['seq']\n",
1816 |     "    df = pd.concat([df,df1],axis = 1)\n",
1817 |     "    del df1\n",
1818 |     "    gc.collect()\n",
1819 |     "\n",
1820 |     "    df.loc[df['lastinvitehour_'+fea] == -1,'lastinvitehour_'+fea] = None\n",
1821 |     "    df['hourlenfromlastinvite_'+fea] = dataf['inviteallhour'] - df['lastinvitehour_'+fea]\n",
1822 |     "    del df['lastinvitehour_'+fea]\n",
1823 |     "    \n",
1824 |     "    return df\n",
1825 |     "\n",
1826 |     "df = get_hourlenfromlastinv(data,df,'qId')\n",
1827 |     "df = get_hourlenfromlastinv(data,df,'writerId')"
1828 |    ]
1829 |   },
1830 |   {
1831 |    "cell_type": "markdown",
1832 |    "metadata": {},
1833 |    "source": [
1834 |     "2）时效性：问题从创建当前邀请的存在时长，反应问题的时效性"
1835 |    ]
1836 |   },
1837 |   {
1838 |    "cell_type": "code",
1839 |    "execution_count": null,
1840 |    "metadata": {
1841 |     "collapsed": true
1842 |    },
1843 |    "outputs": [],
1844 |    "source": [
1845 |     "df['q_life'] = data['inviteday'] - data['createday']"
1846 |    ]
1847 |   },
1848 |   {
1849 |    "cell_type": "markdown",
1850 |    "metadata": {},
1851 |    "source": [
1852 |     "3）活跃度：问题或用户近期的活跃度，用id近期有活动的天数表示，分为邀请活跃度以及回答活跃度，对于问题来说是问题近期是否有被邀请或者被回答，对于用户来说代表了用户近期是否更愿意回答或接收邀请"
1853 |    ]
1854 |   },
1855 |   {
1856 |    "cell_type": "code",
1857 |    "execution_count": null,
1858 |    "metadata": {
1859 |     "collapsed": true
1860 |    },
1861 |    "outputs": [],
1862 |    "source": [
1863 |     "def get_activeday(dataf,data_asw,df,fea):\n",
1864 |     "    df[fea+'_activeday_inv'] = np.nan\n",
1865 |     "    for day in iter(dataf['inviteday'].unique()):\n",
1866 |     "        df.loc[dataf['inviteday'] == day,fea+'_activeday_inv'] = get_daynum(dataf[dataf['inviteday']<day],dataf[dataf['inviteday'] == day],fea)\n",
1867 |     "    \n",
1868 |     "    df[fea+'_activeday_asw'] = np.nan\n",
1869 |     "    for day in iter(dataf.loc[dataf['label'] != -1,'inviteday'].unique()):\n",
1870 |     "        df.loc[dataf['inviteday'] == day,fea+'_activeday_asw'] = get_daynum(data_asw[data_asw['inviteday']<day],dataf[dataf['inviteday'] == day],fea)\n",
1871 |     "    df.loc[dataf['label'] == -1,fea+'_activeday_asw'] = get_daynum(data_asw[data_asw['inviteday']<t0_eval],dataf[dataf['label'] == -1],fea)\n",
1872 |     "    \n",
1873 |     "    return df\n",
1874 |     "        \n",
1875 |     "def get_daynum(data_train,data_test,fea):\n",
1876 |     "    data_train = data_train[data_train[fea].isin(data_test[fea].unique())]\n",
1877 |     "    data_train = data_train.groupby([fea,'inviteday']).size().reset_index()[[fea,'inviteday']].groupby(fea).size().reset_index()\n",
1878 |     "    new_fea_name = fea+'_activeday_inv'\n",
1879 |     "    data_train = data_train.rename(columns = {0:new_fea_name})\n",
1880 |     "    data_test = data_test.merge(data_train,on = fea,how = 'left')\n",
1881 |     "    \n",
1882 |     "    return data_test[new_fea_name].values\n",
1883 |     "\n",
1884 |     "#活跃天数\n",
1885 |     "for fea in ['qId','writerId']:\n",
1886 |     "    df = get_activeday(data,data_answer,df,fea)\n",
1887 |     "    print(fea+' is ok')"
1888 |    ]
1889 |   },
1890 |   {
1891 |    "cell_type": "code",
1892 |    "execution_count": null,
1893 |    "metadata": {
1894 |     "collapsed": true
1895 |    },
1896 |    "outputs": [],
1897 |    "source": [
1898 |     "df = memoryOptimization(df,np.float32)\n",
1899 |     "with open(datapath+'df/df_1.3.pkl','wb') as f:\n",
1900 |     "    pickle.dump(df,f)\n",
1901 |     "del df"
1902 |    ]
1903 |   },
1904 |   {
1905 |    "cell_type": "markdown",
1906 |    "metadata": {},
1907 |    "source": [
1908 |     "## 二、交叉特征\n",
1909 |     "\n",
1910 |     "该部分主要用到协同过滤的思想，关键在于用户跟问题的交叉组合"
1911 |    ]
1912 |   },
1913 |   {
1914 |    "cell_type": "markdown",
1915 |    "metadata": {},
1916 |    "source": [
1917 |     "### 1、计数类特征"
1918 |    ]
1919 |   },
1920 |   {
1921 |    "cell_type": "code",
1922 |    "execution_count": null,
1923 |    "metadata": {
1924 |     "collapsed": true
1925 |    },
1926 |    "outputs": [],
1927 |    "source": [
1928 |     "df = data[[]]"
1929 |    ]
1930 |   },
1931 |   {
1932 |    "cell_type": "markdown",
1933 |    "metadata": {},
1934 |    "source": [
1935 |     "1)id跟类别特征交叉计数"
1936 |    ]
1937 |   },
1938 |   {
1939 |    "cell_type": "code",
1940 |    "execution_count": null,
1941 |    "metadata": {
1942 |     "collapsed": true
1943 |    },
1944 |    "outputs": [],
1945 |    "source": [
1946 |     "features=['sex','activity','bool_D','category_C','mostliketheme','invitehour','bool_A','bool_B','bool_C','bool_E',\n",
1947 |     "          'yanzhi','category_A','category_B','category_D','category_E']\n",
1948 |     "n = 7\n",
1949 |     "for fea in features:\n",
1950 |     "    df['qId'+'_'+fea+'_last%sday_count' %n] = lastndayinvite(data[['qId',fea,'inviteday']],n,['qId',fea],use_weight = False)\n",
1951 |     "    print(fea+' is ok')\n",
1952 |     "\n",
1953 |     "features=['invitehour','createday','createhour','createweekday']\n",
1954 |     "n = 7\n",
1955 |     "for fea in features:\n",
1956 |     "    df['writerId'+'_'+fea+'_last%sday_count' %n] = lastndayinvite(data[['writerId',fea,'inviteday']],n,['writerId',fea],use_weight = False)\n",
1957 |     "    print(fea+' is ok')"
1958 |    ]
1959 |   },
1960 |   {
1961 |    "cell_type": "markdown",
1962 |    "metadata": {},
1963 |    "source": [
1964 |     "2）交叉计数分组统计特征"
1965 |    ]
1966 |   },
1967 |   {
1968 |    "cell_type": "code",
1969 |    "execution_count": null,
1970 |    "metadata": {
1971 |     "collapsed": true
1972 |    },
1973 |    "outputs": [],
1974 |    "source": [
1975 |     "features1 = ['qId_sex_last7day_count', 'qId_activity_last7day_count',\n",
1976 |     "       'qId_bool_D_last7day_count', 'qId_category_C_last7day_count',\n",
1977 |     "       'qId_mostliketheme_last7day_count', 'qId_invitehour_last7day_count',\n",
1978 |     "       'qId_bool_A_last7day_count', 'qId_bool_B_last7day_count',\n",
1979 |     "       'qId_bool_C_last7day_count', 'qId_bool_E_last7day_count',\n",
1980 |     "       'qId_yanzhi_last7day_count', 'qId_category_A_last7day_count',\n",
1981 |     "       'qId_category_B_last7day_count', 'qId_category_D_last7day_count',\n",
1982 |     "       'qId_category_E_last7day_count']\n",
1983 |     "features2 = ['writerId_invitehour_last7day_count','writerId_createday_last7day_count',\n",
1984 |     "             'writerId_createhour_last7day_count','writerId_createweekday_last7day_count']\n",
1985 |     "\n",
1986 |     "for fea in features1:\n",
1987 |     "    df[fea+'_gp_writerId'] = df[fea].groupby(data['writerId']).transform(np.mean)\n",
1988 |     "for fea in features2:\n",
1989 |     "    df[fea+'_gp_qId'] = df[fea].groupby(data['qId']).transform(np.mean)"
1990 |    ]
1991 |   },
1992 |   {
1993 |    "cell_type": "code",
1994 |    "execution_count": null,
1995 |    "metadata": {
1996 |     "collapsed": true
1997 |    },
1998 |    "outputs": [],
1999 |    "source": [
2000 |     "df = memoryOptimization(df,np.float32)\n",
2001 |     "with open(datapath+'df/df_2.1.pkl','wb') as f:\n",
2002 |     "    pickle.dump(df,f)\n",
2003 |     "del df"
2004 |    ]
2005 |   },
2006 |   {
2007 |    "cell_type": "markdown",
2008 |    "metadata": {},
2009 |    "source": [
2010 |     "### 2、目标编码"
2011 |    ]
2012 |   },
2013 |   {
2014 |    "cell_type": "code",
2015 |    "execution_count": null,
2016 |    "metadata": {
2017 |     "collapsed": true
2018 |    },
2019 |    "outputs": [],
2020 |    "source": [
2021 |     "df = data[[]]"
2022 |    ]
2023 |   },
2024 |   {
2025 |    "cell_type": "code",
2026 |    "execution_count": null,
2027 |    "metadata": {
2028 |     "collapsed": true
2029 |    },
2030 |    "outputs": [],
2031 |    "source": [
2032 |     "data_answer = pd.read_csv(datapath+'data/data_answer.csv')\n",
2033 |     "# data_answer = data_answer[['qId','writerId','answerday','answerhour']]\n",
2034 |     "data_question = pd.read_csv(datapath+'data/data_question.csv')\n",
2035 |     "data_writer = pd.read_csv(datapath+'data/data_writer.csv')\n",
2036 |     "data_answer = pd.merge(data_answer,data_question,on = 'qId',how = 'left')\n",
2037 |     "data_answer = pd.merge(data_answer,data_writer,on = 'writerId',how = 'left')\n",
2038 |     "\n",
2039 |     "data_answer = data_answer.rename(columns = {'answerhour':'invitehour','answerday':'inviteday'})\n",
2040 |     "#为了便于构造特征，将问题回答记录中的回答时间名称改为邀请时间\n",
2041 |     "data_answer['inviteweekday'] = getweekday(data_answer['inviteday'])\n",
2042 |     "data_answer['createweekday'] = getweekday(data_answer['createday'])\n",
2043 |     "del data_writer\n",
2044 |     "del data_question\n",
2045 |     "gc.collect()"
2046 |    ]
2047 |   },
2048 |   {
2049 |    "cell_type": "markdown",
2050 |    "metadata": {},
2051 |    "source": [
2052 |     "1）Id类与类别特征交叉的历史目标编码"
2053 |    ]
2054 |   },
2055 |   {
2056 |    "cell_type": "code",
2057 |    "execution_count": null,
2058 |    "metadata": {
2059 |     "collapsed": true
2060 |    },
2061 |    "outputs": [],
2062 |    "source": [
2063 |     "features = ['bool_B','bool_C','bool_D','bool_E','invitehour','inviteweekday']\n",
2064 |     "#'qId_sex','qId_activity','qId_bool_A','qId_yanzhi','qId_category_A','qId_category_B','qId_category_C','qId_category_D','qId_category_E',\n",
2065 |     "#'qId_mostliketheme','writerId_createday',\n",
2066 |     "for fea in features:\n",
2067 |     "    df = targetencoder(data,data_answer,df,['qId',fea])\n",
2068 |     "    print(fea+' is ok')\n",
2069 |     "\n",
2070 |     "features = ['invitehour','inviteweekday','createhour','createweekday']\n",
2071 |     "#'qId_sex','qId_activity','qId_bool_A','qId_yanzhi','qId_category_A','qId_category_B','qId_category_C','qId_category_D','qId_category_E',\n",
2072 |     "#'qId_mostliketheme','writerId_createday',\n",
2073 |     "for fea in features:\n",
2074 |     "    df = targetencoder(data,data_answer,df,['writerId',fea])\n",
2075 |     "    print(fea+' is ok')"
2076 |    ]
2077 |   },
2078 |   {
2079 |    "cell_type": "markdown",
2080 |    "metadata": {},
2081 |    "source": [
2082 |     "2）Id类与多标签类别特征交叉的历史目标编码，将count/ctr结果从大到小排序取topk，目的也是为了一定程度上的过滤"
2083 |    ]
2084 |   },
2085 |   {
2086 |    "cell_type": "code",
2087 |    "execution_count": 64,
2088 |    "metadata": {
2089 |     "collapsed": true
2090 |    },
2091 |    "outputs": [],
2092 |    "source": [
2093 |     "def lastndaylabelcount_multi(dataf,data_asw,idfea,fea):  \n",
2094 |     "    gps = data_asw.groupby(['inviteday'])\n",
2095 |     "    dic = {}\n",
2096 |     "    for gp_id in iter(data_asw['inviteday'].unique()):\n",
2097 |     "        gp = gps.get_group(gp_id)\n",
2098 |     "        dic[gp_id] = gp.shape[0]#/gp[idfea].nunique()\n",
2099 |     "    dic = pd.Series(dic)\n",
2100 |     "    dic = (dic.mean()/dic).round(3).to_dict()\n",
2101 |     "    \n",
2102 |     "    result = dataf[[]]\n",
2103 |     "    result['val0'] = np.nan\n",
2104 |     "    result['val1'] = np.nan\n",
2105 |     "    result['val2'] = np.nan\n",
2106 |     "    result['val3'] = np.nan\n",
2107 |     "    result['val4'] = np.nan\n",
2108 |     "    daylen_asw = 21\n",
2109 |     "    for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n",
2110 |     "        result[dataf['inviteday'] == day] = get_multi_count(data_asw[(data_asw['inviteday']<day)&(data_asw['inviteday']>=day-daylen_asw)],dataf[dataf['inviteday'] == day],idfea,fea,dic)           \n",
2111 |     "    result[dataf['label'] == -1] = get_multi_count(data_asw[(data_asw['inviteday']<t0_eval)&(data_asw['inviteday']>=day-daylen_asw)],dataf[dataf['label'] == -1],idfea,fea,dic)\n",
2112 |     "    \n",
2113 |     "    result.columns = [idfea+'_'+fea+'_lastnday_labelcount' +str(i) for i in range(5)]\n",
2114 |     "    \n",
2115 |     "    return result\n",
2116 |     "\n",
2117 |     "def get_multi_count(df_train,df_test,idfea,fea,dic):\n",
2118 |     "    \n",
2119 |     "    day_max = df_test['inviteday'].min()\n",
2120 |     "    countall = {}\n",
2121 |     "    for row in iter(df_train[[idfea,fea,'inviteday']].values):\n",
2122 |     "        if row[1] == '-1':\n",
2123 |     "            continue\n",
2124 |     "        t = (1.5-0.3*np.floor((day_max-row[2])/3))#*dic[row[2]]#1#\n",
2125 |     "        if t<0.5:\n",
2126 |     "            t = 0.6\n",
2127 |     "        for theme in iter(row[1].strip().split(',')):\n",
2128 |     "            theme = row[0]+'_'+theme\n",
2129 |     "            try:\n",
2130 |     "                countall[theme] = countall[theme] + t\n",
2131 |     "            except:\n",
2132 |     "                countall[theme] = t\n",
2133 |     "    \n",
2134 |     "    result_count = []\n",
2135 |     "    for row in iter(df_test[[idfea,fea]].values):\n",
2136 |     "        row_count = []\n",
2137 |     "        if row[1] == '-1':\n",
2138 |     "            result_count.append([np.nan])\n",
2139 |     "            continue\n",
2140 |     "        for theme in iter(row[1].strip().split(',')):\n",
2141 |     "            theme = row[0]+'_'+theme\n",
2142 |     "            try:\n",
2143 |     "                row_count.append(countall[theme])\n",
2144 |     "            except:\n",
2145 |     "                row_count.append(0)\n",
2146 |     "        row_count.sort(reverse = True)     \n",
2147 |     "        result_count.append(row_count[:5])\n",
2148 |     "        \n",
2149 |     "    result_count = pd.DataFrame(result_count)\n",
2150 |     "    for i in range(5):\n",
2151 |     "        if i not in result_count.columns:\n",
2152 |     "            result_count[i] = np.nan\n",
2153 |     "    del countall\n",
2154 |     "    return result_count.values\n",
2155 |     "\n",
2156 |     "def lastndaylabelctr_multi(dataf,data_asw,idfea,fea):   \n",
2157 |     "\n",
2158 |     "    result = dataf[[]]\n",
2159 |     "    result['val0'] = np.nan\n",
2160 |     "    result['val1'] = np.nan\n",
2161 |     "    result['val2'] = np.nan\n",
2162 |     "    result['val3'] = np.nan\n",
2163 |     "    result['val4'] = np.nan\n",
2164 |     "\n",
2165 |     "    for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n",
2166 |     "        result[dataf['inviteday'] == day] = get_multi_ctr(data_asw[(data_asw['inviteday']<day)],dataf[dataf['inviteday'] == day],idfea,fea)\n",
2167 |     "    result[dataf['label'] == -1] = get_multi_ctr(data_asw[(data_asw['inviteday']<t0_eval)],dataf[dataf['label'] == -1],idfea,fea)\n",
2168 |     "    \n",
2169 |     "    result.columns = [idfea+'_'+fea+'_lastnday_labelctr' +str(i) for i in range(5)]\n",
2170 |     "    return result\n",
2171 |     "\n",
2172 |     "def get_multi_ctr(df_train,df_test,idfea,fea):\n",
2173 |     "    count1 = {}\n",
2174 |     "    countall = {}\n",
2175 |     "    day_max = df_test['inviteday'].min()\n",
2176 |     "    for row in iter(df_train[[idfea,fea,'inviteday','label']].values):\n",
2177 |     "        if row[1] == '-1':\n",
2178 |     "            continue\n",
2179 |     "        t = 1#dic[row[2]]#1.5-0.3*np.floor((day_max-row[2])/3)\n",
2180 |     "#         if t<0.5:\n",
2181 |     "#             t = 0.6\n",
2182 |     "        for theme in iter(row[1].strip().split(',')):\n",
2183 |     "            theme = row[0]+'_'+theme\n",
2184 |     "            try:\n",
2185 |     "                countall[theme] = countall[theme] + t\n",
2186 |     "            except:\n",
2187 |     "                countall[theme] = t\n",
2188 |     "            if row[3] == 1:\n",
2189 |     "                try:\n",
2190 |     "                    count1[theme] = count1[theme] + t\n",
2191 |     "                except:\n",
2192 |     "                    count1[theme] = t\n",
2193 |     "\n",
2194 |     "    result_ctr = []\n",
2195 |     "    for row in iter(df_test[[idfea,fea]].values):\n",
2196 |     "        row_ctr = []\n",
2197 |     "        if fea == '-1':\n",
2198 |     "            result_ctr.append([np.nan])\n",
2199 |     "            continue\n",
2200 |     "        \n",
2201 |     "        for theme in iter(row[1].strip().split(',')):\n",
2202 |     "            theme = row[0]+'_'+theme\n",
2203 |     "            try:\n",
2204 |     "                row_ctr.append((count1[theme]+0.1)/(countall[theme]+0.1))\n",
2205 |     "            except:\n",
2206 |     "                row_ctr.append(0)\n",
2207 |     "        row_ctr.sort(reverse = True)\n",
2208 |     "        result_ctr.append(row_ctr[:5])\n",
2209 |     "        \n",
2210 |     "    result_ctr = pd.DataFrame(result_ctr)\n",
2211 |     "    for i in range(5):\n",
2212 |     "        if i not in result_ctr.columns:\n",
2213 |     "            result_ctr[i] = np.nan\n",
2214 |     "    return result_ctr.values    \n",
2215 |     "\n",
2216 |     "\n",
2217 |     "\n",
2218 |     "df_this = data[[]]\n",
2219 |     "df_this = lastndaylabelcount_multi(data,data_answer,'writerId','themeId')\n",
2220 |     "df = pd.concat([df,df_this],axis = 1)\n",
2221 |     "df['writerId_themeId_lastnday_labelcount_mean'] = df[['writerId_themeId_lastnday_labelcount0', 'writerId_themeId_lastnday_labelcount1',\n",
2222 |     "                                           'writerId_themeId_lastnday_labelcount2', 'writerId_themeId_lastnday_labelcount3',\n",
2223 |     "                                           'writerId_themeId_lastnday_labelcount4']].mean(axis = 1)\n",
2224 |     "df['writerId_themeId_lastnday_labelcount_sum'] = df[['writerId_themeId_lastnday_labelcount0', 'writerId_themeId_lastnday_labelcount1',\n",
2225 |     "                                           'writerId_themeId_lastnday_labelcount2', 'writerId_themeId_lastnday_labelcount3',\n",
2226 |     "                                           'writerId_themeId_lastnday_labelcount4']].sum(axis = 1)\n",
2227 |     "\n",
2228 |     "df_this = data[[]]\n",
2229 |     "df_this = lastndaylabelctr_multi(data,data,'writerId','themeId')\n",
2230 |     "df = pd.concat([df,df_this],axis = 1)\n",
2231 |     "df['writerId_themeId_lastnday_labelctr_mean'] = df[['writerId_themeId_lastnday_labelctr0', 'writerId_themeId_lastnday_labelctr1',\n",
2232 |     "                                           'writerId_themeId_lastnday_labelctr2', 'writerId_themeId_lastnday_labelctr3',\n",
2233 |     "                                           'writerId_themeId_lastnday_labelctr4']].mean(axis = 1)"
2234 |    ]
2235 |   },
2236 |   {
2237 |    "cell_type": "markdown",
2238 |    "metadata": {},
2239 |    "source": [
2240 |     "对于问题标题的切词序列，采用过滤高频词的方式过滤后作为多标签类别特征进行交叉目标编码"
2241 |    ]
2242 |   },
2243 |   {
2244 |    "cell_type": "code",
2245 |    "execution_count": 70,
2246 |    "metadata": {
2247 |     "collapsed": true
2248 |    },
2249 |    "outputs": [],
2250 |    "source": [
2251 |     "def remove_lowfreq_word(data_q,del_words):\n",
2252 |     "    titles = []\n",
2253 |     "    for words in iter(data_q['title_words'].values):\n",
2254 |     "        if words == '-1':\n",
2255 |     "            titles.append('-1')\n",
2256 |     "            continue\n",
2257 |     "        words = words.split(',')\n",
2258 |     "        for word in iter(words):\n",
2259 |     "            if word in del_words:\n",
2260 |     "                words.remove(word)\n",
2261 |     "        words = ','.join(words)\n",
2262 |     "        titles.append(words)\n",
2263 |     "    return titles\n",
2264 |     "\n",
2265 |     "data_question = pd.read_csv(datapath+'data/data_question.csv')\n",
2266 |     "word_num = {}\n",
2267 |     "for words in iter(data_question['title_words'].values):\n",
2268 |     "    if words == '-1':\n",
2269 |     "        continue\n",
2270 |     "    words = words.split(',')\n",
2271 |     "    for word in iter(words):\n",
2272 |     "        try:\n",
2273 |     "            word_num[word] += 1\n",
2274 |     "        except:\n",
2275 |     "            word_num[word] = 1\n",
2276 |     "        \n",
2277 |     "word_num = pd.Series(word_num)\n",
2278 |     "# word_top100 = list(word_num.sort_values(ascending = False)[:100].index)\n",
2279 |     "# word_top50 = list(word_num.sort_values(ascending = False)[:50].index)\n",
2280 |     "word_top150 = list(word_num.sort_values(ascending = False)[:150].index)\n",
2281 |     "# word_top200 = list(word_num.sort_values(ascending = False)[:200].index)\n",
2282 |     "\n",
2283 |     "# data_question['title_words_deltop50'] = remove_lowfreq_word(data_question[['title_words']],word_top50)\n",
2284 |     "# data_question['title_words_deltop100'] = remove_lowfreq_word(data_question[['title_words']],word_top100)\n",
2285 |     "data_question['title_words_deltop150'] = remove_lowfreq_word(data_question[['title_words']],word_top150)\n",
2286 |     "# data_question['title_words_deltop200'] = remove_lowfreq_word(data_question[['title_words']],word_top200)\n",
2287 |     "\n",
2288 |     "data_question = data_question[['qId','title_words_deltop150']]\n",
2289 |     "data_question.loc[data_question['title_words_deltop150'] == '','title_words_deltop150'] = '-1'\n",
2290 |     "data = data.merge(data_question[['qId','title_words_deltop150']],on = 'qId',how = 'left')\n",
2291 |     "data_answer = data_answer.merge(data_question[['qId','title_words_deltop150']],on = 'qId',how = 'left')\n",
2292 |     "del data_question\n",
2293 |     "\n",
2294 |     "\n",
2295 |     "df_this = data[[]]\n",
2296 |     "df_this = lastndaylabelcount_multi(data,data_answer,'writerId','title_words_deltop150')\n",
2297 |     "df = pd.concat([df,df_this],axis = 1)\n",
2298 |     "\n",
2299 |     "df_this = data[[]]\n",
2300 |     "df_this = lastndaylabelctr_multi(data,data,'writerId','title_words_deltop150')\n",
2301 |     "df = pd.concat([df,df_this],axis = 1)\n",
2302 |     "del data['title_words_deltop150']\n",
2303 |     "del data_answer['title_words_deltop150']\n",
2304 |     "\n",
2305 |     "cols = ['writerId_title_words_deltop150_lastnday_labelctr0',\n",
2306 |     "       'writerId_title_words_deltop150_lastnday_labelctr1',\n",
2307 |     "       'writerId_title_words_deltop150_lastnday_labelctr2',\n",
2308 |     "       'writerId_title_words_deltop150_lastnday_labelctr3',\n",
2309 |     "       'writerId_title_words_deltop150_lastnday_labelctr4']\n",
2310 |     "df[cols] = df[cols]*1000"
2311 |    ]
2312 |   },
2313 |   {
2314 |    "cell_type": "code",
2315 |    "execution_count": null,
2316 |    "metadata": {
2317 |     "collapsed": true
2318 |    },
2319 |    "outputs": [],
2320 |    "source": [
2321 |     "df = memoryOptimization(df,np.float32)\n",
2322 |     "with open(datapath+'df/df_2.2.pkl','wb') as f:\n",
2323 |     "    pickle.dump(df,f)\n",
2324 |     "del df"
2325 |    ]
2326 |   },
2327 |   {
2328 |    "cell_type": "markdown",
2329 |    "metadata": {},
2330 |    "source": [
2331 |     "### 3、问题聚类&目标编码"
2332 |    ]
2333 |   },
2334 |   {
2335 |    "cell_type": "markdown",
2336 |    "metadata": {},
2337 |    "source": [
2338 |     "对问题绑定话题的embedding加和取平均后得到问题的embedding，根据问题的embedding对问题进行聚类，得到问题的单标签话题类别特征"
2339 |    ]
2340 |   },
2341 |   {
2342 |    "cell_type": "code",
2343 |    "execution_count": null,
2344 |    "metadata": {
2345 |     "collapsed": true
2346 |    },
2347 |    "outputs": [],
2348 |    "source": [
2349 |     "t2v = {}\n",
2350 |     "with open(datapath+'data/topic_vectors_64d.txt') as f:\n",
2351 |     "    for line in f.readlines():\n",
2352 |     "        line = line.strip().split('\\t')\n",
2353 |     "        t2v[int(line[0][1:])] = np.array(list(map(float, line[1].strip().split(' '))))\n",
2354 |     "q_info = pd.read_csv(datapath+'data_question.csv')\n",
2355 |     "\n",
2356 |     "q2vec = {}\n",
2357 |     "for ind,row in q_info.iterrows():\n",
2358 |     "    t = row['topic'].split(',')\n",
2359 |     "    vec = np.zeros(64)\n",
2360 |     "    n = 0\n",
2361 |     "    for i in t:\n",
2362 |     "        i = int(i[1:])\n",
2363 |     "        if i in t2v:\n",
2364 |     "            vec += t2v[i]\n",
2365 |     "            n += 1\n",
2366 |     "    if n == 0:\n",
2367 |     "        q2vec[row['qid']] = vec\n",
2368 |     "    else:\n",
2369 |     "        q2vec[row['qid']] = vec/n\n",
2370 |     "\n",
2371 |     "# 使用kmean聚类\n",
2372 |     "clusters = 300 # 类别个数\n",
2373 |     "q2class = {}\n",
2374 |     "vectors_list = np.array([q2vec[qid] for qid in q2vec])\n",
2375 |     "kmean_model = KMeans(random_state = 2019, n_clusters=clusters)\n",
2376 |     "kmean_model.fit(vectors_list)\n",
2377 |     "q = list(q2vec.keys())\n",
2378 |     "cla = kmean_model.predict(vectors_list.reshape(-1,64))\n",
2379 |     "class2vec = kmean_model.cluster_centers_  # clusters*300 代表每个簇的中心\n",
2380 |     "\n",
2381 |     "qc_df = pd.DataFrame({'qid':q, 'class':list(cla)})\n",
2382 |     "qc_df.rename(columns = {'qid':'qId'}, inplace=True)\n",
2383 |     "qc_df['qId'] = qc_df['qId'].astype(str)\n",
2384 |     "qc_df['qId'] = 'Q'+qc_df['qId']\n",
2385 |     "qc_df.rename(columns={'class':'q_class_300'},inplace=True)\n",
2386 |     "\n",
2387 |     "data_answer = pd.merge(data_answer, qc_df, on='qId', how='left')\n",
2388 |     "data = pd.merge(data, qc_df, on='qId', how='left')"
2389 |    ]
2390 |   },
2391 |   {
2392 |    "cell_type": "code",
2393 |    "execution_count": null,
2394 |    "metadata": {
2395 |     "collapsed": true
2396 |    },
2397 |    "outputs": [],
2398 |    "source": [
2399 |     "df = data[[]]"
2400 |    ]
2401 |   },
2402 |   {
2403 |    "cell_type": "markdown",
2404 |    "metadata": {},
2405 |    "source": [
2406 |     "对聚类得到的问题单标签类别特征进行目标编码"
2407 |    ]
2408 |   },
2409 |   {
2410 |    "cell_type": "code",
2411 |    "execution_count": null,
2412 |    "metadata": {
2413 |     "collapsed": true
2414 |    },
2415 |    "outputs": [],
2416 |    "source": [
2417 |     "for fea in ['q_class_300']: #'q_class_100','q_class_300'\n",
2418 |     "    df = targetencoder(data[['writerId',fea,'inviteday','label']],data_answer,df,['writerId',fea])\n",
2419 |     "    del data_answer['writerId_'+fea]\n",
2420 |     "    print(fea+' is ok')\n",
2421 |     "for fea1 in ['q_class_300']:#'q_class_100','q_class_300'\n",
2422 |     "    for fea2 in ['bool_A','bool_B','bool_C','bool_E','category_D',]: #'sex','activity','bool_D','yanzhi','category_A','category_B','category_C','category_E','mostliketheme'\n",
2423 |     "        df = targetencoder(data[[fea1,fea2,'inviteday','label']],data_answer,df,[fea1,fea2])\n",
2424 |     "        del data_answer[fea1+'_'+fea2]\n",
2425 |     "        print(fea1+'_'+fea2+' is ok')"
2426 |    ]
2427 |   },
2428 |   {
2429 |    "cell_type": "code",
2430 |    "execution_count": null,
2431 |    "metadata": {
2432 |     "collapsed": true
2433 |    },
2434 |    "outputs": [],
2435 |    "source": [
2436 |     "df = memoryOptimization(df,np.float32)\n",
2437 |     "with open(datapath+'df/df_2.3.pkl','wb') as f:\n",
2438 |     "    pickle.dump(df,f)\n",
2439 |     "del df"
2440 |    ]
2441 |   },
2442 |   {
2443 |    "cell_type": "markdown",
2444 |    "metadata": {},
2445 |    "source": [
2446 |     "### 4、相似性"
2447 |    ]
2448 |   },
2449 |   {
2450 |    "cell_type": "code",
2451 |    "execution_count": null,
2452 |    "metadata": {
2453 |     "collapsed": true
2454 |    },
2455 |    "outputs": [],
2456 |    "source": [
2457 |     "df = data[[]]"
2458 |    ]
2459 |   },
2460 |   {
2461 |    "cell_type": "markdown",
2462 |    "metadata": {},
2463 |    "source": [
2464 |     "1)用户行为历史相似度，分别计算了当前待判断问题与用户历史回答问题的话题余弦相似度、标题余弦相似度以及历史邀请负样本话题的余弦相似度"
2465 |    ]
2466 |   },
2467 |   {
2468 |    "cell_type": "code",
2469 |    "execution_count": null,
2470 |    "metadata": {
2471 |     "collapsed": true
2472 |    },
2473 |    "outputs": [],
2474 |    "source": [
2475 |     "def cos_sim(vector_a, vector_b):\n",
2476 |     "\n",
2477 |     "    vector_a = np.mat(vector_a)\n",
2478 |     "    vector_b = np.mat(vector_b)\n",
2479 |     "    num = float(vector_a * vector_b.T)\n",
2480 |     "    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)\n",
2481 |     "    cos = num / denom\n",
2482 |     "    sim = 0.5 + 0.5 * cos\n",
2483 |     "    return sim\n",
2484 |     "\n",
2485 |     "def writerlastndayperformance_itemCF(dataf,data_asw,df,n,q2themes,q2words,dic_themes,dic_words):  \n",
2486 |     "    \n",
2487 |     "    scorefea = 'simhis_base_theme_theme_itemcf'\n",
2488 |     "    df[scorefea] = np.nan\n",
2489 |     "    for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n",
2490 |     "        df.loc[dataf['inviteday'] == day,scorefea] = get_sim_itemCF(data_asw[(data_asw['inviteday']<day)&(data_asw['inviteday']>=day-n)],dataf[dataf['inviteday'] == day],day,n,'themeId','themeId',q2themes,{},dic_themes,{})           \n",
2491 |     "    df.loc[dataf['label'] == -1,scorefea] = get_sim_itemCF(data_asw[(data_asw['inviteday']<t0_eval)&(data_asw['inviteday']>=t0_eval-n)],dataf[dataf['label'] == -1],t0_eval,n,'themeId','themeId',q2themes,{},dic_themes,{})\n",
2492 |     "    return df\n",
2493 |     "\n",
2494 |     "def writerlastndayperformance_itemCF_pool(dataf,data_asw,data_train_noclick,df,q2themes,q2words,dic_themes,dic_words):  \n",
2495 |     "    \n",
2496 |     "    #######sim_theme######\n",
2497 |     "    \n",
2498 |     "    scorefea = 'simhis_base_theme_theme_itemcf'\n",
2499 |     "    df[scorefea] = np.nan\n",
2500 |     "    n = 30\n",
2501 |     "\n",
2502 |     "    pool = Pool(2)\n",
2503 |     "    result = {}\n",
2504 |     "    result[t0_eval] = pool.apply_async(\n",
2505 |     "        func = get_sim_itemCF,\n",
2506 |     "        args = (data_asw.loc[(data_asw['inviteday']<t0_eval)&(data_asw['inviteday']>=t0_eval-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['label'] == -1,['writerId','qId']],t0_eval,n,'themeId','themeId',q2themes.copy(),{},dic_themes.copy(),{},False,)\n",
2507 |     "    )\n",
2508 |     "    for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n",
2509 |     "        result[day] = pool.apply_async(\n",
2510 |     "            func = get_sim_itemCF,\n",
2511 |     "            args = (data_asw.loc[(data_asw['inviteday']<day)&(data_asw['inviteday']>=day-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['inviteday'] == day,['writerId','qId']],day,n,'themeId','themeId',q2themes.copy(),{},dic_themes.copy(),{},False,)\n",
2512 |     "        )\n",
2513 |     "    pool.close()\n",
2514 |     "    pool.join()\n",
2515 |     "    for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n",
2516 |     "        df.loc[dataf['inviteday'] == day,scorefea] = result[day].get()        \n",
2517 |     "    df.loc[dataf['label'] == -1,scorefea] = result[t0_eval].get()\n",
2518 |     "    with open(datapath+'newfea_copy/sim_theme.pkl','wb') as f:\n",
2519 |     "        pickle.dump(df,f)\n",
2520 |     "\n",
2521 |     "    #######sim_title######\n",
2522 |     "    \n",
2523 |     "    scorefea = 'simhis_base_title_title_itemcf'\n",
2524 |     "    df[scorefea] = np.nan\n",
2525 |     "    n = 30\n",
2526 |     "\n",
2527 |     "    pool = Pool(2)\n",
2528 |     "    result = {}\n",
2529 |     "    result[t0_eval] = pool.apply_async(\n",
2530 |     "        func = get_sim_itemCF,\n",
2531 |     "        args = (data_asw.loc[(data_asw['inviteday']<t0_eval)&(data_asw['inviteday']>=t0_eval-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['label'] == -1,['writerId','qId']],t0_eval,n,'title_words_tfidf1','title_words_tfidf1',q2words.copy(),{},dic_words.copy(),{},False,)\n",
2532 |     "    )\n",
2533 |     "    for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n",
2534 |     "        result[day] = pool.apply_async(\n",
2535 |     "            func = get_sim_itemCF,\n",
2536 |     "            args = (data_asw.loc[(data_asw['inviteday']<day)&(data_asw['inviteday']>=day-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['inviteday'] == day,['writerId','qId']],day,n,'title_words_tfidf1','title_words_tfidf1',q2words.copy(),{},dic_words.copy(),{},False,)\n",
2537 |     "        )\n",
2538 |     "    \n",
2539 |     "    pool.close()\n",
2540 |     "    pool.join()\n",
2541 |     "    for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n",
2542 |     "        df.loc[dataf['inviteday'] == day,scorefea] = result[day].get()        \n",
2543 |     "    df.loc[dataf['label'] == -1,scorefea] = result[t0_eval].get()\n",
2544 |     "    with open(datapath+'newfea_copy/sim_title.pkl','wb') as f:\n",
2545 |     "        pickle.dump(df,f)\n",
2546 |     "        \n",
2547 |     "    ######sim_noclick########\n",
2548 |     "    \n",
2549 |     "    scorefea = 'simhis_base_theme_theme_noclick'\n",
2550 |     "    df[scorefea] = np.nan\n",
2551 |     "    n = 15\n",
2552 |     "\n",
2553 |     "    data_asw = data_train_noclick[data_train_noclick['label'] == 0].reset_index(drop = True)\n",
2554 |     "    pool = Pool(2)\n",
2555 |     "    result = {}\n",
2556 |     "    result[t0_eval] = pool.apply_async(\n",
2557 |     "        func = get_sim_itemCF,\n",
2558 |     "        args = (data_asw.loc[(data_asw['inviteday']<t0_eval)&(data_asw['inviteday']>=t0_eval-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['label'] == -1,['writerId','qId']].copy(),t0_eval,n,'themeId','themeId',q2themes.copy(),{},dic_themes.copy(),{},True,)\n",
2559 |     "    )\n",
2560 |     "    for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n",
2561 |     "        result[day] = pool.apply_async(\n",
2562 |     "            func = get_sim_itemCF,\n",
2563 |     "            args = (data_asw.loc[(data_asw['inviteday']<day)&(data_asw['inviteday']>=day-n),['writerId','inviteday','invitehour','qId']].copy(),dataf.loc[dataf['inviteday'] == day,['writerId','qId']].copy(),day,n,'themeId','themeId',q2themes.copy(),{},dic_themes.copy(),{},True,)\n",
2564 |     "        )\n",
2565 |     "    pool.close()\n",
2566 |     "    pool.join()\n",
2567 |     "    for day in iter(dataf.loc[dataf['label'] != -1 ,'inviteday'].unique()):\n",
2568 |     "        df.loc[dataf['inviteday'] == day,scorefea] = result[day].get()        \n",
2569 |     "    df.loc[dataf['label'] == -1,scorefea] = result[t0_eval].get()\n",
2570 |     "    \n",
2571 |     "    return df\n",
2572 |     "\n",
2573 |     "\n",
2574 |     "def get_sim_itemCF(data_train,data_test,day,n,hisfea,curfea,q2hisfea,q2curfea,dic_his,dic_cur,noclick):\n",
2575 |     "    #day:待计算数据邀请日\n",
2576 |     "    #n:统计时长\n",
2577 |     "    #hisfea:历史聚合特征（theme/word）\n",
2578 |     "    #curfea:当前比较特征(theme/word)\n",
2579 |     "    #q2hisfea:问题到hisfea的dic\n",
2580 |     "    #q2curfea：问题到curfea的dic\n",
2581 |     "    #dic_cur:当前训练接的theme/word的emb\n",
2582 |     "    #dic_his:历史回答的theme/word的emb\n",
2583 |     "    if curfea == hisfea:\n",
2584 |     "        dic_cur = dic_his\n",
2585 |     "        q2curfea = q2hisfea\n",
2586 |     "    if noclick:\n",
2587 |     "        pathfold = hisfea+'_noclick'\n",
2588 |     "    else:\n",
2589 |     "        pathfold = hisfea\n",
2590 |     "    \n",
2591 |     "    if os.path.exists(datapath+'dic_q2hisfea/%s/his_%s_%s.pkl' %(pathfold,day,n)) and os.path.exists(datapath+'dic_q2hisfea/%s/his_delts_%s_%s.pkl' %(pathfold,day,n)):\n",
2592 |     "        with open(datapath+'dic_q2hisfea/%s/his_%s_%s.pkl' %(pathfold,day,n),'rb') as f:\n",
2593 |     "            dic_t = pickle.load(f)\n",
2594 |     "        with open(datapath+'dic_q2hisfea/%s/his_delts_%s_%s.pkl' %(pathfold,day,n),'rb') as f:\n",
2595 |     "            dic_delt = pickle.load(f)\n",
2596 |     "    else:\n",
2597 |     "        dic_t = {}\n",
2598 |     "        dic_delt = {}\n",
2599 |     "        data_train = data_train[data_train['writerId'].isin(data_test['writerId'].unique())].groupby('writerId')\n",
2600 |     "        for writerid in iter(data_train.size().index):\n",
2601 |     "            dic_t[writerid] = []#可能为kong\n",
2602 |     "            dic_delt[writerid] = []\n",
2603 |     "            gp = data_train.get_group(writerid)\n",
2604 |     "            gp = gp.sort_values(by = ['inviteday','invitehour'],ascending = False)#.iloc[:10,:]\n",
2605 |     "            for row in iter(gp[['inviteday','qId']].values):\n",
2606 |     "                themes = q2hisfea[row[1]]\n",
2607 |     "                if themes[0] == '-1':\n",
2608 |     "                    continue\n",
2609 |     "                dic_t[writerid].append(themes)\n",
2610 |     "                dic_delt[writerid].append(day-row[0])\n",
2611 |     "        with open(datapath+'dic_q2hisfea/%s/his_%s_%s.pkl' %(pathfold,day,n),'wb') as f:\n",
2612 |     "            pickle.dump(dic_t,f)\n",
2613 |     "        with open(datapath+'dic_q2hisfea/%s/his_delts_%s_%s.pkl' %(pathfold,day,n),'wb') as f:\n",
2614 |     "            pickle.dump(dic_delt,f)\n",
2615 |     "    \n",
2616 |     "    result = []\n",
2617 |     "    for row in iter(data_test[['writerId','qId']].values):\n",
2618 |     "        try:\n",
2619 |     "            list_t = dic_t[row[0]]\n",
2620 |     "#             list_delt = dic_delt[row[0]]\n",
2621 |     "        except:\n",
2622 |     "            result.append(np.nan)#无历史记录\n",
2623 |     "            continue\n",
2624 |     "        curthemes = q2curfea[row[1]]\n",
2625 |     "        if curthemes[0] == '-1':#当前问题t缺失\n",
2626 |     "            result.append(np.nan)\n",
2627 |     "            continue\n",
2628 |     "        \n",
2629 |     "        if len(list_t) == 0:#有历史记录，但历史记录的问题的t都是-1\n",
2630 |     "            result.append(np.nan)\n",
2631 |     "            continue\n",
2632 |     "        \n",
2633 |     "        sim_between_his_cur = 0\n",
2634 |     "        for index,histhemes in iter(enumerate(list_t)):#循环历史问题\n",
2635 |     "#             deltday = list_delt[index]\n",
2636 |     "#             weight = \n",
2637 |     "            sims_between_q = []\n",
2638 |     "            for histheme in iter(histhemes):#循环问题的t\n",
2639 |     "                for curtheme in iter(curthemes):#循环当前问题的t\n",
2640 |     "                    sims_between_q.append(cos_sim(dic_his[histheme],dic_cur[curtheme]))\n",
2641 |     "            q_sim = np.sort(sims_between_q)[-3:].mean()\n",
2642 |     "            if q_sim > 0.7:#simhold,相当于取与该问题相似的问题的topk与历史问题取交集\n",
2643 |     "                sim_between_his_cur += q_sim#*weight,关于间隔时间的权\n",
2644 |     "        result.append(sim_between_his_cur)\n",
2645 |     "    \n",
2646 |     "    return result\n",
2647 |     "\n",
2648 |     "#话题64维emb索引字典\n",
2649 |     "theme_vec = pd.read_csv(datapath+'data/theme_vector.csv')\n",
2650 |     "cols = theme_vec.columns.tolist()\n",
2651 |     "cols.remove('themeId')\n",
2652 |     "cols = ['themeId']+cols\n",
2653 |     "theme_vec = theme_vec[cols]\n",
2654 |     "dic_themes = {}\n",
2655 |     "for row in iter(theme_vec.values):\n",
2656 |     "    dic_themes[row[0]] = row[1:]\n",
2657 |     "\n",
2658 |     "#问题绑定话题的索引字典\n",
2659 |     "data_question = pd.read_csv(datapath+'data/data_question.csv')[['qId','themeId']]\n",
2660 |     "q2themes = {}\n",
2661 |     "for row in iter(data_question[['qId','themeId']].values):\n",
2662 |     "    q2themes[row[0]] = row[1].split(',')\n",
2663 |     "del data_question\n",
2664 |     "\n",
2665 |     "#切词64维emb索引字典\n",
2666 |     "word_vec = pd.read_csv(datapath+'data/word_vector.csv')\n",
2667 |     "cols = word_vec.columns.tolist()\n",
2668 |     "cols.remove('wordId')\n",
2669 |     "cols = ['wordId']+cols\n",
2670 |     "word_vec = word_vec[cols]\n",
2671 |     "dic_words = {}\n",
2672 |     "for row in iter(word_vec.values):\n",
2673 |     "    dic_words[row[0]] = row[1:]\n",
2674 |     "\n",
2675 |     "#问题标题&描述切词过滤后的索引字典\n",
2676 |     "data_question = pd.read_csv(datapath+'data_q_title_tfidf.csv')\n",
2677 |     "data_question = data_question[['问题id','title_topk','text_topk']].rename(columns = {'问题id':'qId',\n",
2678 |     "                                                                    'title_topk':'title_words_tfidf0','text_topk':'title_words_tfidf1'})\n",
2679 |     "q2words = {}\n",
2680 |     "for row in iter(data_question[['qId','title_words_tfidf1']].values):\n",
2681 |     "    q2words[row[0]] = row[1].split(',')\n",
2682 |     "del data_question\n",
2683 |     "\n",
2684 |     "df = writerlastndayperformance_itemCF_pool(data,data_answer,data,df,q2themes,q2words,dic_themes,dic_words)\n",
2685 |     "\n",
2686 |     "del data_answer\n",
2687 |     "gc.collect()"
2688 |    ]
2689 |   },
2690 |   {
2691 |    "cell_type": "markdown",
2692 |    "metadata": {},
2693 |    "source": [
2694 |     "2)用户关注/感兴趣话题与当前问题绑定话题的相似度"
2695 |    ]
2696 |   },
2697 |   {
2698 |    "cell_type": "code",
2699 |    "execution_count": null,
2700 |    "metadata": {
2701 |     "collapsed": true
2702 |    },
2703 |    "outputs": [],
2704 |    "source": [
2705 |     "#用户与当前问题的theme相似度\n",
2706 |     "def get_user_theme_emb(data_writer):\n",
2707 |     "    \n",
2708 |     "    user_topic_fea_64 = []\n",
2709 |     "    for index,row in data_writer.iterrows():\n",
2710 |     "        if row['attentionthemes'] == '-1' and row['likethemes'] == '-1':\n",
2711 |     "            user_topic_fea_64.append(np.zeros(64))\n",
2712 |     "        if row['attentionthemes'] != '-1':\n",
2713 |     "            ft = row['attentionthemes'].strip().split(',')\n",
2714 |     "            temp = np.zeros(64)\n",
2715 |     "            for t in iter(ft):\n",
2716 |     "                temp = temp + t2v[t]\n",
2717 |     "            temp = temp/len(ft)\n",
2718 |     "            user_topic_fea_64.append(temp)\n",
2719 |     "        if row['attentionthemes'] == '-1' and row['likethemes'] != '-1':\n",
2720 |     "            it = row['likethemes'].strip().split(',')\n",
2721 |     "            temp = np.zeros(64)\n",
2722 |     "    #         w = []\n",
2723 |     "    #         for t in it:\n",
2724 |     "    #             w.append(float(t.split(':')[1]))\n",
2725 |     "    #         for t in it:\n",
2726 |     "    #             wei = float(t.split(':')[1]) / sum(w)\n",
2727 |     "    #             temp += wei*t2v_norm[int(t.split(':')[0][1:])]\n",
2728 |     "            for t in iter(it):\n",
2729 |     "                temp = temp + t2v[t.split(':')[0]]\n",
2730 |     "            temp = temp/len(it)\n",
2731 |     "            user_topic_fea_64.append(temp)\n",
2732 |     "    user_topic_fea_64 = pd.DataFrame(user_topic_fea_64,columns = ['writer_theme_emb%i' %i for i in range(64)])\n",
2733 |     "    return user_topic_fea_64\n",
2734 |     "\n",
2735 |     "def get_question_theme_emb(data_q):\n",
2736 |     "    result = []\n",
2737 |     "    for themes in iter(data_q['themeId']):\n",
2738 |     "        if themes == '-1':\n",
2739 |     "            result.append(np.zeros(64))\n",
2740 |     "            continue\n",
2741 |     "        themes = themes.split(',')\n",
2742 |     "        emb = np.zeros(64)\n",
2743 |     "        for theme in iter(themes):\n",
2744 |     "            emb = emb + t2v[theme]\n",
2745 |     "        emb = emb/len(themes)\n",
2746 |     "        result.append(emb)\n",
2747 |     "    result = pd.DataFrame(result,columns = ['q_theme_emb%i' %i for i in range(64)])\n",
2748 |     "    return result\n",
2749 |     "\n",
2750 |     "def cos_sim(vector_a, vector_b):\n",
2751 |     "\n",
2752 |     "    vector_a = np.mat(vector_a)\n",
2753 |     "    vector_b = np.mat(vector_b)\n",
2754 |     "    num = float(vector_a * vector_b.T)\n",
2755 |     "    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)\n",
2756 |     "    cos = num / denom\n",
2757 |     "    sim = 0.5 + 0.5 * cos\n",
2758 |     "    return sim\n",
2759 |     "\n",
2760 |     "def get_sim_theme_between_q_writer(data):\n",
2761 |     "    result = []\n",
2762 |     "    for row in iter(data.values):\n",
2763 |     "        result.append(cos_sim(row[:64],row[64:]))\n",
2764 |     "    \n",
2765 |     "    return pd.Series(result)\n",
2766 |     "\n",
2767 |     "theme_vec = pd.read_csv(datapath+'data/theme_vector.csv')\n",
2768 |     "cols = theme_vec.columns.tolist()\n",
2769 |     "cols.remove('themeId')\n",
2770 |     "cols = ['themeId']+cols\n",
2771 |     "theme_vec = theme_vec[cols]\n",
2772 |     "t2v = {}\n",
2773 |     "for row in iter(theme_vec.values):\n",
2774 |     "    t2v[row[0]] = row[1:]\n",
2775 |     "\n",
2776 |     "data_writer = pd.read_csv(datapath+'data/data_writer.csv')\n",
2777 |     "user_topic_fea_64 = parallelize_dataframe(data_writer,get_user_theme_emb).reset_index(drop = True)\n",
2778 |     "user_topic_fea_64['writerId'] = data_writer['writerId']\n",
2779 |     "data = data.merge(user_topic_fea_64,on = 'writerId',how = 'left')\n",
2780 |     "\n",
2781 |     "data_question = pd.read_csv(datapath+'data/data_question.csv')\n",
2782 |     "q_topic_fea_64 = parallelize_dataframe(data_question,get_question_theme_emb).reset_index(drop = True)\n",
2783 |     "q_topic_fea_64['qId'] = data_question['qId']\n",
2784 |     "data = data.merge(q_topic_fea_64,on = 'qId',how = 'left')\n",
2785 |     "\n",
2786 |     "cols = ['writer_theme_emb%i' %i for i in range(64)]+['q_theme_emb%i' %i for i in range(64)]\n",
2787 |     "df['sim_theme_between_q_writer'] = parallelize_dataframe(data[cols],get_sim_theme_between_q_writer).values\n",
2788 |     "data = data.drop(cols,axis = 1)\n",
2789 |     "del data_question\n",
2790 |     "del data_writer\n",
2791 |     "del theme_vec\n",
2792 |     "del t2v\n",
2793 |     "del user_topic_fea_64\n",
2794 |     "del q_topic_fea_64\n",
2795 |     "gc.collect()"
2796 |    ]
2797 |   },
2798 |   {
2799 |    "cell_type": "markdown",
2800 |    "metadata": {},
2801 |    "source": [
2802 |     "3）用户关注/感兴趣话题与当前问题绑定话题的重合统计"
2803 |    ]
2804 |   },
2805 |   {
2806 |    "cell_type": "code",
2807 |    "execution_count": null,
2808 |    "metadata": {
2809 |     "collapsed": true
2810 |    },
2811 |    "outputs": [],
2812 |    "source": [
2813 |     "def samethemenum_atten(row):\n",
2814 |     "    row = row.values\n",
2815 |     "    if row[0] == '-1' or row[1] == '-1':\n",
2816 |     "        return np.nan\n",
2817 |     "    row0 = row[0].strip().split(',')\n",
2818 |     "    row1 = row[1].strip().split(',')\n",
2819 |     "    num = 0\n",
2820 |     "    for theme in iter(row0):\n",
2821 |     "        if theme in row1:\n",
2822 |     "            num = num+1\n",
2823 |     "    return num\n",
2824 |     "def samethemenum_like(row):\n",
2825 |     "    row = row.values\n",
2826 |     "    if row[0] == '-1' or row[1] == '-1':\n",
2827 |     "        return [np.nan,np.nan]\n",
2828 |     "    row0 = row[0].strip().split(',')\n",
2829 |     "    row1 = row[1].strip().split(',')\n",
2830 |     "    num = 0\n",
2831 |     "    quannum = 0\n",
2832 |     "    for theme in iter(row1):\n",
2833 |     "        theme = theme.strip().split(':')\n",
2834 |     "        if theme[0] in row0:\n",
2835 |     "            num = num+1\n",
2836 |     "            quannum = quannum+float(theme[1])\n",
2837 |     "\n",
2838 |     "    return [num,quannum]\n",
2839 |     "\n",
2840 |     "def p1(dataf):\n",
2841 |     "    return dataf.apply(samethemenum_like,axis = 1)\n",
2842 |     "df['samenum_like'] = np.nan\n",
2843 |     "df['sameqnum_like'] = np.nan\n",
2844 |     "df[['samenum_like','sameqnum_like']] = parallelize_dataframe(data[['themeId','likethemes']],p1).values\n",
2845 |     "\n",
2846 |     "def p2(dataf):\n",
2847 |     "    return dataf.apply(samethemenum_atten,axis = 1)\n",
2848 |     "df['samenum_atten'] = parallelize_dataframe(data[['themeId','attentionthemes']],p2).values"
2849 |    ]
2850 |   },
2851 |   {
2852 |    "cell_type": "code",
2853 |    "execution_count": null,
2854 |    "metadata": {
2855 |     "collapsed": true
2856 |    },
2857 |    "outputs": [],
2858 |    "source": [
2859 |     "df = memoryOptimization(df,np.float32)\n",
2860 |     "with open(datapath+'df/df_2.4.pkl','wb') as f:\n",
2861 |     "    pickle.dump(df,f)\n",
2862 |     "del df"
2863 |    ]
2864 |   },
2865 |   {
2866 |    "cell_type": "markdown",
2867 |    "metadata": {},
2868 |    "source": [
2869 |     "### 三、用户历史反馈统计特征"
2870 |    ]
2871 |   },
2872 |   {
2873 |    "cell_type": "code",
2874 |    "execution_count": null,
2875 |    "metadata": {
2876 |     "collapsed": true
2877 |    },
2878 |    "outputs": [],
2879 |    "source": [
2880 |     "df = data[[]]"
2881 |    ]
2882 |   },
2883 |   {
2884 |    "cell_type": "code",
2885 |    "execution_count": null,
2886 |    "metadata": {
2887 |     "collapsed": true
2888 |    },
2889 |    "outputs": [],
2890 |    "source": [
2891 |     "def ans_quality(dataf,data_asw,df,feas,id_fea):   \n",
2892 |     "    data_train = dataf[dataf['label'] != -1]\n",
2893 |     "    data_test = dataf[dataf['label'] == -1]  \n",
2894 |     "    if (len(id_fea) > 1):\n",
2895 |     "        for fea in feas:\n",
2896 |     "            df[id_fea[1]+'_'+fea+'_sum'] = np.nan\n",
2897 |     "            df[id_fea[1]+'_'+fea+'_mean'] = np.nan\n",
2898 |     "            df[id_fea[1]+'_'+fea+'_max'] = np.nan\n",
2899 |     "            daylen_asw = 21\n",
2900 |     "            for day in iter(data_train['inviteday'].unique()):\n",
2901 |     "                df.loc[dataf['inviteday'] == day,[id_fea[1]+'_'+fea+'_sum',id_fea[1]+'_'+fea+'_mean',id_fea[1]+'_'+fea+'_max']] = \\\n",
2902 |     "                get_values(data_asw[(data_asw['inviteday']<day)],dataf[dataf['inviteday'] == day],fea,id_fea)#&(data_asw['inviteday']>=day-daylen_asw)\n",
2903 |     "#             df[[id_fea[1]+'_'+fea+'_sum',id_fea[1]+'_'+fea+'_mean',id_fea[1]+'_'+fea+'_max']] = \\\n",
2904 |     "            df.loc[data_test.index,[id_fea[1]+'_'+fea+'_sum',id_fea[1]+'_'+fea+'_mean',id_fea[1]+'_'+fea+'_max']] = \\\n",
2905 |     "            get_values(data_asw[(data_asw['inviteday']<t0_eval)],data_test,fea,id_fea)#&(data_asw['inviteday']>=t0_eval-daylen_asw)\n",
2906 |     "    #         df.loc[(~dataf[fea].isna())&(df[fea+'_label_count'].isna()),fea+'_label_count'] = 0\n",
2907 |     "    else:\n",
2908 |     "        for fea in feas:\n",
2909 |     "            df[fea+'_sum'] = np.nan\n",
2910 |     "            df[fea+'_mean'] = np.nan\n",
2911 |     "            df[fea+'_max'] = np.nan\n",
2912 |     "            daylen_asw = 21\n",
2913 |     "            for day in iter(data_train['inviteday'].unique()):\n",
2914 |     "                df.loc[dataf['inviteday'] == day,[fea+'_sum',fea+'_mean',fea+'_max']] = \\\n",
2915 |     "                get_values(data_asw[(data_asw['inviteday']<day)],dataf[dataf['inviteday'] == day],fea,id_fea)#&(data_asw['inviteday']>=day-daylen_asw)\n",
2916 |     "#             df[[fea+'_sum',fea+'_mean',fea+'_max']] = \\\n",
2917 |     "            df.loc[data_test.index,[fea+'_sum',fea+'_mean',fea+'_max']] = \\\n",
2918 |     "            get_values(data_asw[(data_asw['inviteday']<t0_eval)],data_test,fea,id_fea)#&(data_asw['inviteday']>=t0_eval-daylen_asw)\n",
2919 |     "    #         df.loc[(~dataf[fea].isna())&(df[fea+'_label_count'].isna()),fea+'_label_count'] = 0\n",
2920 |     "    del data_train\n",
2921 |     "    del data_test\n",
2922 |     "    gc.collect()\n",
2923 |     "    \n",
2924 |     "    return df\n",
2925 |     "def get_values(df_train,df_test,fea,id_fea):\n",
2926 |     "    df_train = df_train[[fea]+id_fea].groupby(id_fea)[fea].agg(['sum','mean','max']).reset_index()\n",
2927 |     "    new_fea_name = id_fea+[fea+'_sum',fea+'_mean',fea+'_max']\n",
2928 |     "    df_train.columns=new_fea_name\n",
2929 |     "    df_test = df_test.merge(df_train,on = id_fea, how = 'left')\n",
2930 |     "    return df_test[[fea+'_sum',fea+'_mean',fea+'_max']].values\n",
2931 |     "\n",
2932 |     "# 用户回答质量统计\n",
2933 |     "ans_quality_cols = ['collectnum', 'commentnum', 'good_bool','picture_bool', 'recommend_bool','yuanzhuo_bool', 'video_bool', 'unhelpnum','wordnum','unlikenum', 'cancellikenum','jubaonum', \\\n",
2934 |     " '3qnum', 'likenum']\n",
2935 |     "id_fea = ['writerId']#writerId\n",
2936 |     "df = ans_quality(data[id_fea+['inviteday','label']], data_answer,df, ans_quality_cols, id_fea)"
2937 |    ]
2938 |   },
2939 |   {
2940 |    "cell_type": "code",
2941 |    "execution_count": null,
2942 |    "metadata": {
2943 |     "collapsed": true
2944 |    },
2945 |    "outputs": [],
2946 |    "source": [
2947 |     "df = memoryOptimization(df,np.float32)\n",
2948 |     "with open(datapath+'df/df_3.pkl','wb') as f:\n",
2949 |     "    pickle.dump(df,f)\n",
2950 |     "del df"
2951 |    ]
2952 |   },
2953 |   {
2954 |    "cell_type": "markdown",
2955 |    "metadata": {},
2956 |    "source": [
2957 |     "# .NN特征构建"
2958 |    ]
2959 |   },
2960 |   {
2961 |    "cell_type": "code",
2962 |    "execution_count": null,
2963 |    "metadata": {
2964 |     "collapsed": true
2965 |    },
2966 |    "outputs": [],
2967 |    "source": [
2968 |     "df = data[[]]"
2969 |    ]
2970 |   },
2971 |   {
2972 |    "cell_type": "markdown",
2973 |    "metadata": {},
2974 |    "source": [
2975 |     "### 1、加载问题关键字\n",
2976 |     "\n",
2977 |     "问题关键字构建方式：利用tf-idf过滤，对于只有标题的取最重要的1/3，对于有描述的取总的2/3"
2978 |    ]
2979 |   },
2980 |   {
2981 |    "cell_type": "code",
2982 |    "execution_count": null,
2983 |    "metadata": {
2984 |     "collapsed": true
2985 |    },
2986 |    "outputs": [],
2987 |    "source": [
2988 |     "data_question = pd.read_csv('data/data_q_title_tfidf.csv')\n",
2989 |     "data_question.columns = ['index', 'qId', 'title_words', 'describe_words']\n",
2990 |     "df['all_words'] = data[['qId']].merge(data_question[['describe_words', 'qId']],on = 'qId',how = 'left')['describe_words'].values\n",
2991 |     "del data_question\n",
2992 |     "gc.collect()"
2993 |    ]
2994 |   },
2995 |   {
2996 |    "cell_type": "markdown",
2997 |    "metadata": {},
2998 |    "source": [
2999 |     "### 2、基于用户历史回答信息得到的用户感兴趣的关键字 / 主题信息\n",
3000 |     "利用提供的answer文件，利用当前用户的历史回答信息（注意时序），得到用户感兴趣的关键字、主题信息，并按照出现频率为不同的主题、关键字设置权重，出现频率越高代表用户对这个主题的关注越高，设置的权重越高"
3001 |    ]
3002 |   },
3003 |   {
3004 |    "cell_type": "code",
3005 |    "execution_count": null,
3006 |    "metadata": {
3007 |     "collapsed": true
3008 |    },
3009 |    "outputs": [],
3010 |    "source": [
3011 |     "def getHistInfo(data, data_ans, info_type, dayn=30):\n",
3012 |     "\n",
3013 |     "    def getUserhistTopic_all(dataf, data_ans, info_type, dayn=30, testflag=False):\n",
3014 |     "        rs = dataf[[]]\n",
3015 |     "        \n",
3016 |     "        rs[info_type[0]] = np.nan  \n",
3017 |     "        result = {}\n",
3018 |     "        for day in tqdm.tqdm_notebook(dataf.loc[dataf['label']!=-1, 'inviteday'].unique()):\n",
3019 |     "            rs.loc[dataf['inviteday']==day, info_type[0]] = getlastndayHisttopic(data_ans.loc[(data_ans['answerday']<day)&(data_ans['answerday']>=day-dayn)],\n",
3020 |     "                                                                                 dataf.loc[dataf['inviteday']==day], info_type[1])  \n",
3021 |     "\n",
3022 |     "        rs.loc[dataf['label']==-1, info_type[0]] = getlastndayHisttopic(data_ans.loc[(data_ans['answerday']<3868)&(data_ans['answerday']>=3868-dayn)],\n",
3023 |     "                                                 dataf[dataf['label']==-1], info_type[1])\n",
3024 |     "        return rs\n",
3025 |     "\n",
3026 |     "    #得到过去历史信息  \n",
3027 |     "    def getlastndayHisttopic(data_ans, dataf, fea): \n",
3028 |     "        gps = data_ans[['writerId', fea]].groupby('writerId')\n",
3029 |     "        rs = []\n",
3030 |     "        temp_dic = {}\n",
3031 |     "        for uid in dataf['writerId'].values:\n",
3032 |     "            topics = {}\n",
3033 |     "            try:\n",
3034 |     "                if uid in temp_dic:\n",
3035 |     "                    rs.append(temp_dic[uid])    \n",
3036 |     "                else:           \n",
3037 |     "                    for i in gps.get_group(uid)[fea]:\n",
3038 |     "                        tps = i.split(',')\n",
3039 |     "                        for tp in tps:\n",
3040 |     "                            if tp not in topics:\n",
3041 |     "                                topics[tp] = 1\n",
3042 |     "                            else:\n",
3043 |     "                                topics[tp] += 1  \n",
3044 |     "\n",
3045 |     "                    topics = pd.Series(topics)\n",
3046 |     "                    topics =(topics/topics.mean()).round(3).to_dict()\n",
3047 |     "                    topics = sorted(topics.items(),key=lambda x: x[1], reverse=True)[:100]\n",
3048 |     "                    temp = []\n",
3049 |     "                    for t in topics:\n",
3050 |     "                        temp.append(t[0]+ ':' +str(t[1]))\n",
3051 |     "                    rs.append(','.join(temp)) \n",
3052 |     "                    temp_dic[uid] = ','.join(temp)\n",
3053 |     "            except:\n",
3054 |     "                rs.append('-1')\n",
3055 |     "                temp_dic[uid] = '-1'\n",
3056 |     "        return rs\n",
3057 |     "    \n",
3058 |     "    type_dic = {'topics':['hist_user_themes', 'themeId'], 'words':['hist_user_unlike_themes', 'words_list']}\n",
3059 |     "    user_topic = getUserhistTopic_all(data, data_ans, type_dic[info_type], dayn=dayn)\n",
3060 |     "    return user_topic\n",
3061 |     "\n",
3062 |     "data_ans = pd.read_csv(datapath+'data/data_answer.csv')\n",
3063 |     "data_question = pd.read_csv(datapath+'data/data_question.csv')\n",
3064 |     "data_ans = data_ans.merge(data_question[['qId', 'themeId']],on = 'qId',how = 'left')\n",
3065 |     "# 加载word\n",
3066 |     "data_question = pd.read_csv(datapath+'data/data_q_title_tfidf.csv')\n",
3067 |     "data_question.columns = ['index', 'qId', 'title_words', 'describe_words']\n",
3068 |     "data_ans = data_ans.merge(data_question[['describe_words', 'qId']],on = 'qId',how = 'left').rename(columns = {'describe_words':'words_list'})\n",
3069 |     "\n",
3070 |     "# data_test 表示要处理的数据\n",
3071 |     "df['user_topics'] = getHistInfo(data, data_ans, 'topics')\n",
3072 |     "df['user_words'] = getHistInfo(data, data_ans, 'words')"
3073 |    ]
3074 |   },
3075 |   {
3076 |    "cell_type": "markdown",
3077 |    "metadata": {},
3078 |    "source": [
3079 |     "### 3、基于用户历史未回答信息得到的用户未点击问题的关键字 / 主题信息\n",
3080 |     "利用当前用户的历史未回答的信息（注意时序），得到用户讨厌的关键字、主题信息，并按照出现频率为不同的主题、关键字设置权重，出现频率越高代表用户对这个主题的讨厌度越高，设置的权重越高"
3081 |    ]
3082 |   },
3083 |   {
3084 |    "cell_type": "code",
3085 |    "execution_count": null,
3086 |    "metadata": {
3087 |     "collapsed": true
3088 |    },
3089 |    "outputs": [],
3090 |    "source": [
3091 |     "def getUnlikeInfo(data, data_ans, info_type, dayn=7): # data[data['label']==0]\n",
3092 |     " \n",
3093 |     "    def getUserhistTopic_F_all(dataf, data_ans, info_type, dayn=7):\n",
3094 |     "        rs = dataf[[]]\n",
3095 |     "        rs[info_type[0]] = np.nan  \n",
3096 |     "        result = {}\n",
3097 |     "\n",
3098 |     "        for day in tqdm.tqdm_notebook(dataf.loc[dataf['label']!=-1, 'inviteday'].unique()):\n",
3099 |     "            if day>3838:\n",
3100 |     "                rs.loc[dataf['inviteday']==day, info_type[0]] = getlastndayHisttopic(data_ans.loc[(data_ans['inviteday']<day)&(data_ans['inviteday']>=day-dayn)],\n",
3101 |     "                                                                                     dataf.loc[dataf['inviteday']==day], info_type[0])\n",
3102 |     "\n",
3103 |     "        rs.loc[dataf['label']==-1, info_type[0]] = getlastndayHisttopic(data_ans.loc[(data_ans['inviteday']<3868)&(data_ans['inviteday']>=3868-dayn)],\n",
3104 |     "                                             dataf[dataf['label']==-1], info_type[1])\n",
3105 |     "        return rs\n",
3106 |     "\n",
3107 |     "    #得到过去历史信息       \n",
3108 |     "    def getlastndayHisttopic(data_ans, dataf, fea): \n",
3109 |     "        gps = data_ans[['writerId', fea]].groupby('writerId')\n",
3110 |     "        rs = []\n",
3111 |     "        temp_dic = {}\n",
3112 |     "        for uid in dataf['writerId'].values:\n",
3113 |     "            topics = {}\n",
3114 |     "            try:\n",
3115 |     "                if uid in temp_dic:\n",
3116 |     "                    rs.append(temp_dic[uid])    \n",
3117 |     "                else:           \n",
3118 |     "                    for i in gps.get_group(uid)[fea]:\n",
3119 |     "                        tps = i.split(',')\n",
3120 |     "                        for tp in tps:\n",
3121 |     "                            if tp not in topics:\n",
3122 |     "                                topics[tp] = 1\n",
3123 |     "                            else:\n",
3124 |     "                                topics[tp] += 1  \n",
3125 |     "\n",
3126 |     "                    topics = pd.Series(topics)\n",
3127 |     "                    topics =(topics/topics.mean()).round(3).to_dict()\n",
3128 |     "                    topics = sorted(topics.items(),key=lambda x: x[1], reverse=True)[:100]\n",
3129 |     "                    temp = []\n",
3130 |     "                    for t in topics:\n",
3131 |     "                        temp.append(t[0]+ ':' +str(t[1]))\n",
3132 |     "                    rs.append(','.join(temp)) \n",
3133 |     "                    temp_dic[uid] = ','.join(temp)\n",
3134 |     "\n",
3135 |     "            except:\n",
3136 |     "                #print('error')\n",
3137 |     "                rs.append('-1')\n",
3138 |     "        return rs\n",
3139 |     "    \n",
3140 |     "    type_dic = {'topics':['hist_user_unlike_themes', 'themeId'], 'words':['hist_user_unlike_themes', 'words_list']}\n",
3141 |     "    user_unlike_topic = getUserhistTopic_F_all(data, data_ans, type_dic[info_type], dayn=dayn)\n",
3142 |     "    return user_unlike_topic\n",
3143 |     "\n",
3144 |     "unlikeData = data[data['label']==0] ## 用户未回答（label=0）的数据\n",
3145 |     "df['user_unlike_topic'] = getUnlikeTopics(data, unlikeData, 'topics')\n",
3146 |     "df['user_unlike_word'] = getUnlikeWord(data, unlikeData, 'words')"
3147 |    ]
3148 |   },
3149 |   {
3150 |    "cell_type": "code",
3151 |    "execution_count": null,
3152 |    "metadata": {
3153 |     "collapsed": true
3154 |    },
3155 |    "outputs": [],
3156 |    "source": [
3157 |     "df = memoryOptimization(df,np.float32)\n",
3158 |     "with open(datapath+'df/df_nn.pkl','wb') as f:\n",
3159 |     "    pickle.dump(df,f)\n",
3160 |     "del df"
3161 |    ]
3162 |   },
3163 |   {
3164 |    "cell_type": "markdown",
3165 |    "metadata": {},
3166 |    "source": [
3167 |     "### 4、deepWalk\n",
3168 |     "\n",
3169 |     "经过分析，邀请本身就蕴含有丰富的信息，这里利用deepwalk得到社交网络信息，提取问题跟用户邀请的信息，将用户和问题放在同一空间  \n",
3170 |     "（本部分代码参考19年腾讯赛冠军队伍开源）"
3171 |    ]
3172 |   },
3173 |   {
3174 |    "cell_type": "code",
3175 |    "execution_count": null,
3176 |    "metadata": {
3177 |     "collapsed": true
3178 |    },
3179 |    "outputs": [],
3180 |    "source": [
3181 |     "def get_sentences(dataf,present,target):\n",
3182 |     "    sentences=[]\n",
3183 |     "    dic={}\n",
3184 |     "    day=0\n",
3185 |     "\n",
3186 |     "    for item in iter(dataf[['inviteday',present,target]].values):\n",
3187 |     "        if day!=item[0]:\n",
3188 |     "            for key in iter(dic):\n",
3189 |     "                sentences.append(dic[key])\n",
3190 |     "            dic={}\n",
3191 |     "            day=item[0]\n",
3192 |     "        try:\n",
3193 |     "            dic[item[1]].append(str(item[2]))\n",
3194 |     "        except:\n",
3195 |     "            dic[item[1]]=[str(item[2])]\n",
3196 |     "    for key in iter(dic):\n",
3197 |     "        sentences.append(dic[key]) \n",
3198 |     "    random.shuffle(sentences)\n",
3199 |     "    return sentences\n",
3200 |     "\n",
3201 |     "def get_w2v(data_w2v, model,fea,Type, flag):\n",
3202 |     "    dic = {'qId':'item','writerId':'user'}\n",
3203 |     "    values = data_w2v[fea].unique()\n",
3204 |     "    w2v=[]\n",
3205 |     "    for v in iter(values):  \n",
3206 |     "        try:\n",
3207 |     "            a=[str(v)]\n",
3208 |     "            if flag:\n",
3209 |     "                v = dic[fea]+'_'+str(v)\n",
3210 |     "            a.extend(model[str(v)])\n",
3211 |     "            w2v.append(a)\n",
3212 |     "        except:\n",
3213 |     "            pass\n",
3214 |     "    return pd.DataFrame(w2v,columns = [fea]+[fea+'_'+Type+str(i) for i in range(32)])\n",
3215 |     "\n",
3216 |     "def get_sentences_deepwalk(log,f1,f2):\n",
3217 |     "    #构建图\n",
3218 |     "    dic={}\n",
3219 |     "    for item in iter(log[[f1,f2]].values):\n",
3220 |     "#         try:\n",
3221 |     "#             str(item[1])\n",
3222 |     "#             str(item[0])\n",
3223 |     "#         except:\n",
3224 |     "#             continue\n",
3225 |     "        try:\n",
3226 |     "            dic['item_'+str((item[1]))].add('user_'+str((item[0])))\n",
3227 |     "        except:\n",
3228 |     "            dic['item_'+str((item[1]))]=set(['user_'+str((item[0]))])\n",
3229 |     "        try:\n",
3230 |     "            dic['user_'+str((item[0]))].add('item_'+str((item[1])))\n",
3231 |     "        except:\n",
3232 |     "            dic['user_'+str((item[0]))]=set(['item_'+str((item[1]))])\n",
3233 |     "    dic_cont={}\n",
3234 |     "    for key in iter(dic):\n",
3235 |     "        dic[key]=list(dic[key])\n",
3236 |     "        dic_cont[key]=len(dic[key])\n",
3237 |     "    print(\"creating\")     \n",
3238 |     "    #构建路径\n",
3239 |     "    path_length=10        \n",
3240 |     "    sentences=[]\n",
3241 |     "    length=[]\n",
3242 |     "    for key in iter(dic):\n",
3243 |     "        sentence=[key]\n",
3244 |     "        while len(sentence)!=path_length:\n",
3245 |     "            key=dic[sentence[-1]][random.randint(0,dic_cont[sentence[-1]]-1)]\n",
3246 |     "            if len(sentence)>=2 and key == sentence[-2]:\n",
3247 |     "                break\n",
3248 |     "            else:\n",
3249 |     "                sentence.append(key)\n",
3250 |     "        sentences.append(sentence)\n",
3251 |     "        length.append(len(sentence))\n",
3252 |     "        if len(sentences)%100000==0:\n",
3253 |     "            print(len(sentences))\n",
3254 |     "    print(np.mean(length))\n",
3255 |     "    print(len(sentences))\n",
3256 |     "    random.shuffle(sentences)\n",
3257 |     "    return sentences\n",
3258 |     "\n",
3259 |     "model_deepwalk_sentence = get_sentences_deepwalk(data.sort_values(by='inviteday'), 'writerId', 'qId',)\n",
3260 |     "model_deepwalk = word2vec.Word2Vec(model_deepwalk_sentence,size = 32,window = 4,min_count = 1,sg = 1,workers = 15,iter = 20)\n",
3261 |     "print('model ok')\n",
3262 |     "deepw_mId = get_w2v(data, model_deepwalk,'writerId','dw',flag = True)\n",
3263 |     "deepw_qId = get_w2v(data, model_deepwalk,'qId','dw',flag = True)\n",
3264 |     "deepw_mId.head()\n",
3265 |     "\n",
3266 |     "with open(datapath+'data/m_dwdf.pkl', 'wb') as f:\n",
3267 |     "    pickle.dump(deepw_mId, f)\n",
3268 |     "with open(datapath+'data/q_dwdf.pkl', 'wb') as f:\n",
3269 |     "    pickle.dump(deepw_qId, f)"
3270 |    ]
3271 |   },
3272 |   {
3273 |    "cell_type": "markdown",
3274 |    "metadata": {},
3275 |    "source": [
3276 |     "### 5、类别映射字典\n",
3277 |     "将字符串类型的特征转化为数字"
3278 |    ]
3279 |   },
3280 |   {
3281 |    "cell_type": "code",
3282 |    "execution_count": null,
3283 |    "metadata": {
3284 |     "collapsed": true
3285 |    },
3286 |    "outputs": [],
3287 |    "source": [
3288 |     "single_features = ['invitehour', 'createhour', 'inviteweekday', 'createweekday', 'sex', 'activity', 'bool_A',\n",
3289 |     "       'bool_B', 'bool_C', 'bool_D', 'bool_E', 'category_E', 'category_A', 'category_B', 'category_C', 'category_D']\n",
3290 |     "dic = {}\n",
3291 |     "temp = data[data['label']!=-1]\n",
3292 |     "for s in tqdm.tqdm_notebook(single_features):\n",
3293 |     "    dic[s] = {}\n",
3294 |     "    dic[s]['unk'] = 0\n",
3295 |     "    for i in temp[s].unique():\n",
3296 |     "        dic[s][str(i)] = len(dic[s])\n",
3297 |     "            \n",
3298 |     "print('singlefeatures ok')\n",
3299 |     "with open(datapath+'/data/dic_all.pkl', 'wb') as f:\n",
3300 |     "    pickle.dump(dic, f)"
3301 |    ]
3302 |   },
3303 |   {
3304 |    "cell_type": "markdown",
3305 |    "metadata": {},
3306 |    "source": [
3307 |     "### 6、相关文件\n",
3308 |     "- word_weight_64.pkl 利用提供的word embedding 得到的嵌入矩阵\n",
3309 |     "- word2index：word映射index文件\n",
3310 |     "- topic_weight_64：利用提供的topic embedding 得到的嵌入矩阵\n",
3311 |     "- topic2index：topic映射index的文件"
3312 |    ]
3313 |   },
3314 |   {
3315 |    "cell_type": "code",
3316 |    "execution_count": null,
3317 |    "metadata": {
3318 |     "collapsed": true
3319 |    },
3320 |    "outputs": [],
3321 |    "source": [
3322 |     "#### word_weight_64， word2index\n",
3323 |     "text = codecs.open(datapath+'data/word_vectors_64d.txt').readlines()\n",
3324 |     "text = [i.replace('\\t',' ') for i in text]\n",
3325 |     "wf = codecs.open(datapath+'data/word_vectors_suntp.txt', 'w')\n",
3326 |     "for i in text:\n",
3327 |     "    wf.write(i)\n",
3328 |     "\n",
3329 |     "glove_file = dpath(datapath+'data/word_vectors_suntp.txt', )\n",
3330 |     "tmp_file = get_tmpfile(datapath+\"data/word_vectors_suntp_w2v.txt\")\n",
3331 |     "glove2word2vec(glove_file, tmp_file)\n",
3332 |     "vectors = Vectors(name='word_vectors_suntp_w2v.txt', cache=datapath+'/data')\n",
3333 |     "\n",
3334 |     "topic2index = dict()\n",
3335 |     "topic2index['unk'] = 0\n",
3336 |     "topic2index['pad'] = 1\n",
3337 |     "t = vectors.stoi\n",
3338 |     "for i in t:\n",
3339 |     "    topic2index[i] = t[i]+2\n",
3340 |     "    \n",
3341 |     "a = torch.Tensor(2, 64).uniform_(-1,1)\n",
3342 |     "weight = torch.cat([a,vectors.vectors], dim=0)\n",
3343 |     "print(weight.size())\n",
3344 |     "with open(datapath+'data/word_weight_64.pkl', 'wb') as f:\n",
3345 |     "    pickle.dump(weight, f)\n",
3346 |     "    \n",
3347 |     "with open(datapath+'data/word2index.pkl', 'wb') as f:\n",
3348 |     "    pickle.dump(topic2index, f)\n",
3349 |     "    \n",
3350 |     "###########topic_weight_64.pkl， topic2index\n",
3351 |     "\n",
3352 |     "text = codecs.open(datapath+'data/topic_vectors_64d.txt').readlines()\n",
3353 |     "text = [i.replace('\\t',' ') for i in text]\n",
3354 |     "wf = codecs.open(datapath+'data/topic_vectors_64d_sun.txt', 'w')\n",
3355 |     "for i in text:\n",
3356 |     "    wf.write(i)\n",
3357 |     "\n",
3358 |     "glove_file = dpath(datapath+'data/topic_vectors_64d_sun.txt')\n",
3359 |     "tmp_file = get_tmpfile(datapath+\"data/topic_vectors_64d_sun_w2v.txt\")\n",
3360 |     "glove2word2vec(glove_file, tmp_file)\n",
3361 |     "vectors = Vectors(name='topic_vectors_64d_sun_w2v.txt', cache=datapath+'/data')\n",
3362 |     "\n",
3363 |     "topic2index = dict()\n",
3364 |     "topic2index['unk'] = 0\n",
3365 |     "topic2index['pad'] = 1\n",
3366 |     "t = vectors.stoi\n",
3367 |     "for i in t:\n",
3368 |     "    topic2index[i] = t[i]+2\n",
3369 |     "    \n",
3370 |     "a = torch.Tensor(2, 64).uniform_(-1,1)\n",
3371 |     "weight = torch.cat([a,vectors.vectors], dim=0)\n",
3372 |     "with open(datapath+'data/topic_weight_64.pkl', 'wb') as f:\n",
3373 |     "    pickle.dump(weight, f)\n",
3374 |     "    \n",
3375 |     "with open(datapath+'data/topic2index.pkl', 'wb') as f:\n",
3376 |     "    pickle.dump(topic2index, f)"
3377 |    ]
3378 |   },
3379 |   {
3380 |    "cell_type": "markdown",
3381 |    "metadata": {},
3382 |    "source": [
3383 |     "# .模型训练（树模型&NN）"
3384 |    ]
3385 |   },
3386 |   {
3387 |    "cell_type": "markdown",
3388 |    "metadata": {},
3389 |    "source": [
3390 |     "## 1、稠密特征拼接"
3391 |    ]
3392 |   },
3393 |   {
3394 |    "cell_type": "code",
3395 |    "execution_count": null,
3396 |    "metadata": {
3397 |     "collapsed": true
3398 |    },
3399 |    "outputs": [],
3400 |    "source": [
3401 |     "#labelEncoder\n",
3402 |     "labelfeatures = ['sex','activity','bool_A','bool_B','bool_C','bool_D','bool_E','category_E']\n",
3403 |     "data[labelfeatures] = data[labelfeatures].fillna('-1')\n",
3404 |     "for feature in labelfeatures:\n",
3405 |     "    le = LabelEncoder()\n",
3406 |     "    try:\n",
3407 |     "        data[feature] = le.fit_transform(data[feature].apply(int))\n",
3408 |     "    except:\n",
3409 |     "        data[feature] = le.fit_transform(data[feature])\n",
3410 |     "\n",
3411 |     "#拼接df\n",
3412 |     "for feagp in ['1.1','1.2','1.3','2.1','2.2','2.3','2.4','3']:\n",
3413 |     "    with open(datapath+'data/df_%s.pkl' %feagp,'rb') as f:\n",
3414 |     "        df = pickle.load(f)\n",
3415 |     "    data = pd.concat([data.df],axis = 1)\n",
3416 |     "    print(feagp+' is ok')\n",
3417 |     "    del df\n",
3418 |     "\n",
3419 |     "del data['themeId']\n",
3420 |     "del data['attentionthemes']\n",
3421 |     "del data['likethemes']\n",
3422 |     "data = memoryOptimization(data,np.float32)"
3423 |    ]
3424 |   },
3425 |   {
3426 |    "cell_type": "markdown",
3427 |    "metadata": {},
3428 |    "source": [
3429 |     "## 2、树模型训练"
3430 |    ]
3431 |   },
3432 |   {
3433 |    "cell_type": "code",
3434 |    "execution_count": null,
3435 |    "metadata": {
3436 |     "collapsed": true
3437 |    },
3438 |    "outputs": [],
3439 |    "source": [
3440 |     "train = data[data['label'] != -1]\n",
3441 |     "# train = down_sample(train,train,rate = 3)\n",
3442 |     "test = data[data['label'] == -1].reset_index(drop = True)\n",
3443 |     "train_x = train.drop(['label'],axis = 1)\n",
3444 |     "train_y = train['label']\n",
3445 |     "test_x = test.drop(['label'],axis = 1)\n",
3446 |     "del train\n",
3447 |     "del test\n",
3448 |     "gc.collect()\n",
3449 |     "\n",
3450 |     "mfeas = ['category_A','category_B','category_D','category_C','mostliketheme']\n",
3451 |     "catefeas = ['sex','activity','bool_A','bool_B','bool_C','bool_D','bool_E','category_E']\n",
3452 |     "\n",
3453 |     "#lgb\n",
3454 |     "pre_test,pre_train,score,fea_imp,iternum = lgb_train_pre1(train_x.drop(['qId','writerId'],axis = 1),train_y,\n",
3455 |     "                    test_x.drop(['qId','writerId'],axis = 1),catefeas,dropfeas = list(set(['inviteweekday','createweekday']+mfeas)&set(list(train_x.columns))),one = False,save_model=True)\n",
3456 |     "\n",
3457 |     "pre_train.to_csv(datapath+'data/lgb_0.1.csv', index=False)\n",
3458 |     "sub_sample = pd.read_csv(datapath+'invite_info_evaluate_2_0926.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime'])\n",
3459 |     "sub_sample['label'] = pre_test\n",
3460 |     "sub_sample.to_csv(datapath+'data/lgb_0.1.txt',sep = '\\t',header = False,index = False)\n",
3461 |     "\n",
3462 |     "#cat\n",
3463 |     "pre_test,pre_train,score,fea_imp,iternum = cat_train_pre1(train_x.drop(['qId','writerId'],axis = 1),train_y,\n",
3464 |     "                    test_x.drop(['qId','writerId'],axis = 1),catefeas,dropfeas = list(set(['inviteweekday','createweekday']+mfeas)&set(list(train_x.columns))),one = False,save_model=True)\n",
3465 |     "pre_train.to_csv(datapath+'data/cat_0.1.csv', index=False)\n",
3466 |     "sub_sample = pd.read_csv(datapath+'invite_info_evaluate_2_0926.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime'])\n",
3467 |     "sub_sample['label'] = pre_test\n",
3468 |     "sub_sample.to_csv(datapath+'data/cat_0.1.txt',sep = '\\t',header = False,index = False)\n",
3469 |     "\n",
3470 |     "#xgb\n",
3471 |     "pre_test,pre_train,score,fea_imp,iternum = xgb_train_pre1(train_x.drop(['qId','writerId'],axis = 1),train_y,\n",
3472 |     "                    test_x.drop(['qId','writerId'],axis = 1),dropfeas = list(set(['inviteweekday','createweekday']+mfeas+catefeas)&set(list(train_x.columns))),one = False,save_model=True)\n",
3473 |     "pre_train.to_csv(datapath+'data/xgb_0.1.csv', index=False)\n",
3474 |     "sub_sample = pd.read_csv(datapath+'invite_info_evaluate_2_0926.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime'])\n",
3475 |     "sub_sample['label'] = pre_test\n",
3476 |     "sub_sample.to_csv(datapath+'data/xgb_0.1.txt',sep = '\\t',header = False,index = False)"
3477 |    ]
3478 |   },
3479 |   {
3480 |    "cell_type": "markdown",
3481 |    "metadata": {},
3482 |    "source": [
3483 |     "## 3、nn部分特征拼接"
3484 |    ]
3485 |   },
3486 |   {
3487 |    "cell_type": "markdown",
3488 |    "metadata": {},
3489 |    "source": [
3490 |     "除了上面用到的稠密特征外，nn部分还用到了其它特征"
3491 |    ]
3492 |   },
3493 |   {
3494 |    "cell_type": "code",
3495 |    "execution_count": null,
3496 |    "metadata": {
3497 |     "collapsed": true
3498 |    },
3499 |    "outputs": [],
3500 |    "source": [
3501 |     "#nn\n",
3502 |     "with open(datapath+'data/df_nn.pkl','rb') as f:\n",
3503 |     "    df = pickle.load(f)\n",
3504 |     "data = pd.concat([data.df],axis = 1)\n",
3505 |     "del df\n",
3506 |     "\n",
3507 |     "with open(datapath+'data/m_dwdf.pkl', 'rb') as f:\n",
3508 |     "    m_dwdf = pickle.load(f)\n",
3509 |     "with open(datapath+'data/q_dwdf.pkl', 'rb') as f:\n",
3510 |     "    q_dwdf = pickle.load(f)   \n",
3511 |     "data = pd.merge(data, m_dwdf, on='writerId', how='left')\n",
3512 |     "data = pd.merge(data, q_dwdf, on='qId', how='left')\n",
3513 |     "\n",
3514 |     "del m_dwdf\n",
3515 |     "del q_dwdf\n",
3516 |     "gc.collect()\n",
3517 |     "data = memoryOptimization(data,np.float32)"
3518 |    ]
3519 |   },
3520 |   {
3521 |    "cell_type": "markdown",
3522 |    "metadata": {},
3523 |    "source": [
3524 |     "### 特征处理\n",
3525 |     "- 1、类别特征处理， 转化成数值，方便进行embedding\n",
3526 |     "- 2、数值特征归一化  \n",
3527 |     "- 3、deepwalk 信息拼接"
3528 |    ]
3529 |   },
3530 |   {
3531 |    "cell_type": "code",
3532 |    "execution_count": null,
3533 |    "metadata": {
3534 |     "collapsed": true
3535 |    },
3536 |    "outputs": [],
3537 |    "source": [
3538 |     "#类别特征处理， 转化成数值，方便进行embedding\n",
3539 |     "with open(datapath+'data/dic_all.pkl', 'rb') as f:\n",
3540 |     "    dic = pickle.load(f)\n",
3541 |     "\n",
3542 |     "single_features = ['invitehour', 'createhour', 'inviteweekday', 'createweekday', 'sex', 'activity', 'bool_A',\n",
3543 |     "       'bool_B', 'bool_C', 'bool_D', 'bool_E', 'category_E', 'category_A', 'category_B', 'category_C', 'category_D']\n",
3544 |     "for i in single_features:\n",
3545 |     "    data[i] = data[i].apply(lambda x: dic[i][str(x)] if str(x) in dic[i] else 0)\n",
3546 |     "\n",
3547 |     "#数值特征归一化  \n",
3548 |     "new_dense = ['qId_bool_B_last7day_count', 'writerId_title_words_deltop150_lastnday_labelcount1', 'hourlenfromlastinvite_qId', 'writerId_createhour_last7day_count_gp_qId', 'collect_sum', 'thanks_max', 'qId_mostliketheme_last7day_count', 'qId_bool_C_label_count', 'thumbs_up_max', 'writerId_createweekday_label_ctr', 'qId_count', 'title_words_tfidf1_label_count1', 'writerId_themeId_lastnday_labelcount3', 'mostliketheme_curdayinv_count', 'themeId_label_ctr1', 'writerId_title_words_deltop150_lastnday_labelctr0', 'writerId_themeId_lastnday_labelctr_mean', 'isgood_sum', 'qId_label_count_gp_writerId', 'qId_invitehour_last7day_count_gp_writerId', 'nohelp_mean', 'activity_curdayinv_count', 'writerId_last3invnum2', 'qId_count_gp_writerId', 'writerId_title_words_deltop150_lastnday_labelctr2', 'qId_sex_last7day_count', 'createhour_curdayinv_count', 'qId_label_ctr_gp_writerId', 'q_class_300_bool_B_label_count', 'category_E_curdayinv_count', 'writerId_count', 'qId_inviteweekday_label_ctr', 'qId_last3+1invnum_mean', 'qId_category_B_last7day_count_gp_writerId', 'qId_bool_D_label_ctr', 'q_class_300_category_D_label_count', 'themeId_label_count0', 'q_class_300_bool_C_label_count', 'themeId_label_ctr2', 'writerId_themeId_lastnday_labelcount4', 'qId_invitehour_label_ctr', 'category_A_createdaylastweek2label_count', 'writerId_inviteweekday_label_count', 'qId_invitedaylastweek2label_count', 'category_A_invitedaylastweek2label_rate', 'title_words_tfidf1_label_count2', 'attentionthemes_label_ctr1', 'writerId_themeId_lastnday_labelctr0', 'writerId_title_words_deltop150_lastnday_labelctr4', 'comment_sum', 'quzan_max', 'qId_invitehour_last7day_count', 'writerId_createday_last7day_count', 'qId_activeday_inv', 'qId_bool_B_last7day_count_gp_writerId', 'attentionthemes_label_count1', 'qId_last7day_count', 'writerId_createdaylastweek2label_count', 'qId_mostliketheme_last7day_count_gp_writerId', 'category_D_label_ctr', 'themeId_label_count2', 'writerId_title_words_deltop150_lastnday_labelcount2', 'qId_category_B_last7day_count', 'writerId_invitehour_label_count', 'qId_bool_E_last7day_count_gp_writerId', 'opposition_max', 'writerId_title_words_deltop150_lastnday_labelctr3', 'istabel_sum', 'q_class_300_bool_E_label_ctr', 'istabel_mean', 'mostliketheme_count', 'writerId_q_class_300_label_count', 'writerId_themeId_lastnday_labelctr3', 'createday_curdayinv_count', 'qId_last3+1invnum_std', 'invitehour_count', 'qId_last7count_gp_writerId', 'category_D_createdaylastweek2label_rate', 'quzan_sum', 'writerId_themeId_lastnday_labelcount1', 'writerId_createhour_last7day_count', 'writerId_createhour_label_count', 'writerId_last3invnum1', 'category_C_last7day_count', 'writerId_curcount_gp_qId', 'qId_bool_D_label_count', 'qId_bool_D_last7day_count', 'sex_last7day_count', 'activity_createdaylastweek2label_rate', 'writerId_createdaylastweek2label_rate', 'yanzhi_count', 'activity_createdaylastweek2label_count', 'qId_activeday_asw', 'category_C_label_ctr', 'writerId_last3+1invnum_std', 'writerId_themeId_lastnday_labelctr1', 'category_A_invitedaylastweek2label_count', 'thumbs_up_mean', 'qId_label_count', 'writerId_label_count', 'writerId_createday_last7day_count_gp_qId', 'thumbs_up_sum', 'report_mean', 'qId_bool_A_last7day_count_gp_writerId', 'writerId_createweekday_last7day_count_gp_qId', 'qId_bool_B_label_ctr', 'samenum_like', 'isvideo_max', 'simhis_base_theme_theme_itemcf', 'writerId_invitehour_label_ctr', 'title_words_tfidf1_label_ctr2', 'writerId_q_class_300_label_ctr', 'qId_last3invnum2', 'writerId_curdayinv_count', 'writerId_themeId_lastnday_labelcount0', 'qId_activity_last7day_count_gp_writerId', 'qId_bool_C_last7day_count', 'title_words_tfidf1_label_ctr0', 'writerId_invitedaylastweek2label_count', 'writerId_title_words_deltop150_lastnday_labelcount4', 'writerId_themeId_lastnday_labelcount_sum', 'opposition_sum', 'simhis_base_theme_theme_noclick', 'writerId_themeId_lastnday_labelcount2', 'qId_category_E_last7day_count_gp_writerId', 'writerId_last7count_gp_qId', 'writerId_createweekday_last7day_count', 'qId_bool_C_label_ctr', 'q_class_300_bool_B_label_ctr', 'title_words_tfidf1_label_count0', 'qId_category_C_last7day_count_gp_writerId', 'qId_category_E_last7day_count', 'topic_sim', 'mostliketheme_label_ctr', 'isvideo_sum', 'q_class_300_category_D_label_ctr', 'writerId_themeId_lastnday_labelctr2', 'comment_max', 'quzan_mean', 'qId_bool_E_label_ctr', 'createweekday_curdayinv_count', 'hourlenfromlastinvite_writerId', 'qId_bool_A_last7day_count', 'createhour_label_ctr', 'attentionthemes_label_count2', 'isrec_mean', 'writerId_last7day_count', 'length_max', 'qId_category_D_last7day_count_gp_writerId', 'writerId_label_ctr_gp_qId', 'writerId_count_gp_qId', 'qId_curdayinv_count', 'writerId_inviteweekday_label_ctr', 'writerId_activeday_asw', 'writerId_themeId_lastnday_labelcount_mean', 'writerId_activeday_inv', 'writerId_invitehour_last7day_count_gp_qId', 'category_D_invitedaylastweek2label_count', 'collect_max', 'attentionthemes_label_ctr2', 'q_class_300_bool_A_label_ctr', 'report_sum', 'createhour_count', 'title_words_tfidf1_label_ctr_mean', 'writerId_invitedaylastweek2label_rate', 'istabel_max', 'bool_D_curdayinv_count', 'yanzhi_label_ctr', 'isimage_sum', 'qId_invitedaylastweek2label_rate', 'category_A_createdaylastweek2label_rate', 'comment_mean', 'q_life', 'length_mean', 'activity_label_ctr', 'qId_invitehour_label_count', 'createhour_label_count', 'category_D_createdaylastweek2label_count', 'qId_yanzhi_last7day_count', 'themeId_label_count1', 'isimage_max', 'writerId_label_count_gp_qId', 'yanzhi_curdayinv_count', 'writerId_createweekday_label_count', 'title_words_tfidf1_label_count_mean', 'writerId_title_words_deltop150_lastnday_labelcount3', 'invitehour_label_count', 'mostliketheme_label_count', 'activity_last7day_count', 'qId_last3invnum1', 'nohelp_sum', 'createday_count', 'qId_yanzhi_last7day_count_gp_writerId', 'qId_inviteweekday_label_count', 'writerId_label_ctr', 'writerId_createhour_label_ctr', 'qId_bool_B_label_count', 'length_sum', 'qId_category_D_last7day_count', 'q_class_300_bool_E_label_count', 'q_class_300_bool_A_label_count', 'qId_category_C_last7day_count', 'writerId_title_words_deltop150_lastnday_labelcount0', 'writerId_last3+1invnum_mean', 'isrec_sum', 'thanks_sum', 'invitehour_curdayinv_count', 'writerId_last3invnum0', 'yanzhi_label_count', 'qId_sex_last7day_count_gp_writerId', 'qId_last3invnum0', 'samenum_atten', 'themeId_label_ctr0', 'simhis_base_title_title_itemcf', 'qId_bool_E_last7day_count', 'category_D_invitedaylastweek2label_rate', 'writerId_title_words_deltop150_lastnday_labelctr1', 'collect_mean', 'attentionthemes_label_ctr0', 'qId_label_ctr', 'attentionthemes_label_count0', 'q_class_300_bool_C_label_ctr', 'isimage_mean', 'qId_category_A_last7day_count', 'yanzhi_d_last7day_count', 'nohelp_max', 'qId_bool_E_label_count', 'invitehour_label_ctr', 'writerId_invitehour_last7day_count', 'qId_bool_D_last7day_count_gp_writerId', 'qId_activity_last7day_count', 'isgood_max', 'qId_category_A_last7day_count_gp_writerId', 'thanks_mean', 'report_max', 'isgood_mean', 'category_C_curdayinv_count', 'isvideo_mean', 'opposition_mean', 'qId_curcount_gp_writerId', 'isrec_max', 'title_words_tfidf1_label_ctr1', 'qId_bool_C_last7day_count_gp_writerId', 'writerId_themeId_lastnday_labelctr4']\n",
3549 |     "num_dic = {}\n",
3550 |     "for fea in tqdm.tqdm_notebook(new_dense):\n",
3551 |     "    try:\n",
3552 |     "        scaler_val = data[fea][~data[fea].isnull()].values\n",
3553 |     "        scaler = StandardScaler().fit(scaler_val.reshape((len(scaler_val), 1)))\n",
3554 |     "        num_dic[fea] = scaler\n",
3555 |     "        data[fea].fillna(scaler.mean_[0], inplace=True)\n",
3556 |     "        data[fea] = scaler.transform(data[fea].values.reshape((len(data), 1))).reshape((len(data),)).tolist()\n",
3557 |     "    except:\n",
3558 |     "        print(fea)\n",
3559 |     "del scaler_val, scaler\n",
3560 |     "gc.collect()"
3561 |    ]
3562 |   },
3563 |   {
3564 |    "cell_type": "markdown",
3565 |    "metadata": {},
3566 |    "source": [
3567 |     "## 4、nn 训练"
3568 |    ]
3569 |   },
3570 |   {
3571 |    "cell_type": "markdown",
3572 |    "metadata": {},
3573 |    "source": [
3574 |     "### Model"
3575 |    ]
3576 |   },
3577 |   {
3578 |    "cell_type": "code",
3579 |    "execution_count": null,
3580 |    "metadata": {
3581 |     "collapsed": true
3582 |    },
3583 |    "outputs": [],
3584 |    "source": [
3585 |     "import torch\n",
3586 |     "import torch.nn as nn\n",
3587 |     "import pandas as pd\n",
3588 |     "import random\n",
3589 |     "import pickle\n",
3590 |     "import torch.nn.functional as F\n",
3591 |     "import tqdm\n",
3592 |     "from sklearn.metrics import roc_auc_score\n",
3593 |     "from optimizer import Lookahead\n",
3594 |     "from optimizer import RAdam\n",
3595 |     "import numpy as np\n",
3596 |     "import os\n",
3597 |     "import torch.utils.data as Data\n",
3598 |     "import gc\n",
3599 |     "from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler\n",
3600 |     "from sklearn.model_selection import train_test_split, StratifiedKFold\n",
3601 |     "import codecs"
3602 |    ]
3603 |   },
3604 |   {
3605 |    "cell_type": "code",
3606 |    "execution_count": null,
3607 |    "metadata": {
3608 |     "collapsed": true
3609 |    },
3610 |    "outputs": [],
3611 |    "source": [
3612 |     "#mask generate\n",
3613 |     "def sequence_mask(embed, feature):\n",
3614 |     "    \n",
3615 |     "    mask = (feature!=1).unsqueeze(-1).expand_as(embed).float()\n",
3616 |     "    return embed*mask\n",
3617 |     "\n",
3618 |     "class TextCNN(nn.Module):\n",
3619 |     "    def __init__(self, args):\n",
3620 |     "        super(TextCNN, self).__init__()\n",
3621 |     "        self.args = args\n",
3622 |     "\n",
3623 |     "        chanel_num = 1\n",
3624 |     "        filter_num = args['filter_num']\n",
3625 |     "        filter_sizes = args['filter_sizes']\n",
3626 |     "\n",
3627 |     "        vocabulary_size = args['vocabulary_size']\n",
3628 |     "        embedding_dimension = args['embedding_dim']\n",
3629 |     "        self.embedding = nn.Embedding(vocabulary_size, embedding_dimension)\n",
3630 |     "        #self.embedding = self.embedding.from_pretrained(args.vectors, freeze=False)\n",
3631 |     "        self.embedding.weight.data.copy_(args['pretrained_weight'])\n",
3632 |     "            \n",
3633 |     "        self.convs = nn.ModuleList([nn.Conv2d(1, filter_num, (size, embedding_dimension)) for size in filter_sizes])\n",
3634 |     "        self.dropout = nn.Dropout(args['dropout'])\n",
3635 |     "        #self.fc = nn.Linear(len(filter_sizes)*filter_num, class_num)\n",
3636 |     "        self.feature_num = len(filter_sizes)*filter_num*4\n",
3637 |     "    def forward(self, x, y):\n",
3638 |     "\n",
3639 |     "        x = self.embedding(x)\n",
3640 |     "        x = x.unsqueeze(1)\n",
3641 |     "        y = self.embedding(y)\n",
3642 |     "        y = y.unsqueeze(1)\n",
3643 |     "        \n",
3644 |     "        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]\n",
3645 |     "        x = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in x]\n",
3646 |     "        x = torch.cat(x, 1)\n",
3647 |     "        y = [F.relu(conv(y)).squeeze(3) for conv in self.convs]\n",
3648 |     "        y = [F.max_pool1d(item, item.size(2)).squeeze(2) for item in y]\n",
3649 |     "        y = torch.cat(y, 1)\n",
3650 |     "        rs = torch.cat([x, y, torch.abs(x-y), x*y], 1)\n",
3651 |     "        return rs\n",
3652 |     "\n",
3653 |     "class xDeepFM(nn.Module):\n",
3654 |     "    \n",
3655 |     "    def __init__(self, params):\n",
3656 |     "        super(xDeepFM, self).__init__()\n",
3657 |     "        self.device = params['device']\n",
3658 |     "        #self.mlp_input_dim = params['field_size'] * params['embedding_size']\n",
3659 |     "        self.k = params['k'] ##\n",
3660 |     "        self.dic = params['dic']  # 字典\n",
3661 |     "        self.single_features = params['single_features'] #\n",
3662 |     "        self.muti_features = params['muti_features']#\n",
3663 |     "        self.num_features = params['num_features']\n",
3664 |     "        self.cross_features = params['cross_features']\n",
3665 |     "        self.topic_features = params['topic_features']\n",
3666 |     "        self.other_features = params['other_features']\n",
3667 |     "        self.l2 = params['l2']\n",
3668 |     "        self.norm = params['normNum']\n",
3669 |     "        self.usetext = params['usetext']\n",
3670 |     "        self.usecin = params['usecin']\n",
3671 |     "        self.usetopic = params['usetopic']\n",
3672 |     "        self.useword = params['useword']\n",
3673 |     "        self.word_features = params['word_features']\n",
3674 |     "        #textcnn\n",
3675 |     "        if params['usetext']:\n",
3676 |     "            self.textcnn = TextCNN(params['textargs'])\n",
3677 |     "            self.textlen = self.textcnn.feature_num\n",
3678 |     "  \n",
3679 |     "        #mem\n",
3680 |     "        self.usemem = params['usemem']\n",
3681 |     "        if self.usemem:\n",
3682 |     "            mem_dim = 0 #64*2\n",
3683 |     "            \n",
3684 |     "            self.linear1 = nn.Linear(128, 64)\n",
3685 |     "            self.sig = nn.Linear(128+64,64)\n",
3686 |     "            self.mem1 = nn.Embedding(len(self.dic['writerId']), 64)\n",
3687 |     "            self.mem0 = nn.Embedding(len(self.dic['writerId']), 64)\n",
3688 |     "        else:\n",
3689 |     "            mem_dim = 0\n",
3690 |     "    \n",
3691 |     "        first_orders = nn.ModuleDict()\n",
3692 |     "        second_orders = nn.ModuleDict()\n",
3693 |     "        ## feature -index\n",
3694 |     "        feature_index = {}\n",
3695 |     "        for s in self.single_features+self.num_features + self.other_features:\n",
3696 |     "            feature_index[s] = [len(feature_index), len(feature_index)+1]\n",
3697 |     "\n",
3698 |     "        if 'writerId' in self.single_features:\n",
3699 |     "            self.single_features.remove('writerId') \n",
3700 |     "            \n",
3701 |     "        temp_index = 0\n",
3702 |     "        if self.usetopic:\n",
3703 |     "            temp_index = len(feature_index)+100\n",
3704 |     "            feature_index['attentionthemes'] = [len(feature_index), temp_index]             \n",
3705 |     "            feature_index['themeId'] = [temp_index, temp_index+13] \n",
3706 |     "            feature_index['likethemes'] = [temp_index+13, temp_index+13+10]\n",
3707 |     "            feature_index['likethemes_att'] = [temp_index+13+10, temp_index+13+10+10]\n",
3708 |     "            #hist_user_themes\n",
3709 |     "            feature_index['hist_user_themes'] = [temp_index+13+10+10, temp_index+13+10+10+10]\n",
3710 |     "            feature_index['hist_user_themes_att'] = [temp_index+13+10+10+10, temp_index+13+10+10+10+10]      \n",
3711 |     "        if self.usetext:\n",
3712 |     "            feature_index['m_interest_topic'] = feature_index['likethemes']\n",
3713 |     "            feature_index['q_topic'] = feature_index['themeId']    \n",
3714 |     "        if self.useword:\n",
3715 |     "            if temp_index>0:\n",
3716 |     "                feature_index['hist_user_words'] = [temp_index+13+10+10+10+10, temp_index+13+10+10+10+10 + 20 ] # 20\n",
3717 |     "                feature_index['hist_user_words_att'] = [temp_index+13+10+10+10+10 + 20, temp_index+13+10+10+10+10 + 20 + 20 ]\n",
3718 |     "                feature_index['all_words'] = [temp_index+13+10+10+10+10 + 20 + 20, temp_index+13+10+10+10+10 + 20 + 20 + 10 ] # 20              \n",
3719 |     "        feature_index['hist_user_unlike_themes'] = [temp_index+13+10+10+10+10 + 20 + 20 + 10, temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 ] # 20\n",
3720 |     "        feature_index['hist_user_unlike_themes_att'] = [temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10, temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10 ] # 20\n",
3721 |     "        feature_index['hist_user_unlike_words'] = [temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10, temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10 + 20 ] # 20\n",
3722 |     "        feature_index['hist_user_unlike_words_att'] = [temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10 + 20, temp_index+13+10+10+10+10 + 20 + 20 + 10 + 10 + 10 + 20 +20 ] # 20\n",
3723 |     "        \n",
3724 |     "        self.feature_index = feature_index  ## index\n",
3725 |     "        \n",
3726 |     "        for s in self.single_features:\n",
3727 |     "            first_orders[s] = nn.Embedding(len(self.dic[s]), 1)\n",
3728 |     "            nn.init.normal_(first_orders[s].weight, mean=0, std=0.0001)\n",
3729 |     "            second_orders[s] = nn.Embedding(len(self.dic[s]), self.k)\n",
3730 |     "            nn.init.normal_(second_orders[s].weight, mean=0, std=0.0001)\n",
3731 |     "        \n",
3732 |     "        for s in self.muti_features:\n",
3733 |     "            first_orders[s] = nn.Embedding(len(self.dic[s])+2, 1)\n",
3734 |     "            nn.init.normal_(first_orders[s].weight, mean=0, std=0.0001)\n",
3735 |     "            second_orders[s] = nn.Embedding(len(self.dic[s])+2, self.k)\n",
3736 |     "            nn.init.normal_(second_orders[s].weight, mean=0, std=0.0001)\n",
3737 |     "            \n",
3738 |     "        self.first_orders = first_orders.to(self.device)\n",
3739 |     "        self.second_orders = second_orders.to(self.device)\n",
3740 |     "       \n",
3741 |     "        self.norm_num = nn.ModuleDict()\n",
3742 |     "        for s in self.num_features:\n",
3743 |     "            self.norm_num[s] = nn.BatchNorm1d(1)\n",
3744 |     "            \n",
3745 |     "        ######################################################dnn\n",
3746 |     "        self.p = params['p'] # drop_out\n",
3747 |     "        self.layers = params['layers']\n",
3748 |     "        self.input_dim = (len(self.single_features)+ len(self.muti_features) + len(self.topic_features) + len(self.word_features)) * self.k + len(self.num_features) + mem_dim #*self.k #* self.k #+ 2* self.k\n",
3749 |     "        self.deep_layers = nn.Sequential()\n",
3750 |     "        net_dims = [self.input_dim]+self.layers\n",
3751 |     "        for i in range(len(self.layers)):\n",
3752 |     "            self.deep_layers.add_module('fc%d' % (i+1), nn.Linear(net_dims[i], net_dims[i+1]))\n",
3753 |     "            self.deep_layers.add_module('bn%d' % (i+1), nn.BatchNorm1d(net_dims[i+1]))\n",
3754 |     "            self.deep_layers.add_module('relu%d' % (i+1), nn.ReLU()) \n",
3755 |     "            self.deep_layers.add_module('dropout%d' % (i+1), nn.Dropout(self.p))       \n",
3756 |     "        for name, tensor in self.deep_layers.named_parameters():\n",
3757 |     "            if 'weight' in name:\n",
3758 |     "                nn.init.normal_(tensor, mean=0, std=0.0001)\n",
3759 |     "        self.deep_layers = self.deep_layers.to(self.device)\n",
3760 |     "          \n",
3761 |     "        ## topic \n",
3762 |     "        if params['usetopic']:\n",
3763 |     "            self.topic_weight = nn.Embedding(params['textargs']['vocabulary_size'], 64)\n",
3764 |     "            self.topic_weight.weight.data.copy_(params['textargs']['pretrained_weight'])\n",
3765 |     "            self.topic_linear = nn.Sequential(nn.Linear(64, self.k), nn.ReLU())\n",
3766 |     "        \n",
3767 |     "        #word\n",
3768 |     "        if self.useword:\n",
3769 |     "            temp_weight = params['word2index']#word_weight\n",
3770 |     "            self.word_weight = nn.Embedding(len(params['word2index']), 64)\n",
3771 |     "            self.word_weight.weight.data.copy_(params['word_weight'])\n",
3772 |     "            self.word_linear = nn.Sequential(nn.Linear(64, self.k), nn.ReLU())\n",
3773 |     "        \n",
3774 |     "        #############################################################cin      \n",
3775 |     "        self.num_field = len(self.cross_features)+ len(self.muti_features) + len(self.topic_features) + len(self.word_features) #+ len(self.num_features)#+ 2\n",
3776 |     "        self.conv1ds = nn.ModuleList()\n",
3777 |     "        self.cin_layers = params['cin_layers']\n",
3778 |     "        cin_layers_dims = [self.num_field]+self.cin_layers\n",
3779 |     "        self.split_half = params['split_half']\n",
3780 |     "        self.hidden_dims_split_half = [self.num_field]\n",
3781 |     "        prev_dim = 0\n",
3782 |     "        for i in range(len(self.cin_layers)):\n",
3783 |     "            self.conv1ds.append(nn.Conv1d(cin_layers_dims[0]*self.hidden_dims_split_half[-1], cin_layers_dims[i+1], 1))\n",
3784 |     "            if self.split_half and i != len(self.cin_layers)-1:\n",
3785 |     "                self.hidden_dims_split_half.append(cin_layers_dims[i+1] // 2)\n",
3786 |     "                prev_dim += cin_layers_dims[i+1] // 2\n",
3787 |     "            else:\n",
3788 |     "                self.hidden_dims_split_half.append(cin_layers_dims[i+1])\n",
3789 |     "                prev_dim += cin_layers_dims[i+1]\n",
3790 |     "        self.conv1ds = self.conv1ds.to(self.device)\n",
3791 |     "              \n",
3792 |     "        if self.usetext:\n",
3793 |     "            textlen = self.textlen\n",
3794 |     "        else:\n",
3795 |     "            textlen = 0\n",
3796 |     "\n",
3797 |     "        self.output = nn.Sequential(nn.Linear(prev_dim+self.layers[-1]+ textlen + len(self.other_features), 512), # no-linear\n",
3798 |     "                                     nn.BatchNorm1d(512),\n",
3799 |     "                                     nn.ReLU(),\n",
3800 |     "                                     nn.Dropout(self.p),\n",
3801 |     "                                     nn.Linear(512, 256),\n",
3802 |     "                                     nn.BatchNorm1d(256),\n",
3803 |     "                                     nn.ReLU(),\n",
3804 |     "                                     nn.Dropout(self.p),\n",
3805 |     "                                     nn.Linear(256,64),\n",
3806 |     "                                     nn.BatchNorm1d(64),\n",
3807 |     "                                     nn.ReLU(),\n",
3808 |     "                                     nn.Dropout(self.p))\n",
3809 |     "        self.end = nn.Sequential(nn.Linear(64, 1),)       \n",
3810 |     "        \n",
3811 |     "    def forward(self, input_x):\n",
3812 |     "        embed1 = {}\n",
3813 |     "        embed2 = {}\n",
3814 |     "        norm_num = {}\n",
3815 |     "        for s in self.single_features:             \n",
3816 |     "            embed1[s] = self.first_orders[s](input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()).squeeze(-1) #B * 1 \n",
3817 |     "            embed2[s] = self.second_orders[s](input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()) #b*1*k\n",
3818 |     "            \n",
3819 |     "        if self.usetopic:\n",
3820 |     "            topics = {}\n",
3821 |     "            for s in self.topic_features:\n",
3822 |     "                if s not in ['hist_user_themes', 'likethemes', 'hist_user_unlike_themes']:#'likethemes', \n",
3823 |     "                    temp = (torch.sum(sequence_mask(self.topic_weight(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()),\n",
3824 |     "                                                    input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()), dim=1)/(torch.sum((input_x[:,self.feature_index[s][0]:self.feature_index[s][1]]!=1).float()+1e-10, dim=-1).unsqueeze(-1)))#.unsqueeze(1)# b * 1 * k\n",
3825 |     "                    topics[s] = self.topic_linear(temp).unsqueeze(1)\n",
3826 |     "                else:             \n",
3827 |     "                    s_att = s+'_att'\n",
3828 |     "                    s_value = self.topic_weight(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long())\n",
3829 |     "                    s_att_val = input_x[:,self.feature_index[s_att][0]:self.feature_index[s_att][1]].unsqueeze(-1).expand_as(s_value).float()\n",
3830 |     "                    temp = torch.sum(s_value * s_att_val, dim=1)/(torch.sum((input_x[:,self.feature_index[s_att][0]:self.feature_index[s_att][1]]).float()+1e-10, dim=-1).unsqueeze(-1))                          \n",
3831 |     "                    temp = temp +1e-12\n",
3832 |     "                    topics[s] = self.topic_linear(temp).unsqueeze(1)\n",
3833 |     "        \n",
3834 |     "        if self.useword:\n",
3835 |     "            words = {}\n",
3836 |     "            for s in self.word_features:\n",
3837 |     "                if s in ['all_words']:     \n",
3838 |     "                    temp = (torch.sum(sequence_mask(self.word_weight(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()),\n",
3839 |     "                                                            input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long()), dim=1)/(torch.sum((input_x[:,self.feature_index[s][0]:self.feature_index[s][1]]!=1).float()+1e-10, dim=-1).unsqueeze(-1)))#.unsqueeze(1)# b * 1 * k\n",
3840 |     "                    words[s] = self.word_linear(temp).unsqueeze(1)\n",
3841 |     "                else:\n",
3842 |     "                    s_att = s+'_att'\n",
3843 |     "                    s_value = self.word_weight(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]].long())\n",
3844 |     "                    s_att_val = input_x[:,self.feature_index[s_att][0]:self.feature_index[s_att][1]].unsqueeze(-1).expand_as(s_value).float()\n",
3845 |     "                    temp = torch.sum(s_value * s_att_val, dim=1)/(torch.sum((input_x[:,self.feature_index[s_att][0]:self.feature_index[s_att][1]]).float()+1e-10, dim=-1).unsqueeze(-1))                          \n",
3846 |     "                    temp = temp +1e-12\n",
3847 |     "                    words[s] = self.word_linear(temp).unsqueeze(1)\n",
3848 |     "        ##other\n",
3849 |     "        others_num = []\n",
3850 |     "        for s in self.other_features:\n",
3851 |     "            others_num.append(input_x[:,self.feature_index[s][0]:self.feature_index[s][1]])\n",
3852 |     "        others_num = torch.cat(others_num, dim=1)\n",
3853 |     "                    \n",
3854 |     "        if self.norm:   \n",
3855 |     "            for s in self.num_features:\n",
3856 |     "                norm_num[s] = self.norm_num[s](input_x[:,self.feature_index[s][0]:self.feature_index[s][1]])\n",
3857 |     "        else:\n",
3858 |     "            for s in self.num_features:\n",
3859 |     "                norm_num[s] = input_x[:,self.feature_index[s][0]:self.feature_index[s][1]]\n",
3860 |     "                \n",
3861 |     "        \n",
3862 |     "        ## mem\n",
3863 |     "        if self.usemem:\n",
3864 |     "            mem1 = self.mem1(input_x[:,self.feature_index['writerId'][0]:self.feature_index['writerId'][1]].long()).squeeze(1)\n",
3865 |     "            mem0 = self.mem0(input_x[:,self.feature_index['writerId'][0]:self.feature_index['writerId'][1]].long()).squeeze(1)\n",
3866 |     "        \n",
3867 |     "        # ###########################以下是MLP\n",
3868 |     "        y = []\n",
3869 |     "        input_size = 0\n",
3870 |     "        for s in embed2:\n",
3871 |     "            y.append(embed2[s])\n",
3872 |     "        #topic\n",
3873 |     "        if self.usetopic:\n",
3874 |     "            for s in topics:\n",
3875 |     "                y.append(topics[s])          \n",
3876 |     "        #words\n",
3877 |     "        if self.useword:\n",
3878 |     "            for s in words:\n",
3879 |     "                y.append(words[s])\n",
3880 |     "        \n",
3881 |     "        y = torch.cat(y,1)\n",
3882 |     "        input_size += len(embed2)* self.k\n",
3883 |     "        if self.usetopic:\n",
3884 |     "            if self.useword:\n",
3885 |     "                y = torch.reshape(y, [-1, (len(embed2)+len(topics)+len(words)) * self.k])\n",
3886 |     "            else:\n",
3887 |     "                y = torch.reshape(y, [-1, (len(embed2)+len(topics)) * self.k])\n",
3888 |     "        else:\n",
3889 |     "            y = torch.reshape(y, [-1, len(embed2)* self.k])\n",
3890 |     "          \n",
3891 |     "        temp = []\n",
3892 |     "        temp.append(y)        \n",
3893 |     "            \n",
3894 |     "        for s in self.num_features:\n",
3895 |     "            temp.append(norm_num[s])             \n",
3896 |     "        x = torch.cat(temp, -1)\n",
3897 |     "        \n",
3898 |     "        ## dnn_logits\n",
3899 |     "        dnn_logits = self.deep_layers(x) \n",
3900 |     "        # ##########################################################CIN \n",
3901 |     "        x = []\n",
3902 |     "        for s in self.muti_features+self.cross_features:\n",
3903 |     "            x.append(embed2[s])\n",
3904 |     "            \n",
3905 |     "        if self.usetopic:\n",
3906 |     "            for s in self.topic_features:\n",
3907 |     "                x.append(topics[s])\n",
3908 |     "                \n",
3909 |     "        #word\n",
3910 |     "        if self.useword:\n",
3911 |     "            for s in self.word_features:\n",
3912 |     "                x.append(words[s])\n",
3913 |     "        \n",
3914 |     "        x0 = torch.cat(x, 1)\n",
3915 |     "        res = []\n",
3916 |     "        x_list = [x0]\n",
3917 |     "        for k in range(1, len(self.cin_layers)+1):\n",
3918 |     "            z_k = torch.einsum('bhd,bmd->bhmd', x_list[-1], x_list[0])\n",
3919 |     "            z_k = z_k.reshape(x0.shape[0], x_list[-1].shape[1] * x0.shape[1], x0.shape[2])\n",
3920 |     "            x_k = self.conv1ds[k-1](z_k)\n",
3921 |     "            x_k = torch.relu(x_k)\n",
3922 |     "            if self.split_half and k != len(self.cin_layers):\n",
3923 |     "                next_hidden, hi = torch.split(x_k, x_k.shape[1] // 2, 1)\n",
3924 |     "            else:\n",
3925 |     "                next_hidden, hi = x_k, x_k\n",
3926 |     "\n",
3927 |     "            x_list.append(next_hidden)\n",
3928 |     "            res.append(hi)\n",
3929 |     "\n",
3930 |     "        res = torch.cat(res, dim=1)\n",
3931 |     "        res = torch.sum(res, dim=2)\n",
3932 |     "\n",
3933 |     "        if self.usetext:\n",
3934 |     "            textinfo = self.textcnn(input_x[:,self.feature_index['q_topic'][0]:self.feature_index['q_topic'][1]].long(),\n",
3935 |     "                            input_x[:,self.feature_index['m_interest_topic'][0]:self.feature_index['m_interest_topic'][1]].long())      \n",
3936 |     "            allput = torch.cat([dnn_logits, res, textinfo], dim=1)\n",
3937 |     "        else:  \n",
3938 |     "            allput = torch.cat([dnn_logits, res, others_num], dim=1)   \n",
3939 |     "        score = self.output(allput)\n",
3940 |     "        \n",
3941 |     "        if self.usemem:\n",
3942 |     "            with torch.no_grad():\n",
3943 |     "                mem_n = score.detach()        \n",
3944 |     "            m_info = F.tanh(self.linear1(torch.cat([mem0, mem1], dim=-1)))          \n",
3945 |     "            return self.end(score), mem_n, mem0, mem1   \n",
3946 |     "        else:\n",
3947 |     "            return self.end(score)"
3948 |    ]
3949 |   },
3950 |   {
3951 |    "cell_type": "markdown",
3952 |    "metadata": {},
3953 |    "source": [
3954 |     "### util\n",
3955 |     "主要对word，topic进行处理，将word，topic映射到对应的数值，方便进行embedding"
3956 |    ]
3957 |   },
3958 |   {
3959 |    "cell_type": "code",
3960 |    "execution_count": null,
3961 |    "metadata": {
3962 |     "collapsed": true
3963 |    },
3964 |    "outputs": [],
3965 |    "source": [
3966 |     "## pad 1  加入topics信息\n",
3967 |     "def deal_text(textdata, params, maxlen=10):\n",
3968 |     "    temp = []\n",
3969 |     "    topic2ix = params['textargs']['topic2index']\n",
3970 |     "    \n",
3971 |     "    for text in textdata:\n",
3972 |     "        tps = str(text).split(',')\n",
3973 |     "        if '-1' in text:\n",
3974 |     "            rs = [1]* maxlen\n",
3975 |     "        else:\n",
3976 |     "            rs = list(map(lambda x: topic2ix[x],tps))\n",
3977 |     "            if len(rs)<=maxlen:\n",
3978 |     "                rs += [1]*(maxlen-len(rs))\n",
3979 |     "            else:\n",
3980 |     "                rs = rs[:maxlen]\n",
3981 |     "        temp.append(rs)\n",
3982 |     "        \n",
3983 |     "    return np.array(temp)\n",
3984 |     "\n",
3985 |     "def deal_text2(textdata, params):\n",
3986 |     "    temp = []\n",
3987 |     "    topic2ix = params['textargs']['topic2index']\n",
3988 |     "    weight = params['textargs']['pretrained_weight']\n",
3989 |     "    for text in textdata:\n",
3990 |     "        tps = str(text).split(',')\n",
3991 |     "        if '-1' in text:\n",
3992 |     "            rs = np.array([0]*64)\n",
3993 |     "        else:\n",
3994 |     "            rs = list(map(lambda x: np.array(weight[topic2ix[x]]), tps))\n",
3995 |     "            rs = np.mean(np.array(rs),axis=0)\n",
3996 |     "        temp.append(rs)\n",
3997 |     "        \n",
3998 |     "        \n",
3999 |     "    return np.array(temp)\n",
4000 |     "\n",
4001 |     "# 处理带权重的topics\n",
4002 |     "def deal_text3(textdata, params, maxlen=10):\n",
4003 |     "    temp = []\n",
4004 |     "    topic2ix = params['textargs']['topic2index']\n",
4005 |     "    temp_att = []\n",
4006 |     "    ix = 0\n",
4007 |     "\n",
4008 |     "    for text in textdata:\n",
4009 |     "        tps = str(text).split(',')\n",
4010 |     "        ix += 1\n",
4011 |     "        if '-1' in text:\n",
4012 |     "            rs = [1] * maxlen\n",
4013 |     "            rs_att = [0.0] * maxlen\n",
4014 |     "        else:\n",
4015 |     "            tps = list(filter(lambda x:'Infinity' not in x , tps))\n",
4016 |     "            rs = list(map(lambda x: topic2ix[x.split(':')[0]],tps))  \n",
4017 |     "            rs_att = list(map(lambda x: float(x.split(':')[1]),tps))\n",
4018 |     "            \n",
4019 |     "            if len(rs) <= maxlen:\n",
4020 |     "                rs += [1]*(maxlen-len(rs))\n",
4021 |     "                rs_att += [0.0] * (maxlen-len(rs_att))\n",
4022 |     "            else:\n",
4023 |     "                rs = rs[:maxlen]\n",
4024 |     "                rs_att = rs_att[:maxlen]\n",
4025 |     "                \n",
4026 |     "        temp.append(rs)\n",
4027 |     "        temp_att.append(rs_att)\n",
4028 |     "        \n",
4029 |     "    temp = np.array(temp)\n",
4030 |     "    temp_att = np.array(temp_att)\n",
4031 |     "    \n",
4032 |     "    return np.concatenate([temp,temp_att], axis=-1)\n",
4033 |     "\n",
4034 |     "# 权重word\n",
4035 |     "def deal_word(textdata, params, maxlen=10):\n",
4036 |     "    temp = []\n",
4037 |     "    topic2ix = params['word2index']\n",
4038 |     "    temp_att = []\n",
4039 |     "    ix = 0\n",
4040 |     "\n",
4041 |     "    for text in textdata:\n",
4042 |     "        tps = str(text).split(',')\n",
4043 |     "        ix += 1\n",
4044 |     "        if '-1' in text:\n",
4045 |     "            rs = [1] * maxlen\n",
4046 |     "            rs_att = [0.0] * maxlen\n",
4047 |     "        else:\n",
4048 |     "            tps = list(filter(lambda x:'Infinity' not in x , tps))\n",
4049 |     "            rs = list(map(lambda x: topic2ix[x.split(':')[0]],tps))  \n",
4050 |     "            rs_att = list(map(lambda x: float(x.split(':')[1]),tps))\n",
4051 |     "            \n",
4052 |     "            if len(rs) <= maxlen:\n",
4053 |     "                rs += [1]*(maxlen-len(rs))\n",
4054 |     "                rs_att += [0.0] * (maxlen-len(rs_att))\n",
4055 |     "            else:\n",
4056 |     "                rs = rs[:maxlen]\n",
4057 |     "                rs_att = rs_att[:maxlen]\n",
4058 |     "                \n",
4059 |     "        temp.append(rs)\n",
4060 |     "        temp_att.append(rs_att)\n",
4061 |     "        \n",
4062 |     "    temp = np.array(temp)\n",
4063 |     "    temp_att = np.array(temp_att)\n",
4064 |     "    \n",
4065 |     "    return np.concatenate([temp,temp_att], axis=-1)\n",
4066 |     "\n",
4067 |     "# 非权重\n",
4068 |     "def deal_word2(textdata, params, maxlen=10):\n",
4069 |     "    temp = []\n",
4070 |     "    topic2ix = params['word2index']\n",
4071 |     "    \n",
4072 |     "    for text in textdata:\n",
4073 |     "        tps = str(text).split(',')\n",
4074 |     "        if '-1' in text:\n",
4075 |     "            rs = [1]* maxlen\n",
4076 |     "        else:\n",
4077 |     "            rs = list(map(lambda x: topic2ix[x],tps))\n",
4078 |     "            if len(rs)<=maxlen:\n",
4079 |     "                rs += [1]*(maxlen-len(rs))\n",
4080 |     "            else:\n",
4081 |     "                rs = rs[:maxlen]\n",
4082 |     "        temp.append(rs)\n",
4083 |     "        \n",
4084 |     "    return np.array(temp) \n",
4085 |     "\n",
4086 |     "import os\n",
4087 |     "def setup_seed(seed):\n",
4088 |     "    random.seed(seed)\n",
4089 |     "    os.environ['PYTHONHASHSEED'] = str(seed)\n",
4090 |     "    np.random.seed(seed)\n",
4091 |     "    torch.manual_seed(seed)\n",
4092 |     "    torch.cuda.manual_seed(seed)\n",
4093 |     "    torch.backends.cudnn.deterministic = True"
4094 |    ]
4095 |   },
4096 |   {
4097 |    "cell_type": "markdown",
4098 |    "metadata": {},
4099 |    "source": [
4100 |     "### train test\n",
4101 |     "本部分主要是模型的训练（使用Lookahead+Adam 优化器），验证，预测"
4102 |    ]
4103 |   },
4104 |   {
4105 |    "cell_type": "code",
4106 |    "execution_count": null,
4107 |    "metadata": {
4108 |     "collapsed": true
4109 |    },
4110 |    "outputs": [],
4111 |    "source": [
4112 |     "def eval(model, devloader, params):\n",
4113 |     "    preds = []    \n",
4114 |     "    print('eval')\n",
4115 |     "    model.eval()\n",
4116 |     "    trues = []\n",
4117 |     "    for x,y in devloader:\n",
4118 |     "        with torch.no_grad():\n",
4119 |     "            x = x.to(params['device']).float()\n",
4120 |     "            if params['usemem']:\n",
4121 |     "                score, _, _, _ = model(x)\n",
4122 |     "            else:\n",
4123 |     "                score = model(x)\n",
4124 |     "            preds+=score.cpu().reshape(-1).tolist()\n",
4125 |     "            trues+=y.cpu().reshape(-1).tolist()\n",
4126 |     "    auc = roc_auc_score(trues, preds)\n",
4127 |     "    print('auc: ', auc)\n",
4128 |     "    return auc\n",
4129 |     "\n",
4130 |     "temp_x = 0\n",
4131 |     "def train(params, trainset, devset, foldix=0):\n",
4132 |     "    x = []\n",
4133 |     "    for i in params['single_features']+ params['num_features']+ params['other_features']:\n",
4134 |     "        x.append(np.expand_dims(trainset[i], axis=1))\n",
4135 |     "    if params['usetext']:\n",
4136 |     "        x.append(deal_text(trainset['attentionthemes'], params, maxlen=10))\n",
4137 |     "        x.append(deal_text(trainset['themeId'], params, maxlen=10))\n",
4138 |     "    if params['usetopic']:\n",
4139 |     "        x.append(deal_text(trainset['attentionthemes'], params, maxlen=100))  \n",
4140 |     "        x.append(deal_text(trainset['themeId'], params, maxlen=13))\n",
4141 |     "        x.append(deal_text3(trainset['likethemes'], params, maxlen=10))#w_topics1\n",
4142 |     "        #hist_user_themes_att\n",
4143 |     "        x.append(deal_text3(trainset['hist_user_themes'], params, maxlen=10))\n",
4144 |     "    if params['useword']:\n",
4145 |     "        x.append(deal_word(trainset['hist_user_words'], params, maxlen=20))\n",
4146 |     "        x.append(deal_word2(trainset['all_words'], params, maxlen=10))\n",
4147 |     "\n",
4148 |     "    #hist_user_unlike_themes\n",
4149 |     "    x.append(deal_text3(trainset['hist_user_unlike_themes'], params, maxlen=10))\n",
4150 |     "    #hist_user_unlike_words\n",
4151 |     "    x.append(deal_word(trainset['hist_user_unlike_words'], params, maxlen=20))\n",
4152 |     "    \n",
4153 |     "    train_tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x, axis=-1)),\n",
4154 |     "                torch.from_numpy(np.expand_dims(trainset['label'], axis=1)))   \n",
4155 |     "\n",
4156 |     "    trainloader = Data.DataLoader(\n",
4157 |     "        dataset=train_tensor_data,\n",
4158 |     "        batch_size=params['batch_size'],\n",
4159 |     "        shuffle=not params['usemem'],\n",
4160 |     "        num_workers=0,\n",
4161 |     "    )\n",
4162 |     "    del trainset\n",
4163 |     "    gc.collect()\n",
4164 |     "    print('train load ok')\n",
4165 |     "    x_val = []    \n",
4166 |     "    for i in params['single_features']+ params['num_features']+ params['other_features']:\n",
4167 |     "        x_val.append(np.expand_dims(devset[i], axis=1))\n",
4168 |     "    if params['usetext']:\n",
4169 |     "        x_val.append(deal_text(devset['attentionthemes'], params, maxlen=10))\n",
4170 |     "        x_val.append(deal_text(devset['themeId'], params, maxlen=10))  \n",
4171 |     "    \n",
4172 |     "    if params['usetopic']:\n",
4173 |     "        x_val.append(deal_text(devset['attentionthemes'], params, maxlen=100))  \n",
4174 |     "        x_val.append(deal_text(devset['themeId'], params, maxlen=13))\n",
4175 |     "        x_val.append(deal_text3(devset['likethemes'], params, maxlen=10))\n",
4176 |     "        x_val.append(deal_text3(devset['hist_user_themes'], params, maxlen=10))\n",
4177 |     "    if params['useword']:#\n",
4178 |     "        x_val.append(deal_word(devset['hist_user_words'], params, maxlen=20))\n",
4179 |     "        x_val.append(deal_word2(devset['all_words'], params, maxlen=10))\n",
4180 |     "\n",
4181 |     "    x_val.append(deal_text3(devset['hist_user_unlike_themes'], params, maxlen=10))\n",
4182 |     "    #hist_user_unlike_words\n",
4183 |     "    x_val.append(deal_word(devset['hist_user_unlike_words'], params, maxlen=20))\n",
4184 |     "    \n",
4185 |     "    dev_tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x_val, axis=-1)),\n",
4186 |     "            torch.from_numpy(np.expand_dims(devset['label'], axis=1)))   \n",
4187 |     "\n",
4188 |     "    devloader = Data.DataLoader(\n",
4189 |     "        dataset=dev_tensor_data,\n",
4190 |     "        batch_size=params['batch_size'],\n",
4191 |     "        shuffle=False,\n",
4192 |     "        num_workers=0,\n",
4193 |     "    )\n",
4194 |     "    print('dev loader ok')\n",
4195 |     "    model = xDeepFM(params)\n",
4196 |     "    model.to(params['device'])\n",
4197 |     "    base_optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['l2'])\n",
4198 |     "    optimizer = Lookahead(base_optimizer=base_optimizer,k=5,alpha=0.5)\n",
4199 |     "    best_auc = 0\n",
4200 |     "    criterion = nn.MSELoss() ## mem\n",
4201 |     "    maxauc = 0\n",
4202 |     "    for epoch in tqdm.tqdm_notebook(range(params['epochs'])):\n",
4203 |     "        all_loss = 0\n",
4204 |     "        i=0     \n",
4205 |     "        all_loss_mem = 0\n",
4206 |     "        for x, y in tqdm.tqdm_notebook(trainloader):\n",
4207 |     "            x = x.to(params['device']).float()\n",
4208 |     "            y = y.to(params['device']).float() \n",
4209 |     "            if params['usemem']:\n",
4210 |     "                score, mem_n, mem0, mem1 = model(x)\n",
4211 |     "                loss = F.binary_cross_entropy_with_logits(score, y)\n",
4212 |     "                mem = (1-y.expand_as(mem_n)) * mem0 + y.expand_as(mem_n) * mem1\n",
4213 |     "                memloss = criterion(mem, mem_n)\n",
4214 |     "                loss = loss + memloss\n",
4215 |     "                all_loss_mem += memloss.detach().cpu().item()\n",
4216 |     "            else:\n",
4217 |     "                score = model(x) \n",
4218 |     "                loss = F.binary_cross_entropy_with_logits(score, y)\n",
4219 |     "                \n",
4220 |     "            optimizer.zero_grad()\n",
4221 |     "            loss.backward()\n",
4222 |     "            i+=1\n",
4223 |     "            optimizer.step()\n",
4224 |     "            all_loss += loss.detach().cpu().item()\n",
4225 |     "            if i % params['num_display_steps']==0:\n",
4226 |     "                if params['usemem']:\n",
4227 |     "                    #print('mem_loss:', all_loss_mem/params['num_display_steps'])\n",
4228 |     "                    all_loss_mem = 0\n",
4229 |     "                all_loss = 0\n",
4230 |     "        auc = eval(model, devloader, params)\n",
4231 |     "        model.train()\n",
4232 |     "        if auc > maxauc:\n",
4233 |     "            maxauc = auc\n",
4234 |     "        torch.save(model.state_dict(), datapath+'data/'+'_auc_'+ str(auc)+'_'+str(foldix)+'.pth')\n",
4235 |     "        print(datapath+'data/'+'_auc_'+ str(auc)+'_'+str(foldix)+'.pth saved!')\n",
4236 |     "\n",
4237 |     "    return datapath+'data/'+'_auc_'+ str(maxauc)+'_'+str(foldix)+'.pth'\n",
4238 |     "\n",
4239 |     "def test(params, testset, testpath, onefold=False):\n",
4240 |     "    model = xDeepFM(params)\n",
4241 |     "    model.load_state_dict(torch.load(testpath))\n",
4242 |     "    model.to(params['device'])\n",
4243 |     "    \n",
4244 |     "    x = []\n",
4245 |     "    for i in params['single_features']+ params['num_features'] + params['other_features']:\n",
4246 |     "        x.append(np.expand_dims(testset[i], axis=1))\n",
4247 |     "    if params['usetext']:\n",
4248 |     "        x.append(deal_text(testset['attentionthemes'], params, maxlen=10))\n",
4249 |     "        x.append(deal_text(testset['themeId'], params, maxlen=10)) \n",
4250 |     "\n",
4251 |     "    if params['usetopic']:\n",
4252 |     "        x.append(deal_text(testset['attentionthemes'], params, maxlen=100))  \n",
4253 |     "        x.append(deal_text(testset['themeId'], params, maxlen=13))\n",
4254 |     "        x.append(deal_text3(testset['likethemes'], params, maxlen=10))#w_topics1\n",
4255 |     "        x.append(deal_text3(testset['hist_user_themes'], params, maxlen=10))\n",
4256 |     "\n",
4257 |     "    if params['useword']:\n",
4258 |     "        x.append(deal_word(testset['hist_user_words'], params, maxlen=20))\n",
4259 |     "        x.append(deal_word2(testset['all_words'], params, maxlen=10))\n",
4260 |     "    #hist_user_unlike_themes\n",
4261 |     "    x.append(deal_text3(testset['hist_user_unlike_themes'], params, maxlen=10))\n",
4262 |     "    #hist_user_unlike_words\n",
4263 |     "    x.append(deal_word(testset['hist_user_unlike_words'], params, maxlen=20))\n",
4264 |     "    \n",
4265 |     "    test_tensor_data = Data.TensorDataset(torch.from_numpy(np.concatenate(x, axis=-1)),\n",
4266 |     "                torch.from_numpy(np.expand_dims(testset['label'], axis=1)))   \n",
4267 |     "\n",
4268 |     "    testloader = Data.DataLoader(\n",
4269 |     "        dataset=test_tensor_data,\n",
4270 |     "        batch_size=params['batch_size'],\n",
4271 |     "        shuffle=False,\n",
4272 |     "        num_workers=0,\n",
4273 |     "    )\n",
4274 |     "    preds = []\n",
4275 |     "    model.eval()\n",
4276 |     "    for x, y in testloader:\n",
4277 |     "        x = x.to(params['device']).float()\n",
4278 |     "        score = model(x)\n",
4279 |     "        score = torch.sigmoid(score)\n",
4280 |     "        preds+=score.cpu().reshape(-1).tolist()\n",
4281 |     "    if  onefold:    \n",
4282 |     "        testdata = pd.read_csv(datapath+'data/m_q_invite_test.csv') \n",
4283 |     "        testdata['label'] = pd.Series(preds)\n",
4284 |     "        testdata[['q_id', 'm_id', 'invite_time', 'label']].to_csv(datapath+'submit.txt', index = None, sep = '\\t', header=None)\n",
4285 |     "        print('test finished!')\n",
4286 |     "    return preds"
4287 |    ]
4288 |   },
4289 |   {
4290 |    "cell_type": "markdown",
4291 |    "metadata": {},
4292 |    "source": [
4293 |     "### Fold\n",
4294 |     "本部分包括随机5折、按时间顺序5折、5折的测试，对比随机5折和按时间顺序5折效果，发现随机5折的效果更好"
4295 |    ]
4296 |   },
4297 |   {
4298 |    "cell_type": "code",
4299 |    "execution_count": null,
4300 |    "metadata": {
4301 |     "collapsed": true
4302 |    },
4303 |    "outputs": [],
4304 |    "source": [
4305 |     "def train_fold(trainset, params, fold=5):\n",
4306 |     "    train_y = trainset['label']\n",
4307 |     "    train_x = trainset\n",
4308 |     "    rs_list = []\n",
4309 |     "    pre_train = pd.Series(np.zeros(len(train_y)))\n",
4310 |     "    \n",
4311 |     "    kf = StratifiedKFold(n_splits = fold,shuffle = True,random_state = 2019)\n",
4312 |     "    for ix,(train_index,eval_index) in enumerate(kf.split(train_x,train_y)):\n",
4313 |     "        dtrain_x = train_x.loc[train_index,:]\n",
4314 |     "        deval_x = train_x.loc[eval_index,:]\n",
4315 |     "        rs = train(params, dtrain_x, deval_x, ix)\n",
4316 |     "        pre_train[eval_index] = test(params, deval_x, rs) \n",
4317 |     "        rs_list.append(rs)\n",
4318 |     "    \n",
4319 |     "    train_pre = trainset[['qId', 'writerId', 'inviteday', 'label', 'invitehour']]\n",
4320 |     "    train_pre['pre_label'] = pre_train\n",
4321 |     "    with open(datapath + 'train-fold-3zhou.pkl', 'wb') as f:\n",
4322 |     "        pickle.dump(train_pre, f, protocol=4)\n",
4323 |     "    \n",
4324 |     "    file = codecs.open(datapath + 'fold_list_3zhou.txt', 'w')\n",
4325 |     "    file.write(','.join(rs_list))\n",
4326 |     "    print('train -fold end!')\n",
4327 |     "\n",
4328 |     "def train_fold_time(trainset, params, fold=5):\n",
4329 |     "    train_x = trainset\n",
4330 |     "    rs_list = []\n",
4331 |     "    pre_train = pd.Series(np.zeros(len(train_x)))\n",
4332 |     "    \n",
4333 |     "    start = train_x['inviteday'].min()\n",
4334 |     "    block_len = np.ceil(float(3868-start)/fold)\n",
4335 |     "    for i in range(5): \n",
4336 |     "        bool_eval = (train_x['inviteday']>=start)&(train_x['inviteday']<start+block_len)\n",
4337 |     "        bool_train = ~bool_eval\n",
4338 |     "        train_index = train_x[bool_train].index\n",
4339 |     "        eval_index = train_x[bool_eval].index\n",
4340 |     "        start = start+block_len\n",
4341 |     "        ###\n",
4342 |     "        dtrain_x = train_x.loc[train_index,:]\n",
4343 |     "        deval_x = train_x.loc[eval_index,:]\n",
4344 |     "        print(str(deval_x['inviteday'].min())+':'+str(deval_x['inviteday'].max()))\n",
4345 |     "        rs = train(params, dtrain_x, deval_x, i)\n",
4346 |     "        pre_train[eval_index] = test(params, deval_x, rs) \n",
4347 |     "        rs_list.append(rs)\n",
4348 |     "    \n",
4349 |     "    train_pre = trainset[['qId', 'writerId', 'inviteday', 'label', 'invitehour']]\n",
4350 |     "    train_pre['pre_label'] = pre_train\n",
4351 |     "    with open(datapath + 'train-fold-time.pkl', 'wb') as f:\n",
4352 |     "        pickle.dump(train_pre, f, protocol=4)\n",
4353 |     "    \n",
4354 |     "    file = codecs.open(datapath + 'fold_list-time.txt', 'w')\n",
4355 |     "    file.write(','.join(rs_list))\n",
4356 |     "    print('train -fold end!')    \n",
4357 |     "\n",
4358 |     "\n",
4359 |     "def test_fold(testset, params, modelList=None): # modelist= './fold_list.txt'\n",
4360 |     "    if modelList==None and os.path.exists(datapath + 'fold_list.txt'):\n",
4361 |     "        file = codecs.open(datapath+'fold_list.txt', 'r')\n",
4362 |     "        modelList = file.read().split(',')\n",
4363 |     "        print('加载 fold-list end!')\n",
4364 |     "    else:\n",
4365 |     "        file = codecs.open(modelList, 'r')\n",
4366 |     "        modelList = file.read().split(',')\n",
4367 |     "        print('加载 fold-list end!')\n",
4368 |     "        \n",
4369 |     "    pre_test = []\n",
4370 |     "    for i in modelList:\n",
4371 |     "        pre = test(params, testset, i) \n",
4372 |     "        pre_test.append(pre)\n",
4373 |     "        print(pre[:30])\n",
4374 |     "    pre_test = np.array(pre_test)\n",
4375 |     "    pre_test = np.mean(pre_test,axis = 0)\n",
4376 |     "    sub_sample = pd.read_csv(datapath + 'data/data_invite_test.csv').drop(['inviteday','invitehour'],axis = 1)\n",
4377 |     "    sub_sample['label'] = pre_test\n",
4378 |     "    sub_sample.to_csv(datapath+'submit-test-3zhou.txt',sep = '\\t',header = False,index = False)\n",
4379 |     "    print('test-fold finished!')"
4380 |    ]
4381 |   },
4382 |   {
4383 |    "cell_type": "markdown",
4384 |    "metadata": {},
4385 |    "source": [
4386 |     "### 模型参数设置\n",
4387 |     "- 1、加载相关文件：类别特征的字典， topic的权重， topic转化字典， word的权重， word转化字典\n",
4388 |     "- 2、特征类别：单一特征，多值特征，交叉特征， topic特征， word特征， 其他特征（deepWalk）\n",
4389 |     "- 3、模型结构：比赛中尝试对模型结构进行改动，包括添加memory结构，想要得到用户历史信息；利用textcnn结构获取topic的特征；默认参数是达到最好结果的参数。"
4390 |    ]
4391 |   },
4392 |   {
4393 |    "cell_type": "code",
4394 |    "execution_count": null,
4395 |    "metadata": {
4396 |     "collapsed": true
4397 |    },
4398 |    "outputs": [],
4399 |    "source": [
4400 |     "# 禁用topic时，注意feature=[] , usetopic=False\n",
4401 |     "setup_seed(1024)\n",
4402 |     "with open(datapath+'data/dic_all.pkl', 'rb') as f:\n",
4403 |     "    dic = pickle.load(f)\n",
4404 |     "with open(datapath+'data/topic_weight_64.pkl', 'rb') as f:\n",
4405 |     "    weight = pickle.load(f)\n",
4406 |     "with open(datapath+'data/topic2index.pkl', 'rb') as f:\n",
4407 |     "    topic2index = pickle.load(f)   \n",
4408 |     "    \n",
4409 |     "with open(datapath+'data/word_weight_64.pkl', 'rb') as f:\n",
4410 |     "    word_weight = pickle.load(f)\n",
4411 |     "with open(datapath+'data/word2index.pkl', 'rb') as f:\n",
4412 |     "    word2index = pickle.load(f) \n",
4413 |     "    \n",
4414 |     "    \n",
4415 |     "textargs = {\n",
4416 |     "    'filter_num': 64,\n",
4417 |     "    'filter_sizes':[2,3,4],\n",
4418 |     "    'vocabulary_size':weight.size()[0],\n",
4419 |     "    'embedding_dim':64,\n",
4420 |     "    'dropout':0,\n",
4421 |     "    'topic2index':topic2index,\n",
4422 |     "    'pretrained_weight':weight,\n",
4423 |     "}\n",
4424 |     "\n",
4425 |     "params = {\n",
4426 |     "    'k':8, #embedding dims 8\n",
4427 |     "    'batch_size':8000,\n",
4428 |     "    'lr':0.0005,\n",
4429 |     "    'l2':0.00001,\n",
4430 |     "    'device': torch.device('cuda:1' if torch.cuda.is_available() else \"cpu\"),\n",
4431 |     "    'p' : 0.5,\n",
4432 |     "    'single_features':['invitehour', 'createhour', 'inviteweekday', 'createweekday', 'sex', 'activity', 'bool_A',\n",
4433 |     "       'bool_B', 'bool_C', 'bool_D', 'bool_E', 'category_E', 'category_A', 'category_B', 'category_C', 'category_D'],\n",
4434 |     "    'cross_features':['invitehour', 'createhour', 'inviteweekday', 'createweekday', 'sex', 'activity', 'bool_A',\n",
4435 |     "       'bool_B', 'bool_C', 'bool_D', 'bool_E', 'category_E', 'category_A', 'category_B','category_C', 'category_D'],\n",
4436 |     "    'muti_features':[],\n",
4437 |     "    'topic_features':['attentionthemes', 'themeId', 'likethemes', 'hist_user_themes'], #hist_user_unlike_themes 加上这个特征效果稍稍变差，但是在模型融合的时候有帮助 \n",
4438 |     "    'other_features': ['qId_dw'+str(i) for i in range(32)] + ['writerId_dw'+str(i) for i in range(32)] ,\n",
4439 |     "    'word_features':['hist_user_words', 'all_words'],#hist_user_unlike_words 加上这个特征效果稍稍变差，但是在模型融合的时候有帮助 \n",
4440 |     "    'layers':[1024, 512], # dnn\n",
4441 |     "    'cin_layers':[128, 128], ### cin\n",
4442 |     "    'num_features': new_dense,#list(set(numPart) - set(['sameqnum_like'])) ,#+ ['qId_dw'+str(i) for i in range(32)], writerId_dw28\n",
4443 |     "    'num_display_steps':100,\n",
4444 |     "    'num_eval_steps':1000,\n",
4445 |     "    'epochs':50,\n",
4446 |     "    'split_half':True,\n",
4447 |     "    'normNum':False,\n",
4448 |     "    'textargs':textargs,\n",
4449 |     "    'usetext':False,\n",
4450 |     "    'usecin':True,\n",
4451 |     "    'usemem':False,\n",
4452 |     "    'usetopic':True,\n",
4453 |     "    'useword':True,\n",
4454 |     "    'word2index':word2index,\n",
4455 |     "    'word_weight':word_weight,\n",
4456 |     "}\n",
4457 |     "\n",
4458 |     "\n",
4459 |     "params['dic'] = dic"
4460 |    ]
4461 |   },
4462 |   {
4463 |    "cell_type": "markdown",
4464 |    "metadata": {},
4465 |    "source": [
4466 |     "### 训练"
4467 |    ]
4468 |   },
4469 |   {
4470 |    "cell_type": "code",
4471 |    "execution_count": null,
4472 |    "metadata": {
4473 |     "collapsed": true
4474 |    },
4475 |    "outputs": [],
4476 |    "source": [
4477 |     "train_fold(trainset, params)  \n",
4478 |     "test_fold(testset, params, datapath+'fold_list_3zhou.txt')"
4479 |    ]
4480 |   },
4481 |   {
4482 |    "cell_type": "markdown",
4483 |    "metadata": {},
4484 |    "source": [
4485 |     "# .Stacking"
4486 |    ]
4487 |   },
4488 |   {
4489 |    "cell_type": "code",
4490 |    "execution_count": null,
4491 |    "metadata": {
4492 |     "collapsed": true
4493 |    },
4494 |    "outputs": [],
4495 |    "source": [
4496 |     "# load test\n",
4497 |     "test = pd.read_csv(datapath+'data/pre1_lgb_0.1.csv') # lgb_0.1_nonew.txt\n",
4498 |     "test = test[['lgb_pre1']]\n",
4499 |     "\n",
4500 |     "temp = pd.read_csv(datapath+'data/pre1_cat_0.1.csv') # cat3.txt\n",
4501 |     "temp = temp[['cat_pre1']]\n",
4502 |     "test = pd.concat([test,temp], axis=1)\n",
4503 |     "\n",
4504 |     "temp = pd.read_csv(datapath+'data/xgb_0.1.csv') # cat3.txt\n",
4505 |     "temp = temp[['xgb_pre1']]\n",
4506 |     "test = pd.concat([test,temp], axis=1)\n",
4507 |     "\n",
4508 |     "temp = pd.read_csv(datapath+'data/submit-test-3zhou.txt', sep = '\\t', header = None) \n",
4509 |     "temp.columns = ['qid','uid','inv_time','nn_pre1']\n",
4510 |     "temp = temp[['nn_pre1']]\n",
4511 |     "test = pd.concat([test,temp], axis=1)\n",
4512 |     "\n",
4513 |     "train_y = pd.read_csv(datapath+'train_y.csv', header = None)\n",
4514 |     "train_y.columns = ['label']\n",
4515 |     "train_y = train_y[['label']]\n",
4516 |     "\n",
4517 |     "# stacking\n",
4518 |     "usecols = list(train.columns)\n",
4519 |     "pre_train = pd.Series(np.zeros(len(train_y)))\n",
4520 |     "pre_test = []\n",
4521 |     "kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)\n",
4522 |     "for train_index, eval_index in kf.split(train, train_y['label']):\n",
4523 |     "    params_lgbc = {\n",
4524 |     "        'boosting_type': 'gbdt',\n",
4525 |     "        'objective': 'binary',  \n",
4526 |     "        'num_leaves': 10,\n",
4527 |     "        'learning_rate': 0.3,\n",
4528 |     "        'feature_fraction': 0.8,\n",
4529 |     "        'bagging_fraction': 0.8,\n",
4530 |     "        'bagging_freq': 1,\n",
4531 |     "        'min_sum_hessian_in_leaf': 20,\n",
4532 |     "        'num_threads': cpu_count() - 1,\n",
4533 |     "        'seed': 7,\n",
4534 |     "        'n_estimators': 10000,  \n",
4535 |     "        'max_depth': 5,\n",
4536 |     "        'subsample': 0.9,\n",
4537 |     "        'subsample_freq': 2,\n",
4538 |     "        'reg_alpha': 0,\n",
4539 |     "        'reg_lambda': 2\n",
4540 |     "    }\n",
4541 |     "    dtrain_x = train.loc[train_index, usecols]\n",
4542 |     "    deval_x = train.loc[eval_index, usecols]\n",
4543 |     "    dtrain_y = train_y.loc[train_index, 'label']\n",
4544 |     "    deval_y = train_y.loc[eval_index, 'label']\n",
4545 |     "    lgbc = lgb.LGBMClassifier(random_state=2019, **params_lgbc)\n",
4546 |     "    lgbc.fit(dtrain_x, dtrain_y, eval_set=[(deval_x, deval_y)], eval_names=['eval'], eval_metric='auc',\n",
4547 |     "             early_stopping_rounds=10, verbose=10)\n",
4548 |     "    pre_train[eval_index] = lgbc.predict_proba(deval_x)[:, 1]\n",
4549 |     "    pre_test.append(list(lgbc.predict_proba(test.loc[:, usecols].values)[:, 1]))\n",
4550 |     "\n",
4551 |     "pre_test = np.array(pre_test)\n",
4552 |     "pre_test = np.mean(pre_test, axis=0)\n",
4553 |     "score = roc_auc_score(train_y, pre_train)\n",
4554 |     "\n",
4555 |     "sub_sample = pd.read_csv(datapath+'invite_info_evaluate_2_0926.txt',sep = '\\t',header = None,names = ['qId','writerId','invitetime'])\n",
4556 |     "sub_sample['label'] = pre_test\n",
4557 |     "\n",
4558 |     "sub_sample.to_csv(datapath+'data/sub.txt',sep = '\\t',header = False,index = False)"
4559 |    ]
4560 |   }
4561 |  ],
4562 |  "metadata": {
4563 |   "anaconda-cloud": {},
4564 |   "kernelspec": {
4565 |    "display_name": "Python 3",
4566 |    "language": "python",
4567 |    "name": "python3"
4568 |   },
4569 |   "language_info": {
4570 |    "codemirror_mode": {
4571 |     "name": "ipython",
4572 |     "version": 3
4573 |    },
4574 |    "file_extension": ".py",
4575 |    "mimetype": "text/x-python",
4576 |    "name": "python",
4577 |    "nbconvert_exporter": "python",
4578 |    "pygments_lexer": "ipython3",
4579 |    "version": "3.7.0"
4580 |   }
4581 |  },
4582 |  "nbformat": 4,
4583 |  "nbformat_minor": 2
4584 | }
4585 | 


--------------------------------------------------------------------------------