├── README.md ├── utils.py └── rnn.py /README.md: -------------------------------------------------------------------------------- 1 | # KuaiShou2018-RANK13-RNN 2 | ## 垃圾RNN,单模91多点,融合管用,14个序列特征+两个type 3 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | register = pd.read_csv('/mnt/datasets/fusai/user_register_log.txt',sep='\t',header=None,dtype={0:np.int32,1:np.int8,2:np.int16,3:np.int16}).rename(columns={0:'user_id',1:'day',2:'register_type',3:'device_type'}) 5 | activity = pd.read_csv('/mnt/datasets/fusai/user_activity_log.txt',sep='\t',header=None,dtype={0:np.int32,1:np.int8,2:np.int8,3:np.int32,4:np.int32,5:np.int8}).rename(columns={0:'user_id',1:'day',2:'page',3:'video_id',4:'author_id',5:'action_type'}) 6 | launch = pd.read_csv('/mnt/datasets/fusai/app_launch_log.txt',sep='\t',header=None,dtype={0:np.int32,1:np.int8}).rename(columns={0:'user_id',1:'day'}) 7 | video = pd.read_csv('/mnt/datasets/fusai/video_create_log.txt',sep='\t',header=None,dtype={0:np.int32,1:np.int8}).rename(columns={0:'user_id',1:'day'}) 8 | 9 | def gen_truth(start_date,span=7): 10 | end_date = start_date+span 11 | # 保证都是已注册用户 12 | basic = register[register.day=start_date)&(launch.day=start_date)&(video.day=start_date)&(activity.day0: 49 | label_seq.append(tt+(max_len-l)*[0]) 50 | label_length.append(l) 51 | return np.array(label_seq), label_length 52 | 53 | def get_table(table): 54 | if table == 'launch': 55 | return launch 56 | elif table == 'reg': 57 | return register 58 | elif table == 'video': 59 | return video 60 | elif table == 'act': 61 | return activity 62 | 63 | def gen_day_seq(start,end,table,type_columns=None,type_value=None): 64 | max_len = end-start+1 65 | data = register[['user_id']].copy() 66 | for i in range(start,end+1): 67 | sub = register[register.day<=i][['user_id']].copy() 68 | t = get_table(table) 69 | t = t[t.day==i] 70 | if table == 'act': 71 | t = t[t[type_columns]==type_value] 72 | t = t[['user_id']].drop_duplicates() 73 | t['day%d'%i] = 1 74 | sub = sub.merge(t,'left','user_id') 75 | sub = sub.fillna(0) 76 | data = data.merge(sub,'left','user_id') 77 | data = data.fillna(-1) 78 | del data['user_id'] 79 | data = data.values 80 | seq = [] 81 | for t in data: 82 | tt = list(t[t!=-1]) 83 | l = len(tt) 84 | if l>0: 85 | seq.append(tt+(max_len-l)*[-1]) 86 | return np.array(seq) 87 | -------------------------------------------------------------------------------- /rnn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tqdm import tqdm 4 | from sklearn.metrics import roc_auc_score 5 | import os,time 6 | import tensorflow as tf 7 | from utils import * 8 | 9 | class my_model(): 10 | def __init__(self,num_feat,time_stage,epoch=2,batch_size=64,learning_rate=0.001,random_seed=1011, 11 | hidden_size=[50,50],num_layers=2): 12 | self.num_feat = num_feat 13 | self.time_stage = time_stage 14 | self.epoch = epoch 15 | self.batch_size = batch_size 16 | self.learning_rate = learning_rate 17 | self.random_seed = random_seed 18 | self.hidden_size = hidden_size 19 | self.num_layers = num_layers 20 | self._init_graph() 21 | 22 | def _init_graph(self): 23 | self.graph = tf.Graph() 24 | with self.graph.as_default(): 25 | tf.set_random_seed(self.random_seed) 26 | initializer=tf.random_uniform_initializer(-0.1, 0.1) 27 | 28 | self.feat_seq = tf.placeholder(tf.float32, [None, self.time_stage, self.num_feat], name='feat_seq') 29 | self.label_seq = tf.placeholder(tf.int32, [None, self.time_stage], name='label_seq') 30 | self.register_type = tf.placeholder(tf.int32, (None,), name='register_type') 31 | self.device_type = tf.placeholder(tf.int32, (None,), name='device_type') 32 | self.seq_length = tf.placeholder(tf.int32, (None,), name='seq_length') 33 | self.train_phase = tf.placeholder(tf.bool, name="train_phase") 34 | 35 | 36 | cell = tf.nn.rnn_cell.LSTMCell(50,state_is_tuple=True, initializer=initializer) 37 | def get_lstm_cell(rnn_size): 38 | lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=initializer) 39 | return lstm_cell 40 | # multi_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(self.hidden_size[i]) for i in range(self.num_layers)]) 41 | output, state = tf.nn.dynamic_rnn(cell, self.feat_seq, dtype=tf.float32, sequence_length=self.seq_length) 42 | # output: -1*time_stage*rnn_size 43 | 44 | 45 | regType_emb = tf.gather(tf.Variable(tf.truncated_normal(shape=[12,1], mean=0.0, stddev=0.0001)), self.register_type) 46 | regType_emb = tf.tile(regType_emb,[1,self.time_stage]) 47 | regType_emb = tf.expand_dims(regType_emb, -1) 48 | 49 | devType_emb = tf.gather(tf.Variable(tf.truncated_normal(shape=[2000,1], mean=0.0, stddev=0.0001)), self.device_type) 50 | devType_emb = tf.tile(devType_emb,[1,self.time_stage]) 51 | devType_emb = tf.expand_dims(devType_emb, -1) 52 | 53 | output = tf.concat([output,regType_emb],axis=-1) 54 | output = tf.concat([output,devType_emb],axis=-1) 55 | 56 | output = tf.reshape(output, [-1, self.hidden_size[-1]+2]) 57 | 58 | w2 = tf.Variable(tf.random_uniform([self.hidden_size[-1]+2, 2], -0.1, 0.1)) 59 | b2 = tf.Variable(tf.random_uniform([2], -0.1, 0.1)) 60 | logits = tf.matmul(output, w2) + b2 61 | logits = tf.reshape(logits, [-1, self.time_stage, 2]) 62 | 63 | # loss ignore last 7 days 64 | masks = tf.sequence_mask(self.seq_length-7, self.time_stage-7, dtype=tf.float32, name='masks') 65 | paddings = tf.constant([[0, 0,], [0, 7]]) 66 | masks = tf.pad(masks,paddings) 67 | loss = tf.contrib.seq2seq.sequence_loss(logits,self.label_seq,masks) 68 | self.loss = tf.reduce_sum(loss) 69 | 70 | # last out 71 | batch_range = tf.range(tf.shape(logits)[0]) 72 | ind = self.seq_length - 1 73 | indices = tf.stack([batch_range, ind], axis=1) 74 | logits = tf.gather_nd(logits,indices) 75 | self.out = tf.nn.softmax(logits) 76 | 77 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, 78 | epsilon=1e-8).minimize(self.loss) 79 | self.saver = tf.train.Saver() 80 | init = tf.global_variables_initializer() 81 | self.sess = self._init_session() 82 | self.sess.run(init) 83 | 84 | def _init_session(self): 85 | config = tf.ConfigProto() 86 | return tf.Session(config=config) 87 | 88 | def get_batch(self,feat_seq,label_seq,seq_length,register_type,device_type,batch_size,index): 89 | start = index * batch_size 90 | end = (index+1) * batch_size 91 | end = end if end < len(feat_seq) else len(feat_seq) 92 | return feat_seq[start:end],label_seq[start:end],seq_length[start:end],register_type[start:end],device_type[start:end] 93 | 94 | def fit_on_batch(self, feat_seq,label_seq,seq_length,register_type,device_type): 95 | feed_dict = {self.feat_seq: feat_seq, 96 | self.label_seq: label_seq, 97 | self.seq_length: seq_length, 98 | self.register_type:register_type, 99 | self.device_type:device_type, 100 | self.train_phase: True} 101 | loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict) 102 | return loss 103 | 104 | def fit(self,feat_seq,label_seq,seq_length,register_type,device_type): 105 | for epoch in range(self.epoch): 106 | total_loss = 0.0 107 | total_size = 0.0 108 | batch_begin_time = time.time() 109 | t1 = time.time() 110 | total_batch = int(len(feat_seq) / self.batch_size) 111 | for i in range(total_batch): 112 | offset = i * self.batch_size 113 | end = (i+1) * self.batch_size 114 | end = end if end < len(feat_seq) else len(feat_seq) 115 | _feat_seq,_label_seq,_seq_length,_register_type,_device_type\ 116 | = self.get_batch(feat_seq,label_seq,seq_length,register_type,device_type,self.batch_size,i) 117 | batch_loss = self.fit_on_batch(_feat_seq, _label_seq, _seq_length,_register_type,_device_type) 118 | total_loss += batch_loss * (end - offset) 119 | total_size += end - offset 120 | if i % 100 == 99: 121 | print('[%d, %5d] loss: %.6f time: %.1f s' % 122 | (epoch + 1, i + 1, total_loss / total_size, time.time() - batch_begin_time)) 123 | total_loss = 0.0 124 | total_size = 0.0 125 | batch_begin_time = time.time() 126 | 127 | def predict(self,feat_seq,seq_length,register_type,device_type,y = []): 128 | if len(y) == 0: 129 | label_seq = np.zeros([feat_seq.shape[0],feat_seq.shape[1]]) 130 | else: 131 | label_seq = y 132 | batch_index = 0 133 | batch_size = 4096 134 | _feat_seq,_label_seq,_seq_length,_register_type,_device_type\ 135 | = self.get_batch(feat_seq,label_seq,seq_length,register_type,device_type,batch_size,batch_index) 136 | y_pred = None 137 | total_loss = 0.0 138 | total_size = 0.0 139 | while len(_seq_length) > 0: 140 | num_batch = len(_seq_length) 141 | feed_dict = {self.feat_seq: _feat_seq, 142 | self.label_seq: _label_seq, 143 | self.seq_length: _seq_length, 144 | self.register_type:_register_type, 145 | self.device_type:_device_type, 146 | self.train_phase: False} 147 | batch_out, batch_loss = self.sess.run((self.out, self.loss), feed_dict=feed_dict) 148 | total_loss += batch_loss * num_batch 149 | total_size += num_batch 150 | if batch_index == 0: 151 | y_pred = np.reshape(batch_out, (num_batch,2,)) 152 | else: 153 | y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,2,)))) 154 | batch_index += 1 155 | _feat_seq,_label_seq,_seq_length,_register_type,_device_type\ 156 | = self.get_batch(feat_seq,label_seq,seq_length,register_type,device_type,batch_size,batch_index) 157 | print("valid logloss is %.6f" % (total_loss / total_size)) 158 | print("predict end") 159 | return y_pred 160 | 161 | # 线下 162 | offline_label_seq,offline_seq_length = gen_label(1,23) 163 | offline_lanuch_seq = gen_day_seq(1,23,'launch') 164 | offline_video_seq = gen_day_seq(1,23,'video') 165 | offline_reg_seq = gen_day_seq(1,23,'reg') 166 | offline_act0_seq = gen_day_seq(1,23,'act','action_type',0) 167 | offline_act1_seq = gen_day_seq(1,23,'act','action_type',1) 168 | offline_act2_seq = gen_day_seq(1,23,'act','action_type',2) 169 | offline_act3_seq = gen_day_seq(1,23,'act','action_type',3) 170 | offline_act4_seq = gen_day_seq(1,23,'act','action_type',4) 171 | offline_act5_seq = gen_day_seq(1,23,'act','action_type',5) 172 | offline_page0_seq = gen_day_seq(1,23,'act','page',0) 173 | offline_page1_seq = gen_day_seq(1,23,'act','page',1) 174 | offline_page2_seq = gen_day_seq(1,23,'act','page',2) 175 | offline_page3_seq = gen_day_seq(1,23,'act','page',3) 176 | offline_page4_seq = gen_day_seq(1,23,'act','page',4) 177 | 178 | offline_data = np.concatenate((offline_lanuch_seq.reshape(-1,1),offline_video_seq.reshape(-1,1)),axis=1) 179 | offline_data = np.concatenate((offline_data.reshape(-1,2),offline_reg_seq.reshape(-1,1)),axis=1) 180 | offline_data = np.concatenate((offline_data.reshape(-1,3),offline_act0_seq.reshape(-1,1)),axis=1) 181 | offline_data = np.concatenate((offline_data.reshape(-1,4),offline_act1_seq.reshape(-1,1)),axis=1) 182 | offline_data = np.concatenate((offline_data.reshape(-1,5),offline_act2_seq.reshape(-1,1)),axis=1) 183 | offline_data = np.concatenate((offline_data.reshape(-1,6),offline_act3_seq.reshape(-1,1)),axis=1) 184 | offline_data = np.concatenate((offline_data.reshape(-1,7),offline_act4_seq.reshape(-1,1)),axis=1) 185 | offline_data = np.concatenate((offline_data.reshape(-1,8),offline_act5_seq.reshape(-1,1)),axis=1) 186 | offline_data = np.concatenate((offline_data.reshape(-1,9),offline_page0_seq.reshape(-1,1)),axis=1) 187 | offline_data = np.concatenate((offline_data.reshape(-1,10),offline_page1_seq.reshape(-1,1)),axis=1) 188 | offline_data = np.concatenate((offline_data.reshape(-1,11),offline_page2_seq.reshape(-1,1)),axis=1) 189 | offline_data = np.concatenate((offline_data.reshape(-1,12),offline_page3_seq.reshape(-1,1)),axis=1) 190 | offline_data = np.concatenate((offline_data.reshape(-1,13),offline_page4_seq.reshape(-1,1)),axis=1) 191 | offline_data = offline_data.reshape([-1,23,14]) 192 | 193 | sub = register[register.day<=23] 194 | truth = gen_truth(24) 195 | sub['device_type'] = np.where(sub['device_type']<1999,sub['device_type'],1999) 196 | offline_register_type = sub['register_type'].values 197 | offline_device_type = sub['device_type'].values 198 | sub = sub[['user_id']].copy() 199 | sub = sub.merge(truth,'left','user_id') 200 | sub = sub.fillna(0) 201 | 202 | tf.reset_default_graph() 203 | model = my_model(num_feat=14,time_stage=23,epoch=35,batch_size=512,learning_rate=0.001,num_layers=2) 204 | sub['pre'] = pre[:,1:2] 205 | print(roc_auc_score(sub['label'],sub['pre'])) 206 | 207 | # 线上部分 208 | train_label_seq,train_label_length = gen_label(1,30) 209 | train_lanuch_seq = gen_day_seq(1,30,'launch') 210 | train_video_seq = gen_day_seq(1,30,'video') 211 | train_reg_seq = gen_day_seq(1,30,'reg') 212 | train_act0_seq = gen_day_seq(1,30,'act','action_type',0) 213 | train_act1_seq = gen_day_seq(1,30,'act','action_type',1) 214 | train_act2_seq = gen_day_seq(1,30,'act','action_type',2) 215 | train_act3_seq = gen_day_seq(1,30,'act','action_type',3) 216 | train_act4_seq = gen_day_seq(1,30,'act','action_type',4) 217 | train_act5_seq = gen_day_seq(1,30,'act','action_type',5) 218 | train_page0_seq = gen_day_seq(1,30,'act','page',0) 219 | train_page1_seq = gen_day_seq(1,30,'act','page',1) 220 | train_page2_seq = gen_day_seq(1,30,'act','page',2) 221 | train_page3_seq = gen_day_seq(1,30,'act','page',3) 222 | train_page4_seq = gen_day_seq(1,30,'act','page',4) 223 | 224 | train_data = np.concatenate((train_lanuch_seq.reshape(-1,1),train_video_seq.reshape(-1,1)),axis=1) 225 | train_data = np.concatenate((train_data.reshape(-1,2),train_reg_seq.reshape(-1,1)),axis=1) 226 | train_data = np.concatenate((train_data.reshape(-1,3),train_act0_seq.reshape(-1,1)),axis=1) 227 | train_data = np.concatenate((train_data.reshape(-1,4),train_act1_seq.reshape(-1,1)),axis=1) 228 | train_data = np.concatenate((train_data.reshape(-1,5),train_act2_seq.reshape(-1,1)),axis=1) 229 | train_data = np.concatenate((train_data.reshape(-1,6),train_act3_seq.reshape(-1,1)),axis=1) 230 | train_data = np.concatenate((train_data.reshape(-1,7),train_act4_seq.reshape(-1,1)),axis=1) 231 | train_data = np.concatenate((train_data.reshape(-1,8),train_act5_seq.reshape(-1,1)),axis=1) 232 | train_data = np.concatenate((train_data.reshape(-1,9),train_page0_seq.reshape(-1,1)),axis=1) 233 | train_data = np.concatenate((train_data.reshape(-1,10),train_page1_seq.reshape(-1,1)),axis=1) 234 | train_data = np.concatenate((train_data.reshape(-1,11),train_page2_seq.reshape(-1,1)),axis=1) 235 | train_data = np.concatenate((train_data.reshape(-1,12),train_page3_seq.reshape(-1,1)),axis=1) 236 | train_data = np.concatenate((train_data.reshape(-1,13),train_page4_seq.reshape(-1,1)),axis=1) 237 | train_data = train_data.reshape([-1,30,14]) 238 | 239 | res = register[register.day<=30] 240 | res['device_type'] = np.where(res['device_type']<1999,res['device_type'],1999) 241 | 242 | register_type = res['register_type'].values 243 | device_type = res['device_type'].values 244 | res = res[['user_id']].copy() 245 | 246 | tf.reset_default_graph() 247 | model = my_model(num_feat=14,time_stage=30,epoch=30,batch_size=512,learning_rate=0.001,num_layers=2) 248 | model.fit(train_data,train_label_seq,train_label_length,register_type,device_type) 249 | pre = model.predict(train_data,train_label_length,register_type,device_type) 250 | res['pre'] = pre[:,1:2].reshape([-1]) 251 | res.to_csv('submit.txt',index=False,header=None) 252 | 253 | --------------------------------------------------------------------------------