├── README.md
├── utils.py
└── rnn.py


/README.md:
--------------------------------------------------------------------------------
1 | # KuaiShou2018-RANK13-RNN
2 | ## 垃圾RNN，单模91多点，融合管用，14个序列特征+两个type
3 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | register = pd.read_csv('/mnt/datasets/fusai/user_register_log.txt',sep='\t',header=None,dtype={0:np.int32,1:np.int8,2:np.int16,3:np.int16}).rename(columns={0:'user_id',1:'day',2:'register_type',3:'device_type'})
 5 | activity = pd.read_csv('/mnt/datasets/fusai/user_activity_log.txt',sep='\t',header=None,dtype={0:np.int32,1:np.int8,2:np.int8,3:np.int32,4:np.int32,5:np.int8}).rename(columns={0:'user_id',1:'day',2:'page',3:'video_id',4:'author_id',5:'action_type'})
 6 | launch = pd.read_csv('/mnt/datasets/fusai/app_launch_log.txt',sep='\t',header=None,dtype={0:np.int32,1:np.int8}).rename(columns={0:'user_id',1:'day'})
 7 | video = pd.read_csv('/mnt/datasets/fusai/video_create_log.txt',sep='\t',header=None,dtype={0:np.int32,1:np.int8}).rename(columns={0:'user_id',1:'day'})
 8 | 
 9 | def gen_truth(start_date,span=7):
10 |     end_date = start_date+span
11 |     # 保证都是已注册用户
12 |     basic = register[register.day<start_date]
13 |     basic = basic['user_id'].unique()
14 |     
15 |     u1 = launch[(launch.day>=start_date)&(launch.day<end_date)]
16 |     u1 = u1['user_id'].unique()
17 | 
18 |     u2 = video[(video.day>=start_date)&(video.day<end_date)]
19 |     u2 = u2['user_id'].unique()
20 |     
21 |     u3 = activity[(activity.day>=start_date)&(activity.day<end_date)]
22 |     u3 = u3['user_id'].unique()
23 |     
24 |     truth = set(u1)|set(u2)|set(u3)
25 |     truth = truth&set(basic)
26 |     truth = pd.DataFrame(list(truth),columns=['user_id'])
27 |     truth['label'] = 1
28 |     return truth
29 |     
30 | def gen_label(start,end):
31 |     max_len = end-start+1
32 |     data = register[['user_id']].copy()
33 |     for i in range(start,end+1):
34 |         sub = register[register.day<=i][['user_id']].copy()
35 |         truth = gen_truth(i+1)
36 |         truth.columns = ['user_id','day%d_label'%i]
37 |         sub = sub.merge(truth,'left','user_id')
38 |         sub = sub.fillna(0)
39 |         data = data.merge(sub,'left','user_id')
40 |         data = data.fillna(-1)
41 |     del data['user_id']
42 |     data = data.values
43 |     label_seq = []
44 |     label_length = []
45 |     for t in data:
46 |         tt = list(t[t!=-1])
47 |         l = len(tt)
48 |         if l>0:
49 |             label_seq.append(tt+(max_len-l)*[0])
50 |             label_length.append(l)        
51 |     return np.array(label_seq), label_length
52 | 
53 | def get_table(table):
54 |     if table == 'launch':
55 |         return launch
56 |     elif table == 'reg':
57 |         return register
58 |     elif table == 'video':
59 |         return video
60 |     elif table == 'act':
61 |         return activity
62 | 
63 | def gen_day_seq(start,end,table,type_columns=None,type_value=None):
64 |     max_len = end-start+1
65 |     data = register[['user_id']].copy()
66 |     for i in range(start,end+1):
67 |         sub = register[register.day<=i][['user_id']].copy()
68 |         t = get_table(table)
69 |         t = t[t.day==i]
70 |         if table == 'act':
71 |             t = t[t[type_columns]==type_value]
72 |         t = t[['user_id']].drop_duplicates()
73 |         t['day%d'%i] = 1
74 |         sub = sub.merge(t,'left','user_id')
75 |         sub = sub.fillna(0)
76 |         data = data.merge(sub,'left','user_id')
77 |         data = data.fillna(-1)
78 |     del data['user_id']
79 |     data = data.values
80 |     seq = []
81 |     for t in data:
82 |         tt = list(t[t!=-1])
83 |         l = len(tt)
84 |         if l>0:
85 |             seq.append(tt+(max_len-l)*[-1])
86 |     return np.array(seq)
87 | 


--------------------------------------------------------------------------------
/rnn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from tqdm import tqdm
  4 | from sklearn.metrics import roc_auc_score
  5 | import os,time
  6 | import tensorflow as tf
  7 | from utils import *
  8 | 
  9 | class my_model():
 10 |     def __init__(self,num_feat,time_stage,epoch=2,batch_size=64,learning_rate=0.001,random_seed=1011,
 11 |                  hidden_size=[50,50],num_layers=2):
 12 |         self.num_feat = num_feat
 13 |         self.time_stage = time_stage
 14 |         self.epoch = epoch
 15 |         self.batch_size = batch_size
 16 |         self.learning_rate = learning_rate
 17 |         self.random_seed = random_seed
 18 |         self.hidden_size = hidden_size
 19 |         self.num_layers = num_layers
 20 |         self._init_graph()
 21 |         
 22 |     def _init_graph(self):
 23 |         self.graph = tf.Graph()
 24 |         with self.graph.as_default():
 25 |             tf.set_random_seed(self.random_seed)
 26 |             initializer=tf.random_uniform_initializer(-0.1, 0.1)
 27 |         
 28 |         self.feat_seq = tf.placeholder(tf.float32, [None, self.time_stage, self.num_feat], name='feat_seq')
 29 |         self.label_seq = tf.placeholder(tf.int32, [None, self.time_stage], name='label_seq')
 30 |         self.register_type = tf.placeholder(tf.int32, (None,), name='register_type')
 31 |         self.device_type = tf.placeholder(tf.int32, (None,), name='device_type')
 32 |         self.seq_length = tf.placeholder(tf.int32, (None,), name='seq_length')
 33 |         self.train_phase = tf.placeholder(tf.bool, name="train_phase")
 34 |         
 35 | 
 36 |         cell = tf.nn.rnn_cell.LSTMCell(50,state_is_tuple=True, initializer=initializer)
 37 |         def get_lstm_cell(rnn_size):
 38 |             lstm_cell = tf.contrib.rnn.LSTMCell(rnn_size, initializer=initializer)
 39 |             return lstm_cell
 40 |         # multi_cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(self.hidden_size[i]) for i in range(self.num_layers)])
 41 |         output, state = tf.nn.dynamic_rnn(cell, self.feat_seq, dtype=tf.float32, sequence_length=self.seq_length)
 42 |         # output: -1*time_stage*rnn_size
 43 |         
 44 |         
 45 |         regType_emb = tf.gather(tf.Variable(tf.truncated_normal(shape=[12,1], mean=0.0, stddev=0.0001)), self.register_type)
 46 |         regType_emb = tf.tile(regType_emb,[1,self.time_stage])
 47 |         regType_emb = tf.expand_dims(regType_emb, -1)
 48 |         
 49 |         devType_emb = tf.gather(tf.Variable(tf.truncated_normal(shape=[2000,1], mean=0.0, stddev=0.0001)), self.device_type)
 50 |         devType_emb = tf.tile(devType_emb,[1,self.time_stage])
 51 |         devType_emb = tf.expand_dims(devType_emb, -1)
 52 |         
 53 |         output = tf.concat([output,regType_emb],axis=-1)
 54 |         output = tf.concat([output,devType_emb],axis=-1) 
 55 |         
 56 |         output = tf.reshape(output, [-1, self.hidden_size[-1]+2])
 57 | 
 58 |         w2 = tf.Variable(tf.random_uniform([self.hidden_size[-1]+2, 2], -0.1, 0.1))
 59 |         b2 = tf.Variable(tf.random_uniform([2], -0.1, 0.1))
 60 |         logits = tf.matmul(output, w2) + b2
 61 |         logits = tf.reshape(logits, [-1, self.time_stage, 2])
 62 |         
 63 |         # loss ignore last 7 days
 64 |         masks = tf.sequence_mask(self.seq_length-7, self.time_stage-7, dtype=tf.float32, name='masks')
 65 |         paddings = tf.constant([[0, 0,], [0, 7]])
 66 |         masks = tf.pad(masks,paddings)
 67 |         loss = tf.contrib.seq2seq.sequence_loss(logits,self.label_seq,masks)
 68 |         self.loss = tf.reduce_sum(loss)
 69 |         
 70 |         # last out
 71 |         batch_range = tf.range(tf.shape(logits)[0])
 72 |         ind = self.seq_length - 1
 73 |         indices = tf.stack([batch_range, ind], axis=1)
 74 |         logits = tf.gather_nd(logits,indices)
 75 |         self.out = tf.nn.softmax(logits)
 76 |                 
 77 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
 78 |                                                 epsilon=1e-8).minimize(self.loss)
 79 |         self.saver = tf.train.Saver()
 80 |         init = tf.global_variables_initializer()
 81 |         self.sess = self._init_session()
 82 |         self.sess.run(init)
 83 |         
 84 |     def _init_session(self):
 85 |         config = tf.ConfigProto()
 86 |         return tf.Session(config=config)
 87 |         
 88 |     def get_batch(self,feat_seq,label_seq,seq_length,register_type,device_type,batch_size,index):
 89 |         start = index * batch_size
 90 |         end = (index+1) * batch_size
 91 |         end = end if end < len(feat_seq) else len(feat_seq)
 92 |         return feat_seq[start:end],label_seq[start:end],seq_length[start:end],register_type[start:end],device_type[start:end]
 93 |     
 94 |     def fit_on_batch(self, feat_seq,label_seq,seq_length,register_type,device_type):
 95 |         feed_dict = {self.feat_seq: feat_seq,
 96 |                      self.label_seq: label_seq,
 97 |                      self.seq_length: seq_length,
 98 |                      self.register_type:register_type,
 99 |                      self.device_type:device_type,
100 |                      self.train_phase: True}
101 |         loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
102 |         return loss
103 |     
104 |     def fit(self,feat_seq,label_seq,seq_length,register_type,device_type):
105 |         for epoch in range(self.epoch):
106 |             total_loss = 0.0
107 |             total_size = 0.0
108 |             batch_begin_time = time.time()
109 |             t1 = time.time()
110 |             total_batch = int(len(feat_seq) / self.batch_size)
111 |             for i in range(total_batch):
112 |                 offset = i * self.batch_size
113 |                 end = (i+1) * self.batch_size
114 |                 end = end if end < len(feat_seq) else len(feat_seq)
115 |                 _feat_seq,_label_seq,_seq_length,_register_type,_device_type\
116 |                     = self.get_batch(feat_seq,label_seq,seq_length,register_type,device_type,self.batch_size,i)
117 |                 batch_loss = self.fit_on_batch(_feat_seq, _label_seq, _seq_length,_register_type,_device_type)
118 |                 total_loss += batch_loss * (end - offset)
119 |                 total_size += end - offset
120 |                 if i % 100 == 99:
121 |                     print('[%d, %5d] loss: %.6f time: %.1f s' %
122 |                           (epoch + 1, i + 1, total_loss / total_size, time.time() - batch_begin_time))
123 |                     total_loss = 0.0
124 |                     total_size = 0.0
125 |                     batch_begin_time = time.time()
126 |                     
127 |     def predict(self,feat_seq,seq_length,register_type,device_type,y = []):
128 |         if len(y) == 0:
129 |             label_seq = np.zeros([feat_seq.shape[0],feat_seq.shape[1]])
130 |         else:
131 |             label_seq = y
132 |         batch_index = 0
133 |         batch_size = 4096
134 |         _feat_seq,_label_seq,_seq_length,_register_type,_device_type\
135 |             = self.get_batch(feat_seq,label_seq,seq_length,register_type,device_type,batch_size,batch_index)
136 |         y_pred = None
137 |         total_loss = 0.0
138 |         total_size = 0.0
139 |         while len(_seq_length) > 0:
140 |             num_batch = len(_seq_length)
141 |             feed_dict = {self.feat_seq: _feat_seq,
142 |                          self.label_seq: _label_seq,
143 |                          self.seq_length: _seq_length,
144 |                          self.register_type:_register_type,
145 |                          self.device_type:_device_type,
146 |                          self.train_phase: False}
147 |             batch_out, batch_loss = self.sess.run((self.out, self.loss), feed_dict=feed_dict)
148 |             total_loss += batch_loss * num_batch
149 |             total_size += num_batch
150 |             if batch_index == 0:
151 |                 y_pred = np.reshape(batch_out, (num_batch,2,))
152 |             else:
153 |                 y_pred = np.concatenate((y_pred, np.reshape(batch_out, (num_batch,2,))))
154 |             batch_index += 1
155 |             _feat_seq,_label_seq,_seq_length,_register_type,_device_type\
156 |                 = self.get_batch(feat_seq,label_seq,seq_length,register_type,device_type,batch_size,batch_index)
157 |         print("valid logloss is %.6f" % (total_loss / total_size))
158 |         print("predict end")
159 |         return y_pred
160 | 
161 | # 线下
162 | offline_label_seq,offline_seq_length = gen_label(1,23)
163 | offline_lanuch_seq = gen_day_seq(1,23,'launch')
164 | offline_video_seq = gen_day_seq(1,23,'video')
165 | offline_reg_seq = gen_day_seq(1,23,'reg')
166 | offline_act0_seq = gen_day_seq(1,23,'act','action_type',0)
167 | offline_act1_seq = gen_day_seq(1,23,'act','action_type',1)
168 | offline_act2_seq = gen_day_seq(1,23,'act','action_type',2)
169 | offline_act3_seq = gen_day_seq(1,23,'act','action_type',3)
170 | offline_act4_seq = gen_day_seq(1,23,'act','action_type',4)
171 | offline_act5_seq = gen_day_seq(1,23,'act','action_type',5)
172 | offline_page0_seq = gen_day_seq(1,23,'act','page',0)
173 | offline_page1_seq = gen_day_seq(1,23,'act','page',1)
174 | offline_page2_seq = gen_day_seq(1,23,'act','page',2)
175 | offline_page3_seq = gen_day_seq(1,23,'act','page',3)
176 | offline_page4_seq = gen_day_seq(1,23,'act','page',4)
177 | 
178 | offline_data = np.concatenate((offline_lanuch_seq.reshape(-1,1),offline_video_seq.reshape(-1,1)),axis=1)
179 | offline_data = np.concatenate((offline_data.reshape(-1,2),offline_reg_seq.reshape(-1,1)),axis=1)
180 | offline_data = np.concatenate((offline_data.reshape(-1,3),offline_act0_seq.reshape(-1,1)),axis=1)
181 | offline_data = np.concatenate((offline_data.reshape(-1,4),offline_act1_seq.reshape(-1,1)),axis=1)
182 | offline_data = np.concatenate((offline_data.reshape(-1,5),offline_act2_seq.reshape(-1,1)),axis=1)
183 | offline_data = np.concatenate((offline_data.reshape(-1,6),offline_act3_seq.reshape(-1,1)),axis=1)
184 | offline_data = np.concatenate((offline_data.reshape(-1,7),offline_act4_seq.reshape(-1,1)),axis=1)
185 | offline_data = np.concatenate((offline_data.reshape(-1,8),offline_act5_seq.reshape(-1,1)),axis=1)
186 | offline_data = np.concatenate((offline_data.reshape(-1,9),offline_page0_seq.reshape(-1,1)),axis=1)
187 | offline_data = np.concatenate((offline_data.reshape(-1,10),offline_page1_seq.reshape(-1,1)),axis=1)
188 | offline_data = np.concatenate((offline_data.reshape(-1,11),offline_page2_seq.reshape(-1,1)),axis=1)
189 | offline_data = np.concatenate((offline_data.reshape(-1,12),offline_page3_seq.reshape(-1,1)),axis=1)
190 | offline_data = np.concatenate((offline_data.reshape(-1,13),offline_page4_seq.reshape(-1,1)),axis=1)
191 | offline_data = offline_data.reshape([-1,23,14])
192 | 
193 | sub = register[register.day<=23]
194 | truth = gen_truth(24)
195 | sub['device_type'] = np.where(sub['device_type']<1999,sub['device_type'],1999)
196 | offline_register_type = sub['register_type'].values
197 | offline_device_type = sub['device_type'].values
198 | sub = sub[['user_id']].copy()
199 | sub = sub.merge(truth,'left','user_id')
200 | sub = sub.fillna(0)
201 | 
202 | tf.reset_default_graph()
203 | model = my_model(num_feat=14,time_stage=23,epoch=35,batch_size=512,learning_rate=0.001,num_layers=2)
204 | sub['pre'] = pre[:,1:2]
205 | print(roc_auc_score(sub['label'],sub['pre']))
206 | 
207 | # 线上部分
208 | train_label_seq,train_label_length = gen_label(1,30)
209 | train_lanuch_seq = gen_day_seq(1,30,'launch')
210 | train_video_seq = gen_day_seq(1,30,'video')
211 | train_reg_seq = gen_day_seq(1,30,'reg')
212 | train_act0_seq = gen_day_seq(1,30,'act','action_type',0)
213 | train_act1_seq = gen_day_seq(1,30,'act','action_type',1)
214 | train_act2_seq = gen_day_seq(1,30,'act','action_type',2)
215 | train_act3_seq = gen_day_seq(1,30,'act','action_type',3)
216 | train_act4_seq = gen_day_seq(1,30,'act','action_type',4)
217 | train_act5_seq = gen_day_seq(1,30,'act','action_type',5)
218 | train_page0_seq = gen_day_seq(1,30,'act','page',0)
219 | train_page1_seq = gen_day_seq(1,30,'act','page',1)
220 | train_page2_seq = gen_day_seq(1,30,'act','page',2)
221 | train_page3_seq = gen_day_seq(1,30,'act','page',3)
222 | train_page4_seq = gen_day_seq(1,30,'act','page',4)
223 | 
224 | train_data = np.concatenate((train_lanuch_seq.reshape(-1,1),train_video_seq.reshape(-1,1)),axis=1)
225 | train_data = np.concatenate((train_data.reshape(-1,2),train_reg_seq.reshape(-1,1)),axis=1)
226 | train_data = np.concatenate((train_data.reshape(-1,3),train_act0_seq.reshape(-1,1)),axis=1)
227 | train_data = np.concatenate((train_data.reshape(-1,4),train_act1_seq.reshape(-1,1)),axis=1)
228 | train_data = np.concatenate((train_data.reshape(-1,5),train_act2_seq.reshape(-1,1)),axis=1)
229 | train_data = np.concatenate((train_data.reshape(-1,6),train_act3_seq.reshape(-1,1)),axis=1)
230 | train_data = np.concatenate((train_data.reshape(-1,7),train_act4_seq.reshape(-1,1)),axis=1)
231 | train_data = np.concatenate((train_data.reshape(-1,8),train_act5_seq.reshape(-1,1)),axis=1)
232 | train_data = np.concatenate((train_data.reshape(-1,9),train_page0_seq.reshape(-1,1)),axis=1)
233 | train_data = np.concatenate((train_data.reshape(-1,10),train_page1_seq.reshape(-1,1)),axis=1)
234 | train_data = np.concatenate((train_data.reshape(-1,11),train_page2_seq.reshape(-1,1)),axis=1)
235 | train_data = np.concatenate((train_data.reshape(-1,12),train_page3_seq.reshape(-1,1)),axis=1)
236 | train_data = np.concatenate((train_data.reshape(-1,13),train_page4_seq.reshape(-1,1)),axis=1)
237 | train_data = train_data.reshape([-1,30,14])
238 | 
239 | res = register[register.day<=30]
240 | res['device_type'] = np.where(res['device_type']<1999,res['device_type'],1999)
241 | 
242 | register_type = res['register_type'].values
243 | device_type = res['device_type'].values
244 | res = res[['user_id']].copy()
245 | 
246 | tf.reset_default_graph()
247 | model = my_model(num_feat=14,time_stage=30,epoch=30,batch_size=512,learning_rate=0.001,num_layers=2)
248 | model.fit(train_data,train_label_seq,train_label_length,register_type,device_type)
249 | pre = model.predict(train_data,train_label_length,register_type,device_type)
250 | res['pre'] = pre[:,1:2].reshape([-1])
251 | res.to_csv('submit.txt',index=False,header=None)
252 | 
253 | 


--------------------------------------------------------------------------------