├── pic1.png ├── p_ed_rnn ├── best_model.pt ├── p5_edrnn_predict.py └── p5_edrnn_train.py ├── data_process ├── 1_label_split.py ├── 5_sequence_match.py ├── 2_device_info_fusion.py ├── 3_behavior_process.py └── 4_behavior_sequence.py └── README.md /pic1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Q-Qing/huawei_digix2021_track1/HEAD/pic1.png -------------------------------------------------------------------------------- /p_ed_rnn/best_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Q-Qing/huawei_digix2021_track1/HEAD/p_ed_rnn/best_model.pt -------------------------------------------------------------------------------- /data_process/1_label_split.py: -------------------------------------------------------------------------------- 1 | """ 2 | 分割原始的device_active.csv 3 | """ 4 | import pandas as pd 5 | 6 | df_active = pd.read_csv('2021_1_data/1_device_active.csv', sep="|") 7 | print(df_active) 8 | active_matrix = [] 9 | for index, row in df_active.iterrows(): 10 | active_flag = [0]*60 11 | active_days = row[1].split('#') 12 | for day in active_days: 13 | active_flag[int(day)-1] = 1 14 | active_matrix.append(active_flag) 15 | 16 | column_name = list(range(1, 61)) 17 | df = pd.DataFrame(active_matrix, columns=column_name) 18 | df.insert(0, 'device_id', df_active['device_id']) 19 | print(df) 20 | df.to_csv('processed_data/device_active.csv', index=False) 21 | -------------------------------------------------------------------------------- /data_process/5_sequence_match.py: -------------------------------------------------------------------------------- 1 | """ 2 | 按照df_active_user中device_id的顺序,重新排列page_action_sequence 3 | """ 4 | import pandas as pd 5 | import numpy as np 6 | 7 | 8 | result = np.load('processed_data/page_action_sequence.npy') 9 | id_list = np.load('processed_data/behavior_ids.npy') 10 | 11 | df_active = pd.read_csv('processed_data/device_active_with_info.csv') 12 | 13 | matched_result = np.zeros((2000000, 60, 300, 2), dtype='uint8') 14 | for i in range(len(id_list)): 15 | id = id_list[i] 16 | id_index = df_active.index[df_active['device_id']==id].tolist()[0] 17 | matched_result[id_index, :, :, :] = result[i, :, :, :] 18 | if i % 10000 == 0: 19 | print(i) 20 | 21 | np.save('processed_data/matched_page_action_sequence.npy', matched_result) 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 华为digix 2021 赛题1 冠军解决方案 2 | 3 | ### 1. 题目介绍 4 | 赛题提供200万匿名用户在60天内使用一款音乐app的行为数据,包括当天是否活跃、每天收听的歌曲、打开的页面、执行的动作,以及用户和歌曲的基本数据。赛题要求预测所有用户未来30天的留存情况。 5 | 6 | ### 2. 解题思路 7 | 本赛题已知用户的历史行为,预测用户未来的行为,本质上可以看作一个多步时间序列预测问题。我们可以使用RNN Encoder-Decoder来解决这一问题。Encoder负责学习历史行为的表征,Decoder则基于这样的表征来产生未来的预测结果。 8 | 9 | ### 3. 解决方案 10 | * **数据处理** 11 | 12 | 首先将原始数据分为三大类: 13 | 1. 用户的基本信息(年龄、性别、城市、设备、是否是VIP、关注的话题数) 14 | 2. 用户每天的静态数据(当天是否活跃、当天听歌总数、当天首次打开app的渠道) 15 | 3. 用户每天的行为序列数据(当天打开的页面序列、执行的动作序列、收听的歌曲序列) 16 | 17 | 三类数据的区别在于数据维度不同,基本信息是二维数据[用户数 * 特征数],静态数据是三维数据[用户数 * 天数 * 特征数],行为序列数据则是四维数据[用户数 * 天数 * 序列长度 * 特征数]。同时,针对三类数据的缺失值,我们采用常数填补的方法,即用从未出现过的数值(-1)来代替缺失值。 18 | 19 | * **模型框架** 20 | 21 | 模型的基本结构如下图所示: 22 | 23 | ![avatar](pic1.png) 24 | 25 | 26 | 首先,将行为序列数据 *S_t* 通过RNN,使用RNN最后时刻隐藏层的状态 *c_t* 作为序列数据的表征。将 *c_t* 和静态数据 *x_t* 一起输入到并行的encoder-decoder中,在用attention机制对多个encoder-decoder的输出进行加权,得到加权后的decoder隐藏层状态 *w_ht* 。然后拼接用户基本信息*u*,输入多层感知机中预测最后的留存。 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /data_process/2_device_info_fusion.py: -------------------------------------------------------------------------------- 1 | """ 2 | 处理个人信息,并将个人信息添加到device_active.cav中 3 | """ 4 | 5 | import pandas as pd 6 | 7 | # 数据处理 8 | df_userinfo = pd.read_csv('2021_1_data/2_user_info.csv', sep="|") 9 | print(df_userinfo) 10 | # 缺失统计 11 | print(df_userinfo.isna().sum()) 12 | # 统计每列的数值分布 13 | for i in range(6): 14 | print(df_userinfo.iloc[:, i+1].value_counts()) 15 | 16 | # processing missing value 17 | # gender 0, age -1, is_vip 0, topics 提取数量 18 | df_fill = df_userinfo.fillna({'gender':-1, 'age':-1, "is_vip": -1, "topics":0}) 19 | print(df_fill) 20 | 21 | for index, row in df_fill.iterrows(): 22 | if row['topics'] != 0: 23 | topic_num = len(row['topics'].split('#')) 24 | # print(row['topics'], topic_num) 25 | df_fill.loc[index, 'topics'] = topic_num 26 | 27 | print(df_fill['topics'].value_counts()) 28 | df_fill.to_csv('processed_data/user_info1.csv', index=False) 29 | 30 | # 数据融合 31 | df_info = pd.read_csv('processed_data/user_info1.csv') 32 | df_device = pd.read_csv('processed_data/device_active.csv') 33 | 34 | df = df_device.merge(df_info, how='left', on='device_id') 35 | print(df) 36 | df.to_csv('processed_data/device_active_with_info.csv', index=False) 37 | 38 | 39 | -------------------------------------------------------------------------------- /data_process/3_behavior_process.py: -------------------------------------------------------------------------------- 1 | """ 2 | 得到200w用户每日行为的统计数据,每天是否活跃,听歌次数,打开渠道 3 | mixed_behavior.npy:200w * 60 * 3 4 | """ 5 | import pandas as pd 6 | import numpy as np 7 | 8 | pd.set_option('display.max_columns', None) 9 | pd.set_option('display.width', 4000) 10 | 11 | # 提取数量 12 | df = pd.read_csv('2021_1_data/4_user_behavior.csv', sep="|") 13 | print(df) 14 | print(df['channel'].value_counts()) 15 | 16 | 17 | def num_extract(strs): 18 | # if not np.isnan(strs): 19 | num = len(str(strs).split('#')) 20 | # else: 21 | # num = 0 22 | return num 23 | 24 | total_nums = [] 25 | for index, row in df.iterrows(): 26 | # print(row[2]) 27 | num_pages = num_extract(row[2]) 28 | num_music = num_extract(row[3]) 29 | num_actions = num_extract(row[4]) 30 | total_nums.append([num_pages, num_music, num_actions]) 31 | 32 | df_num = pd.DataFrame(total_nums, columns=['num_pages', 'num_music', 'num_actions']) 33 | df_num.insert(0, 'device_id', df['device_id']) 34 | df_num.insert(1, 'day', df['day']) 35 | df_num.insert(5, 'channel', df['channel']) 36 | print(df_num) 37 | df_num.to_csv('processed_data/behavior_num.csv', index=False) 38 | 39 | 40 | # 检查各数量是否一致(页面,歌曲,动作数量完全一致) 41 | df_num = pd.read_csv('processed_data/behavior_num.csv') 42 | same_num = 0 43 | for index, row in df_num.iterrows(): 44 | if row[2] == row[3] and row[3] == row[4]: 45 | same_num += 1 46 | 47 | print(df_num) 48 | print(same_num) 49 | 50 | 51 | # 合并active信息和behavior信息,构建一个三维张量(用户数*天数*特征数=200w*60*3) 52 | df_num = pd.read_csv('processed_data/behavior_num.csv') 53 | print(df_num['channel'].value_counts()) 54 | df_num = df_num.fillna({'channel': -1}) 55 | print(df_num['channel'].value_counts()) 56 | channel_to_ix = {channel: i for i, channel in enumerate(df_num['channel'].unique())} 57 | print(channel_to_ix) 58 | channel_idx = [channel_to_ix[d] for d in df_num['channel']] 59 | # 0 表示没有打开 1 表示缺失 60 | df_num['channel_idx'] = np.array(channel_idx) + 1 61 | print(df_num) 62 | 63 | result_tensor = [] 64 | df_active = pd.read_csv('processed_data/device_active_with_info.csv') 65 | for index, row in df_active.iterrows(): 66 | device_id = row['device_id'] 67 | # 60 68 | active_days = row[1:61].values 69 | user_beavior = np.zeros((60, 3)) 70 | user_beavior[:, 0] = active_days 71 | df_behavior = df_num.loc[df_num['device_id'] == device_id] 72 | if df_behavior.empty: 73 | result_tensor.append(user_beavior.tolist()) 74 | else: 75 | for index_b, row_b in df_behavior.iterrows(): 76 | day = int(row_b['day']) 77 | user_beavior[day-1, 1] = row_b['num_pages'] 78 | user_beavior[day-1, 2] = row_b['channel_idx'] 79 | result_tensor.append(user_beavior.tolist()) 80 | result = np.array(result_tensor) 81 | np.save('processed_data/mixed_behavior.npy', result) 82 | 83 | 84 | -------------------------------------------------------------------------------- /data_process/4_behavior_sequence.py: -------------------------------------------------------------------------------- 1 | """ 2 | 整合每天的行为序列 3 | 包括每天打开了的页面,听的歌曲 4 | """ 5 | import pandas as pd 6 | import numpy as np 7 | 8 | 9 | pd.set_option('display.max_columns', None) 10 | pd.set_option('display.width', 4000) 11 | 12 | 13 | def repalce_by_dic(d, arrays): 14 | 15 | replace_list = [d[i] for i in arrays] 16 | return replace_list 17 | 18 | 19 | def update_seq(row_b, seq): 20 | day = int(row_b['day']) 21 | pages = str(row_b['pages']).replace('null', '-1') 22 | pages = pages.replace('nan', '-1').split('#') 23 | pages = list(map(int, pages)) 24 | music_ids = str(row_b['music_ids']).replace('null', '-1') 25 | music_ids = music_ids.replace('nan', '-1').split('#') 26 | music_ids = list(map(int, music_ids)) 27 | actions = str(row_b['actions']).replace('null', '-1') 28 | actions = actions.replace('nan', '-1').split('#') 29 | actions = list(map(int, actions)) 30 | if len(pages) <= 300: 31 | seq_len = len(pages) 32 | else: 33 | seq_len = 300 34 | seq[day - 1, 0:seq_len, 0] = pages[0:seq_len] 35 | seq[day - 1, 0:seq_len, 1] = actions[0:seq_len] 36 | seq[day - 1, 0:seq_len, 2] = music_ids[0:seq_len] 37 | return seq 38 | 39 | 40 | def update_page_action_seq(row_b, seq, page_dict): 41 | day = int(row_b['day']) 42 | pages = str(row_b['pages']).replace('null', '-1') 43 | pages = pages.replace('nan', '-1').split('#') 44 | pages = list(map(int, pages)) 45 | pages = repalce_by_dic(page_dict, pages) 46 | actions = list(map(int, str(row_b['actions']).split('#'))) 47 | 48 | if len(pages) <= 300: 49 | seq_len = len(pages) 50 | else: 51 | seq_len = 300 52 | seq[day - 1, 0:seq_len, 0] = pages[0:seq_len] 53 | seq[day - 1, 0:seq_len, 1] = actions[0:seq_len] 54 | return seq 55 | 56 | 57 | df = pd.read_csv('2021_1_data/4_user_behavior.csv', sep="|") 58 | df = df.sort_values(by=['device_id'], ignore_index=True) 59 | print(df) 60 | print(df.dtypes) 61 | uni_pages = [-1, 7, 9, 13, 16, 17, 18, 19, 20, 21, 22, 26, 28, 29, 30, 32, 34, 36, 38, 40, 44, 45, 47] 62 | pages_to_ix = {page: i+1 for i, page in enumerate(uni_pages)} 63 | print(pages_to_ix) 64 | 65 | result_tensor = [] 66 | last_device = None 67 | id_list = [] 68 | for index, row in df.iterrows(): 69 | device_id = row['device_id'] 70 | if last_device is None: 71 | print(index, device_id) 72 | user_behavior_seq = np.zeros((60, 300, 2), dtype='uint8') 73 | user_behavior_seq = update_page_action_seq(row, user_behavior_seq, pages_to_ix) 74 | last_device = device_id 75 | continue 76 | if device_id != last_device: 77 | print(index, device_id) 78 | id_list.append(last_device) 79 | result_tensor.append(user_behavior_seq) 80 | # if result_tensor is None: 81 | # result_tensor = user_behavior_seq 82 | # else: 83 | # result_tensor = np.append(result_tensor, user_behavior_seq, 0) 84 | user_behavior_seq = np.zeros((60, 300, 2), dtype='uint8') 85 | user_behavior_seq = update_page_action_seq(row, user_behavior_seq, pages_to_ix) 86 | last_device = device_id 87 | else: 88 | user_behavior_seq = update_page_action_seq(row, user_behavior_seq, pages_to_ix) 89 | # last_device = device_id 90 | 91 | result = np.array(result_tensor) 92 | np.save('processed_data/behavior_ids.npy', np.array(id_list)) 93 | np.save('processed_data/page_action_sequence.npy', result) 94 | print('save npy') 95 | 96 | -------------------------------------------------------------------------------- /p_ed_rnn/p5_edrnn_predict.py: -------------------------------------------------------------------------------- 1 | """ 2 | 并行编码器解码器 3 | 基于GRU构建的多个编码器和解码器,其中解码器的输入是编码器得到隐藏状态 4 | 每个编码器的输入包括用户每天行为的统计情况和每天行为序列的隐藏状态 5 | """ 6 | 7 | import torch 8 | import torch.nn as nn 9 | import os 10 | import time 11 | from torch.autograd import Variable 12 | from sklearn.metrics import roc_auc_score 13 | import numpy as np 14 | import pandas as pd 15 | from torch.utils.data import TensorDataset, DataLoader 16 | 17 | 18 | class Gru_Encoder(nn.Module): 19 | """ encoder time series """ 20 | 21 | def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False): 22 | super(Gru_Encoder, self).__init__() 23 | self.input_size = input_size 24 | self.hidden_size = hidden_size 25 | self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=0, bidirectional=bidirectional) 26 | 27 | def forward(self, e_input, h0): 28 | # output: batch_size * L * hidden_size 29 | # hn: 1 * batch_size * hidden_size 30 | output, hn = self.gru(e_input, h0) 31 | return output, hn 32 | 33 | 34 | class Gru_Decoder(nn.Module): 35 | """ decoder, input is hidden state of encoder """ 36 | 37 | def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False): 38 | super(Gru_Decoder, self).__init__() 39 | self.input_size = input_size 40 | self.hidden_size = hidden_size 41 | self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=0, bidirectional=bidirectional) 42 | 43 | def forward(self, d_input, h0): 44 | # output: batch_size * L * hidden_size 45 | # hn: 1 * batch_size * hidden_size 46 | output, hn = self.gru(d_input, h0) 47 | return output, hn 48 | 49 | 50 | class Rnn_Sequence(nn.Module): 51 | 52 | def __init__(self, seq_input, seq_hidden, seq_rnn_layer, bi, seq_num_embedding, seq_embedding_dim, 53 | use_gpu, device): 54 | super(Rnn_Sequence, self).__init__() 55 | self.seq_input = seq_input 56 | self.seq_hidden = seq_hidden 57 | self.seq_emb = nn.ModuleList() 58 | self.num_layers = seq_rnn_layer 59 | # self.seq_num_embedding = seq_num_embedding 60 | total_embdim = 0 61 | for i in range(len(seq_num_embedding)): 62 | self.seq_emb.append(nn.Embedding(seq_num_embedding[i], seq_embedding_dim[i])) 63 | total_embdim += seq_embedding_dim[i] - 1 64 | 65 | self.rnn = nn.GRU(seq_input+total_embdim, seq_hidden, num_layers=seq_rnn_layer, batch_first=True, 66 | dropout=0, bidirectional=bi) 67 | self.use_gpu = use_gpu 68 | self.device = device 69 | 70 | def forward(self, input): 71 | batch_size = input.shape[0] 72 | days = input.shape[1] 73 | # input: batch_size * days * 300 * 2 74 | embed_input = None 75 | for i, seq_embed in enumerate(self.seq_emb): 76 | embed_seq = seq_embed(input[:, :, :, i].long()) 77 | if embed_input is None: 78 | embed_input = embed_seq 79 | else: 80 | embed_input = torch.cat((embed_input, embed_seq), 3) 81 | seq_len = embed_input.shape[2] 82 | input_size = embed_input.shape[3] 83 | seq_h0 = self.init_hidden(batch_size*days, self.seq_hidden) 84 | # embed_input: batch_size * days * 300 * 2embed_size 85 | rnn_out, rnn_h = self.rnn(embed_input.view(-1, seq_len, input_size), seq_h0) 86 | # return: batch_size * days * hidden_size 87 | return rnn_out[:, -1, :].view(batch_size, days, -1) 88 | 89 | def init_hidden(self, batch_size, hidden_size): 90 | if self.use_gpu and self.device == 0: 91 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size).cuda() 92 | elif self.use_gpu and self.device == 1: 93 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size).cuda(device=torch.device('cuda:1')) 94 | else: 95 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size) 96 | return h0 97 | 98 | 99 | class Gru_Encoder_Decoder(nn.Module): 100 | 101 | def __init__(self, e_input_size, e_hidden_size, d_input_size, d_hidden_size, use_gpu, device, 102 | num_layers=1, bidirectional=False): 103 | super(Gru_Encoder_Decoder, self).__init__() 104 | self.use_gpu = use_gpu 105 | self.device = device 106 | self.num_layers = num_layers 107 | self.e_hidden_size = e_hidden_size 108 | self.d_hidden_size = d_hidden_size 109 | self.encoder = Gru_Encoder(e_input_size, e_hidden_size, num_layers, bidirectional) 110 | self.decoder = Gru_Decoder(d_input_size, d_hidden_size, num_layers, bidirectional) 111 | 112 | def forward(self, input, target_len): 113 | """ 114 | :param input: input data (batch_size * days * e_input_size) 115 | :param target_len: time steps of output 116 | :return: 117 | """ 118 | 119 | batch_size = input.size(0) 120 | final_output = self.init_output(batch_size, target_len) 121 | e_ho = self.init_hidden(batch_size, self.e_hidden_size) 122 | d_h0 = self.init_hidden(batch_size, self.d_hidden_size) 123 | # e_output: batch_size * time_steps * e_hidden_size 124 | e_output, e_hn = self.encoder(input, e_ho) 125 | for i in range(target_len): 126 | # decoder_input: batch_size * 1 * e_hidden_size 127 | decoder_input = torch.unsqueeze(e_output[:, -1, :], 1) 128 | d_output, d_hn = self.decoder(decoder_input, d_h0) 129 | d_h0 = d_hn 130 | 131 | final_output[:, i, :] = d_output[:, -1, :] 132 | # final_output: batch_size * target_len * d_hidden_size 133 | return final_output 134 | 135 | def init_output(self, batch_size, target_len): 136 | if self.use_gpu and self.device == 0: 137 | output = torch.zeros(batch_size, target_len, self.d_hidden_size).cuda() 138 | elif self.use_gpu and self.device == 1: 139 | output = torch.zeros(batch_size, target_len, self.d_hidden_size).cuda(device=torch.device('cuda:1')) 140 | else: 141 | output = torch.zeros(batch_size, target_len, self.d_hidden_size) 142 | return output 143 | 144 | def init_hidden(self, batch_size, hidden_size): 145 | if self.use_gpu and self.device == 0: 146 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size).cuda() 147 | elif self.use_gpu and self.device == 1: 148 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size).cuda(device=torch.device('cuda:1')) 149 | else: 150 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size) 151 | return h0 152 | 153 | 154 | class Parallel_ED_RNN(nn.Module): 155 | 156 | def __init__(self, 157 | input_size, user_info_size, e_hidden_size, d_hidden_size, parallel_num, 158 | mlp_hidden_size, mlp_layer, dp, 159 | beha_num_embedding, beha_embedding_dim, num_embeddings, embedding_dim, 160 | seq_input, seq_hidden, seq_rnn_layer, seq_bi, seq_num_emb, seq_emb_dim, 161 | device, use_gpu, gru_layer=1): 162 | super(Parallel_ED_RNN, self).__init__() 163 | 164 | self.seq_hidden = seq_hidden 165 | self.rnn_seq = Rnn_Sequence(seq_input, seq_hidden, seq_rnn_layer, seq_bi, seq_num_emb, seq_emb_dim, 166 | use_gpu, device) 167 | 168 | self.beha_embedding_dim = beha_embedding_dim 169 | self.beha_emb = nn.Embedding(num_embeddings=beha_num_embedding, embedding_dim=beha_embedding_dim) 170 | 171 | self.e_input_size = input_size + beha_embedding_dim + seq_hidden - 1 172 | self.d_input_size = e_hidden_size 173 | self.e_hidden_size = e_hidden_size 174 | self.d_hidden_size = d_hidden_size 175 | self.gru_eds = nn.ModuleList() 176 | self.parallel_num = parallel_num 177 | for i in range(parallel_num): 178 | self.gru_eds.append(Gru_Encoder_Decoder(self.e_input_size, self.e_hidden_size, 179 | self.d_input_size, self.d_hidden_size, 180 | use_gpu=use_gpu, device=device)) 181 | self.atten_linear = nn.Linear(d_hidden_size, 1) 182 | self.softmax = nn.Softmax(dim=-1) 183 | 184 | self.emb = nn.ModuleList() 185 | total_embdim = 0 186 | for i in range(len(num_embeddings)): 187 | self.emb.append(nn.Embedding(num_embeddings=num_embeddings[i], embedding_dim=embedding_dim[i])) 188 | total_embdim += embedding_dim[i] - 1 189 | 190 | self.userinfo_size = user_info_size 191 | self.mlp_hidden_size = mlp_hidden_size 192 | self.mlp_layer = mlp_layer 193 | self.num_layers = gru_layer 194 | if mlp_layer > 0: 195 | self.mlp_model = nn.Sequential( 196 | nn.Linear(self.d_hidden_size + self.userinfo_size + total_embdim, self.mlp_hidden_size), 197 | nn.ReLU(), nn.Dropout(dp)) 198 | if self.mlp_layer > 1: 199 | for i in range(self.mlp_layer - 1): 200 | self.mlp_model.add_module("linear{}".format(i), nn.Linear(self.mlp_hidden_size, mlp_hidden_size)) 201 | self.mlp_model.add_module("active{}".format(i), nn.ReLU()) 202 | self.mlp_model.add_module("dropout{}".format(i), nn.Dropout(dp)) 203 | self.fc = nn.Linear(self.mlp_hidden_size, 1) 204 | else: 205 | self.fc = nn.Linear(self.d_hidden_size + self.userinfo_size + total_embdim, 1) 206 | 207 | self.device = device 208 | self.use_gpu = use_gpu 209 | 210 | def forward(self, input, seq_input, user_info, target_len): 211 | """ 212 | :param input: input data (batch_size * days * 3), 3 dimensions are active, music number, channel 213 | :param seq_input: behaviour sequence data (batch_size * days * 100 * features) features include page and action 214 | :param user_info: basic information about user (age, gender, vip, topics, device, city) 215 | :param target_len: time steps of output 216 | :return: 217 | """ 218 | # 行为序列转化为隐藏状态 219 | # seq_out : batch_size * days * seq_hidden 220 | seq_out = self.rnn_seq(seq_input) 221 | 222 | # 对input data中的离散变量做embedding 223 | # channel: batch_size * time_steps 224 | channel = input[:, :, -1] 225 | # emb_channel: batch_size * time_steps * embedding_dims 226 | emb_channel = self.beha_emb(channel.long()) 227 | 228 | # 合并序列的隐藏状态和input data,作为ED的最终输入 229 | # final_input: batch_size * time_steps * embedding_dims+2 230 | final_input = torch.cat((input[:, :, 0:2], emb_channel, seq_out), 2) 231 | 232 | # gru_eds_output : batch_size * target_len * parallel_num * d_hidden_size 233 | gru_eds_output = None 234 | for i, gru_ed in enumerate(self.gru_eds): 235 | # ed_output : batch_size * target_len * 1 * d_hidden_size 236 | ed_output = torch.unsqueeze(gru_ed(final_input, target_len), 2) 237 | if gru_eds_output is None: 238 | gru_eds_output = ed_output 239 | else: 240 | gru_eds_output = torch.cat((gru_eds_output, ed_output), 2) 241 | 242 | # linear attention 243 | # weights: batch_size * target_len * parallel_num 244 | batch_size = input.shape[0] 245 | weights = self.softmax(self.atten_linear(gru_eds_output).view(batch_size, target_len, self.parallel_num)) 246 | # weighted_sum : batch_size * target_len * d_hidden_size 247 | weighted_sum = \ 248 | torch.matmul(torch.unsqueeze(weights, 2), gru_eds_output).view(batch_size, target_len, self.d_hidden_size) 249 | 250 | # 个人信息嵌入并合并 251 | # continue_info : batch_size * info_dim 252 | continue_info = user_info[:, 0:4] 253 | for i, embed in enumerate(self.emb): 254 | embedding_info = embed(user_info[:, 4 + i].long()) 255 | continue_info = torch.cat((continue_info, embedding_info), 1) 256 | 257 | final_output = None 258 | for i in range(target_len): 259 | # 合并continue_info与weighted_sum 260 | merge_input = torch.cat((weighted_sum[:, i, :], continue_info), 1) 261 | if self.mlp_layer > 0: 262 | fc_input = self.mlp_model(merge_input) 263 | model_output = torch.sigmoid(self.fc(fc_input)) 264 | else: 265 | model_output = torch.sigmoid(self.fc(merge_input)) 266 | # model_output: batch_size * 1 267 | if final_output is None: 268 | final_output = model_output 269 | else: 270 | final_output = torch.cat((final_output, model_output), 1) 271 | 272 | # final_output: batch_size * target_len * 1 273 | return torch.unsqueeze(final_output, -1) 274 | 275 | def train_model(self, train_dataloader, num_epochs, path, name, 276 | learning_rate_decay=0, learning_rate=0.01, a=False, start_epoch=1): 277 | 278 | text_path = os.path.join(path, name + '.txt') 279 | model_path = os.path.join(path, name + '.pt') 280 | if a: 281 | f = open(text_path, 'a+') 282 | f.write('Reload trained model\r\n') 283 | else: 284 | f = open(text_path, 'w+') 285 | f.write('Model Structure\r\n') 286 | f.write(str(self) + '\r\n') 287 | f.close() 288 | print('Model Structure: ', self) 289 | print('Start Training ... ') 290 | if self.use_gpu and self.device == 0: 291 | print("Let's use GPU 0!") 292 | self.cuda() 293 | 294 | if self.use_gpu and self.device == 1: 295 | print("Let's use GPU 1!") 296 | self.cuda(device=torch.device('cuda:1')) 297 | 298 | criterion = nn.BCELoss() 299 | 300 | for epoch in range(num_epochs): 301 | model_path = os.path.join(path, name +'epoch'+ str(epoch+start_epoch) + '.pt') 302 | f = open(text_path, 'a+') 303 | optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) 304 | if learning_rate_decay != 0: 305 | if epoch % learning_rate_decay == 0: 306 | learning_rate = learning_rate / 2 307 | optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) 308 | f.write('at epoch {} learning_rate is updated to {}\r\n'.format(epoch, learning_rate)) 309 | print('at epoch {} learning_rate is updated to {}'.format(epoch, learning_rate)) 310 | 311 | losses, aucs = [], [] 312 | self.train() 313 | pre_time = time.time() 314 | for train_data, test_data, info_data, beha_seq in train_dataloader: 315 | if self.use_gpu and self.device == 0: 316 | train_data, test_data, info_data, beha_seq = Variable(train_data.cuda()), Variable(test_data.cuda()), \ 317 | Variable(info_data.cuda()), Variable(beha_seq.cuda()) 318 | if self.use_gpu and self.device == 1: 319 | cuda1 = torch.device('cuda:1') 320 | train_data, test_data, info_data, beha_seq = Variable(train_data.cuda(device=cuda1)), Variable( 321 | test_data.cuda(device=cuda1)), Variable(info_data.cuda(device=cuda1)), \ 322 | Variable(beha_seq.cuda(device=cuda1)) 323 | if not self.use_gpu: 324 | train_data, test_data, info_data, beha_seq = Variable(train_data), Variable(test_data), \ 325 | Variable(info_data), Variable(beha_seq) 326 | optimizer.zero_grad() 327 | 328 | # predict 30 days 329 | train_data_30 = train_data.clone() 330 | beha_seq_30 = beha_seq[:, 0:30, :, :] 331 | pred_30 = self(train_data_30, beha_seq_30, info_data, 30) 332 | selected_pred_30, selected_label_30 = self.select_specific_days([0,1,2,6,13,29], pred_30, test_data) 333 | 334 | # predict 14 days 335 | # days_shift_14 = list(range(1, 17)) 336 | # final_pred_14, final_label_14 = \ 337 | # self.sliding_predict(days_shift_14, train_data, test_data, info_data, [0, 1, 2, 6, 13], 14) 338 | 339 | # predict 7 days 340 | # days_shift_7 = list(range(17, 24)) 341 | # final_pred_7, final_label_7 = \ 342 | # self.sliding_predict(days_shift_7, train_data, test_data, info_data, [0, 1, 2, 6], 7) 343 | 344 | # predict 3 days 345 | days_shift = list(range(1, 28)) 346 | # days_shift = [3,6,9,12,15,18,21,24,27] 347 | select_days = [0, 1, 2] 348 | final_pred_3, final_label_3 = \ 349 | self.sliding_predict(days_shift, train_data, test_data, beha_seq, info_data, select_days, 3) 350 | 351 | # predict 1 days 352 | days_shift_1 = list(range(28, 29)) 353 | final_pred_1, final_label_1 = \ 354 | self.sliding_predict(days_shift_1, train_data, test_data, beha_seq, info_data, [0], 1) 355 | 356 | final_label_all = torch.cat((selected_label_30, final_label_3, final_label_1), 0) 357 | final_pred_all = torch.cat((selected_pred_30, final_pred_3, final_pred_1), 0) 358 | loss = criterion(final_pred_all, final_label_all) 359 | # loss = criterion(selected_pred_30, selected_label_30) 360 | losses.append(loss.item()) 361 | loss.backward() 362 | optimizer.step() 363 | # train_auc = roc_auc_score(selected_label_30.tolist(), selected_pred_30.tolist()) 364 | train_auc = roc_auc_score(final_label_all.tolist(), final_pred_all.tolist()) 365 | aucs.append(train_auc) 366 | 367 | train_loss = np.mean(losses) 368 | ave_auc = np.mean(aucs) 369 | 370 | a = "Epoch: {} Train loss: {:.6f}, Train auc:{:.6f}, Time is {:.2f} \r\n".format(epoch, train_loss, ave_auc, 371 | time.time() - pre_time) 372 | print(a) 373 | f.write(a) 374 | f.close() 375 | torch.save(self.state_dict(), model_path) 376 | 377 | final_test_pred = self.predict_model(train_dataloader) 378 | return final_test_pred 379 | 380 | def predict_model(self, test_dataloader): 381 | 382 | with torch.no_grad(): 383 | self.eval() 384 | pre_time = time.time() 385 | final_test_pred = None 386 | for train_data, test_data, info_data, beha_seq in test_dataloader: 387 | if self.use_gpu and self.device == 0: 388 | train_data, test_data, info_data, beha_seq = Variable(train_data.cuda()), Variable( 389 | test_data.cuda()), Variable(info_data.cuda()), Variable(beha_seq.cuda()) 390 | if self.use_gpu and self.device == 1: 391 | cuda1 = torch.device('cuda:1') 392 | train_data, test_data, info_data, beha_seq = Variable(train_data.cuda(device=cuda1)), \ 393 | Variable(test_data.cuda(device=cuda1)), \ 394 | Variable(info_data.cuda(device=cuda1)), \ 395 | Variable(beha_seq.cuda(device=cuda1)) 396 | if not self.use_gpu: 397 | train_data, test_data, info_data, beha_seq = Variable(train_data), Variable(test_data), \ 398 | Variable(info_data), Variable(beha_seq) 399 | 400 | # batch_size * 60 * 1 401 | batch_size = train_data.shape[0] 402 | input_data = torch.cat((train_data, test_data), 1) 403 | test_pred = self(input_data, beha_seq, info_data, 30) 404 | test_output = torch.zeros(batch_size, 6) 405 | 406 | select_days = [0, 1, 2, 6, 13, 29] 407 | for i in range(6): 408 | test_output[:, i] = torch.squeeze(test_pred)[:, select_days[i]] 409 | if final_test_pred is None: 410 | final_test_pred = test_output 411 | else: 412 | final_test_pred = torch.cat((final_test_pred, test_output), 0) 413 | print("predict time is {:.2f}".format(time.time() - pre_time)) 414 | return final_test_pred 415 | 416 | def sliding_predict(self, days_shift, train_x, test_x, bahavior_seq, info_data, days, tar_len): 417 | """ 418 | spliding on train dataset to predict target days 419 | :param days_shift: a list which means the end indexs of train_data 420 | :param train_x: 421 | :param test_x: 422 | :param bahavior_seq: batch_size * 60 * 300 * 2 423 | :param info_data: 424 | :param days: 425 | :param tar_len: output length 426 | :return: 427 | """ 428 | final_pred, final_label = None, None 429 | for day_shift in days_shift: 430 | slid_train_x = torch.cat((train_x, test_x[:, 0:day_shift, :]), 1) 431 | slid_test_x = test_x[:, day_shift:, :].clone() 432 | slid_beha_seq = bahavior_seq[:, 0:30+day_shift, :, :] 433 | pred = self(slid_train_x, slid_beha_seq, info_data, tar_len) 434 | select_pred, select_label = self.select_specific_days(days, pred, slid_test_x) 435 | if final_pred is None and final_label is None: 436 | final_pred = select_pred 437 | final_label = select_label 438 | else: 439 | final_label = torch.cat((final_label, select_label), 0) 440 | final_pred = torch.cat((final_pred, select_pred), 0) 441 | return final_pred, final_label 442 | 443 | def select_specific_days(self, days, pred_x, label_x): 444 | """ 445 | 从未来x天中选择特定的几天 446 | :param days: 447 | :param pred_x: 448 | :param label_x: 449 | :return: 450 | """ 451 | # shape of pred_x is batch_size * x * 2 452 | select_pred, select_label = None, None 453 | for j in days: 454 | if select_label is None and select_pred is None: 455 | select_pred = pred_x[:, j, 0] 456 | select_label = label_x[:, j, 0] 457 | else: 458 | select_label = torch.cat((select_label, label_x[:, j, 0]), 0) 459 | select_pred = torch.cat((select_pred, pred_x[:, j, 0]), 0) 460 | # shape of returned select_pred is len(days)batch_size 461 | return select_pred, select_label 462 | 463 | 464 | def prepare_data(path, behavior_path, seq_path, batch_size=100): 465 | """ 466 | 467 | :param path: 468 | :param days:1,2,3,7,14,30 469 | :param batch_size: 470 | :return: 471 | """ 472 | df = pd.read_csv(path) 473 | 474 | user_info = df.loc[:, ['gender', 'age', 'is_vip', 'topics']].values 475 | num_embedding = [df['device'].nunique(), df['city'].nunique()] 476 | device_to_ix = {device: i for i, device in enumerate(df['device'].unique())} 477 | device_idx = [device_to_ix[d] for d in df['device']] 478 | # print(device_to_ix) 479 | # print(df['device']) 480 | # print(device_idx) 481 | city_to_ix = {city: i for i, city in enumerate(df['city'].unique())} 482 | city_idx = [city_to_ix[d] for d in df['city']] 483 | user_info = np.column_stack((user_info, device_idx)) 484 | user_info = np.column_stack((user_info, city_idx)) 485 | print(num_embedding) 486 | user_info = user_info.astype('float32') 487 | 488 | # load train_data and test_data 489 | data = np.load(behavior_path) 490 | data = data.astype('float32') 491 | channel_nums = int(np.max(data[:, :, -1])) + 1 492 | print(channel_nums) 493 | train_data = data[:, 0:30, :] 494 | test_data = data[:, 30:, :] 495 | 496 | # bahavior sequence data 497 | seq_data = np.load(seq_path) 498 | 499 | train_dataset = TensorDataset(torch.from_numpy(train_data), torch.from_numpy(test_data), 500 | torch.from_numpy(user_info), torch.from_numpy(seq_data[:, :, 0:100, :])) 501 | train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size) 502 | # test_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size) 503 | 504 | return train_dataloader, num_embedding, channel_nums 505 | 506 | 507 | if __name__ == "__main__": 508 | 509 | data_path = "../../data/final_processed_data/equal_select_device_active_with_info.csv" 510 | behavior_path = "../../data/final_processed_data/equal_select_mixed_behavior.npy" 511 | seq_path = "../../data/final_processed_data/equal_select_matched_p_a_seq100.npy" 512 | 513 | df = pd.read_csv(data_path) 514 | 515 | train_dataloader, num_emb, channel_nums = prepare_data(data_path, behavior_path, seq_path, batch_size=500) 516 | structure = '5parallel_p_a100' 517 | name = 'seq2h31aug' 518 | model = Parallel_ED_RNN(input_size=3, user_info_size=6, e_hidden_size=64, d_hidden_size=64, parallel_num=5, 519 | mlp_hidden_size=32, mlp_layer=1, dp=0.2, 520 | num_embeddings=num_emb, embedding_dim=[4, 4], beha_num_embedding=channel_nums, beha_embedding_dim=4, 521 | seq_input=2, seq_hidden=2, seq_rnn_layer=1, seq_bi=False, seq_num_emb=[24, 9], seq_emb_dim=[4, 4], 522 | device=0, use_gpu=True) 523 | model_path = 'best_model.pt' 524 | model.load_state_dict(torch.load(model_path, map_location='cpu')) 525 | # test_pred = model.train_model(train_dataloader, num_epochs=8, path='results', name=name + structure, 526 | # learning_rate_decay=4) 527 | # test_pred = model.train_model(train_dataloader, num_epochs=1, path='results', name='seq2h31aug5eopch2'+structure, 528 | # learning_rate_decay=5, learning_rate=0.005, a=True) 529 | model.cuda() 530 | test_pred = model.predict_model(train_dataloader) 531 | print(test_pred) 532 | df2 = pd.DataFrame(test_pred.tolist(), 533 | columns=['label_1d', 'label_2d', 'label_3d', 'label_7d', 'label_14d', 'label_30d']) 534 | df2.insert(0, 'device_id', df['device_id']) 535 | df2 = df2.round(3) 536 | df2.to_csv('submission.csv', index=False) 537 | -------------------------------------------------------------------------------- /p_ed_rnn/p5_edrnn_train.py: -------------------------------------------------------------------------------- 1 | """ 2 | 并行编码器解码器 3 | 基于GRU构建的多个编码器和解码器,其中解码器的输入是编码器得到隐藏状态 4 | 每个编码器的输入包括用户每天行为的统计情况和每天行为序列的隐藏状态 5 | """ 6 | 7 | import torch 8 | import torch.nn as nn 9 | import os 10 | import time 11 | from torch.autograd import Variable 12 | from sklearn.metrics import roc_auc_score 13 | import numpy as np 14 | import pandas as pd 15 | from torch.utils.data import TensorDataset, DataLoader 16 | 17 | 18 | class Gru_Encoder(nn.Module): 19 | """ encoder time series """ 20 | 21 | def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False): 22 | super(Gru_Encoder, self).__init__() 23 | self.input_size = input_size 24 | self.hidden_size = hidden_size 25 | self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=0, bidirectional=bidirectional) 26 | 27 | def forward(self, e_input, h0): 28 | # output: batch_size * L * hidden_size 29 | # hn: 1 * batch_size * hidden_size 30 | output, hn = self.gru(e_input, h0) 31 | return output, hn 32 | 33 | 34 | class Gru_Decoder(nn.Module): 35 | """ decoder, input is hidden state of encoder """ 36 | 37 | def __init__(self, input_size, hidden_size, num_layers=1, bidirectional=False): 38 | super(Gru_Decoder, self).__init__() 39 | self.input_size = input_size 40 | self.hidden_size = hidden_size 41 | self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=0, bidirectional=bidirectional) 42 | 43 | def forward(self, d_input, h0): 44 | # output: batch_size * L * hidden_size 45 | # hn: 1 * batch_size * hidden_size 46 | output, hn = self.gru(d_input, h0) 47 | return output, hn 48 | 49 | 50 | class Rnn_Sequence(nn.Module): 51 | 52 | def __init__(self, seq_input, seq_hidden, seq_rnn_layer, bi, seq_num_embedding, seq_embedding_dim, 53 | use_gpu, device): 54 | super(Rnn_Sequence, self).__init__() 55 | self.seq_input = seq_input 56 | self.seq_hidden = seq_hidden 57 | self.seq_emb = nn.ModuleList() 58 | self.num_layers = seq_rnn_layer 59 | # self.seq_num_embedding = seq_num_embedding 60 | total_embdim = 0 61 | for i in range(len(seq_num_embedding)): 62 | self.seq_emb.append(nn.Embedding(seq_num_embedding[i], seq_embedding_dim[i])) 63 | total_embdim += seq_embedding_dim[i] - 1 64 | 65 | self.rnn = nn.GRU(seq_input+total_embdim, seq_hidden, num_layers=seq_rnn_layer, batch_first=True, 66 | dropout=0, bidirectional=bi) 67 | self.use_gpu = use_gpu 68 | self.device = device 69 | 70 | def forward(self, input): 71 | batch_size = input.shape[0] 72 | days = input.shape[1] 73 | # input: batch_size * days * 300 * 2 74 | embed_input = None 75 | for i, seq_embed in enumerate(self.seq_emb): 76 | embed_seq = seq_embed(input[:, :, :, i].long()) 77 | if embed_input is None: 78 | embed_input = embed_seq 79 | else: 80 | embed_input = torch.cat((embed_input, embed_seq), 3) 81 | seq_len = embed_input.shape[2] 82 | input_size = embed_input.shape[3] 83 | seq_h0 = self.init_hidden(batch_size*days, self.seq_hidden) 84 | # embed_input: batch_size * days * 300 * 2embed_size 85 | rnn_out, rnn_h = self.rnn(embed_input.view(-1, seq_len, input_size), seq_h0) 86 | # return: batch_size * days * hidden_size 87 | return rnn_out[:, -1, :].view(batch_size, days, -1) 88 | 89 | def init_hidden(self, batch_size, hidden_size): 90 | if self.use_gpu and self.device == 0: 91 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size).cuda() 92 | elif self.use_gpu and self.device == 1: 93 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size).cuda(device=torch.device('cuda:1')) 94 | else: 95 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size) 96 | return h0 97 | 98 | 99 | class Gru_Encoder_Decoder(nn.Module): 100 | 101 | def __init__(self, e_input_size, e_hidden_size, d_input_size, d_hidden_size, use_gpu, device, 102 | num_layers=1, bidirectional=False): 103 | super(Gru_Encoder_Decoder, self).__init__() 104 | self.use_gpu = use_gpu 105 | self.device = device 106 | self.num_layers = num_layers 107 | self.e_hidden_size = e_hidden_size 108 | self.d_hidden_size = d_hidden_size 109 | self.encoder = Gru_Encoder(e_input_size, e_hidden_size, num_layers, bidirectional) 110 | self.decoder = Gru_Decoder(d_input_size, d_hidden_size, num_layers, bidirectional) 111 | 112 | def forward(self, input, target_len): 113 | """ 114 | :param input: input data (batch_size * days * e_input_size) 115 | :param target_len: time steps of output 116 | :return: 117 | """ 118 | 119 | batch_size = input.size(0) 120 | final_output = self.init_output(batch_size, target_len) 121 | e_ho = self.init_hidden(batch_size, self.e_hidden_size) 122 | d_h0 = self.init_hidden(batch_size, self.d_hidden_size) 123 | # e_output: batch_size * time_steps * e_hidden_size 124 | e_output, e_hn = self.encoder(input, e_ho) 125 | for i in range(target_len): 126 | # decoder_input: batch_size * 1 * e_hidden_size 127 | decoder_input = torch.unsqueeze(e_output[:, -1, :], 1) 128 | d_output, d_hn = self.decoder(decoder_input, d_h0) 129 | d_h0 = d_hn 130 | 131 | final_output[:, i, :] = d_output[:, -1, :] 132 | # final_output: batch_size * target_len * d_hidden_size 133 | return final_output 134 | 135 | def init_output(self, batch_size, target_len): 136 | if self.use_gpu and self.device == 0: 137 | output = torch.zeros(batch_size, target_len, self.d_hidden_size).cuda() 138 | elif self.use_gpu and self.device == 1: 139 | output = torch.zeros(batch_size, target_len, self.d_hidden_size).cuda(device=torch.device('cuda:1')) 140 | else: 141 | output = torch.zeros(batch_size, target_len, self.d_hidden_size) 142 | return output 143 | 144 | def init_hidden(self, batch_size, hidden_size): 145 | if self.use_gpu and self.device == 0: 146 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size).cuda() 147 | elif self.use_gpu and self.device == 1: 148 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size).cuda(device=torch.device('cuda:1')) 149 | else: 150 | h0 = torch.zeros(self.num_layers, batch_size, hidden_size) 151 | return h0 152 | 153 | 154 | class Parallel_ED_RNN(nn.Module): 155 | 156 | def __init__(self, 157 | input_size, user_info_size, e_hidden_size, d_hidden_size, parallel_num, 158 | mlp_hidden_size, mlp_layer, dp, 159 | beha_num_embedding, beha_embedding_dim, num_embeddings, embedding_dim, 160 | seq_input, seq_hidden, seq_rnn_layer, seq_bi, seq_num_emb, seq_emb_dim, 161 | device, use_gpu, gru_layer=1): 162 | super(Parallel_ED_RNN, self).__init__() 163 | 164 | self.seq_hidden = seq_hidden 165 | self.rnn_seq = Rnn_Sequence(seq_input, seq_hidden, seq_rnn_layer, seq_bi, seq_num_emb, seq_emb_dim, 166 | use_gpu, device) 167 | 168 | self.beha_embedding_dim = beha_embedding_dim 169 | self.beha_emb = nn.Embedding(num_embeddings=beha_num_embedding, embedding_dim=beha_embedding_dim) 170 | 171 | self.e_input_size = input_size + beha_embedding_dim + seq_hidden - 1 172 | self.d_input_size = e_hidden_size 173 | self.e_hidden_size = e_hidden_size 174 | self.d_hidden_size = d_hidden_size 175 | self.gru_eds = nn.ModuleList() 176 | self.parallel_num = parallel_num 177 | for i in range(parallel_num): 178 | self.gru_eds.append(Gru_Encoder_Decoder(self.e_input_size, self.e_hidden_size, 179 | self.d_input_size, self.d_hidden_size, 180 | use_gpu=use_gpu, device=device)) 181 | self.atten_linear = nn.Linear(d_hidden_size, 1) 182 | self.softmax = nn.Softmax(dim=-1) 183 | 184 | self.emb = nn.ModuleList() 185 | total_embdim = 0 186 | for i in range(len(num_embeddings)): 187 | self.emb.append(nn.Embedding(num_embeddings=num_embeddings[i], embedding_dim=embedding_dim[i])) 188 | total_embdim += embedding_dim[i] - 1 189 | 190 | self.userinfo_size = user_info_size 191 | self.mlp_hidden_size = mlp_hidden_size 192 | self.mlp_layer = mlp_layer 193 | self.num_layers = gru_layer 194 | if mlp_layer > 0: 195 | self.mlp_model = nn.Sequential( 196 | nn.Linear(self.d_hidden_size + self.userinfo_size + total_embdim, self.mlp_hidden_size), 197 | nn.ReLU(), nn.Dropout(dp)) 198 | if self.mlp_layer > 1: 199 | for i in range(self.mlp_layer - 1): 200 | self.mlp_model.add_module("linear{}".format(i), nn.Linear(self.mlp_hidden_size, mlp_hidden_size)) 201 | self.mlp_model.add_module("active{}".format(i), nn.ReLU()) 202 | self.mlp_model.add_module("dropout{}".format(i), nn.Dropout(dp)) 203 | self.fc = nn.Linear(self.mlp_hidden_size, 1) 204 | else: 205 | self.fc = nn.Linear(self.d_hidden_size + self.userinfo_size + total_embdim, 1) 206 | 207 | self.device = device 208 | self.use_gpu = use_gpu 209 | 210 | def forward(self, input, seq_input, user_info, target_len): 211 | """ 212 | :param input: input data (batch_size * days * 3), 3 dimensions are active, music number, channel 213 | :param seq_input: behaviour sequence data (batch_size * days * 100 * features) features include page and action 214 | :param user_info: basic information about user (age, gender, vip, topics, device, city) 215 | :param target_len: time steps of output 216 | :return: 217 | """ 218 | # 行为序列转化为隐藏状态 219 | # seq_out : batch_size * days * seq_hidden 220 | seq_out = self.rnn_seq(seq_input) 221 | 222 | # 对input data中的离散变量做embedding 223 | # channel: batch_size * time_steps 224 | channel = input[:, :, -1] 225 | # emb_channel: batch_size * time_steps * embedding_dims 226 | emb_channel = self.beha_emb(channel.long()) 227 | 228 | # 合并序列的隐藏状态和input data,作为ED的最终输入 229 | # final_input: batch_size * time_steps * embedding_dims+2 230 | final_input = torch.cat((input[:, :, 0:2], emb_channel, seq_out), 2) 231 | 232 | # gru_eds_output : batch_size * target_len * parallel_num * d_hidden_size 233 | gru_eds_output = None 234 | for i, gru_ed in enumerate(self.gru_eds): 235 | # ed_output : batch_size * target_len * 1 * d_hidden_size 236 | ed_output = torch.unsqueeze(gru_ed(final_input, target_len), 2) 237 | if gru_eds_output is None: 238 | gru_eds_output = ed_output 239 | else: 240 | gru_eds_output = torch.cat((gru_eds_output, ed_output), 2) 241 | 242 | # linear attention 243 | # weights: batch_size * target_len * parallel_num 244 | batch_size = input.shape[0] 245 | weights = self.softmax(self.atten_linear(gru_eds_output).view(batch_size, target_len, self.parallel_num)) 246 | # weighted_sum : batch_size * target_len * d_hidden_size 247 | weighted_sum = \ 248 | torch.matmul(torch.unsqueeze(weights, 2), gru_eds_output).view(batch_size, target_len, self.d_hidden_size) 249 | 250 | # 个人信息嵌入并合并 251 | # continue_info : batch_size * info_dim 252 | continue_info = user_info[:, 0:4] 253 | for i, embed in enumerate(self.emb): 254 | embedding_info = embed(user_info[:, 4 + i].long()) 255 | continue_info = torch.cat((continue_info, embedding_info), 1) 256 | 257 | final_output = None 258 | for i in range(target_len): 259 | # 合并continue_info与weighted_sum 260 | merge_input = torch.cat((weighted_sum[:, i, :], continue_info), 1) 261 | if self.mlp_layer > 0: 262 | fc_input = self.mlp_model(merge_input) 263 | model_output = torch.sigmoid(self.fc(fc_input)) 264 | else: 265 | model_output = torch.sigmoid(self.fc(merge_input)) 266 | # model_output: batch_size * 1 267 | if final_output is None: 268 | final_output = model_output 269 | else: 270 | final_output = torch.cat((final_output, model_output), 1) 271 | 272 | # final_output: batch_size * target_len * 1 273 | return torch.unsqueeze(final_output, -1) 274 | 275 | def train_model(self, train_dataloader, num_epochs, path, name, 276 | learning_rate_decay=0, learning_rate=0.01, a=False, start_epoch=1): 277 | 278 | text_path = os.path.join(path, name + '.txt') 279 | model_path = os.path.join(path, name + '.pt') 280 | if a: 281 | f = open(text_path, 'a+') 282 | f.write('Reload trained model\r\n') 283 | else: 284 | f = open(text_path, 'w+') 285 | f.write('Model Structure\r\n') 286 | f.write(str(self) + '\r\n') 287 | f.close() 288 | print('Model Structure: ', self) 289 | print('Start Training ... ') 290 | if self.use_gpu and self.device == 0: 291 | print("Let's use GPU 0!") 292 | self.cuda() 293 | 294 | if self.use_gpu and self.device == 1: 295 | print("Let's use GPU 1!") 296 | self.cuda(device=torch.device('cuda:1')) 297 | 298 | criterion = nn.BCELoss() 299 | 300 | for epoch in range(num_epochs): 301 | model_path = os.path.join(path, name +'epoch'+ str(epoch+start_epoch) + '.pt') 302 | f = open(text_path, 'a+') 303 | optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) 304 | if learning_rate_decay != 0: 305 | if epoch % learning_rate_decay == 0: 306 | learning_rate = learning_rate / 2 307 | optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) 308 | f.write('at epoch {} learning_rate is updated to {}\r\n'.format(epoch, learning_rate)) 309 | print('at epoch {} learning_rate is updated to {}'.format(epoch, learning_rate)) 310 | 311 | losses, aucs = [], [] 312 | self.train() 313 | pre_time = time.time() 314 | for train_data, test_data, info_data, beha_seq in train_dataloader: 315 | if self.use_gpu and self.device == 0: 316 | train_data, test_data, info_data, beha_seq = Variable(train_data.cuda()), Variable(test_data.cuda()), \ 317 | Variable(info_data.cuda()), Variable(beha_seq.cuda()) 318 | if self.use_gpu and self.device == 1: 319 | cuda1 = torch.device('cuda:1') 320 | train_data, test_data, info_data, beha_seq = Variable(train_data.cuda(device=cuda1)), Variable( 321 | test_data.cuda(device=cuda1)), Variable(info_data.cuda(device=cuda1)), \ 322 | Variable(beha_seq.cuda(device=cuda1)) 323 | if not self.use_gpu: 324 | train_data, test_data, info_data, beha_seq = Variable(train_data), Variable(test_data), \ 325 | Variable(info_data), Variable(beha_seq) 326 | optimizer.zero_grad() 327 | 328 | # predict 30 days 329 | train_data_30 = train_data.clone() 330 | beha_seq_30 = beha_seq[:, 0:30, :, :] 331 | pred_30 = self(train_data_30, beha_seq_30, info_data, 30) 332 | selected_pred_30, selected_label_30 = self.select_specific_days([0,1,2,6,13,29], pred_30, test_data) 333 | 334 | # predict 14 days 335 | # days_shift_14 = list(range(1, 17)) 336 | # final_pred_14, final_label_14 = \ 337 | # self.sliding_predict(days_shift_14, train_data, test_data, info_data, [0, 1, 2, 6, 13], 14) 338 | 339 | # predict 7 days 340 | # days_shift_7 = list(range(17, 24)) 341 | # final_pred_7, final_label_7 = \ 342 | # self.sliding_predict(days_shift_7, train_data, test_data, info_data, [0, 1, 2, 6], 7) 343 | 344 | # predict 3 days 345 | days_shift = list(range(1, 28)) 346 | # days_shift = [3,6,9,12,15,18,21,24,27] 347 | select_days = [0, 1, 2] 348 | final_pred_3, final_label_3 = \ 349 | self.sliding_predict(days_shift, train_data, test_data, beha_seq, info_data, select_days, 3) 350 | 351 | # predict 1 days 352 | days_shift_1 = list(range(28, 29)) 353 | final_pred_1, final_label_1 = \ 354 | self.sliding_predict(days_shift_1, train_data, test_data, beha_seq, info_data, [0], 1) 355 | 356 | final_label_all = torch.cat((selected_label_30, final_label_3, final_label_1), 0) 357 | final_pred_all = torch.cat((selected_pred_30, final_pred_3, final_pred_1), 0) 358 | loss = criterion(final_pred_all, final_label_all) 359 | # loss = criterion(selected_pred_30, selected_label_30) 360 | losses.append(loss.item()) 361 | loss.backward() 362 | optimizer.step() 363 | # train_auc = roc_auc_score(selected_label_30.tolist(), selected_pred_30.tolist()) 364 | train_auc = roc_auc_score(final_label_all.tolist(), final_pred_all.tolist()) 365 | aucs.append(train_auc) 366 | 367 | train_loss = np.mean(losses) 368 | ave_auc = np.mean(aucs) 369 | 370 | a = "Epoch: {} Train loss: {:.6f}, Train auc:{:.6f}, Time is {:.2f} \r\n".format(epoch, train_loss, ave_auc, 371 | time.time() - pre_time) 372 | print(a) 373 | f.write(a) 374 | f.close() 375 | torch.save(self.state_dict(), model_path) 376 | 377 | final_test_pred = self.predict_model(train_dataloader) 378 | return final_test_pred 379 | 380 | def predict_model(self, test_dataloader): 381 | 382 | with torch.no_grad(): 383 | self.eval() 384 | pre_time = time.time() 385 | final_test_pred = None 386 | for train_data, test_data, info_data, beha_seq in test_dataloader: 387 | if self.use_gpu and self.device == 0: 388 | train_data, test_data, info_data, beha_seq = Variable(train_data.cuda()), Variable( 389 | test_data.cuda()), Variable(info_data.cuda()), Variable(beha_seq.cuda()) 390 | if self.use_gpu and self.device == 1: 391 | cuda1 = torch.device('cuda:1') 392 | train_data, test_data, info_data, beha_seq = Variable(train_data.cuda(device=cuda1)), \ 393 | Variable(test_data.cuda(device=cuda1)), \ 394 | Variable(info_data.cuda(device=cuda1)), \ 395 | Variable(beha_seq.cuda(device=cuda1)) 396 | if not self.use_gpu: 397 | train_data, test_data, info_data, beha_seq = Variable(train_data), Variable(test_data), \ 398 | Variable(info_data), Variable(beha_seq) 399 | 400 | # batch_size * 60 * 1 401 | batch_size = train_data.shape[0] 402 | input_data = torch.cat((train_data, test_data), 1) 403 | test_pred = self(input_data, beha_seq, info_data, 30) 404 | test_output = torch.zeros(batch_size, 6) 405 | 406 | select_days = [0, 1, 2, 6, 13, 29] 407 | for i in range(6): 408 | test_output[:, i] = torch.squeeze(test_pred)[:, select_days[i]] 409 | if final_test_pred is None: 410 | final_test_pred = test_output 411 | else: 412 | final_test_pred = torch.cat((final_test_pred, test_output), 0) 413 | print("predict time is {:.2f}".format(time.time() - pre_time)) 414 | return final_test_pred 415 | 416 | def sliding_predict(self, days_shift, train_x, test_x, bahavior_seq, info_data, days, tar_len): 417 | """ 418 | spliding on train dataset to predict target days 419 | :param days_shift: a list which means the end indexs of train_data 420 | :param train_x: 421 | :param test_x: 422 | :param bahavior_seq: batch_size * 60 * 300 * 2 423 | :param info_data: 424 | :param days: 425 | :param tar_len: output length 426 | :return: 427 | """ 428 | final_pred, final_label = None, None 429 | for day_shift in days_shift: 430 | slid_train_x = torch.cat((train_x, test_x[:, 0:day_shift, :]), 1) 431 | slid_test_x = test_x[:, day_shift:, :].clone() 432 | slid_beha_seq = bahavior_seq[:, 0:30+day_shift, :, :] 433 | pred = self(slid_train_x, slid_beha_seq, info_data, tar_len) 434 | select_pred, select_label = self.select_specific_days(days, pred, slid_test_x) 435 | if final_pred is None and final_label is None: 436 | final_pred = select_pred 437 | final_label = select_label 438 | else: 439 | final_label = torch.cat((final_label, select_label), 0) 440 | final_pred = torch.cat((final_pred, select_pred), 0) 441 | return final_pred, final_label 442 | 443 | def select_specific_days(self, days, pred_x, label_x): 444 | """ 445 | 从未来x天中选择特定的几天 446 | :param days: 447 | :param pred_x: 448 | :param label_x: 449 | :return: 450 | """ 451 | # shape of pred_x is batch_size * x * 2 452 | select_pred, select_label = None, None 453 | for j in days: 454 | if select_label is None and select_pred is None: 455 | select_pred = pred_x[:, j, 0] 456 | select_label = label_x[:, j, 0] 457 | else: 458 | select_label = torch.cat((select_label, label_x[:, j, 0]), 0) 459 | select_pred = torch.cat((select_pred, pred_x[:, j, 0]), 0) 460 | # shape of returned select_pred is len(days)batch_size 461 | return select_pred, select_label 462 | 463 | 464 | def prepare_data(path, behavior_path, seq_path, batch_size=100): 465 | """ 466 | 467 | :param path: 468 | :param days:1,2,3,7,14,30 469 | :param batch_size: 470 | :return: 471 | """ 472 | df = pd.read_csv(path) 473 | 474 | user_info = df.loc[:, ['gender', 'age', 'is_vip', 'topics']].values 475 | num_embedding = [df['device'].nunique(), df['city'].nunique()] 476 | device_to_ix = {device: i for i, device in enumerate(df['device'].unique())} 477 | device_idx = [device_to_ix[d] for d in df['device']] 478 | # print(device_to_ix) 479 | # print(df['device']) 480 | # print(device_idx) 481 | city_to_ix = {city: i for i, city in enumerate(df['city'].unique())} 482 | city_idx = [city_to_ix[d] for d in df['city']] 483 | user_info = np.column_stack((user_info, device_idx)) 484 | user_info = np.column_stack((user_info, city_idx)) 485 | print(num_embedding) 486 | user_info = user_info.astype('float32') 487 | 488 | # load train_data and test_data 489 | data = np.load(behavior_path) 490 | data = data.astype('float32') 491 | channel_nums = int(np.max(data[:, :, -1])) + 1 492 | print(channel_nums) 493 | train_data = data[:, 0:30, :] 494 | test_data = data[:, 30:, :] 495 | 496 | # bahavior sequence data 497 | seq_data = np.load(seq_path) 498 | 499 | train_dataset = TensorDataset(torch.from_numpy(train_data), torch.from_numpy(test_data), 500 | torch.from_numpy(user_info), torch.from_numpy(seq_data[:, :, 0:100, :])) 501 | train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size) 502 | # test_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size) 503 | 504 | return train_dataloader, num_embedding, channel_nums 505 | 506 | 507 | if __name__ == "__main__": 508 | 509 | data_path = "../../data/final_processed_data/equal_select_device_active_with_info.csv" 510 | behavior_path = "../../data/final_processed_data/equal_select_mixed_behavior.npy" 511 | seq_path = "../../data/final_processed_data/equal_select_matched_p_a_seq100.npy" 512 | 513 | df = pd.read_csv(data_path) 514 | 515 | train_dataloader, num_emb, channel_nums = prepare_data(data_path, behavior_path, seq_path, batch_size=500) 516 | structure = '5parallel_p_a100' 517 | name = 'seq2h31aug' 518 | model = Parallel_ED_RNN(input_size=3, user_info_size=6, e_hidden_size=64, d_hidden_size=64, parallel_num=5, 519 | mlp_hidden_size=32, mlp_layer=1, dp=0.2, 520 | num_embeddings=num_emb, embedding_dim=[4, 4], beha_num_embedding=channel_nums, beha_embedding_dim=4, 521 | seq_input=2, seq_hidden=2, seq_rnn_layer=1, seq_bi=False, seq_num_emb=[24, 9], seq_emb_dim=[4, 4], 522 | device=0, use_gpu=True) 523 | # model_path = 'results/seq2h31aug5parallel_p_a100epoch6.pt' 524 | # model.load_state_dict(torch.load(model_path)) 525 | test_pred = model.train_model(train_dataloader, num_epochs=6, path='results', name=name + structure, 526 | learning_rate_decay=4) 527 | # test_pred = model.train_model(train_dataloader, num_epochs=1, path='results', name='seq2h31aug5eopch2'+structure, 528 | # learning_rate_decay=5, learning_rate=0.005, a=True) 529 | # model.cuda(device=torch.device('cuda:1')) 530 | # test_pred = model.predict_model(train_dataloader) 531 | print(test_pred) 532 | df2 = pd.DataFrame(test_pred.tolist(), 533 | columns=['label_1d', 'label_2d', 'label_3d', 'label_7d', 'label_14d', 'label_30d']) 534 | df2.insert(0, 'device_id', df['device_id']) 535 | df2 = df2.round(3) 536 | df2.to_csv('submission_' + name + structure + '_6epoch.csv', index=False) 537 | --------------------------------------------------------------------------------