├── README.md ├── data_generator.py ├── model.py ├── train.py └── util.py /README.md: -------------------------------------------------------------------------------- 1 | # 基于keras的Deep Interest Network实现 2 | 欢迎star,后续会有更多的nlp和推荐的代码实现,同时关注我的知乎(https://zhuanlan.zhihu.com/skydm) 3 | 4 | 参考阿里的论文Deep Interest Network for Click-Through Rate Prediction(https://arxiv.org/abs/1706.06978) 5 | 6 | 实现过程参考了了如下代码库: 7 | 1. https://github.com/zhougr1993/DeepInterestNetwork 8 | 2. https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/din 9 | 10 | 数据下载: 11 | Amazon Product数据集并进行预处理(http://jmcauley.ucsd.edu/data/amazon/), 这部分的数据处理代码可以参考上述代码库 12 | 13 | 提供了jupyter notebook的具体实现过程, 调参确实一门学问。 14 | 起初在设置如下过程时,发现loss经过几个epoches后,loss上升,同时acc减小。 15 | ``` 16 | model.compile(optimizer=keras.optimizers.Adam(1e-3), metrics=["accuracy"]) 17 | Epoch 1/10 18 | 40761/40762 [============================>.] - ETA: 0s - loss: 0.5377 19 | 20 | Epoch 2/10 21 | 40761/40762 [============================>.] - ETA: 0s - loss: 0.5241 22 | 23 | Epoch 5/10 24 | 40758/40762 [============================>.] - ETA: 0s - loss: 0.5347 25 | 26 | Epoch 6/10 27 | 40757/40762 [============================>.] - ETA: 0s - loss: 0.5427 28 | 29 | Epoch 7/10 30 | 40760/40762 [============================>.] - ETA: 0s - loss: 0.5506 31 | 32 | Epoch 9/10 33 | 40758/40762 [============================>.] - ETA: 0s - loss: 0.5651 34 | ``` 35 | 经过调整优化算法和学习参数之后,loss下降回归正常。 36 | 37 | ``` 38 | Epoch 1/10 39 | 40761/40762 [============================>.] - ETA: 0s - loss: 0.6021 40 | 41 | Epoch 2/10 42 | 40761/40762 [============================>.] - ETA: 0s - loss: 0.5349 43 | 44 | Epoch 3/10 45 | 40760/40762 [============================>.] - ETA: 0s - loss: 0.5302 46 | 47 | Epoch 4/10 48 | 40759/40762 [============================>.] - ETA: 0s - loss: 0.5280 49 | 50 | Consider using a TensorFlow optimizer from `tf.train`. 51 | acc: 0.7824, best acc: 0.7824 52 | ``` -------------------------------------------------------------------------------- /data_generator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/7/1 12:59 3 | # @Author : skydm 4 | # @Email : wzwei1636@163.com 5 | # @File : data_generator.py 6 | # @Software: PyCharm 7 | 8 | import random 9 | import numpy as np 10 | from tensorflow.python.keras.preprocessing.sequence import pad_sequences 11 | 12 | 13 | class DataInput: 14 | def __init__(self, file, batch_size): 15 | self.file = file 16 | self.batch_size = batch_size 17 | self.data_set = self.read_file() 18 | # 打乱顺序 19 | random.shuffle(self.data_set) 20 | 21 | self.steps = len(self.data_set) // self.batch_size 22 | if len(self.data_set) % self.batch_size != 0: 23 | self.steps = self.steps + 1 24 | 25 | def read_file(self): 26 | '''读取训练集''' 27 | res = [] 28 | # max_len = 0 29 | with open(self.file, "r") as f: 30 | for line in f: 31 | line = line.strip().split(";") 32 | hist = line[0].split(" ") # 商品历史点击序列 33 | cate = line[1].split(" ") # 商品历史点击对应的类别序列 34 | # max_len = max(max_len, len(hist)) # 序列最大长度 35 | click_next_item = line[2] 36 | click_next_item_cate = line[3] 37 | label = line[4] 38 | res.append([hist, cate, click_next_item, click_next_item_cate, float(label)]) 39 | return res 40 | 41 | def __len__(self): 42 | return self.steps 43 | 44 | def __iter__(self): 45 | while True: 46 | idxs = list(range(len(self.data_set))) 47 | random.shuffle(idxs) 48 | 49 | hist_item, hist_cat, target_item, target_cate, hist_len, b_label = [], [], [], [], [], [] 50 | for i in idxs: 51 | item = self.data_set[i][0] 52 | cate = self.data_set[i][1] 53 | target_i = self.data_set[i][2] 54 | target_c = self.data_set[i][3] 55 | len_ = len(self.data_set[i][0]) 56 | label = float(self.data_set[i][4]) 57 | 58 | hist_item.append(item) 59 | hist_cat.append(cate) 60 | target_item.append(target_i) 61 | target_cate.append(target_c) 62 | hist_len.append(len_) 63 | b_label.append(label) 64 | 65 | if len(hist_item) == self.batch_size: 66 | max_len = max(hist_len) 67 | hist_item = pad_sequences(hist_item, max_len, padding="post") 68 | hist_cat = pad_sequences(hist_cat, max_len, padding="post") 69 | 70 | yield [np.array(hist_item), np.array(hist_cat), np.array(target_item), np.array(target_cate), np.array(hist_len), np.array(b_label)], None 71 | 72 | hist_item, hist_cat, target_item, target_cate, hist_len, b_label = [], [], [], [], [], [] 73 | 74 | 75 | class TestData: 76 | '''单个输入''' 77 | def __init__(self, file): 78 | self.file = file 79 | self.test_set = self.read_file() 80 | 81 | def read_file(self): 82 | '''读取训练集''' 83 | res = [] 84 | with open(self.file, "r") as f: 85 | for line in f: 86 | line = line.strip().split(";") 87 | hist = line[0].split(" ") # 商品历史点击序列 88 | cate = line[1].split(" ") # 商品历史点击对应的类别序列 89 | click_next_item = line[2] 90 | click_next_item_cate = line[3] 91 | label = line[4] 92 | hist_len = len(hist) 93 | res.append([np.array([hist]), np.array([cate]), np.array([click_next_item]), np.array([click_next_item_cate]), np.array([hist_len]), float(label)]) 94 | return res 95 | 96 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/7/2 8:43 3 | # @Author : skydm 4 | # @Email : wzwei1636@163.com 5 | # @File : model.py 6 | # @Software: PyCharm 7 | 8 | import random 9 | import numpy as np 10 | import tensorflow as tf 11 | import tensorflow.python.keras as keras 12 | import tensorflow.python.keras.backend as K 13 | 14 | # 设置随机种子,方便复现 15 | seed = 1234 16 | random.seed(seed) 17 | np.random.seed(seed) 18 | tf.set_random_seed(seed) 19 | 20 | 21 | class Attention(keras.layers.Layer): 22 | def __init__(self, attention_hidden_units=(80, 40, 1), attention_activation="sigmoid", supports_masking=True): 23 | super(Attention, self).__init__() 24 | self.attention_hidden_units = attention_hidden_units 25 | self.attention_activation = attention_activation 26 | self.supports_masking = supports_masking 27 | 28 | def build(self, input_shape): 29 | super(Attention, self).build(input_shape) 30 | 31 | def call(self, x, mask=None): 32 | ''' 33 | i_emb: [Batch_size, Hidden_units] 34 | hist_emb: [Batch_size, max_len, Hidden_units] 35 | hist_len: [Batch_size] 36 | ''' 37 | assert len(x) == 3 38 | 39 | i_emb, hist_emb, hist_len = x[0], x[1], x[2] 40 | hidden_units = K.int_shape(hist_emb)[-1] 41 | max_len = tf.shape(hist_emb)[1] 42 | 43 | i_emb = tf.tile(i_emb, [1, max_len]) # (batch_size, max_len * hidden_units) 44 | i_emb = tf.reshape(i_emb, [-1, max_len, hidden_units]) # (batch_size, max_len, hidden_units) 45 | concat = K.concatenate([i_emb, hist_emb, i_emb - hist_emb, i_emb * hist_emb], 46 | axis=2) # (batch_size, max_len, hidden_units * 4) 47 | 48 | for i in range(len(self.attention_hidden_units)): 49 | activation = None if i == 2 else self.attention_activation 50 | outputs = keras.layers.Dense(self.attention_hidden_units[i], activation=activation)(concat) 51 | concat = outputs 52 | 53 | outputs = tf.reshape(outputs, [-1, 1, max_len]) # (batch_size, 1, max_len) 54 | 55 | if self.supports_masking: 56 | mask = tf.sequence_mask(hist_len, max_len) # (batch_size, 1, max_len) 57 | padding = tf.ones_like(outputs) * (-1e12) 58 | outputs = tf.where(mask, outputs, padding) 59 | 60 | # 对outputs进行scale 61 | outputs = outputs / (hidden_units ** 0.5) 62 | outputs = K.softmax(outputs) 63 | 64 | 65 | outputs = tf.matmul(outputs, hist_emb) # batch_size, 1, hidden_units) 66 | 67 | outputs = tf.squeeze(outputs) # (batch_size, hidden_units) 68 | 69 | return outputs 70 | 71 | def compute_output_shape(self, input_shape): 72 | return (input_shape[0][0], input_shape[0][-1]) 73 | 74 | 75 | def share_weights(hidden_units=63930): 76 | ''' 77 | reuse a group of keras layers(封装多层,同时可以共享) 78 | ''' 79 | layers_units = (80, 40, 1) 80 | share_input = keras.layers.Input(shape=(hidden_units, )) 81 | share_layer = share_input 82 | for i in range(len(layers_units)): 83 | activation = None if i == 2 else "sigmoid" 84 | share_layer = keras.layers.Dense(layers_units[i], activation=activation)(share_layer) 85 | out_layer = share_layer 86 | model = keras.models.Model(share_input, out_layer) 87 | return model 88 | 89 | 90 | def din(item_count, cate_count, hidden_units=128): 91 | ''' 92 | :param item_count: 商品数 93 | :param cate_count: 类别数 94 | :param hidden_units: 隐藏单元数 95 | :return: model 96 | ''' 97 | target_item = keras.layers.Input(shape=(1,), name='target_item', dtype="int32") # 点击的item 98 | target_cate = keras.layers.Input(shape=(1,), name='target_cate', dtype="int32") # 点击的item对应的所属类别 99 | label = keras.layers.Input(shape=(1,), name='label', dtype="float32") # 是否点击 100 | 101 | hist_item_seq = keras.layers.Input(shape=(None,), name="hist_item_seq", dtype="int32") # 点击序列 102 | hist_cate_seq = keras.layers.Input(shape=(None,), name="hist_cate_seq", dtype="int32") # 点击序列对应的类别序列 103 | 104 | hist_len = keras.layers.Input(shape=(1,), name='hist_len', dtype="int32") # 序列本来的长度 105 | 106 | item_emb = keras.layers.Embedding(input_dim=item_count, 107 | output_dim=hidden_units // 2, 108 | embeddings_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=1e-4, 109 | seed=seed)) 110 | cate_emb = keras.layers.Embedding(input_dim=cate_count, 111 | output_dim=hidden_units // 2, 112 | embeddings_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=1e-4, 113 | seed=seed)) 114 | item_b = keras.layers.Embedding(input_dim=item_count, output_dim=1, 115 | embeddings_initializer=keras.initializers.Constant(0.0)) 116 | 117 | # get target bias embedding 118 | target_item_bias_emb = item_b(target_item) # (batch_size, 1, 1) 119 | # 120 | target_item_bias_emb = keras.layers.Lambda(lambda x: K.squeeze(x, axis=1))(target_item_bias_emb) 121 | 122 | # get target embedding 123 | target_item_emb = item_emb(target_item) # (batch_size, 1, hidden_units//2) 124 | target_cate_emb = cate_emb(target_cate) # (batch_size, 1, hidden_units//2) 125 | i_emb = keras.layers.Lambda(lambda x: K.concatenate([x[0], x[1]], axis=-1))( 126 | [target_item_emb, target_cate_emb]) # (batch_size, 1, hidden_units) 127 | i_emb = keras.layers.Lambda(lambda x: K.squeeze(x, axis=1))(i_emb) # (batch_size, hidden_units) 128 | 129 | # get history item embedding 130 | hist_item_emb = item_emb(hist_item_seq) # (batch_size, max_len, hidden_units//2) 131 | hist_cate_emb = cate_emb(hist_cate_seq) # (batch_size, max_len, hidden_units//2) 132 | hist_emb = keras.layers.Lambda(lambda x: K.concatenate([x[0], x[1]], axis=-1))( 133 | [hist_item_emb, hist_cate_emb]) # (batch_size, max_len, hidden_units) 134 | 135 | # 构建点击序列与候选的attention关系 136 | din_attention = Attention()([i_emb, hist_emb, hist_len]) # (batch_size, hidden_units) 137 | din_attention = keras.layers.Lambda(lambda x: tf.reshape(x, [-1, hidden_units]))(din_attention) 138 | 139 | # keras.layers.BatchNormalization实现暂时有坑,借用paddle相关代码实现 140 | din_attention_fc = keras.layers.Dense(63802)(din_attention) # (batch_size, item_count + cate_count) 141 | # item_count: 63001 cate_count: 801 hidden_units: 128 (batch_size, item_count + cate_count + hidden_units) 142 | din_item = keras.layers.Lambda(lambda x: K.concatenate([x[0], x[1]], axis=1))([i_emb, din_attention_fc]) 143 | din_item = share_weights()(din_item) # (batch_size, 1) 144 | 145 | print("logits:", din_item, target_item_bias_emb) 146 | logits = keras.layers.Add()([din_item, target_item_bias_emb]) 147 | 148 | label_model = keras.models.Model(inputs=[hist_item_seq, hist_cate_seq, target_item, target_cate, hist_len], outputs=[logits]) 149 | 150 | train_model = keras.models.Model(inputs=[hist_item_seq, hist_cate_seq, target_item, target_cate, hist_len, label], 151 | outputs=logits) 152 | 153 | # 计算损失函数 154 | loss = K.binary_crossentropy(target=label, output=logits, from_logits=True) 155 | train_model.add_loss(loss) 156 | train_model.compile(optimizer=keras.optimizers.SGD(1e-3), metrics=["accuracy"]) 157 | 158 | return train_model, label_model -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/7/2 8:45 3 | # @Author : skydm 4 | # @Email : wzwei1636@163.com 5 | # @File : train.py 6 | # @Software: PyCharm 7 | 8 | from tqdm import tqdm 9 | import numpy as np 10 | from din.model import din 11 | from din.data_generator import DataInput, TestData 12 | from tensorflow.python.keras.callbacks import Callback 13 | import tensorflow.python.keras as keras 14 | 15 | # 定义数据集 16 | batch_size = 64 17 | train_D = DataInput(file="./paddle_train.txt", batch_size=batch_size) 18 | test_D = TestData(file="./paddle_test.txt") 19 | 20 | # 构造模型 21 | train_model, label_model = din(item_count=63001, cate_count=801, hidden_units=128) 22 | 23 | # 定义sigmoid函数 24 | def sigmoid(x): 25 | return 1.0/(1+np.exp(-x)) 26 | 27 | def calc_auc(raw_arr): 28 | # sort by pred value, from small to big 29 | arr = sorted(raw_arr, key=lambda d: d[2]) 30 | auc = 0.0 31 | fp1, tp1, fp2, tp2 = 0.0, 0.0, 0.0, 0.0 32 | for record in arr: 33 | fp2 += record[0] # noclick 34 | tp2 += record[1] # click 35 | auc += (fp2 - fp1) * (tp2 + tp1) 36 | fp1, tp1 = fp2, tp2 37 | # if all nonclick or click, disgard 38 | threshold = len(arr) - 1e-3 39 | if tp2 > threshold or fp2 > threshold: 40 | return -0.5 41 | if tp2 * fp2 > 0.0: # normal auc 42 | return (1.0 - auc / (2.0 * tp2 * fp2)) 43 | else: 44 | return None 45 | 46 | # 定义回调函数 47 | class Evaluate(Callback): 48 | '''回调评估和保存模型''' 49 | def __init__(self): 50 | self.acc = [] 51 | self.best_acc = 0. 52 | 53 | def on_epoch_end(self, epoch, logs=None): 54 | acc = self.evaluate() 55 | self.acc.append(acc) 56 | if acc > self.best_acc: 57 | self.best_acc = acc 58 | train_model.save_weights("./best_model.weight") 59 | print('acc: %.4f, best acc: %.4f\n' % (acc, self.best_acc)) 60 | 61 | def evaluate(self): 62 | t_count = 0 63 | score = [] # 记录实际和预测的结果 64 | # 取一个batch*20的数据 65 | np.random.shuffle(test_D.test_set) 66 | batch_valid_data = test_D.test_set[:(batch_size*20)] 67 | for row in tqdm(batch_valid_data): 68 | label = row[-1] 69 | logits = label_model.predict(row[:-1]) # (batch_size, 1) array([[0.3211818]], dtype=float32) 70 | pred = sigmoid(logits)[0][0] 71 | 72 | if label > 0.5: 73 | score.append([0, 1, pred]) 74 | else: 75 | score.append([1, 0, pred]) 76 | 77 | # 计算AUC 78 | auc = calc_auc(score) 79 | print("TEST --> auc: {}".format(auc)) 80 | return auc 81 | 82 | # 定义 83 | evaluator = Evaluate() 84 | 85 | # 定义ModelCheckpoint、EarlyStopping和TensorBoard 86 | ### Using tensorboard callbacks 87 | # tb_callback = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) 88 | 89 | train_model.fit_generator(train_D.__iter__(), 90 | steps_per_epoch=len(train_D), 91 | callbacks=[evaluator], 92 | epochs=2) -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2019/6/26 11:18 3 | # @Author : skydm 4 | # @Email : wzwei1636@163.com 5 | # @File : util.py 6 | # @Software: PyCharm 7 | 8 | import random 9 | import numpy as np 10 | import pandas as pd 11 | 12 | def convert_to_df(file_path): 13 | with open(file_path, "r") as f: 14 | df = {} 15 | i = 0 16 | for line in f: 17 | df[i] = eval(line) 18 | i = i + 1 19 | 20 | df = pd.DataFrame.from_dict(df, orient='index') 21 | return df 22 | 23 | reviews_df = convert_to_df("./raw_data/reviews_Electronics_5.json") 24 | meta_df = convert_to_df('./raw_data/meta_Electronics.json') 25 | 26 | # 选取在review数据中出现在goods_id, 包含的item相同 27 | meta_df = meta_df[meta_df["asin"].isin(reviews_df["asin"].unique())] 28 | reviews_df = reviews_df[reviews_df["asin"].isin(meta_df["asin"].unique())] 29 | meta_df = meta_df.reset_index(drop=True) 30 | 31 | # 取部分字段 32 | reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']] 33 | meta_df = meta_df[['asin', 'categories']] 34 | 35 | # 按照最后一类来分 36 | meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1]) 37 | 38 | # 构建有序字段和对应索引 39 | def build_map(df, col_name): 40 | key = sorted(df[col_name].unique().tolist()) 41 | m = dict(zip(key, range(len(key)))) 42 | df[col_name] = df[col_name].map(lambda x: m[x]) 43 | return m, key 44 | 45 | asin_map, asin_key = build_map(meta_df, "asin") 46 | cate_map, cate_key = build_map(meta_df, "categories") 47 | revi_map, revi_key = build_map(reviews_df, "reviewerID") 48 | 49 | # 按照asin进行排序 50 | meta_df = meta_df.sort_values('asin') 51 | meta_df = meta_df.reset_index(drop=True) 52 | 53 | # reviews_df的asin字段进行映射 54 | reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x]) 55 | 56 | # reviews_df 按照reviewerID和时间进行排序 57 | reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime']) 58 | reviews_df = reviews_df.reset_index(drop=True) 59 | reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']] 60 | 61 | user_count, item_count, cate_count, example_count = len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0] 62 | # user_count: 192403 item_count: 63001 cate_count: 801 example_count: 1689188 63 | 64 | # 商品对应的类别情况 65 | cate_list = [meta_df['categories'][i] for i in range(len(asin_map))] 66 | cate_list = np.array(cate_list, dtype=np.int32) 67 | 68 | # 构建训练数据,划分测试集和训练集(商品ID 类目ID 用户ID) 69 | train_set, test_set = [], [] 70 | # 用户id, 对应的用户的item历史行为记录 71 | for reviewerID, hist in reviews_df.groupby("reviewerID"): 72 | # 用户购买的product_id 73 | pos_list = hist["asin"].tolist() # hist is datframe 74 | 75 | def gen_neg(): 76 | neg = pos_list[0] 77 | while neg in pos_list: 78 | neg = random.randint(0, item_count-1) 79 | return neg 80 | 81 | neg_list = [gen_neg() for i in range(len(pos_list))] 82 | 83 | for i in range(1, len(pos_list)): 84 | hist = pos_list[:i] 85 | if i != len(pos_list) - 1: 86 | train_set.append((reviewerID, hist, pos_list[i], 1)) 87 | train_set.append((reviewerID, hist, neg_list[i], 0)) 88 | else: 89 | label = (pos_list[i], neg_list[i]) 90 | test_set.append((reviewerID, hist, label)) 91 | 92 | # 打乱数据 93 | random.shuffle(train_set) 94 | random.shuffle(test_set) 95 | 96 | assert len(test_set) == user_count 97 | 98 | --------------------------------------------------------------------------------