├── .gitignore ├── image ├── saddle_point.png ├── 20191215145026134.png ├── image-20210705202821879.png ├── image-20210708141144900.png ├── image-20210708142307912.png ├── image-20210708142402038.png └── image-20210708170933328.png ├── model ├── __init__.py ├── evaluation.py ├── FocalLoss.py ├── get_data.py ├── lr_cosine.py ├── deepfm.py ├── mmoe.py └── ple.py └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | data 3 | user_data -------------------------------------------------------------------------------- /image/saddle_point.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SummerRaining/multi_task-learning/HEAD/image/saddle_point.png -------------------------------------------------------------------------------- /image/20191215145026134.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SummerRaining/multi_task-learning/HEAD/image/20191215145026134.png -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jul 9 21:08:26 2021 4 | 5 | @author: tunan 6 | """ 7 | 8 | -------------------------------------------------------------------------------- /image/image-20210705202821879.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SummerRaining/multi_task-learning/HEAD/image/image-20210705202821879.png -------------------------------------------------------------------------------- /image/image-20210708141144900.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SummerRaining/multi_task-learning/HEAD/image/image-20210708141144900.png -------------------------------------------------------------------------------- /image/image-20210708142307912.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SummerRaining/multi_task-learning/HEAD/image/image-20210708142307912.png -------------------------------------------------------------------------------- /image/image-20210708142402038.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SummerRaining/multi_task-learning/HEAD/image/image-20210708142402038.png -------------------------------------------------------------------------------- /image/image-20210708170933328.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SummerRaining/multi_task-learning/HEAD/image/image-20210708170933328.png -------------------------------------------------------------------------------- /model/evaluation.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import time 3 | from collections import defaultdict 4 | import numpy as np 5 | from sklearn.metrics import roc_auc_score 6 | 7 | 8 | def uAUC(labels, preds, user_id_list): 9 | """Calculate user AUC""" 10 | user_pred = defaultdict(lambda: []) 11 | user_truth = defaultdict(lambda: []) 12 | for idx, truth in enumerate(labels): 13 | user_id = user_id_list[idx] 14 | pred = preds[idx] 15 | truth = labels[idx] 16 | user_pred[user_id].append(pred) 17 | user_truth[user_id].append(truth) 18 | 19 | user_flag = defaultdict(lambda: False) 20 | for user_id in set(user_id_list): 21 | truths = user_truth[user_id] 22 | flag = False 23 | # 若全是正样本或全是负样本,则flag为False 24 | for i in range(len(truths) - 1): 25 | if truths[i] != truths[i + 1]: 26 | flag = True 27 | break 28 | user_flag[user_id] = flag 29 | 30 | total_auc = 0.0 31 | size = 0.0 32 | for user_id in user_flag: 33 | if user_flag[user_id]: 34 | auc = roc_auc_score(np.asarray(user_truth[user_id]), np.asarray(user_pred[user_id])) 35 | total_auc += auc 36 | size += 1.0 37 | user_auc = float(total_auc)/size 38 | return user_auc 39 | 40 | 41 | def compute_weighted_score(score_dict, weight_dict): 42 | '''基于多个行为的uAUC值,计算加权uAUC 43 | Input: 44 | scores_dict: 多个行为的uAUC值映射字典, dict 45 | weights_dict: 多个行为的权重映射字典, dict 46 | Output: 47 | score: 加权uAUC值, float 48 | ''' 49 | score = 0.0 50 | weight_sum = 0.0 51 | for action in score_dict: 52 | weight = float(weight_dict[action]) 53 | score += weight*score_dict[action] 54 | weight_sum += weight 55 | score /= float(weight_sum) 56 | score = round(score, 6) 57 | return score 58 | 59 | 60 | def evaluate_deepctr(val_labels,val_pred_ans,userid_list): 61 | eval_dict = {} 62 | target = ["read_comment", "like", "click_avatar", "forward"] 63 | for i, action in enumerate(target): 64 | eval_dict[action] = uAUC(val_labels[i], val_pred_ans[i], userid_list) 65 | print(eval_dict) 66 | weight_dict = {"read_comment": 4, "like": 3, "click_avatar": 2, "favorite": 1, "forward": 1, 67 | "comment": 1, "follow": 1} 68 | weight_auc = compute_weighted_score(eval_dict, weight_dict) 69 | print("Weighted uAUC: ", weight_auc) 70 | return weight_auc 71 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## 推荐算法模型笔记: 2 | 3 | ### 实验总结 4 | 5 | 我对deepfm,mmoe,ple和focal loss 学习率退火算法分别进行了实验,在《微信大数据挑战赛》的数据集上进行训练并且评估结果。其中所有模型的结构都没有调整过超参数,只是将模型跑通了,该对比实验并不能说明这几种模型的优劣。 6 | 7 | | 模型名 | read_comment在验证集上的auc | 8 | | -------------------------------- | --------------------------- | 9 | | DeepFm | 0.9233 | 10 | | MMOE | 0.9260 | 11 | | PLE | 0.9224 | 12 | | 带focal loss和学习率退火的deepfm | 0.9307 | 13 | 14 | 复现实验的流程: 15 | 16 | 1. 下载《微信大数据挑战赛》的数据集,然后放在data文件夹下。比赛链接 https://algo.weixin.qq.com/。 17 | 2. 新建一个user_data文件夹,然后运行get_data.py文件,它会将数据集的内容进行特征处理后用pickle保存起来。 18 | 3. 分别运行deepfm.py,mmoe.py,ple.py和FocalLoss.py 文件,可以得到结果。 19 | 20 | 21 | 22 | ### DeepFM模型 23 | 24 | **Fm流程**:what:factorization machine 因子分解机,用于获取稀疏特征的交互信息。 25 | 26 | Why:对于高基的离散变量一般使用onehot的方式编码,而对特征的进行二阶交互会更大的增加特征数量,特征数量从n维到n(n-1)维。因此产生大量的稀疏特征。 27 | 28 | How:FM是在逻辑回归上增加了二阶特征,通过对每个特征学习一个隐向量vi,任意两个特征的交互表示为$x_ix_j$。输出结果为:“线性部分”加上“所有二阶交互特征”,然后加上sigmoid激活函数。 29 | 30 | ​ $$ y_{fm} = w_0++\sum_{i,j\in(1,n)}x_ix_j $$ 31 | 32 | ​ $$ output = sigmoid(y_{fm}) $$ 33 | 34 | 优点分析: 35 | 36 | 1. FM的学习参数量为nd,d为隐向量的维数。相比于所有的二阶交互参数量为n(n-1),减少很多。 37 | 2. 可以学习到训练集上不存在的交互特征,传统的二阶特征,如果训练集上的$x_ix_j$全部为0,那么$x_ix_j$的参数$w_{ij}$无法学习。而FM可以通过$$来计算得到。每个参数学习的数据量增加了。 38 | 3. Fm时间可以分解成,先相加再相乘- 先相乘再相加形式。时间复杂度为,O(nd)。 39 | 40 | **Deepfm**: 41 | 42 | why:FM对于输入特征的一阶信息只做了简单的线性运算,因为在FM之外增加了dnn部分,对于输入增加更加复杂的非线性运算。(注意:DNN部分无法获得特征间的高阶交互) 43 | 44 | how: 45 | 46 | 1. 线性部分和二阶Fm部分与FM一致。 47 | 48 | 2. 将FM学习的隐向量拼接在一起,添加多个全连接层,最后全连接到1维输出。 49 | 50 | 3. 将线性输出,二阶fm输出,dnn输出相加。使用sigmoid激活函数得到模型输出。 51 | 52 | $$ y_{deepfm} = y_{fm}+y_{dnn}$$ 53 | 54 | $$ y_{dnn} = dense(concate([v_1,...,v_n])) $$ 55 | 56 | ### MMOE模型 57 | 58 | 简述结构: 59 | 60 | 1. 将输入的离散变量和连续变量分别embedding后拼接在一起,经过几次全连接到固定维的向量x。 61 | 2. x分别经过m个子网络(专家网络)得到m个专家向量,每个子网络是相同大小的全连接。对于每个任务学习一个门网络,门网络将x作为输入,输出softmax后概率向量,并使用该概率对m个专家做加权平均。 62 | 3. 专家加权平均后的结果就作为对应塔任务网络的输入。每个塔网络接受专家加权平均值作为输入,经过一次全连接映射到一维的向量,做分类或回归任务。 63 | 64 | ### PLE模型 65 | 66 | 简述结构: 67 | 68 | 1. PLE模型的想法是,解决多任务训练中的翘翘板问题。多任务模型例如:mmoe,每个任务的损失梯度更新的时候会对所有的参数都更新。但有时不同的任务所提取的特征是冲突的,甚至是相反的;这样就会导致A任务表现上升时,B任务的表现下降。 69 | 2. PLE模型,提出每个任务都有自己的模块,然后所有任务同时共享一个模块。例如:A任务有两个expert (e1,e2),B任务有三个expert(e3,e4,e5),它们共享两个expert(e6,e7).A任务的tower等于(e1,e2,e6,e7)四个expert的加权平均,权重由门网络学习gate1(x)。同理:B任务的tower等于(e3,e4,e5,e6,e7)五个expert的加权平均,权重是gate2(x).每个任务的输出,由tower全连接到对应的维度。 70 | 3. 优势:这样在A任务的损失进行参数更新时,只影响了A自己的模块(e1,e2)和共享模块,而不影响B的模块。避免了两个任务同时学习时,产生冲突。 71 | 72 | ### Focal Loss 73 | 74 | 1. 原理:平衡易判断的样本和判断困难的样本之间的权重,将预测的很准的样本的损失减小。 75 | 2. 计算公式:$$loss = (1−p_t)^{\gamma}log(p_t)\quad p_t=\begin{cases} p,y_t = 1\\ 1-p,y_t = 0 \end{cases} $$ ; 当判断错误时,$p_t$接近0,$1-p_t$接近1,权重等于1,对损失没有影响。判断正确时,$1-p_t$接近0,权重接近0,降低了正确分类样本的损失。 76 | 3. 在目标检测场景中,负样本非常多而且容易判断,而正样本很少且判断困难。所以focal loss可以自动降低负样本的权重。 77 | -------------------------------------------------------------------------------- /model/FocalLoss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jul 1 17:30:45 2021 4 | 5 | @author: shujie.wang 6 | """ 7 | 8 | import tensorflow.keras.backend as K 9 | import tensorflow as tf 10 | 11 | def binary_focal_loss(gamma=2, alpha=0.25): 12 | """ 13 | Binary form of focal loss. 14 | 适用于二分类问题的focal loss 15 | 16 | focal_loss(p_t) = -alpha_t * (1 - p_t)**gamma * log(p_t) 17 | where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0, respectively. 18 | References: 19 | https://arxiv.org/pdf/1708.02002.pdf 20 | Usage: 21 | model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"], optimizer=adam) 22 | """ 23 | alpha = tf.constant(alpha, dtype=tf.float32) 24 | gamma = tf.constant(gamma, dtype=tf.float32) 25 | 26 | def binary_focal_loss_fixed(y_true, y_pred): 27 | """ 28 | y_true shape need be (None,1) 29 | y_pred need be compute after sigmoid 30 | """ 31 | y_true = tf.cast(y_true, tf.float32) 32 | alpha_t = y_true*(1-alpha) + (K.ones_like(y_true)-y_true)*alpha 33 | 34 | p_t = y_true*y_pred + (K.ones_like(y_true)-y_true)*(K.ones_like(y_true)-y_pred) + K.epsilon() 35 | focal_loss = - alpha_t * K.pow((K.ones_like(y_true)-p_t),gamma) * K.log(p_t) 36 | return K.mean(focal_loss) 37 | return binary_focal_loss_fixed 38 | 39 | if __name__ == '__main__': 40 | import pickle as pkl 41 | import numpy as np 42 | import gc 43 | from deepfm import build_FM 44 | from tensorflow.keras import optimizers,initializers 45 | from lr_cosine import CosineAnnealing 46 | 47 | target = ["read_comment", "like", "click_avatar", "forward"] 48 | sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id'] 49 | varlen_features = ['manual_tag_list','manual_keyword_list'] 50 | dense_features = ['videoplayseconds'] 51 | #1.加载数据 52 | with open('../user_data/data.pkl','rb') as f: 53 | train,val,test,encoder = pkl.load(f) 54 | train_num = len(train) 55 | 56 | #2.生成输入特征设置 57 | sparse_max_len = {f:len(encoder[f]) + 1 for f in sparse_features} 58 | varlens_max_len = {f:len(encoder[f]) + 1 for f in varlen_features} 59 | feature_names = sparse_features+varlen_features+dense_features 60 | 61 | # 3.generate input data for model 62 | train_model_input = {name: train[name] if name not in varlen_features else np.stack(train[name]) for name in feature_names } #训练模型的输入,字典类型。名称和具体值 63 | val_model_input = {name: val[name] if name not in varlen_features else np.stack(val[name]) for name in feature_names } 64 | test_model_input = {name: test[name] if name not in varlen_features else np.stack(test[name]) for name in feature_names} 65 | 66 | train_labels = train['read_comment'].values 67 | val_labels = val['read_comment'].values 68 | 69 | del train,val #多余的特征删除,释放内存。 70 | gc.collect() 71 | 72 | model = build_FM(sparse_features,dense_features,sparse_max_len,embed_dim = 16, 73 | dnn_hidden_units=(64,64),varlens_cols = varlen_features,varlens_max_len = varlens_max_len, 74 | dropout = 0.1,embedding_reg_l2 = 1e-6,dnn_reg_l2 = 0.0) 75 | 76 | loss = binary_focal_loss(gamma=2, alpha=0.1) 77 | reduce_lr = CosineAnnealing(eta_max = 1,eta_min = 0, 78 | num_step_per_epoch=(train_num//10240)+1,lr_list = [1,2,2]) 79 | adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) 80 | model.compile(adam, loss = loss ,metrics = [tf.keras.metrics.AUC()],) 81 | 82 | history = model.fit(train_model_input, train_labels,validation_data = (val_model_input,val_labels), 83 | batch_size=10240, epochs=5, verbose=1,callbacks=[reduce_lr],) -------------------------------------------------------------------------------- /model/get_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jun 15 23:35:55 2021 4 | 5 | @author: tunan 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import os 11 | from time import time 12 | import pickle as pkl 13 | from tensorflow.python.keras.preprocessing.sequence import pad_sequences 14 | import copy 15 | 16 | def split(x): 17 | if not isinstance(x,str): 18 | return [] 19 | key_ans = x.strip().split(';') 20 | for key in key_ans: 21 | if key not in key2index: 22 | # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input 23 | key2index[key] = len(key2index) + 1 24 | return list(map(lambda x: key2index[x], key_ans)) 25 | 26 | def preprocess(sample,dense_features): 27 | ''' 28 | 特征工程:对数值型特征取对数;对id型特征+1;缺失值补充0。 29 | ''' 30 | sample[dense_features] = sample[dense_features].fillna(0.0) 31 | sample[dense_features] = np.log(sample[dense_features] + 1.0) 32 | 33 | sample[["authorid", "bgm_song_id", "bgm_singer_id"]] += 1 # 0 用于填未知 34 | sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds"]] = \ 35 | sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds"]].fillna(0) 36 | sample["videoplayseconds"] = np.log(sample["videoplayseconds"] + 1.0) 37 | sample[["authorid", "bgm_song_id", "bgm_singer_id"]] = \ 38 | sample[["authorid", "bgm_song_id", "bgm_singer_id"]].astype(int) 39 | return sample 40 | 41 | 42 | if __name__ == "__main__": 43 | target = ["read_comment", "like", "click_avatar", "forward"] 44 | sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id'] 45 | varlen_features = ['manual_tag_list','manual_keyword_list'] 46 | dense_features = ['videoplayseconds'] 47 | data = pd.read_csv('../data/wechat_algo_data1/user_action.csv') 48 | test = pd.read_csv('../data/wechat_algo_data1/test_a.csv') #预测数据 49 | test['date_'] = 15 50 | data = pd.concat([data,test]) 51 | 52 | #1. merge features to data 53 | feed = pd.read_csv('../data/wechat_algo_data1/feed_info.csv') #视频特征。 54 | feed = feed[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id','manual_tag_list','manual_keyword_list']] 55 | data = data.merge(feed, how='left',on='feedid') #行为数据拼接,作者id,bgm_song_id 56 | data = preprocess(data,dense_features) #特征处理 57 | data = data[dense_features+sparse_features+varlen_features+['date_']+target] 58 | 59 | #2. varlen features encode 60 | encoder = {} 61 | global key2index 62 | for f in ['manual_keyword_list','manual_tag_list']: 63 | key2index = {} 64 | f_list = list(map(split, data[f].values)) 65 | f_length = np.array(list(map(len, f_list))) 66 | max_len = max(f_length) 67 | print(f'{f} max length is {max_len}') 68 | # Notice : padding=`post` 69 | data[f] = list(pad_sequences(f_list, maxlen=max_len, padding='post', )) 70 | encoder[f] = copy.copy(key2index) 71 | 72 | # 3.sparse feature encode 73 | for featid in sparse_features: 74 | print(f"encode {featid} feature id") 75 | encoder[featid] = {uid:ucode+1 for ucode,uid in enumerate(data[featid].unique())} 76 | data[featid] = data[featid].apply(lambda x: encoder[featid].get(x,0)) 77 | 78 | print('data.shape', data.shape) 79 | print('data.columns', data.columns.tolist()) 80 | print('unique date_: ', data['date_'].unique()) 81 | data = data.sample(frac = 1.0) 82 | 83 | train = data[data['date_'] < 14].drop(['date_'],axis = 1) 84 | val = data[data['date_'] == 14].drop(['date_'],axis = 1) # 第14天样本作为验证集 85 | test = data[data['date_'] == 15].drop(['date_'],axis = 1) 86 | with open('../user_data/data.pkl','wb') as f: 87 | pkl.dump([train,val,test,encoder],f) 88 | -------------------------------------------------------------------------------- /model/lr_cosine.py: -------------------------------------------------------------------------------- 1 | from keras import * 2 | import keras 3 | from keras import layers 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | import keras.backend as K 9 | 10 | class CosineAnnealing(callbacks.Callback): 11 | """Cosine annealing according to DECOUPLED WEIGHT DECAY REGULARIZATION. 12 | 13 | # Arguments 14 | eta_max: float, eta_max in eq(5). 15 | eta_min: float, eta_min in eq(5). 16 | total_iteration: int, Ti in eq(5). 17 | iteration: int, T_cur in eq(5). 18 | verbose: 0 or 1. 19 | """ 20 | 21 | def __init__(self, eta_max=1, eta_min=0, num_step_per_epoch = 100,lr_list = [],verbose=0, **kwargs): 22 | 23 | super(CosineAnnealing, self).__init__() 24 | 25 | global lr_log 26 | 27 | self.lr_list = lr_list 28 | lr_log = [] 29 | self.eta_max = eta_max 30 | self.eta_min = eta_min 31 | self.verbose = verbose 32 | 33 | self.iteration = 0 34 | self.cur_epoch = 0 35 | self.num_start = 0 36 | self.total_epoch = lr_list[self.num_start] 37 | self.num_step_per_epoch = num_step_per_epoch 38 | self.total_iteration = self.total_epoch*num_step_per_epoch 39 | 40 | def on_train_begin(self, logs=None): 41 | self.lr = K.get_value(self.model.optimizer.lr) 42 | #防止多个epoch分开训练。 43 | eta_t = self.eta_min + (self.eta_max - self.eta_min) * 0.5 * (1 + np.cos(np.pi * self.iteration / self.total_iteration)) 44 | new_lr = self.lr * eta_t 45 | K.set_value(self.model.optimizer.lr, new_lr) 46 | 47 | def on_train_end(self, logs=None): 48 | K.set_value(self.model.optimizer.lr, self.lr) 49 | 50 | def on_epoch_end(self, epoch, logs=None): 51 | self.cur_epoch += 1 52 | if self.cur_epoch == self.total_epoch: 53 | self.cur_epoch = 0 54 | self.num_start += 1 55 | self.total_epoch = self.lr_list[min(self.num_start,len(self.lr_list)-1)] 56 | 57 | self.iteration = 0 58 | self.total_iteration = self.total_epoch*self.num_step_per_epoch 59 | 60 | def on_batch_end(self, epoch, logs=None): 61 | self.iteration += 1 62 | logs = logs or {} 63 | logs['lr'] = K.get_value(self.model.optimizer.lr) 64 | 65 | eta_t = self.eta_min + (self.eta_max - self.eta_min) * 0.5 * (1 + np.cos(np.pi * self.iteration / self.total_iteration)) 66 | new_lr = self.lr * eta_t 67 | K.set_value(self.model.optimizer.lr, new_lr) 68 | if self.verbose > 0: 69 | print('\nEpoch %05d: CosineAnnealing ' 70 | 'learning rate to %s.' % (epoch + 1, new_lr)) 71 | lr_log.append(logs['lr']) 72 | 73 | if __name__ == '__main__': 74 | # 准备数据集 75 | num_train, num_test = 2000, 100 76 | num_features = 200 77 | 78 | true_w, true_b = np.ones((num_features, 1)) * 0.01, 0.05 79 | 80 | features = np.random.normal(0, 1, (num_train + num_test, num_features)) 81 | noises = np.random.normal(0, 1, (num_train + num_test, 1)) * 0.01 82 | labels = np.dot(features, true_w) + true_b + noises 83 | 84 | train_data, test_data = features[:num_train, :], features[num_train:, :] 85 | train_labels, test_labels = labels[:num_train], labels[num_train:] 86 | 87 | # 选择模型 88 | model = keras.models.Sequential([ 89 | layers.Dense(units=128, activation='relu', input_dim=200), 90 | layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.00)), 91 | layers.Dense(1) 92 | ]) 93 | 94 | model.summary() 95 | model.compile(optimizer='adam',loss='mse',metrics=['mse']) 96 | #需要传入参数,max,min。lr会在max和min之间衰减,乘上原来的lr。 97 | #num_step_per_epoch每个epoch会训练多少次。 98 | #lr_list每次的重启周期,例如这里2个epoch是一个周期,4个epoch一个周期,8,15,32.等。 99 | 100 | lr_list = [2,4,8,16,32] 101 | reduce_lr = CosineAnnealing(eta_max=1, eta_min=0, num_step_per_epoch=(2000 // 16), lr_list = lr_list) 102 | # for e in range(62): 103 | # model.fit(train_data, train_labels, batch_size=16, epochs=1, validation_data=(test_data, test_labels), callbacks=[reduce_lr]) 104 | model.fit(train_data, train_labels, batch_size=16, epochs=62, validation_data=(test_data, test_labels), callbacks=[reduce_lr]) 105 | plt.plot(lr_log) 106 | 107 | -------------------------------------------------------------------------------- /model/deepfm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jul 1 17:27:12 2021 4 | 5 | @author: shujie.wang 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from tensorflow.keras.layers import * 11 | from tensorflow.keras import regularizers 12 | import tensorflow.keras.backend as K 13 | import matplotlib.pyplot as plt 14 | import tensorflow as tf 15 | from tensorflow.keras.models import Model,load_model 16 | from tensorflow.keras.utils import plot_model 17 | from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping 18 | import os 19 | from tensorflow.keras import optimizers,initializers 20 | 21 | from tensorflow.python.keras.initializers import glorot_normal #xariver。截断的正态分布,标准差。 22 | from tensorflow.python.keras.layers import Layer 23 | import pickle as pkl 24 | import gc 25 | from time import time 26 | 27 | class MyMeanPool(Layer): 28 | def __init__(self, axis, **kwargs): 29 | super(MyMeanPool, self).__init__(**kwargs) 30 | self.axis = axis 31 | 32 | def call(self, x, mask): 33 | mask = tf.expand_dims(tf.cast(mask,tf.float32),axis = -1) 34 | x = x * mask 35 | return K.sum(x, axis=self.axis) / (K.sum(mask, axis=self.axis) + 1e-9) 36 | 37 | def secondary_fm(W): 38 | #先相加再平方。 39 | frs_part = Add()(W) 40 | frs_part = Multiply()([frs_part,frs_part]) 41 | #先平方再相加 42 | scd_part = Add()([Multiply()([_x,_x]) for _x in W]) 43 | #相减,乘0.5. 44 | fm_part = Subtract()([frs_part,scd_part]) 45 | fm_part = Lambda(lambda x:K.sum(x,axis = 1,keepdims = True)*0.5)(fm_part) 46 | return fm_part 47 | 48 | 49 | def build_FM(sparse_cols,dense_cols,sparse_max_len,embed_dim = 16, 50 | dnn_hidden_units=(128, 128),varlens_cols = [],varlens_max_len = {}, 51 | dropout = 0,embedding_reg_l2 = 1e-6,dnn_reg_l2 = 0.0): 52 | ''' 53 | sparse_cols,dense_cols:离散变量名,连续变量名。 54 | sparse_max_len:字典:离散变量对应的最大的取值范围。 55 | varlens_cols:可变离散变量名。 56 | varlens_max_len:可变离散变量的最大取值范围。 57 | ''' 58 | 59 | #输入部分,分为sparse,varlens,dense部分。 60 | sparse_inputs = {f:Input([1],name = f) for f in sparse_cols} 61 | dense_inputs = {f:Input([1],name = f) for f in dense_cols} 62 | varlens_inputs = {f:Input([None,1],name = f) for f in varlens_cols} 63 | 64 | input_embed = {} 65 | #离散特征,embedding到k维,得到其隐向量。wi 66 | for f in sparse_cols: 67 | _input = sparse_inputs[f] 68 | embedding = Embedding(sparse_max_len[f], embed_dim, 69 | embeddings_regularizer=tf.keras.regularizers.l2(embedding_reg_l2)) 70 | input_embed[f] =Flatten()(embedding(_input)) #(bs,k) 71 | 72 | #多标签离散变量 73 | for f in varlens_inputs: 74 | _input = varlens_inputs[f] 75 | mask = Masking(mask_value = 0).compute_mask(_input) 76 | embedding = Embedding(varlens_max_len[f], embed_dim, 77 | embeddings_regularizer=tf.keras.regularizers.l2(1e-6)) 78 | _embed =Reshape([-1,embed_dim])(embedding(_input)) 79 | out_embed = MyMeanPool(axis=1)(_embed,mask) 80 | input_embed[f] = out_embed 81 | 82 | #连续变量 83 | for f in dense_inputs: 84 | _input = dense_inputs[f] 85 | _embed = Dense(embed_dim,use_bias = False,activation = 'linear')(_input) 86 | input_embed[f] = _embed 87 | 88 | feature_name = sparse_cols+varlens_cols+dense_cols 89 | fm_embed = [input_embed[f] for f in feature_name] 90 | fm_part = secondary_fm(fm_embed) 91 | 92 | #离散变量和连续变量拼接成dnn feature 93 | dnn_feature = Concatenate(axis = -1)(fm_embed) 94 | for num in dnn_hidden_units: 95 | dnn_feature = Dropout(dropout)(Dense(num,activation='relu', 96 | kernel_regularizer=regularizers.l2(dnn_reg_l2))(dnn_feature)) 97 | 98 | dnn_output = Dense(1,activation = 'linear', kernel_regularizer=regularizers.l2(dnn_reg_l2), 99 | use_bias = True)(dnn_feature) 100 | logits = Activation('sigmoid')(Add()([fm_part,dnn_output])) 101 | inputs = [sparse_inputs[f] for f in sparse_inputs]+[varlens_inputs[f] for f in varlens_inputs]\ 102 | +[dense_inputs[f] for f in dense_inputs] 103 | model = Model(inputs,logits) 104 | return model 105 | 106 | if __name__ == '__main__': 107 | target = ["read_comment", "like", "click_avatar", "forward"] 108 | sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id'] 109 | varlen_features = ['manual_tag_list','manual_keyword_list'] 110 | dense_features = ['videoplayseconds'] 111 | #1.加载数据 112 | with open('../user_data/data.pkl','rb') as f: 113 | train,val,test,encoder = pkl.load(f) 114 | train_num = len(train) 115 | 116 | #2.生成输入特征设置 117 | sparse_max_len = {f:len(encoder[f]) + 1 for f in sparse_features} 118 | varlens_max_len = {f:len(encoder[f]) + 1 for f in varlen_features} 119 | feature_names = sparse_features+varlen_features+dense_features 120 | 121 | # 3.generate input data for model 122 | train_model_input = {name: train[name] if name not in varlen_features else np.stack(train[name]) for name in feature_names } #训练模型的输入,字典类型。名称和具体值 123 | val_model_input = {name: val[name] if name not in varlen_features else np.stack(val[name]) for name in feature_names } 124 | test_model_input = {name: test[name] if name not in varlen_features else np.stack(test[name]) for name in feature_names} 125 | 126 | train_labels = train['read_comment'].values 127 | val_labels = val['read_comment'].values 128 | 129 | del train,val #多余的特征删除,释放内存。 130 | gc.collect() 131 | 132 | model = build_FM(sparse_features,dense_features,sparse_max_len,embed_dim = 16, 133 | dnn_hidden_units=(64,64),varlens_cols = varlen_features,varlens_max_len = varlens_max_len, 134 | dropout = 0.1,embedding_reg_l2 = 1e-6,dnn_reg_l2 = 0.0) 135 | 136 | adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) 137 | model.compile(adam, loss = 'binary_crossentropy' ,metrics = [tf.keras.metrics.AUC()],) 138 | 139 | history = model.fit(train_model_input, train_labels,validation_data = (val_model_input,val_labels), 140 | batch_size=10240, epochs=4, verbose=1) 141 | -------------------------------------------------------------------------------- /model/mmoe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jul 1 17:28:27 2021 4 | 5 | @author: shujie.wang 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from tensorflow.keras.layers import * 11 | from tensorflow.keras import regularizers 12 | import tensorflow.keras.backend as K 13 | import matplotlib.pyplot as plt 14 | import tensorflow as tf 15 | from tensorflow.keras.models import Model,load_model 16 | from tensorflow.keras.utils import plot_model 17 | from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping 18 | import os 19 | from tensorflow.keras import optimizers,initializers 20 | 21 | from tensorflow.python.keras.initializers import glorot_normal #xariver。截断的正态分布,标准差。 22 | from tensorflow.python.keras.layers import Layer 23 | import pickle as pkl 24 | import gc 25 | from time import time 26 | 27 | class MyMeanPool(Layer): 28 | def __init__(self, axis, **kwargs): 29 | super(MyMeanPool, self).__init__(**kwargs) 30 | self.axis = axis 31 | 32 | def call(self, x, mask): 33 | mask = tf.expand_dims(tf.cast(mask,tf.float32),axis = -1) 34 | x = x * mask 35 | return K.sum(x, axis=self.axis) / (K.sum(mask, axis=self.axis) + 1e-9) 36 | 37 | class Mmoe_layer(tf.keras.layers.Layer): 38 | def __init__(self,expert_dim,n_expert,n_task): 39 | super(Mmoe_layer, self).__init__() 40 | self.n_task = n_task 41 | self.expert_layer = [Dense(expert_dim,activation = 'relu') for i in range(n_expert)] 42 | self.gate_layers = [Dense(n_expert,activation = 'softmax') for i in range(n_task)] 43 | 44 | def call(self,x): 45 | #多个专家网络 46 | E_net = [expert(x) for expert in self.expert_layer] 47 | E_net = Concatenate(axis = 1)([e[:,tf.newaxis,:] for e in E_net]) #(bs,n_expert,n_dims) 48 | #多个门网络 49 | gate_net = [gate(x) for gate in self.gate_layers] #n_task个(bs,n_expert) 50 | 51 | #每个towers等于,对应的门网络乘上所有的专家网络。 52 | towers = [] 53 | for i in range(self.n_task): 54 | g = tf.expand_dims(gate_net[i],axis = -1) #(bs,n_expert,1) 55 | _tower = tf.matmul(E_net, g,transpose_a=True) 56 | towers.append(Flatten()(_tower)) #(bs,expert_dim) 57 | 58 | return towers 59 | 60 | def build_mmoe(sparse_cols,dense_cols,sparse_max_len,embed_dim,expert_dim, 61 | varlens_cols,varlens_max_len,n_expert,n_task,target = [], 62 | dnn_hidden_units = (64,),dnn_reg_l2 = 1e-5,drop_rate = 0.1, 63 | embedding_reg_l2 = 1e-6): 64 | 65 | 66 | #输入部分,分为sparse,varlens,dense部分。 67 | sparse_inputs = {f:Input([1],name = f) for f in sparse_cols} 68 | dense_inputs = {f:Input([1],name = f) for f in dense_cols} 69 | varlens_inputs = {f:Input([None,1],name = f) for f in varlens_cols} 70 | 71 | input_embed = {} 72 | #离散特征,embedding到k维 73 | for f in sparse_cols: 74 | _input = sparse_inputs[f] 75 | embedding = Embedding(sparse_max_len[f], embed_dim, 76 | embeddings_regularizer=tf.keras.regularizers.l2(embedding_reg_l2)) 77 | input_embed[f] =Flatten()(embedding(_input)) #(bs,k) 78 | 79 | #多标签离散变量 80 | for f in varlens_inputs: 81 | _input = varlens_inputs[f] 82 | mask = Masking(mask_value = 0).compute_mask(_input) 83 | embedding = Embedding(varlens_max_len[f], embed_dim, 84 | embeddings_regularizer=tf.keras.regularizers.l2(1e-6)) 85 | _embed =Reshape([-1,embed_dim])(embedding(_input)) 86 | out_embed = MyMeanPool(axis=1)(_embed,mask) 87 | input_embed[f] = out_embed 88 | 89 | input_embed.update(dense_inputs) #加入连续变量 90 | input_embed = Concatenate(axis = -1)([input_embed[f] for f in input_embed]) 91 | for num in dnn_hidden_units: 92 | input_embed = Dropout(drop_rate)(Dense(num,activation = 'relu', 93 | kernel_regularizer=regularizers.l2(dnn_reg_l2))(input_embed)) 94 | 95 | #mmoe网络层 96 | towers = Mmoe_layer(expert_dim,n_expert,n_task)(input_embed) 97 | outputs = [Dense(1,activation = 'sigmoid', kernel_regularizer=regularizers.l2(dnn_reg_l2), 98 | name = f,use_bias = True)(_t) for _t,f in zip(towers,target)] 99 | inputs = [sparse_inputs[f] for f in sparse_inputs]+[varlens_inputs[f] for f in varlens_inputs]\ 100 | +[dense_inputs[f] for f in dense_inputs] 101 | model = Model(inputs,outputs) 102 | return model 103 | 104 | if __name__ == '__main__': 105 | target = ["read_comment", "like", "click_avatar", "forward"] 106 | sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id'] 107 | varlen_features = ['manual_tag_list','manual_keyword_list'] 108 | dense_features = ['videoplayseconds'] 109 | #1.加载数据 110 | with open('../user_data/data.pkl','rb') as f: 111 | train,val,test,encoder = pkl.load(f) 112 | train_num = len(train) 113 | 114 | #2.生成输入特征设置 115 | sparse_max_len = {f:len(encoder[f]) + 1 for f in sparse_features} 116 | varlens_max_len = {f:len(encoder[f]) + 1 for f in varlen_features} 117 | feature_names = sparse_features+varlen_features+dense_features 118 | 119 | # 3.generate input data for model 120 | train_model_input = {name: train[name] if name not in varlen_features else np.stack(train[name]) for name in feature_names } #训练模型的输入,字典类型。名称和具体值 121 | val_model_input = {name: val[name] if name not in varlen_features else np.stack(val[name]) for name in feature_names } 122 | test_model_input = {name: test[name] if name not in varlen_features else np.stack(test[name]) for name in feature_names} 123 | 124 | train_labels = [train[y].values for y in target] 125 | val_labels = [val[y].values for y in target] 126 | 127 | del train,val #多余的特征删除,释放内存。 128 | gc.collect() 129 | 130 | # 4.Define Model,train,predict and evaluate 131 | model = build_mmoe(sparse_features,dense_features,sparse_max_len,embed_dim = 16,expert_dim = 32, 132 | n_task = 4,n_expert = 4,varlens_cols = varlen_features,varlens_max_len = varlens_max_len, 133 | dnn_hidden_units = (64,64),target = target,dnn_reg_l2 = 1e-5,drop_rate = 0.1) 134 | 135 | adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) 136 | model.compile(adam, loss = 'binary_crossentropy' ,metrics = [tf.keras.metrics.AUC()],) 137 | 138 | history = model.fit(train_model_input, train_labels,validation_data = (val_model_input,val_labels), 139 | batch_size=10240, epochs=4, verbose=1) 140 | -------------------------------------------------------------------------------- /model/ple.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jul 1 17:29:13 2021 4 | 5 | @author: shujie.wang 6 | """ 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from tensorflow.keras.layers import * 11 | from tensorflow.keras import regularizers 12 | import tensorflow.keras.backend as K 13 | import matplotlib.pyplot as plt 14 | import tensorflow as tf 15 | from tensorflow.keras.models import Model,load_model 16 | from tensorflow.keras.utils import plot_model 17 | from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping 18 | import os 19 | from tensorflow.keras import optimizers,initializers 20 | 21 | from tensorflow.python.keras.initializers import glorot_normal #xariver。截断的正态分布,标准差。 22 | from tensorflow.python.keras.layers import Layer 23 | import pickle as pkl 24 | import gc 25 | from time import time 26 | 27 | class MyMeanPool(Layer): 28 | def __init__(self, axis, **kwargs): 29 | super(MyMeanPool, self).__init__(**kwargs) 30 | self.axis = axis 31 | 32 | def call(self, x, mask): 33 | mask = tf.expand_dims(tf.cast(mask,tf.float32),axis = -1) 34 | x = x * mask 35 | return K.sum(x, axis=self.axis) / (K.sum(mask, axis=self.axis) + 1e-9) 36 | 37 | class PleLayer(tf.keras.layers.Layer): 38 | ''' 39 | n_experts:list,每个任务使用几个expert。[2,3]第一个任务使用2个expert,第二个任务使用3个expert。 40 | n_expert_share:int,共享的部分设置的expert个数。 41 | expert_dim:int,每个专家网络输出的向量维度。 42 | n_task:int,任务个数。 43 | ''' 44 | def __init__(self,n_task,n_experts,expert_dim,n_expert_share,dnn_reg_l2 = 1e-5): 45 | super(PleLayer, self).__init__() 46 | self.n_task = n_task 47 | 48 | # 生成多个任务特定网络和1个共享网络。 49 | self.E_layer = [] 50 | for i in range(n_task): 51 | sub_exp = [Dense(expert_dim,activation = 'relu') for j in range(n_experts[i])] 52 | self.E_layer.append(sub_exp) 53 | 54 | self.share_layer = [Dense(expert_dim,activation = 'relu') for j in range(n_expert_share)] 55 | #定义门控网络 56 | self.gate_layers = [Dense(n_expert_share+n_experts[i],kernel_regularizer=regularizers.l2(dnn_reg_l2), 57 | activation = 'softmax') for i in range(n_task)] 58 | 59 | def call(self,x): 60 | #特定网络和共享网络 61 | E_net = [[expert(x) for expert in sub_expert] for sub_expert in self.E_layer] 62 | share_net = [expert(x) for expert in self.share_layer] 63 | 64 | #门的权重乘上,指定任务和共享任务的输出。 65 | towers = [] 66 | for i in range(self.n_task): 67 | g = self.gate_layers[i](x) 68 | g = tf.expand_dims(g,axis = -1) #(bs,n_expert_share+n_experts[i],1) 69 | _e = share_net+E_net[i] 70 | _e = Concatenate(axis = 1)([expert[:,tf.newaxis,:] for expert in _e]) #(bs,n_expert_share+n_experts[i],expert_dim) 71 | _tower = tf.matmul(_e, g,transpose_a=True) 72 | towers.append(Flatten()(_tower)) #(bs,expert_dim) 73 | return towers 74 | 75 | def build_ple(sparse_cols,dense_cols,sparse_max_len,embed_dim,expert_dim = 4, 76 | varlens_cols = [],varlens_max_len = [],dnn_hidden_units = (64,64), 77 | n_task = 2,n_experts = [2,2],n_expert_share = 4,dnn_reg_l2 = 1e-6, 78 | drop_rate = 0.0,embedding_reg_l2 = 1e-6,targets = []): 79 | 80 | #输入部分,分为sparse,varlens,dense部分。 81 | sparse_inputs = {f:Input([1],name = f) for f in sparse_cols} 82 | dense_inputs = {f:Input([1],name = f) for f in dense_cols} 83 | varlens_inputs = {f:Input([None,1],name = f) for f in varlens_cols} 84 | 85 | input_embed = {} 86 | #离散特征,embedding到k维 87 | for f in sparse_cols: 88 | _input = sparse_inputs[f] 89 | embedding = Embedding(sparse_max_len[f], embed_dim, 90 | embeddings_regularizer=tf.keras.regularizers.l2(embedding_reg_l2)) 91 | input_embed[f] =Flatten()(embedding(_input)) #(bs,k) 92 | 93 | #多标签离散变量 94 | for f in varlens_inputs: 95 | _input = varlens_inputs[f] 96 | mask = Masking(mask_value = 0).compute_mask(_input) 97 | embedding = Embedding(varlens_max_len[f], embed_dim, 98 | embeddings_regularizer=tf.keras.regularizers.l2(1e-6)) 99 | _embed =Reshape([-1,embed_dim])(embedding(_input)) 100 | out_embed = MyMeanPool(axis=1)(_embed,mask) 101 | input_embed[f] = out_embed 102 | 103 | input_embed.update(dense_inputs) #加入连续变量 104 | input_embed = Concatenate(axis = -1)([input_embed[f] for f in input_embed]) 105 | 106 | for num in dnn_hidden_units: 107 | input_embed = Dropout(drop_rate)(Dense(num,activation = 'relu', 108 | kernel_regularizer=regularizers.l2(dnn_reg_l2))(input_embed)) 109 | #Ple网络层 110 | towers = PleLayer(n_task,n_experts,expert_dim,n_expert_share)(input_embed) 111 | outputs = [Dense(1,activation = 'sigmoid',kernel_regularizer=regularizers.l2(dnn_reg_l2), 112 | name = f,use_bias = True)(_t) for f,_t in zip(targets,towers)] 113 | inputs = [sparse_inputs[f] for f in sparse_inputs]+[varlens_inputs[f] for f in varlens_inputs]\ 114 | +[dense_inputs[f] for f in dense_inputs] 115 | model = Model(inputs,outputs) 116 | return model 117 | 118 | if __name__ == '__main__': 119 | target = ["read_comment", "like", "click_avatar", "forward"] 120 | sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id'] 121 | varlen_features = ['manual_tag_list','manual_keyword_list'] 122 | dense_features = ['videoplayseconds'] 123 | #1.加载数据 124 | with open('../user_data/data.pkl','rb') as f: 125 | train,val,test,encoder = pkl.load(f) 126 | train_num = len(train) 127 | 128 | #2.生成输入特征设置 129 | sparse_max_len = {f:len(encoder[f]) + 1 for f in sparse_features} 130 | varlens_max_len = {f:len(encoder[f]) + 1 for f in varlen_features} 131 | feature_names = sparse_features+varlen_features+dense_features 132 | 133 | # 3.generate input data for model 134 | train_model_input = {name: train[name] if name not in varlen_features else np.stack(train[name]) for name in feature_names } #训练模型的输入,字典类型。名称和具体值 135 | val_model_input = {name: val[name] if name not in varlen_features else np.stack(val[name]) for name in feature_names } 136 | test_model_input = {name: test[name] if name not in varlen_features else np.stack(test[name]) for name in feature_names} 137 | 138 | train_labels = [train[y].values for y in target] 139 | val_labels = [val[y].values for y in target] 140 | 141 | del train,val #多余的特征删除,释放内存。 142 | gc.collect() 143 | 144 | # 4.Define Model,train,predict and evaluate 145 | model = build_ple(sparse_features,dense_features,sparse_max_len,embed_dim = 16,expert_dim = 32, 146 | varlens_cols = varlen_features,varlens_max_len = varlens_max_len,dnn_hidden_units = (64,), 147 | n_task = 4,n_experts = [4,4,4,4],n_expert_share = 8,dnn_reg_l2 = 1e-6, 148 | drop_rate = 0.1,embedding_reg_l2 = 1e-6,targets = target) 149 | 150 | adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) 151 | model.compile(adam, loss = 'binary_crossentropy' ,metrics = [tf.keras.metrics.AUC()],) 152 | 153 | history = model.fit(train_model_input, train_labels,validation_data = (val_model_input,val_labels), 154 | batch_size=10240, epochs=4, verbose=1) 155 | 156 | 157 | 158 | --------------------------------------------------------------------------------