├── CNS-决赛答辩文档.pdf ├── README.md ├── deepfm.py ├── inputs.py ├── packages.txt ├── train_fm.py ├── 代码说明.txt ├── 奖杯.jpg └── 证书.jpg /CNS-决赛答辩文档.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouc16020021031/video-click-TOP1/b11346cb9334c8e3268f2cf54739fe5b4c80a88d/CNS-决赛答辩文档.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### 视频点击预测大赛-TOP1方案 2 | 3 | [视频点击预测大赛](https://www.turingtopia.com/competitionnew/detail/e4880352b6ef4f9f8f28e8f98498dbc4/dataset) 4 | 5 | CNS队员: 6 | 7 | 沈琢乔 中国海洋大学 大四 8 | 9 | 朱锐 YOHO 算法工程师 10 | 11 | 12 | 奖杯 13 | 14 | 奖杯 15 | 16 | 17 | -------------------------------------------------------------------------------- /deepfm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | Author: 4 | Weichen Shen,wcshen1994@163.com 5 | 6 | Reference: 7 | [1] Guo H, Tang R, Ye Y, et al. Deepfm: a factorization-machine based neural network for ctr prediction[J]. arXiv preprint arXiv:1703.04247, 2017.(https://arxiv.org/abs/1703.04247) 8 | 9 | """ 10 | 11 | import tensorflow as tf 12 | from inputs import build_input_features, input_from_feature_columns, get_linear_logit, combined_dnn_input 13 | from deepctr.layers import FM, DNN, PredictionLayer 14 | from deepctr.layers.utils import concat_fun 15 | from tensorflow.python.keras import Input 16 | from tensorflow.python.keras import backend as K 17 | from tensorflow.python.keras.layers import Lambda, Concatenate, Embedding, LSTM, Permute, Dense, multiply 18 | 19 | 20 | def attention_3d_block(inputs, seq_len=21): 21 | # input_dim = int(inputs.shape[2]) 22 | a = Permute((2, 1))(inputs) 23 | a = Dense(seq_len, activation='softmax')(a) 24 | a_probs = Permute((2, 1), name='attention_vec')(a) 25 | # output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul') 26 | output_attention_mul = multiply([inputs, a_probs], name='attention_mul') 27 | return output_attention_mul 28 | 29 | 30 | def DeepFM(linear_feature_columns, dnn_feature_columns, embedding_size=8, use_fm=True, dnn_hidden_units=(128, 128), 31 | l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0, 32 | dnn_activation='relu', dnn_use_bn=False, task='binary', att=False, seq_len=None, cate_feats=[], 33 | cate2nunique={}): 34 | """Instantiates the DeepFM Network architecture. 35 | 36 | :param linear_feature_columns: An iterable containing all the features used by linear part of the model. 37 | :param dnn_feature_columns: An iterable containing all the features used by deep part of the model. 38 | :param embedding_size: positive integer,sparse feature embedding_size 39 | :param use_fm: bool,use FM part or not 40 | :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN 41 | :param l2_reg_linear: float. L2 regularizer strength applied to linear part 42 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector 43 | :param l2_reg_dnn: float. L2 regularizer strength applied to DNN 44 | :param init_std: float,to use as the initialize std of embedding vector 45 | :param seed: integer ,to use as random seed. 46 | :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate. 47 | :param dnn_activation: Activation function to use in DNN 48 | :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN 49 | :param task: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss 50 | :return: A Keras model instance. 51 | """ 52 | 53 | features = build_input_features(linear_feature_columns + dnn_feature_columns) 54 | 55 | inputs_list = list(features.values()) 56 | 57 | sparse_embedding_list, dense_value_list, embedding_dict = input_from_feature_columns(features, dnn_feature_columns, 58 | embedding_size, 59 | l2_reg_embedding, init_std, 60 | seed) 61 | 62 | linear_logit = get_linear_logit(features, linear_feature_columns, l2_reg=l2_reg_linear, init_std=init_std, 63 | seed=seed, prefix='linear') 64 | 65 | fm_input = concat_fun(sparse_embedding_list, axis=1) 66 | fm_logit = FM()(fm_input) 67 | 68 | dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list) 69 | 70 | input_lstm = Input(shape=(seq_len, 1+len(cate_feats)), name='lstm_input') 71 | input_lstm_gap = Lambda(lambda x: x[:, :, 0:1])(input_lstm) 72 | concate_list = [input_lstm_gap] 73 | for i, cate in enumerate(cate_feats): 74 | input_cate = Lambda(lambda x: x[:, :, i + 1])(input_lstm) 75 | emb = embedding_dict.get(cate) 76 | if emb is None: 77 | emb = Embedding(output_dim=8, input_dim=cate2nunique[cate]) 78 | concate_list.append(emb(input_cate)) 79 | input_lstm_concat = Concatenate(axis=-1)(concate_list) 80 | if att: 81 | lstm_out = LSTM(units=128, return_sequences=True)(input_lstm_concat) 82 | attention_mul = attention_3d_block(lstm_out, seq_len) 83 | lstm_out = Lambda(lambda x: K.sum(x, axis=1))(attention_mul) 84 | else: 85 | lstm_out = LSTM(units=128, return_sequences=False)(input_lstm_concat) 86 | 87 | dnn_input = concat_fun([dnn_input, lstm_out]) 88 | dnn_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, 89 | dnn_use_bn, seed)(dnn_input) 90 | dnn_logit = tf.keras.layers.Dense( 91 | 1, use_bias=False, activation=None)(dnn_out) 92 | 93 | if len(dnn_hidden_units) == 0 and use_fm == False: # only linear 94 | final_logit = linear_logit 95 | elif len(dnn_hidden_units) == 0 and use_fm == True: # linear + FM 96 | final_logit = tf.keras.layers.add([linear_logit, fm_logit]) 97 | elif len(dnn_hidden_units) > 0 and use_fm == False: # linear + Deep 98 | final_logit = tf.keras.layers.add([linear_logit, dnn_logit]) 99 | elif len(dnn_hidden_units) > 0 and use_fm == True: # linear + FM + Deep 100 | final_logit = tf.keras.layers.add([linear_logit, fm_logit, dnn_logit]) 101 | else: 102 | raise NotImplementedError 103 | 104 | output = PredictionLayer(task)(final_logit) 105 | model = tf.keras.models.Model(inputs=inputs_list + [input_lstm], outputs=output) 106 | return model 107 | -------------------------------------------------------------------------------- /inputs.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | """ 3 | 4 | Author: 5 | Weichen Shen,wcshen1994@163.com 6 | 7 | """ 8 | 9 | from collections import OrderedDict, namedtuple 10 | from itertools import chain 11 | 12 | from tensorflow.python.keras.initializers import RandomNormal 13 | from tensorflow.python.keras.layers import Embedding, Input, Flatten 14 | from tensorflow.python.keras.regularizers import l2 15 | 16 | from deepctr.layers.sequence import SequencePoolingLayer 17 | from deepctr.layers.utils import Hash, concat_fun, Linear 18 | 19 | 20 | class SparseFeat(namedtuple('SparseFeat', ['name', 'dimension', 'use_hash', 'dtype', 'embedding_name', 'embedding'])): 21 | __slots__ = () 22 | 23 | def __new__(cls, name, dimension, use_hash=False, dtype="int32", embedding_name=None, embedding=True): 24 | if embedding and embedding_name is None: 25 | embedding_name = name 26 | return super(SparseFeat, cls).__new__(cls, name, dimension, use_hash, dtype, embedding_name, embedding) 27 | 28 | 29 | class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype'])): 30 | __slots__ = () 31 | 32 | def __new__(cls, name, dimension=1, dtype="float32"): 33 | return super(DenseFeat, cls).__new__(cls, name, dimension, dtype) 34 | 35 | 36 | class VarLenSparseFeat(namedtuple('VarLenFeat', 37 | ['name', 'dimension', 'maxlen', 'combiner', 'use_hash', 'dtype', 'embedding_name', 38 | 'embedding'])): 39 | __slots__ = () 40 | 41 | def __new__(cls, name, dimension, maxlen, combiner="mean", use_hash=False, dtype="float32", embedding_name=None, 42 | embedding=True): 43 | if embedding_name is None: 44 | embedding_name = name 45 | return super(VarLenSparseFeat, cls).__new__(cls, name, dimension, maxlen, combiner, use_hash, dtype, 46 | embedding_name, embedding) 47 | 48 | 49 | def get_feature_names(feature_columns): 50 | features = build_input_features(feature_columns) 51 | return list(features.keys()) 52 | 53 | 54 | def get_inputs_list(inputs): 55 | return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs))))) 56 | 57 | 58 | def build_input_features(feature_columns, mask_zero=True, prefix=''): 59 | input_features = OrderedDict() 60 | for fc in feature_columns: 61 | if isinstance(fc, SparseFeat): 62 | input_features[fc.name] = Input( 63 | shape=(1,), name=prefix + fc.name, dtype=fc.dtype) 64 | elif isinstance(fc, DenseFeat): 65 | input_features[fc.name] = Input( 66 | shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype) 67 | elif isinstance(fc, VarLenSparseFeat): 68 | input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + fc.name, 69 | dtype=fc.dtype) 70 | if not mask_zero: 71 | input_features[fc.name + "_seq_length"] = Input(shape=( 72 | 1,), name=prefix + 'seq_length_' + fc.name) 73 | input_features[fc.name + "_seq_max_length"] = fc.maxlen 74 | else: 75 | raise TypeError("Invalid feature column type,got", type(fc)) 76 | 77 | return input_features 78 | 79 | 80 | def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, embedding_size, init_std, seed, l2_reg, 81 | prefix='sparse_', seq_mask_zero=True): 82 | if embedding_size == 'auto': 83 | print("Notice:Do not use auto embedding in models other than DCN") 84 | sparse_embedding = {feat.embedding_name: Embedding(feat.dimension, 6 * int(pow(feat.dimension, 0.25)), 85 | embeddings_initializer=RandomNormal( 86 | mean=0.0, stddev=init_std, seed=seed), 87 | embeddings_regularizer=l2(l2_reg), 88 | name=prefix + '_emb_' + feat.name) for feat in 89 | sparse_feature_columns} 90 | else: 91 | 92 | sparse_embedding = {feat.embedding_name: Embedding(feat.dimension, embedding_size, 93 | embeddings_initializer=RandomNormal( 94 | mean=0.0, stddev=init_std, seed=seed), 95 | embeddings_regularizer=l2( 96 | l2_reg), 97 | name=prefix + '_emb_' + feat.name) for feat in 98 | sparse_feature_columns} 99 | 100 | if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0: 101 | for feat in varlen_sparse_feature_columns: 102 | # if feat.name not in sparse_embedding: 103 | if embedding_size == "auto": 104 | sparse_embedding[feat.embedding_name] = Embedding(feat.dimension, 6 * int(pow(feat.dimension, 0.25)), 105 | embeddings_initializer=RandomNormal( 106 | mean=0.0, stddev=init_std, seed=seed), 107 | embeddings_regularizer=l2( 108 | l2_reg), 109 | name=prefix + '_seq_emb_' + feat.name, 110 | mask_zero=seq_mask_zero) 111 | 112 | else: 113 | sparse_embedding[feat.embedding_name] = Embedding(feat.dimension, embedding_size, 114 | embeddings_initializer=RandomNormal( 115 | mean=0.0, stddev=init_std, seed=seed), 116 | embeddings_regularizer=l2( 117 | l2_reg), 118 | name=prefix + '_seq_emb_' + feat.name, 119 | mask_zero=seq_mask_zero) 120 | return sparse_embedding 121 | 122 | 123 | def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, return_feat_list=(), mask_feat_list=()): 124 | embedding_vec_list = [] 125 | for fg in sparse_feature_columns: 126 | feat_name = fg.name 127 | if len(return_feat_list) == 0 or feat_name in return_feat_list: 128 | if fg.use_hash: 129 | lookup_idx = Hash(fg.dimension, mask_zero=(feat_name in mask_feat_list))(input_dict[feat_name]) 130 | else: 131 | lookup_idx = input_dict[feat_name] 132 | 133 | embedding_vec_list.append(embedding_dict[feat_name](lookup_idx)) 134 | 135 | return embedding_vec_list 136 | 137 | 138 | def create_embedding_matrix(feature_columns, l2_reg, init_std, seed, embedding_size, prefix="", seq_mask_zero=True): 139 | sparse_feature_columns = list( 140 | filter(lambda x: isinstance(x, SparseFeat) and x.embedding, feature_columns)) if feature_columns else [] 141 | varlen_sparse_feature_columns = list( 142 | filter(lambda x: isinstance(x, VarLenSparseFeat) and x.embedding, feature_columns)) if feature_columns else [] 143 | sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, embedding_size, 144 | init_std, seed, 145 | l2_reg, prefix=prefix + 'sparse', seq_mask_zero=seq_mask_zero) 146 | return sparse_emb_dict 147 | 148 | 149 | def get_linear_logit(features, feature_columns, units=1, l2_reg=0, init_std=0.0001, seed=1024, prefix='linear'): 150 | linear_emb_list = [ 151 | input_from_feature_columns(features, feature_columns, 1, l2_reg, init_std, seed, prefix=prefix + str(i))[0] for 152 | i in range(units)] 153 | _, dense_input_list,_ = input_from_feature_columns(features, feature_columns, 1, l2_reg, init_std, seed, 154 | prefix=prefix) 155 | 156 | linear_logit_list = [] 157 | for i in range(units): 158 | 159 | if len(linear_emb_list[0]) > 0 and len(dense_input_list) > 0: 160 | sparse_input = concat_fun(linear_emb_list[i]) 161 | dense_input = concat_fun(dense_input_list) 162 | linear_logit = Linear(l2_reg, mode=2)([sparse_input, dense_input]) 163 | elif len(linear_emb_list[0]) > 0: 164 | sparse_input = concat_fun(linear_emb_list[i]) 165 | linear_logit = Linear(l2_reg, mode=0)(sparse_input) 166 | elif len(dense_input_list) > 0: 167 | dense_input = concat_fun(dense_input_list) 168 | linear_logit = Linear(l2_reg, mode=1)(dense_input) 169 | else: 170 | raise NotImplementedError 171 | linear_logit_list.append(linear_logit) 172 | 173 | return concat_fun(linear_logit_list) 174 | 175 | 176 | def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(), 177 | mask_feat_list=()): 178 | embedding_vec_list = [] 179 | for fc in sparse_feature_columns: 180 | feature_name = fc.name 181 | embedding_name = fc.embedding_name 182 | if len(return_feat_list) == 0 or feature_name in return_feat_list and fc.embedding: 183 | if fc.use_hash: 184 | lookup_idx = Hash(fc.dimension, mask_zero=(feature_name in mask_feat_list))( 185 | sparse_input_dict[feature_name]) 186 | else: 187 | lookup_idx = sparse_input_dict[feature_name] 188 | 189 | embedding_vec_list.append(sparse_embedding_dict[embedding_name](lookup_idx)) 190 | 191 | return embedding_vec_list 192 | 193 | 194 | def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns): 195 | varlen_embedding_vec_dict = {} 196 | for fc in varlen_sparse_feature_columns: 197 | feature_name = fc.name 198 | embedding_name = fc.embedding_name 199 | if fc.use_hash: 200 | lookup_idx = Hash(fc.dimension, mask_zero=True)(sequence_input_dict[feature_name]) 201 | else: 202 | lookup_idx = sequence_input_dict[feature_name] 203 | varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx) 204 | 205 | return varlen_embedding_vec_dict 206 | 207 | 208 | def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns): 209 | pooling_vec_list = [] 210 | for fc in varlen_sparse_feature_columns: 211 | feature_name = fc.name 212 | combiner = fc.combiner 213 | feature_length_name = feature_name + '_seq_length' 214 | if feature_length_name in features: 215 | vec = SequencePoolingLayer(combiner, supports_masking=False)( 216 | [embedding_dict[feature_name], features[feature_length_name]]) 217 | else: 218 | vec = SequencePoolingLayer(combiner, supports_masking=True)( 219 | embedding_dict[feature_name]) 220 | pooling_vec_list.append(vec) 221 | return pooling_vec_list 222 | 223 | 224 | def get_dense_input(features, feature_columns): 225 | dense_feature_columns = list(filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if feature_columns else [] 226 | dense_input_list = [] 227 | for fc in dense_feature_columns: 228 | dense_input_list.append(features[fc.name]) 229 | return dense_input_list 230 | 231 | 232 | def input_from_feature_columns(features, feature_columns, embedding_size, l2_reg, init_std, seed, prefix='', 233 | seq_mask_zero=True, support_dense=True): 234 | sparse_feature_columns = list( 235 | filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else [] 236 | varlen_sparse_feature_columns = list( 237 | filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else [] 238 | 239 | embedding_dict = create_embedding_matrix(feature_columns, l2_reg, init_std, seed, embedding_size, prefix=prefix, 240 | seq_mask_zero=seq_mask_zero) 241 | sparse_embedding_list = embedding_lookup( 242 | embedding_dict, features, sparse_feature_columns) 243 | dense_value_list = get_dense_input(features, feature_columns) 244 | if not support_dense and len(dense_value_list) > 0: 245 | raise ValueError("DenseFeat is not supported in dnn_feature_columns") 246 | 247 | sequence_embed_dict = varlen_embedding_lookup(embedding_dict, features, varlen_sparse_feature_columns) 248 | sequence_embed_list = get_varlen_pooling_list(sequence_embed_dict, features, varlen_sparse_feature_columns) 249 | sparse_embedding_list += sequence_embed_list 250 | 251 | return sparse_embedding_list, dense_value_list,embedding_dict 252 | 253 | 254 | def combined_dnn_input(sparse_embedding_list, dense_value_list): 255 | if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0: 256 | sparse_dnn_input = Flatten()(concat_fun(sparse_embedding_list)) 257 | dense_dnn_input = Flatten()(concat_fun(dense_value_list)) 258 | return concat_fun([sparse_dnn_input, dense_dnn_input]) 259 | elif len(sparse_embedding_list) > 0: 260 | return Flatten()(concat_fun(sparse_embedding_list)) 261 | elif len(dense_value_list) > 0: 262 | return Flatten()(concat_fun(dense_value_list)) 263 | else: 264 | raise NotImplementedError 265 | -------------------------------------------------------------------------------- /packages.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.9.0 2 | astor==0.8.1 3 | bayesian-optimization==1.0.1 4 | boto==2.49.0 5 | boto3==1.10.43 6 | botocore==1.13.43 7 | catboost==0.20.1 8 | certifi==2019.11.28 9 | chardet==3.0.4 10 | colorama==0.4.3 11 | colorlog==4.0.2 12 | cycler==0.10.0 13 | docutils==0.15.2 14 | download==0.3.4 15 | gast==0.3.2 16 | gensim==3.8.1 17 | graphviz==0.13.2 18 | grpcio==1.26.0 19 | h5py==2.10.0 20 | idna==2.8 21 | jmespath==0.9.4 22 | joblib==0.14.1 23 | kashgari==0.2.6 24 | Keras==2.1.3 25 | Keras-Applications==1.0.8 26 | keras-gpt-2==0.11.1 27 | keras-layer-normalization==0.14.0 28 | keras-multi-head==0.22.0 29 | keras-pos-embd==0.11.0 30 | keras-position-wise-feed-forward==0.6.0 31 | Keras-Preprocessing==1.1.0 32 | keras-radam==0.15.0 33 | keras-self-attention==0.41.0 34 | keras-transformer==0.31.0 35 | kiwisolver==1.1.0 36 | lightgbm==2.2.3 37 | Markdown==3.1.1 38 | matplotlib==3.1.2 39 | numpy==1.18.0 40 | pandas==0.25.1 41 | Pillow==6.2.1 42 | plotly==4.4.1 43 | protobuf==3.11.1 44 | pyparsing==2.4.5 45 | python-dateutil==2.8.1 46 | pytz==2019.3 47 | PyYAML==5.2 48 | regex==2019.12.19 49 | requests==2.22.0 50 | retrying==1.3.3 51 | s3transfer==0.2.1 52 | scikit-learn==0.22 53 | scipy==1.4.0 54 | seaborn==0.9.0 55 | seqeval==0.0.12 56 | six==1.13.0 57 | sklearn==0.0 58 | smart-open==1.9.0 59 | tensorboard==1.10.0 60 | tensorflow-gpu==1.10.0 61 | termcolor==1.1.0 62 | tqdm==4.40.2 63 | urllib3==1.25.7 64 | Werkzeug==0.16.0 65 | xgboost==0.90 66 | -------------------------------------------------------------------------------- /train_fm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from inputs import SparseFeat, DenseFeat 4 | from deepfm import * 5 | import numpy as np 6 | import gc 7 | import keras 8 | import pandas as pd 9 | from sklearn.preprocessing import StandardScaler, LabelEncoder 10 | from tensorflow.python.keras.callbacks import ReduceLROnPlateau, EarlyStopping, Callback 11 | from tqdm import tqdm 12 | from sklearn.metrics import f1_score 13 | import tensorflow as tf 14 | 15 | config = tf.ConfigProto() 16 | config.gpu_options.allow_growth = True 17 | keras.backend.set_session(tf.Session(config=config)) 18 | 19 | parser = argparse.ArgumentParser(description='nn') 20 | parser.add_argument('--l', type=int, default=14) 21 | parser.add_argument('--bs', type=int, default=1024) 22 | parser.add_argument('--att', action='store_true') 23 | 24 | args = parser.parse_args() 25 | 26 | 27 | def multi_category_focal_loss2(gamma=2., alpha=.25): 28 | """ 29 | focal loss for multi category of multi label problem 30 | 适用于多分类或多标签问题的focal loss 31 | alpha控制真值y_true为1/0时的权重 32 | 1的权重为alpha, 0的权重为1-alpha 33 | 当你的模型欠拟合,学习存在困难时,可以尝试适用本函数作为loss 34 | 当模型过于激进(无论何时总是倾向于预测出1),尝试将alpha调小 35 | 当模型过于惰性(无论何时总是倾向于预测出0,或是某一个固定的常数,说明没有学到有效特征) 36 | 尝试将alpha调大,鼓励模型进行预测出1。 37 | Usage: 38 | model.compile(loss=[multi_category_focal_loss2(alpha=0.25, gamma=2)], metrics=["accuracy"], optimizer=adam) 39 | """ 40 | epsilon = 1.e-7 41 | gamma = float(gamma) 42 | alpha = tf.constant(alpha, dtype=tf.float32) 43 | 44 | def multi_category_focal_loss2_fixed(y_true, y_pred): 45 | y_true = tf.cast(y_true, tf.float32) 46 | y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon) 47 | 48 | alpha_t = y_true * alpha + (tf.ones_like(y_true) - y_true) * (1 - alpha) 49 | y_t = tf.multiply(y_true, y_pred) + tf.multiply(1 - y_true, 1 - y_pred) 50 | ce = -tf.log(y_t) 51 | weight = tf.pow(tf.subtract(1., y_t), gamma) 52 | fl = tf.multiply(tf.multiply(weight, ce), alpha_t) 53 | loss = tf.reduce_mean(fl) 54 | return loss 55 | 56 | return multi_category_focal_loss2_fixed 57 | 58 | 59 | def reduce_mem_usage(df, verbose=True): 60 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 61 | start_mem = df.memory_usage().sum() / 1024 ** 2 62 | for col in df.columns: 63 | col_type = df[col].dtypes 64 | if col_type in numerics: 65 | c_min = df[col].min() 66 | c_max = df[col].max() 67 | if str(col_type)[:3] == 'int': 68 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: 69 | df[col] = df[col].astype(np.int8) 70 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: 71 | df[col] = df[col].astype(np.int16) 72 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: 73 | df[col] = df[col].astype(np.int32) 74 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: 75 | df[col] = df[col].astype(np.int64) 76 | else: 77 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: 78 | df[col] = df[col].astype(np.float16) 79 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: 80 | df[col] = df[col].astype(np.float32) 81 | else: 82 | df[col] = df[col].astype(np.float64) 83 | end_mem = df.memory_usage().sum() / 1024 ** 2 84 | if verbose: 85 | print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format( 86 | end_mem, 100 * (start_mem - end_mem) / start_mem)) 87 | return df 88 | 89 | 90 | class Metrics(Callback): 91 | 92 | def on_epoch_end(self, epoch, logs={}): 93 | # 获取验证集F1 94 | val_targ = label_validate 95 | val_pre = self.model.predict(val_model_input, batch_size=2048) 96 | threshold_test = np.percentile(val_pre, 89.4) 97 | val_pre = [1 if i > threshold_test else 0 for i in val_pre] 98 | f1 = f1_score(val_targ, val_pre) 99 | print(f'val f1: {f1}') 100 | 101 | # 对test做预测,并根据阈值进行调整,最终输出每个epoch的提交文件,最后根据验证集F1结果选择合适的预测文件提交到线上 102 | test['target'] = self.model.predict(test_model_input, verbose=0, batch_size=2048) 103 | sub = test[['id', 'target']] 104 | threshold_test = np.percentile(sub['target'], 89.4) 105 | sub['target'] = [1 if i > threshold_test else 0 for i in sub['target']] 106 | sub.to_csv('sub_l{}_{:02d}_bs{}_att{}_{:.5f}.csv'.format(args.l, epoch + 1, args.bs, args.att, f1), index=False) 107 | 108 | return 109 | 110 | 111 | train = pd.read_csv("../data/train.csv") 112 | train['isTest'] = 0 113 | test = pd.read_csv("../data/test.csv") 114 | test['isTest'] = 1 115 | data = train.append(test).reset_index(drop=True) 116 | del train, test 117 | gc.collect() 118 | 119 | gc.collect() 120 | 121 | data = data.sort_values(['deviceid', 'ts']).reset_index(drop=True) 122 | data['device_vendor'] = data['device_vendor'].apply(lambda x: x.strip().lower()) 123 | 124 | data.drop(['timestamp', 'lng', 'lat', 'osversion', 'guid'], axis=1, inplace=True) 125 | # lstm部分输入特征(还有gap) 126 | cate_feats = ['pos', 'newsid'] 127 | cate2nunique = {} 128 | for cate in cate_feats: 129 | data[cate] = LabelEncoder().fit_transform(data[cate]) 130 | cate2nunique[cate] = data[cate].nunique() + 1 131 | # deepfm输入特征 132 | cate_feats_concat = ['netmodel', 'device_vendor', 'device_version', 'app_version'] + ['deviceid', 'newsid', 'pos'] 133 | print(cate_feats_concat) 134 | cate_concat2nunique = {} 135 | for cate in cate_feats_concat: 136 | data[cate] = LabelEncoder().fit_transform(data[cate]) 137 | cate_concat2nunique[cate] = data[cate].nunique() + 1 138 | 139 | data = reduce_mem_usage(data) 140 | 141 | group = data.groupby('deviceid')['ts'] 142 | data['gap'] = group.shift(0) - group.shift(1) 143 | del group 144 | gc.collect() 145 | 146 | data['gap'] = data['gap'].fillna(data['gap'].mean()) 147 | data['gap'] = np.log(data['gap'] + 1) 148 | sclaer = StandardScaler() 149 | data[['gap']] = np.float16(sclaer.fit_transform(data[['gap']])) 150 | 151 | # 当前记录前后窗口长度(14) 152 | l = args.l 153 | timing_cols = [] # 序列特征列表 154 | len_pos = data['pos'].nunique() + 1 155 | print(f"pos:{data['pos'].unique()}") 156 | 157 | # 构造序列特征 158 | for i in tqdm(range(l * 2 + 1)): 159 | data['gap_%s' % (i - l)] = data['gap'].shift(i - l) 160 | data['gap_%s' % (i - l)] = data['gap_%s' % (i - l)].fillna(0) 161 | timing_cols += ['gap_%s' % (i - l)] 162 | 163 | for cate in cate_feats: 164 | new_col = f'{cate}_{(i - l)}' 165 | if cate in ['pos', 'newsid']: 166 | data[new_col] = data[[cate]].shift(i - l).fillna(cate2nunique[cate] - 1) 167 | else: 168 | data[new_col] = data[cate] 169 | timing_cols += [new_col] 170 | 171 | data[timing_cols] = reduce_mem_usage(data[timing_cols]) 172 | data = data.sort_values(['ts']).reset_index(drop=True) 173 | 174 | train = data[data['isTest'] != 1] 175 | test = data[data['isTest'] == 1] 176 | del data 177 | train_data_use = np.array(train[timing_cols]).reshape(len(train), l * 2 + 1, len(cate_feats) + 1) # lstm input 178 | train_label = train['target'].values 179 | train_data_sideinfo = train[cate_feats_concat].values # deepfm input 180 | del train 181 | test_data_use = np.array(test[timing_cols]).reshape(len(test), l * 2 + 1, len(cate_feats) + 1) # lstm input 182 | test_data_sideinfo = test[cate_feats_concat].values # deepfm input 183 | # del test 184 | 185 | test = test[['id']] 186 | gc.collect() 187 | 188 | # 训练集验证集按照 4:1比例分割 189 | train_size = int(len(train_data_use) * 0.8) 190 | # 训练集、验证集 lstm特征 191 | X_train, X_validate = train_data_use[:train_size], train_data_use[train_size:] 192 | label_train, label_validate = train_label[:train_size], train_label[train_size:] 193 | 194 | # 训练集、验证集 deepfm的特征 195 | X_train_side, X_validate_side = train_data_sideinfo[:train_size], train_data_sideinfo[train_size:] 196 | 197 | sparse_features = cate_feats_concat 198 | dense_features = [] 199 | 200 | # 构造模型,可参考deepctr deepfm例子 201 | fixlen_feature_columns = [SparseFeat(feat, cate_concat2nunique[feat]) 202 | for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features] 203 | 204 | dnn_feature_columns = fixlen_feature_columns 205 | linear_feature_columns = fixlen_feature_columns 206 | model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', att=args.att, seq_len=l * 2 + 1, 207 | cate_feats=cate_feats, 208 | cate2nunique=cate2nunique) 209 | model.compile(loss=multi_category_focal_loss2(alpha=0.106535), optimizer='adam', metrics=['acc']) 210 | print(model.summary()) 211 | 212 | plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.1, patience=5) 213 | early_stopping = EarlyStopping(monitor='val_acc', patience=9, mode='max') 214 | train_model_input = {'lstm_input': X_train} 215 | val_model_input = {'lstm_input': X_validate} 216 | test_model_input = {'lstm_input': test_data_use} 217 | for i, cate in enumerate(cate_feats_concat): 218 | train_model_input[cate] = X_train_side[:, i] 219 | val_model_input[cate] = X_validate_side[:, i] 220 | test_model_input[cate] = test_data_sideinfo[:, i] 221 | # 开始训练,Metrics会保存了每个epoch的预测结果 222 | history = model.fit(train_model_input, label_train, epochs=50, batch_size=args.bs, 223 | verbose=2, shuffle=True, 224 | validation_data=(val_model_input, label_validate), 225 | callbacks=[early_stopping, plateau, Metrics()]) 226 | -------------------------------------------------------------------------------- /代码说明.txt: -------------------------------------------------------------------------------- 1 | 环境配置: 2 | 3 | python3.6.8 4 | CentOS Linux release 7.2.1511 (Core) 20g内存+20g虚拟内存 8g显卡 5 | 6 | CUDA Version: 7 | - 9.0.176 8 | 9 | CUDNN Version: 10 | - define CUDNN_MAJOR 7 11 | - define CUDNN_MINOR 6 12 | - define CUDNN_PATCHLEVEL 3 13 | define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL) 14 | 15 | python依赖包安装查看 packages.txt 16 | 17 | 18 | 19 | 20 | 21 | 代码: 22 | train_fm.py是训练代码(包含数据处理,特征工程以及模型训练、提交文件输出。内存有详细说明) 23 | deepfm.py 是模型构造脚本,基于deepctr框架做了修改,增加了lstm 模块 24 | inputs.py 是工具类,基于deepctr框架做了修改,支持相同列名共享Embedding 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /奖杯.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouc16020021031/video-click-TOP1/b11346cb9334c8e3268f2cf54739fe5b4c80a88d/奖杯.jpg -------------------------------------------------------------------------------- /证书.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ouc16020021031/video-click-TOP1/b11346cb9334c8e3268f2cf54739fe5b4c80a88d/证书.jpg --------------------------------------------------------------------------------