├── CNS-决赛答辩文档.pdf
├── README.md
├── deepfm.py
├── inputs.py
├── packages.txt
├── train_fm.py
├── 代码说明.txt
├── 奖杯.jpg
└── 证书.jpg
/CNS-决赛答辩文档.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouc16020021031/video-click-TOP1/b11346cb9334c8e3268f2cf54739fe5b4c80a88d/CNS-决赛答辩文档.pdf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### 视频点击预测大赛-TOP1方案
2 |
3 | [视频点击预测大赛](https://www.turingtopia.com/competitionnew/detail/e4880352b6ef4f9f8f28e8f98498dbc4/dataset)
4 |
5 | CNS队员:
6 |
7 | 沈琢乔 中国海洋大学 大四
8 |
9 | 朱锐 YOHO 算法工程师
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/deepfm.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | """
3 | Author:
4 | Weichen Shen,wcshen1994@163.com
5 |
6 | Reference:
7 | [1] Guo H, Tang R, Ye Y, et al. Deepfm: a factorization-machine based neural network for ctr prediction[J]. arXiv preprint arXiv:1703.04247, 2017.(https://arxiv.org/abs/1703.04247)
8 |
9 | """
10 |
11 | import tensorflow as tf
12 | from inputs import build_input_features, input_from_feature_columns, get_linear_logit, combined_dnn_input
13 | from deepctr.layers import FM, DNN, PredictionLayer
14 | from deepctr.layers.utils import concat_fun
15 | from tensorflow.python.keras import Input
16 | from tensorflow.python.keras import backend as K
17 | from tensorflow.python.keras.layers import Lambda, Concatenate, Embedding, LSTM, Permute, Dense, multiply
18 |
19 |
20 | def attention_3d_block(inputs, seq_len=21):
21 | # input_dim = int(inputs.shape[2])
22 | a = Permute((2, 1))(inputs)
23 | a = Dense(seq_len, activation='softmax')(a)
24 | a_probs = Permute((2, 1), name='attention_vec')(a)
25 | # output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
26 | output_attention_mul = multiply([inputs, a_probs], name='attention_mul')
27 | return output_attention_mul
28 |
29 |
30 | def DeepFM(linear_feature_columns, dnn_feature_columns, embedding_size=8, use_fm=True, dnn_hidden_units=(128, 128),
31 | l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0,
32 | dnn_activation='relu', dnn_use_bn=False, task='binary', att=False, seq_len=None, cate_feats=[],
33 | cate2nunique={}):
34 | """Instantiates the DeepFM Network architecture.
35 |
36 | :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
37 | :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
38 | :param embedding_size: positive integer,sparse feature embedding_size
39 | :param use_fm: bool,use FM part or not
40 | :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
41 | :param l2_reg_linear: float. L2 regularizer strength applied to linear part
42 | :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
43 | :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
44 | :param init_std: float,to use as the initialize std of embedding vector
45 | :param seed: integer ,to use as random seed.
46 | :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
47 | :param dnn_activation: Activation function to use in DNN
48 | :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
49 | :param task: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss
50 | :return: A Keras model instance.
51 | """
52 |
53 | features = build_input_features(linear_feature_columns + dnn_feature_columns)
54 |
55 | inputs_list = list(features.values())
56 |
57 | sparse_embedding_list, dense_value_list, embedding_dict = input_from_feature_columns(features, dnn_feature_columns,
58 | embedding_size,
59 | l2_reg_embedding, init_std,
60 | seed)
61 |
62 | linear_logit = get_linear_logit(features, linear_feature_columns, l2_reg=l2_reg_linear, init_std=init_std,
63 | seed=seed, prefix='linear')
64 |
65 | fm_input = concat_fun(sparse_embedding_list, axis=1)
66 | fm_logit = FM()(fm_input)
67 |
68 | dnn_input = combined_dnn_input(sparse_embedding_list, dense_value_list)
69 |
70 | input_lstm = Input(shape=(seq_len, 1+len(cate_feats)), name='lstm_input')
71 | input_lstm_gap = Lambda(lambda x: x[:, :, 0:1])(input_lstm)
72 | concate_list = [input_lstm_gap]
73 | for i, cate in enumerate(cate_feats):
74 | input_cate = Lambda(lambda x: x[:, :, i + 1])(input_lstm)
75 | emb = embedding_dict.get(cate)
76 | if emb is None:
77 | emb = Embedding(output_dim=8, input_dim=cate2nunique[cate])
78 | concate_list.append(emb(input_cate))
79 | input_lstm_concat = Concatenate(axis=-1)(concate_list)
80 | if att:
81 | lstm_out = LSTM(units=128, return_sequences=True)(input_lstm_concat)
82 | attention_mul = attention_3d_block(lstm_out, seq_len)
83 | lstm_out = Lambda(lambda x: K.sum(x, axis=1))(attention_mul)
84 | else:
85 | lstm_out = LSTM(units=128, return_sequences=False)(input_lstm_concat)
86 |
87 | dnn_input = concat_fun([dnn_input, lstm_out])
88 | dnn_out = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
89 | dnn_use_bn, seed)(dnn_input)
90 | dnn_logit = tf.keras.layers.Dense(
91 | 1, use_bias=False, activation=None)(dnn_out)
92 |
93 | if len(dnn_hidden_units) == 0 and use_fm == False: # only linear
94 | final_logit = linear_logit
95 | elif len(dnn_hidden_units) == 0 and use_fm == True: # linear + FM
96 | final_logit = tf.keras.layers.add([linear_logit, fm_logit])
97 | elif len(dnn_hidden_units) > 0 and use_fm == False: # linear + Deep
98 | final_logit = tf.keras.layers.add([linear_logit, dnn_logit])
99 | elif len(dnn_hidden_units) > 0 and use_fm == True: # linear + FM + Deep
100 | final_logit = tf.keras.layers.add([linear_logit, fm_logit, dnn_logit])
101 | else:
102 | raise NotImplementedError
103 |
104 | output = PredictionLayer(task)(final_logit)
105 | model = tf.keras.models.Model(inputs=inputs_list + [input_lstm], outputs=output)
106 | return model
107 |
--------------------------------------------------------------------------------
/inputs.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | """
3 |
4 | Author:
5 | Weichen Shen,wcshen1994@163.com
6 |
7 | """
8 |
9 | from collections import OrderedDict, namedtuple
10 | from itertools import chain
11 |
12 | from tensorflow.python.keras.initializers import RandomNormal
13 | from tensorflow.python.keras.layers import Embedding, Input, Flatten
14 | from tensorflow.python.keras.regularizers import l2
15 |
16 | from deepctr.layers.sequence import SequencePoolingLayer
17 | from deepctr.layers.utils import Hash, concat_fun, Linear
18 |
19 |
20 | class SparseFeat(namedtuple('SparseFeat', ['name', 'dimension', 'use_hash', 'dtype', 'embedding_name', 'embedding'])):
21 | __slots__ = ()
22 |
23 | def __new__(cls, name, dimension, use_hash=False, dtype="int32", embedding_name=None, embedding=True):
24 | if embedding and embedding_name is None:
25 | embedding_name = name
26 | return super(SparseFeat, cls).__new__(cls, name, dimension, use_hash, dtype, embedding_name, embedding)
27 |
28 |
29 | class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype'])):
30 | __slots__ = ()
31 |
32 | def __new__(cls, name, dimension=1, dtype="float32"):
33 | return super(DenseFeat, cls).__new__(cls, name, dimension, dtype)
34 |
35 |
36 | class VarLenSparseFeat(namedtuple('VarLenFeat',
37 | ['name', 'dimension', 'maxlen', 'combiner', 'use_hash', 'dtype', 'embedding_name',
38 | 'embedding'])):
39 | __slots__ = ()
40 |
41 | def __new__(cls, name, dimension, maxlen, combiner="mean", use_hash=False, dtype="float32", embedding_name=None,
42 | embedding=True):
43 | if embedding_name is None:
44 | embedding_name = name
45 | return super(VarLenSparseFeat, cls).__new__(cls, name, dimension, maxlen, combiner, use_hash, dtype,
46 | embedding_name, embedding)
47 |
48 |
49 | def get_feature_names(feature_columns):
50 | features = build_input_features(feature_columns)
51 | return list(features.keys())
52 |
53 |
54 | def get_inputs_list(inputs):
55 | return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))
56 |
57 |
58 | def build_input_features(feature_columns, mask_zero=True, prefix=''):
59 | input_features = OrderedDict()
60 | for fc in feature_columns:
61 | if isinstance(fc, SparseFeat):
62 | input_features[fc.name] = Input(
63 | shape=(1,), name=prefix + fc.name, dtype=fc.dtype)
64 | elif isinstance(fc, DenseFeat):
65 | input_features[fc.name] = Input(
66 | shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
67 | elif isinstance(fc, VarLenSparseFeat):
68 | input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + fc.name,
69 | dtype=fc.dtype)
70 | if not mask_zero:
71 | input_features[fc.name + "_seq_length"] = Input(shape=(
72 | 1,), name=prefix + 'seq_length_' + fc.name)
73 | input_features[fc.name + "_seq_max_length"] = fc.maxlen
74 | else:
75 | raise TypeError("Invalid feature column type,got", type(fc))
76 |
77 | return input_features
78 |
79 |
80 | def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, embedding_size, init_std, seed, l2_reg,
81 | prefix='sparse_', seq_mask_zero=True):
82 | if embedding_size == 'auto':
83 | print("Notice:Do not use auto embedding in models other than DCN")
84 | sparse_embedding = {feat.embedding_name: Embedding(feat.dimension, 6 * int(pow(feat.dimension, 0.25)),
85 | embeddings_initializer=RandomNormal(
86 | mean=0.0, stddev=init_std, seed=seed),
87 | embeddings_regularizer=l2(l2_reg),
88 | name=prefix + '_emb_' + feat.name) for feat in
89 | sparse_feature_columns}
90 | else:
91 |
92 | sparse_embedding = {feat.embedding_name: Embedding(feat.dimension, embedding_size,
93 | embeddings_initializer=RandomNormal(
94 | mean=0.0, stddev=init_std, seed=seed),
95 | embeddings_regularizer=l2(
96 | l2_reg),
97 | name=prefix + '_emb_' + feat.name) for feat in
98 | sparse_feature_columns}
99 |
100 | if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
101 | for feat in varlen_sparse_feature_columns:
102 | # if feat.name not in sparse_embedding:
103 | if embedding_size == "auto":
104 | sparse_embedding[feat.embedding_name] = Embedding(feat.dimension, 6 * int(pow(feat.dimension, 0.25)),
105 | embeddings_initializer=RandomNormal(
106 | mean=0.0, stddev=init_std, seed=seed),
107 | embeddings_regularizer=l2(
108 | l2_reg),
109 | name=prefix + '_seq_emb_' + feat.name,
110 | mask_zero=seq_mask_zero)
111 |
112 | else:
113 | sparse_embedding[feat.embedding_name] = Embedding(feat.dimension, embedding_size,
114 | embeddings_initializer=RandomNormal(
115 | mean=0.0, stddev=init_std, seed=seed),
116 | embeddings_regularizer=l2(
117 | l2_reg),
118 | name=prefix + '_seq_emb_' + feat.name,
119 | mask_zero=seq_mask_zero)
120 | return sparse_embedding
121 |
122 |
123 | def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, return_feat_list=(), mask_feat_list=()):
124 | embedding_vec_list = []
125 | for fg in sparse_feature_columns:
126 | feat_name = fg.name
127 | if len(return_feat_list) == 0 or feat_name in return_feat_list:
128 | if fg.use_hash:
129 | lookup_idx = Hash(fg.dimension, mask_zero=(feat_name in mask_feat_list))(input_dict[feat_name])
130 | else:
131 | lookup_idx = input_dict[feat_name]
132 |
133 | embedding_vec_list.append(embedding_dict[feat_name](lookup_idx))
134 |
135 | return embedding_vec_list
136 |
137 |
138 | def create_embedding_matrix(feature_columns, l2_reg, init_std, seed, embedding_size, prefix="", seq_mask_zero=True):
139 | sparse_feature_columns = list(
140 | filter(lambda x: isinstance(x, SparseFeat) and x.embedding, feature_columns)) if feature_columns else []
141 | varlen_sparse_feature_columns = list(
142 | filter(lambda x: isinstance(x, VarLenSparseFeat) and x.embedding, feature_columns)) if feature_columns else []
143 | sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, embedding_size,
144 | init_std, seed,
145 | l2_reg, prefix=prefix + 'sparse', seq_mask_zero=seq_mask_zero)
146 | return sparse_emb_dict
147 |
148 |
149 | def get_linear_logit(features, feature_columns, units=1, l2_reg=0, init_std=0.0001, seed=1024, prefix='linear'):
150 | linear_emb_list = [
151 | input_from_feature_columns(features, feature_columns, 1, l2_reg, init_std, seed, prefix=prefix + str(i))[0] for
152 | i in range(units)]
153 | _, dense_input_list,_ = input_from_feature_columns(features, feature_columns, 1, l2_reg, init_std, seed,
154 | prefix=prefix)
155 |
156 | linear_logit_list = []
157 | for i in range(units):
158 |
159 | if len(linear_emb_list[0]) > 0 and len(dense_input_list) > 0:
160 | sparse_input = concat_fun(linear_emb_list[i])
161 | dense_input = concat_fun(dense_input_list)
162 | linear_logit = Linear(l2_reg, mode=2)([sparse_input, dense_input])
163 | elif len(linear_emb_list[0]) > 0:
164 | sparse_input = concat_fun(linear_emb_list[i])
165 | linear_logit = Linear(l2_reg, mode=0)(sparse_input)
166 | elif len(dense_input_list) > 0:
167 | dense_input = concat_fun(dense_input_list)
168 | linear_logit = Linear(l2_reg, mode=1)(dense_input)
169 | else:
170 | raise NotImplementedError
171 | linear_logit_list.append(linear_logit)
172 |
173 | return concat_fun(linear_logit_list)
174 |
175 |
176 | def embedding_lookup(sparse_embedding_dict, sparse_input_dict, sparse_feature_columns, return_feat_list=(),
177 | mask_feat_list=()):
178 | embedding_vec_list = []
179 | for fc in sparse_feature_columns:
180 | feature_name = fc.name
181 | embedding_name = fc.embedding_name
182 | if len(return_feat_list) == 0 or feature_name in return_feat_list and fc.embedding:
183 | if fc.use_hash:
184 | lookup_idx = Hash(fc.dimension, mask_zero=(feature_name in mask_feat_list))(
185 | sparse_input_dict[feature_name])
186 | else:
187 | lookup_idx = sparse_input_dict[feature_name]
188 |
189 | embedding_vec_list.append(sparse_embedding_dict[embedding_name](lookup_idx))
190 |
191 | return embedding_vec_list
192 |
193 |
194 | def varlen_embedding_lookup(embedding_dict, sequence_input_dict, varlen_sparse_feature_columns):
195 | varlen_embedding_vec_dict = {}
196 | for fc in varlen_sparse_feature_columns:
197 | feature_name = fc.name
198 | embedding_name = fc.embedding_name
199 | if fc.use_hash:
200 | lookup_idx = Hash(fc.dimension, mask_zero=True)(sequence_input_dict[feature_name])
201 | else:
202 | lookup_idx = sequence_input_dict[feature_name]
203 | varlen_embedding_vec_dict[feature_name] = embedding_dict[embedding_name](lookup_idx)
204 |
205 | return varlen_embedding_vec_dict
206 |
207 |
208 | def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_columns):
209 | pooling_vec_list = []
210 | for fc in varlen_sparse_feature_columns:
211 | feature_name = fc.name
212 | combiner = fc.combiner
213 | feature_length_name = feature_name + '_seq_length'
214 | if feature_length_name in features:
215 | vec = SequencePoolingLayer(combiner, supports_masking=False)(
216 | [embedding_dict[feature_name], features[feature_length_name]])
217 | else:
218 | vec = SequencePoolingLayer(combiner, supports_masking=True)(
219 | embedding_dict[feature_name])
220 | pooling_vec_list.append(vec)
221 | return pooling_vec_list
222 |
223 |
224 | def get_dense_input(features, feature_columns):
225 | dense_feature_columns = list(filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if feature_columns else []
226 | dense_input_list = []
227 | for fc in dense_feature_columns:
228 | dense_input_list.append(features[fc.name])
229 | return dense_input_list
230 |
231 |
232 | def input_from_feature_columns(features, feature_columns, embedding_size, l2_reg, init_std, seed, prefix='',
233 | seq_mask_zero=True, support_dense=True):
234 | sparse_feature_columns = list(
235 | filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
236 | varlen_sparse_feature_columns = list(
237 | filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
238 |
239 | embedding_dict = create_embedding_matrix(feature_columns, l2_reg, init_std, seed, embedding_size, prefix=prefix,
240 | seq_mask_zero=seq_mask_zero)
241 | sparse_embedding_list = embedding_lookup(
242 | embedding_dict, features, sparse_feature_columns)
243 | dense_value_list = get_dense_input(features, feature_columns)
244 | if not support_dense and len(dense_value_list) > 0:
245 | raise ValueError("DenseFeat is not supported in dnn_feature_columns")
246 |
247 | sequence_embed_dict = varlen_embedding_lookup(embedding_dict, features, varlen_sparse_feature_columns)
248 | sequence_embed_list = get_varlen_pooling_list(sequence_embed_dict, features, varlen_sparse_feature_columns)
249 | sparse_embedding_list += sequence_embed_list
250 |
251 | return sparse_embedding_list, dense_value_list,embedding_dict
252 |
253 |
254 | def combined_dnn_input(sparse_embedding_list, dense_value_list):
255 | if len(sparse_embedding_list) > 0 and len(dense_value_list) > 0:
256 | sparse_dnn_input = Flatten()(concat_fun(sparse_embedding_list))
257 | dense_dnn_input = Flatten()(concat_fun(dense_value_list))
258 | return concat_fun([sparse_dnn_input, dense_dnn_input])
259 | elif len(sparse_embedding_list) > 0:
260 | return Flatten()(concat_fun(sparse_embedding_list))
261 | elif len(dense_value_list) > 0:
262 | return Flatten()(concat_fun(dense_value_list))
263 | else:
264 | raise NotImplementedError
265 |
--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
1 | absl-py==0.9.0
2 | astor==0.8.1
3 | bayesian-optimization==1.0.1
4 | boto==2.49.0
5 | boto3==1.10.43
6 | botocore==1.13.43
7 | catboost==0.20.1
8 | certifi==2019.11.28
9 | chardet==3.0.4
10 | colorama==0.4.3
11 | colorlog==4.0.2
12 | cycler==0.10.0
13 | docutils==0.15.2
14 | download==0.3.4
15 | gast==0.3.2
16 | gensim==3.8.1
17 | graphviz==0.13.2
18 | grpcio==1.26.0
19 | h5py==2.10.0
20 | idna==2.8
21 | jmespath==0.9.4
22 | joblib==0.14.1
23 | kashgari==0.2.6
24 | Keras==2.1.3
25 | Keras-Applications==1.0.8
26 | keras-gpt-2==0.11.1
27 | keras-layer-normalization==0.14.0
28 | keras-multi-head==0.22.0
29 | keras-pos-embd==0.11.0
30 | keras-position-wise-feed-forward==0.6.0
31 | Keras-Preprocessing==1.1.0
32 | keras-radam==0.15.0
33 | keras-self-attention==0.41.0
34 | keras-transformer==0.31.0
35 | kiwisolver==1.1.0
36 | lightgbm==2.2.3
37 | Markdown==3.1.1
38 | matplotlib==3.1.2
39 | numpy==1.18.0
40 | pandas==0.25.1
41 | Pillow==6.2.1
42 | plotly==4.4.1
43 | protobuf==3.11.1
44 | pyparsing==2.4.5
45 | python-dateutil==2.8.1
46 | pytz==2019.3
47 | PyYAML==5.2
48 | regex==2019.12.19
49 | requests==2.22.0
50 | retrying==1.3.3
51 | s3transfer==0.2.1
52 | scikit-learn==0.22
53 | scipy==1.4.0
54 | seaborn==0.9.0
55 | seqeval==0.0.12
56 | six==1.13.0
57 | sklearn==0.0
58 | smart-open==1.9.0
59 | tensorboard==1.10.0
60 | tensorflow-gpu==1.10.0
61 | termcolor==1.1.0
62 | tqdm==4.40.2
63 | urllib3==1.25.7
64 | Werkzeug==0.16.0
65 | xgboost==0.90
66 |
--------------------------------------------------------------------------------
/train_fm.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from inputs import SparseFeat, DenseFeat
4 | from deepfm import *
5 | import numpy as np
6 | import gc
7 | import keras
8 | import pandas as pd
9 | from sklearn.preprocessing import StandardScaler, LabelEncoder
10 | from tensorflow.python.keras.callbacks import ReduceLROnPlateau, EarlyStopping, Callback
11 | from tqdm import tqdm
12 | from sklearn.metrics import f1_score
13 | import tensorflow as tf
14 |
15 | config = tf.ConfigProto()
16 | config.gpu_options.allow_growth = True
17 | keras.backend.set_session(tf.Session(config=config))
18 |
19 | parser = argparse.ArgumentParser(description='nn')
20 | parser.add_argument('--l', type=int, default=14)
21 | parser.add_argument('--bs', type=int, default=1024)
22 | parser.add_argument('--att', action='store_true')
23 |
24 | args = parser.parse_args()
25 |
26 |
27 | def multi_category_focal_loss2(gamma=2., alpha=.25):
28 | """
29 | focal loss for multi category of multi label problem
30 | 适用于多分类或多标签问题的focal loss
31 | alpha控制真值y_true为1/0时的权重
32 | 1的权重为alpha, 0的权重为1-alpha
33 | 当你的模型欠拟合,学习存在困难时,可以尝试适用本函数作为loss
34 | 当模型过于激进(无论何时总是倾向于预测出1),尝试将alpha调小
35 | 当模型过于惰性(无论何时总是倾向于预测出0,或是某一个固定的常数,说明没有学到有效特征)
36 | 尝试将alpha调大,鼓励模型进行预测出1。
37 | Usage:
38 | model.compile(loss=[multi_category_focal_loss2(alpha=0.25, gamma=2)], metrics=["accuracy"], optimizer=adam)
39 | """
40 | epsilon = 1.e-7
41 | gamma = float(gamma)
42 | alpha = tf.constant(alpha, dtype=tf.float32)
43 |
44 | def multi_category_focal_loss2_fixed(y_true, y_pred):
45 | y_true = tf.cast(y_true, tf.float32)
46 | y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
47 |
48 | alpha_t = y_true * alpha + (tf.ones_like(y_true) - y_true) * (1 - alpha)
49 | y_t = tf.multiply(y_true, y_pred) + tf.multiply(1 - y_true, 1 - y_pred)
50 | ce = -tf.log(y_t)
51 | weight = tf.pow(tf.subtract(1., y_t), gamma)
52 | fl = tf.multiply(tf.multiply(weight, ce), alpha_t)
53 | loss = tf.reduce_mean(fl)
54 | return loss
55 |
56 | return multi_category_focal_loss2_fixed
57 |
58 |
59 | def reduce_mem_usage(df, verbose=True):
60 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
61 | start_mem = df.memory_usage().sum() / 1024 ** 2
62 | for col in df.columns:
63 | col_type = df[col].dtypes
64 | if col_type in numerics:
65 | c_min = df[col].min()
66 | c_max = df[col].max()
67 | if str(col_type)[:3] == 'int':
68 | if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
69 | df[col] = df[col].astype(np.int8)
70 | elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
71 | df[col] = df[col].astype(np.int16)
72 | elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
73 | df[col] = df[col].astype(np.int32)
74 | elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
75 | df[col] = df[col].astype(np.int64)
76 | else:
77 | if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
78 | df[col] = df[col].astype(np.float16)
79 | elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
80 | df[col] = df[col].astype(np.float32)
81 | else:
82 | df[col] = df[col].astype(np.float64)
83 | end_mem = df.memory_usage().sum() / 1024 ** 2
84 | if verbose:
85 | print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
86 | end_mem, 100 * (start_mem - end_mem) / start_mem))
87 | return df
88 |
89 |
90 | class Metrics(Callback):
91 |
92 | def on_epoch_end(self, epoch, logs={}):
93 | # 获取验证集F1
94 | val_targ = label_validate
95 | val_pre = self.model.predict(val_model_input, batch_size=2048)
96 | threshold_test = np.percentile(val_pre, 89.4)
97 | val_pre = [1 if i > threshold_test else 0 for i in val_pre]
98 | f1 = f1_score(val_targ, val_pre)
99 | print(f'val f1: {f1}')
100 |
101 | # 对test做预测,并根据阈值进行调整,最终输出每个epoch的提交文件,最后根据验证集F1结果选择合适的预测文件提交到线上
102 | test['target'] = self.model.predict(test_model_input, verbose=0, batch_size=2048)
103 | sub = test[['id', 'target']]
104 | threshold_test = np.percentile(sub['target'], 89.4)
105 | sub['target'] = [1 if i > threshold_test else 0 for i in sub['target']]
106 | sub.to_csv('sub_l{}_{:02d}_bs{}_att{}_{:.5f}.csv'.format(args.l, epoch + 1, args.bs, args.att, f1), index=False)
107 |
108 | return
109 |
110 |
111 | train = pd.read_csv("../data/train.csv")
112 | train['isTest'] = 0
113 | test = pd.read_csv("../data/test.csv")
114 | test['isTest'] = 1
115 | data = train.append(test).reset_index(drop=True)
116 | del train, test
117 | gc.collect()
118 |
119 | gc.collect()
120 |
121 | data = data.sort_values(['deviceid', 'ts']).reset_index(drop=True)
122 | data['device_vendor'] = data['device_vendor'].apply(lambda x: x.strip().lower())
123 |
124 | data.drop(['timestamp', 'lng', 'lat', 'osversion', 'guid'], axis=1, inplace=True)
125 | # lstm部分输入特征(还有gap)
126 | cate_feats = ['pos', 'newsid']
127 | cate2nunique = {}
128 | for cate in cate_feats:
129 | data[cate] = LabelEncoder().fit_transform(data[cate])
130 | cate2nunique[cate] = data[cate].nunique() + 1
131 | # deepfm输入特征
132 | cate_feats_concat = ['netmodel', 'device_vendor', 'device_version', 'app_version'] + ['deviceid', 'newsid', 'pos']
133 | print(cate_feats_concat)
134 | cate_concat2nunique = {}
135 | for cate in cate_feats_concat:
136 | data[cate] = LabelEncoder().fit_transform(data[cate])
137 | cate_concat2nunique[cate] = data[cate].nunique() + 1
138 |
139 | data = reduce_mem_usage(data)
140 |
141 | group = data.groupby('deviceid')['ts']
142 | data['gap'] = group.shift(0) - group.shift(1)
143 | del group
144 | gc.collect()
145 |
146 | data['gap'] = data['gap'].fillna(data['gap'].mean())
147 | data['gap'] = np.log(data['gap'] + 1)
148 | sclaer = StandardScaler()
149 | data[['gap']] = np.float16(sclaer.fit_transform(data[['gap']]))
150 |
151 | # 当前记录前后窗口长度(14)
152 | l = args.l
153 | timing_cols = [] # 序列特征列表
154 | len_pos = data['pos'].nunique() + 1
155 | print(f"pos:{data['pos'].unique()}")
156 |
157 | # 构造序列特征
158 | for i in tqdm(range(l * 2 + 1)):
159 | data['gap_%s' % (i - l)] = data['gap'].shift(i - l)
160 | data['gap_%s' % (i - l)] = data['gap_%s' % (i - l)].fillna(0)
161 | timing_cols += ['gap_%s' % (i - l)]
162 |
163 | for cate in cate_feats:
164 | new_col = f'{cate}_{(i - l)}'
165 | if cate in ['pos', 'newsid']:
166 | data[new_col] = data[[cate]].shift(i - l).fillna(cate2nunique[cate] - 1)
167 | else:
168 | data[new_col] = data[cate]
169 | timing_cols += [new_col]
170 |
171 | data[timing_cols] = reduce_mem_usage(data[timing_cols])
172 | data = data.sort_values(['ts']).reset_index(drop=True)
173 |
174 | train = data[data['isTest'] != 1]
175 | test = data[data['isTest'] == 1]
176 | del data
177 | train_data_use = np.array(train[timing_cols]).reshape(len(train), l * 2 + 1, len(cate_feats) + 1) # lstm input
178 | train_label = train['target'].values
179 | train_data_sideinfo = train[cate_feats_concat].values # deepfm input
180 | del train
181 | test_data_use = np.array(test[timing_cols]).reshape(len(test), l * 2 + 1, len(cate_feats) + 1) # lstm input
182 | test_data_sideinfo = test[cate_feats_concat].values # deepfm input
183 | # del test
184 |
185 | test = test[['id']]
186 | gc.collect()
187 |
188 | # 训练集验证集按照 4:1比例分割
189 | train_size = int(len(train_data_use) * 0.8)
190 | # 训练集、验证集 lstm特征
191 | X_train, X_validate = train_data_use[:train_size], train_data_use[train_size:]
192 | label_train, label_validate = train_label[:train_size], train_label[train_size:]
193 |
194 | # 训练集、验证集 deepfm的特征
195 | X_train_side, X_validate_side = train_data_sideinfo[:train_size], train_data_sideinfo[train_size:]
196 |
197 | sparse_features = cate_feats_concat
198 | dense_features = []
199 |
200 | # 构造模型,可参考deepctr deepfm例子
201 | fixlen_feature_columns = [SparseFeat(feat, cate_concat2nunique[feat])
202 | for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features]
203 |
204 | dnn_feature_columns = fixlen_feature_columns
205 | linear_feature_columns = fixlen_feature_columns
206 | model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', att=args.att, seq_len=l * 2 + 1,
207 | cate_feats=cate_feats,
208 | cate2nunique=cate2nunique)
209 | model.compile(loss=multi_category_focal_loss2(alpha=0.106535), optimizer='adam', metrics=['acc'])
210 | print(model.summary())
211 |
212 | plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.1, patience=5)
213 | early_stopping = EarlyStopping(monitor='val_acc', patience=9, mode='max')
214 | train_model_input = {'lstm_input': X_train}
215 | val_model_input = {'lstm_input': X_validate}
216 | test_model_input = {'lstm_input': test_data_use}
217 | for i, cate in enumerate(cate_feats_concat):
218 | train_model_input[cate] = X_train_side[:, i]
219 | val_model_input[cate] = X_validate_side[:, i]
220 | test_model_input[cate] = test_data_sideinfo[:, i]
221 | # 开始训练,Metrics会保存了每个epoch的预测结果
222 | history = model.fit(train_model_input, label_train, epochs=50, batch_size=args.bs,
223 | verbose=2, shuffle=True,
224 | validation_data=(val_model_input, label_validate),
225 | callbacks=[early_stopping, plateau, Metrics()])
226 |
--------------------------------------------------------------------------------
/代码说明.txt:
--------------------------------------------------------------------------------
1 | 环境配置:
2 |
3 | python3.6.8
4 | CentOS Linux release 7.2.1511 (Core) 20g内存+20g虚拟内存 8g显卡
5 |
6 | CUDA Version:
7 | - 9.0.176
8 |
9 | CUDNN Version:
10 | - define CUDNN_MAJOR 7
11 | - define CUDNN_MINOR 6
12 | - define CUDNN_PATCHLEVEL 3
13 | define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
14 |
15 | python依赖包安装查看 packages.txt
16 |
17 |
18 |
19 |
20 |
21 | 代码:
22 | train_fm.py是训练代码(包含数据处理,特征工程以及模型训练、提交文件输出。内存有详细说明)
23 | deepfm.py 是模型构造脚本,基于deepctr框架做了修改,增加了lstm 模块
24 | inputs.py 是工具类,基于deepctr框架做了修改,支持相同列名共享Embedding
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/奖杯.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouc16020021031/video-click-TOP1/b11346cb9334c8e3268f2cf54739fe5b4c80a88d/奖杯.jpg
--------------------------------------------------------------------------------
/证书.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ouc16020021031/video-click-TOP1/b11346cb9334c8e3268f2cf54739fe5b4c80a88d/证书.jpg
--------------------------------------------------------------------------------