├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── input └── README.md ├── model.py └── train.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | input 2 | .idea 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Weichen Shen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ICME2019 & 字节跳动 短视频内容理解与推荐竞赛 2 | 3 | ## 方案说明 4 | - 特征:均为原始特征,不包含多媒体内容特征。使用到的特征字段 ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 5 | 'music_id', 'did',] 6 | - 模型:基于xDeepFM简单修改的多任务模型(没有测过分开预测的效果,也可能分开单独预测更好)。 7 | - 结果:track2: 0.77094938716636 f, l = 0.70671501437, 0.920829590357 8 | 9 | ## 运行环境 10 | 11 | python 3.6 12 | deepctr==0.9.2 13 | tensorflow-gpu(tensorflow) 14 | pandas 15 | scikit-learn 16 | 17 | ### deepctr安装说明 18 | - CPU版本 19 | ```bash 20 | $ pip install deepctr==0.9.2 21 | ``` 22 | - GPU版本 23 | 先确保已经在本地安装`tensorflow-gpu`,然后运行命令 24 | ```bash 25 | $ pip install deepctr==0.9.2 --no-deps 26 | ``` 27 | 28 | 29 | ## 运行说明 30 | 1. 将track2对应的数据下载并解压至`input`目录内 31 | 2. 根据离线测试和线上提交修改`train.py`中的`ONLINE_FLAG`变量,运行`train.py`文件 32 | 33 | 34 | -------------------------------------------------------------------------------- /input/README.md: -------------------------------------------------------------------------------- 1 | # 数据解压后放在这个文件夹内 -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from deepctr.layers.core import DNN, PredictionLayer 3 | from deepctr.layers.interaction import CIN 4 | from deepctr.layers.utils import concat_func, add_func 5 | 6 | from deepctr.feature_column import build_input_features, get_linear_logit, input_from_feature_columns 7 | 8 | 9 | def xDeepFM_MTL(feature_columns, hidden_size=(256, 256), cin_layer_size=(256, 256), 10 | cin_split_half=True, 11 | task_net_size=(128,), l2_reg_linear=0.00001, l2_reg_embedding=0.00001, 12 | seed=1024, ): 13 | if len(task_net_size) < 1: 14 | raise ValueError('task_net_size must be at least one layer') 15 | 16 | # video_input = tf.keras.layers.Input((128,)) 17 | # inputs_list.append(video_input) 18 | 19 | features = build_input_features(feature_columns) 20 | 21 | inputs_list = list(features.values()) 22 | 23 | linear_logit = get_linear_logit(features, feature_columns, seed=seed, prefix='linear', 24 | l2_reg=l2_reg_linear) 25 | 26 | deep_emb_list, dense_value_list = input_from_feature_columns(features, feature_columns, 27 | l2_reg_embedding, seed) 28 | 29 | fm_input = concat_func(deep_emb_list, axis=1) 30 | 31 | deep_input = tf.keras.layers.Flatten()(fm_input) 32 | deep_out = DNN(hidden_size)(deep_input) 33 | 34 | finish_out = DNN(task_net_size)(deep_out) 35 | finish_logit = tf.keras.layers.Dense( 36 | 1, use_bias=False, activation=None)(finish_out) 37 | 38 | like_out = DNN(task_net_size)(deep_out) 39 | like_logit = tf.keras.layers.Dense( 40 | 1, use_bias=False, activation=None)(like_out) 41 | 42 | finish_logit = add_func( 43 | [linear_logit, finish_logit]) 44 | like_logit = add_func( 45 | [linear_logit, like_logit]) 46 | 47 | if len(cin_layer_size) > 0: 48 | exFM_out = CIN(cin_layer_size, 'relu', 49 | cin_split_half, seed)(fm_input) 50 | exFM_logit = tf.keras.layers.Dense(1, activation=None, )(exFM_out) 51 | finish_logit = add_func([finish_logit, exFM_logit]) 52 | like_logit = add_func([like_logit, exFM_logit]) 53 | 54 | output_finish = PredictionLayer('binary', name='p_finish')(finish_logit) 55 | output_like = PredictionLayer('binary', name='p_like')(like_logit) 56 | model = tf.keras.models.Model(inputs=inputs_list, outputs=[ 57 | output_finish, output_like]) 58 | return model 59 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names 3 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler 4 | from sklearn.metrics import log_loss, roc_auc_score 5 | 6 | from model import xDeepFM_MTL 7 | 8 | ONLINE_FLAG = False 9 | loss_weights = [1, 1, ] # [0.7,0.3]任务权重可以调下试试 10 | VALIDATION_FRAC = 0.2 # 用做线下验证数据比例 11 | 12 | if __name__ == "__main__": 13 | data = pd.read_csv('./input/final_track2_train.txt', sep='\t', names=[ 14 | 'uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 'did', 15 | 'creat_time', 'video_duration']) 16 | if ONLINE_FLAG: 17 | test_data = pd.read_csv('./input/final_track2_test_no_anwser.txt', sep='\t', names=[ 18 | 'uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 'did', 19 | 'creat_time', 'video_duration']) 20 | train_size = data.shape[0] 21 | data = data.append(test_data) 22 | else: 23 | train_size = int(data.shape[0] * (1 - VALIDATION_FRAC)) 24 | 25 | sparse_features = ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 26 | 'music_id', 'did', ] 27 | dense_features = ['video_duration'] # 'creat_time', 28 | 29 | data[sparse_features] = data[sparse_features].fillna('-1', ) 30 | data[dense_features] = data[dense_features].fillna(0, ) 31 | 32 | target = ['finish', 'like'] 33 | 34 | for feat in sparse_features: 35 | lbe = LabelEncoder() 36 | data[feat] = lbe.fit_transform(data[feat]) 37 | mms = MinMaxScaler(feature_range=(0, 1)) 38 | data[dense_features] = mms.fit_transform(data[dense_features]) 39 | 40 | sparse_feature_list = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=8) 41 | for feat in sparse_features] 42 | dense_feature_list = [DenseFeat(feat, 1) 43 | for feat in dense_features] 44 | 45 | feature_columns = sparse_feature_list + dense_feature_list 46 | feature_names = get_feature_names(feature_columns) 47 | 48 | train = data.iloc[:train_size] 49 | test = data.iloc[train_size:] 50 | 51 | train_model_input = {name: train[name] for name in feature_names} 52 | test_model_input = {name: test[name] for name in feature_names} 53 | 54 | train_labels = [train[target[0]].values, train[target[1]].values] 55 | test_labels = [test[target[0]].values, test[target[1]].values] 56 | 57 | model = xDeepFM_MTL(feature_columns) 58 | model.compile("adagrad", "binary_crossentropy", loss_weights=loss_weights, ) 59 | 60 | if ONLINE_FLAG: 61 | history = model.fit(train_model_input, train_labels, 62 | batch_size=4096, epochs=1, verbose=1) 63 | pred_ans = model.predict(test_model_input, batch_size=2 ** 14) 64 | 65 | else: 66 | history = model.fit(train_model_input, train_labels, 67 | batch_size=4096, epochs=1, verbose=1) 68 | pred_ans = model.predict(test_model_input, batch_size=2 ** 14) 69 | 70 | print("finish AUC", round(roc_auc_score(test_labels[0], pred_ans[0]), 4)) 71 | print("finish LogLoss", round(log_loss(test_labels[0], pred_ans[0]), 4)) 72 | 73 | print("like AUC", round(roc_auc_score(test_labels[1], pred_ans[1]), 4)) 74 | print("like LogLoss", round(log_loss(test_labels[1], pred_ans[1]), 4)) 75 | 76 | if ONLINE_FLAG: 77 | result = test_data[['uid', 'item_id', 'finish', 'like']].copy() 78 | result.rename(columns={'finish': 'finish_probability', 79 | 'like': 'like_probability'}, inplace=True) 80 | result['finish_probability'] = pred_ans[0] 81 | result['like_probability'] = pred_ans[1] 82 | result[['uid', 'item_id', 'finish_probability', 'like_probability']].to_csv( 83 | 'result.csv', index=None, float_format='%.6f') 84 | --------------------------------------------------------------------------------