├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── input
    └── README.md
├── model.py
└── train.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | input
  2 | .idea
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 | 
115 | # Pyre type checker
116 | .pyre/
117 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Weichen Shen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ICME2019 & 字节跳动 短视频内容理解与推荐竞赛
 2 | 
 3 | ## 方案说明
 4 | - 特征：均为原始特征，不包含多媒体内容特征。使用到的特征字段 ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel',
 5 |        'music_id', 'did',]
 6 | - 模型：基于xDeepFM简单修改的多任务模型(没有测过分开预测的效果，也可能分开单独预测更好)。
 7 | - 结果：track2:  0.77094938716636 f, l = 0.70671501437, 0.920829590357
 8 | 
 9 | ## 运行环境
10 | 
11 |  python 3.6  
12 |  deepctr==0.9.2 
13 |  tensorflow-gpu(tensorflow)
14 |  pandas
15 |  scikit-learn
16 | 
17 | ### deepctr安装说明
18 | - CPU版本
19 |   ```bash
20 |   $ pip install deepctr==0.9.2
21 |   ``` 
22 | - GPU版本
23 |   先确保已经在本地安装`tensorflow-gpu`，然后运行命令
24 |     ```bash
25 |     $ pip install deepctr==0.9.2 --no-deps
26 |     ```
27 | 
28 | 
29 | ## 运行说明
30 | 1. 将track2对应的数据下载并解压至`input`目录内
31 | 2. 根据离线测试和线上提交修改`train.py`中的`ONLINE_FLAG`变量，运行`train.py`文件
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/input/README.md:
--------------------------------------------------------------------------------
1 | # 数据解压后放在这个文件夹内


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from deepctr.layers.core import DNN, PredictionLayer
 3 | from deepctr.layers.interaction import CIN
 4 | from deepctr.layers.utils import concat_func, add_func
 5 | 
 6 | from deepctr.feature_column import build_input_features, get_linear_logit, input_from_feature_columns
 7 | 
 8 | 
 9 | def xDeepFM_MTL(feature_columns, hidden_size=(256, 256), cin_layer_size=(256, 256),
10 |                 cin_split_half=True,
11 |                 task_net_size=(128,), l2_reg_linear=0.00001, l2_reg_embedding=0.00001,
12 |                 seed=1024, ):
13 |     if len(task_net_size) < 1:
14 |         raise ValueError('task_net_size must be at least one layer')
15 | 
16 |     # video_input = tf.keras.layers.Input((128,))
17 |     # inputs_list.append(video_input)
18 | 
19 |     features = build_input_features(feature_columns)
20 | 
21 |     inputs_list = list(features.values())
22 | 
23 |     linear_logit = get_linear_logit(features, feature_columns, seed=seed, prefix='linear',
24 |                                     l2_reg=l2_reg_linear)
25 | 
26 |     deep_emb_list, dense_value_list = input_from_feature_columns(features, feature_columns,
27 |                                                                  l2_reg_embedding, seed)
28 | 
29 |     fm_input = concat_func(deep_emb_list, axis=1)
30 | 
31 |     deep_input = tf.keras.layers.Flatten()(fm_input)
32 |     deep_out = DNN(hidden_size)(deep_input)
33 | 
34 |     finish_out = DNN(task_net_size)(deep_out)
35 |     finish_logit = tf.keras.layers.Dense(
36 |         1, use_bias=False, activation=None)(finish_out)
37 | 
38 |     like_out = DNN(task_net_size)(deep_out)
39 |     like_logit = tf.keras.layers.Dense(
40 |         1, use_bias=False, activation=None)(like_out)
41 | 
42 |     finish_logit = add_func(
43 |         [linear_logit, finish_logit])
44 |     like_logit = add_func(
45 |         [linear_logit, like_logit])
46 | 
47 |     if len(cin_layer_size) > 0:
48 |         exFM_out = CIN(cin_layer_size, 'relu',
49 |                        cin_split_half, seed)(fm_input)
50 |         exFM_logit = tf.keras.layers.Dense(1, activation=None, )(exFM_out)
51 |         finish_logit = add_func([finish_logit, exFM_logit])
52 |         like_logit = add_func([like_logit, exFM_logit])
53 | 
54 |     output_finish = PredictionLayer('binary', name='p_finish')(finish_logit)
55 |     output_like = PredictionLayer('binary', name='p_like')(like_logit)
56 |     model = tf.keras.models.Model(inputs=inputs_list, outputs=[
57 |         output_finish, output_like])
58 |     return model
59 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names
 3 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler
 4 | from sklearn.metrics import log_loss, roc_auc_score
 5 | 
 6 | from model import xDeepFM_MTL
 7 | 
 8 | ONLINE_FLAG = False
 9 | loss_weights = [1, 1, ]  # [0.7,0.3]任务权重可以调下试试
10 | VALIDATION_FRAC = 0.2  # 用做线下验证数据比例
11 | 
12 | if __name__ == "__main__":
13 |     data = pd.read_csv('./input/final_track2_train.txt', sep='\t', names=[
14 |         'uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 'did',
15 |         'creat_time', 'video_duration'])
16 |     if ONLINE_FLAG:
17 |         test_data = pd.read_csv('./input/final_track2_test_no_anwser.txt', sep='\t', names=[
18 |             'uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel', 'finish', 'like', 'music_id', 'did',
19 |             'creat_time', 'video_duration'])
20 |         train_size = data.shape[0]
21 |         data = data.append(test_data)
22 |     else:
23 |         train_size = int(data.shape[0] * (1 - VALIDATION_FRAC))
24 | 
25 |     sparse_features = ['uid', 'user_city', 'item_id', 'author_id', 'item_city', 'channel',
26 |                        'music_id', 'did', ]
27 |     dense_features = ['video_duration']  # 'creat_time',
28 | 
29 |     data[sparse_features] = data[sparse_features].fillna('-1', )
30 |     data[dense_features] = data[dense_features].fillna(0, )
31 | 
32 |     target = ['finish', 'like']
33 | 
34 |     for feat in sparse_features:
35 |         lbe = LabelEncoder()
36 |         data[feat] = lbe.fit_transform(data[feat])
37 |     mms = MinMaxScaler(feature_range=(0, 1))
38 |     data[dense_features] = mms.fit_transform(data[dense_features])
39 | 
40 |     sparse_feature_list = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=8)
41 |                            for feat in sparse_features]
42 |     dense_feature_list = [DenseFeat(feat, 1)
43 |                           for feat in dense_features]
44 | 
45 |     feature_columns = sparse_feature_list + dense_feature_list
46 |     feature_names = get_feature_names(feature_columns)
47 | 
48 |     train = data.iloc[:train_size]
49 |     test = data.iloc[train_size:]
50 | 
51 |     train_model_input = {name: train[name] for name in feature_names}
52 |     test_model_input = {name: test[name] for name in feature_names}
53 | 
54 |     train_labels = [train[target[0]].values, train[target[1]].values]
55 |     test_labels = [test[target[0]].values, test[target[1]].values]
56 | 
57 |     model = xDeepFM_MTL(feature_columns)
58 |     model.compile("adagrad", "binary_crossentropy", loss_weights=loss_weights, )
59 | 
60 |     if ONLINE_FLAG:
61 |         history = model.fit(train_model_input, train_labels,
62 |                             batch_size=4096, epochs=1, verbose=1)
63 |         pred_ans = model.predict(test_model_input, batch_size=2 ** 14)
64 | 
65 |     else:
66 |         history = model.fit(train_model_input, train_labels,
67 |                             batch_size=4096, epochs=1, verbose=1)
68 |         pred_ans = model.predict(test_model_input, batch_size=2 ** 14)
69 | 
70 |         print("finish AUC", round(roc_auc_score(test_labels[0], pred_ans[0]), 4))
71 |         print("finish LogLoss", round(log_loss(test_labels[0], pred_ans[0]), 4))
72 | 
73 |         print("like AUC", round(roc_auc_score(test_labels[1], pred_ans[1]), 4))
74 |         print("like LogLoss", round(log_loss(test_labels[1], pred_ans[1]), 4))
75 | 
76 |     if ONLINE_FLAG:
77 |         result = test_data[['uid', 'item_id', 'finish', 'like']].copy()
78 |         result.rename(columns={'finish': 'finish_probability',
79 |                                'like': 'like_probability'}, inplace=True)
80 |         result['finish_probability'] = pred_ans[0]
81 |         result['like_probability'] = pred_ans[1]
82 |         result[['uid', 'item_id', 'finish_probability', 'like_probability']].to_csv(
83 |             'result.csv', index=None, float_format='%.6f')
84 | 


--------------------------------------------------------------------------------