├── 1.data-preprocessing-and-feature-engineering.ipynb
├── 2.shared-bottom-DeepFM-example.ipynb
├── 3.MMoE-example.ipynb
├── 4.PLE-example.ipynb
├── 5.evaluation.ipynb
└── README.md
/1.data-preprocessing-and-feature-engineering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "BcNP6wnl8Yfq"
7 | },
8 | "source": [
9 | "# 数据预处理与特征工程"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {
15 | "id": "kgpyTPSZdkxv"
16 | },
17 | "source": [
18 | "## 数据一览"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {
25 | "colab": {
26 | "base_uri": "https://localhost:8080/"
27 | },
28 | "id": "JEBQLW6V8t9F",
29 | "outputId": "1c10ecd1-55bc-46cd-8066-3c0d404fd86f"
30 | },
31 | "outputs": [
32 | {
33 | "name": "stdout",
34 | "output_type": "stream",
35 | "text": [
36 | "Archive: wechat_algo_data.zip\n",
37 | " creating: wechat_algo_data/\n",
38 | " inflating: wechat_algo_data/test_a.csv \n",
39 | " inflating: wechat_algo_data/feed_info.csv \n",
40 | " inflating: wechat_algo_data/feed_embeddings.csv \n",
41 | " inflating: wechat_algo_data/README.md \n",
42 | " inflating: wechat_algo_data/user_action.csv \n",
43 | " inflating: wechat_algo_data/submit_demo_初赛a.csv \n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "!unzip wechat_algo_data.zip"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 2,
54 | "metadata": {
55 | "id": "6XXpcyyLdsjL"
56 | },
57 | "outputs": [],
58 | "source": [
59 | "# 数据处理工具库\n",
60 | "import numpy as np\n",
61 | "import pandas as pd"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "metadata": {
68 | "colab": {
69 | "base_uri": "https://localhost:8080/"
70 | },
71 | "id": "RjCNQCURdvc0",
72 | "outputId": "c2d4ff16-2bc3-46bb-9462-8b0b43bf6dfc"
73 | },
74 | "outputs": [
75 | {
76 | "name": "stdout",
77 | "output_type": "stream",
78 | "text": [
79 | "feed_embeddings.csv README.md\t\t test_a.csv\n",
80 | "feed_info.csv\t submit_demo_初赛a.csv user_action.csv\n"
81 | ]
82 | }
83 | ],
84 | "source": [
85 | "!ls wechat_algo_data"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 4,
91 | "metadata": {
92 | "colab": {
93 | "base_uri": "https://localhost:8080/",
94 | "height": 474
95 | },
96 | "id": "55sHVsRxeSnj",
97 | "outputId": "24681838-7a30-4026-f2a1-f4a7d53c4e0e"
98 | },
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/html": [
103 | "
\n",
104 | "\n",
117 | "
\n",
118 | " \n",
119 | " \n",
120 | " | \n",
121 | " feedid | \n",
122 | " authorid | \n",
123 | " videoplayseconds | \n",
124 | " description | \n",
125 | " ocr | \n",
126 | " asr | \n",
127 | " bgm_song_id | \n",
128 | " bgm_singer_id | \n",
129 | " manual_keyword_list | \n",
130 | " machine_keyword_list | \n",
131 | " manual_tag_list | \n",
132 | " machine_tag_list | \n",
133 | " description_char | \n",
134 | " ocr_char | \n",
135 | " asr_char | \n",
136 | "
\n",
137 | " \n",
138 | " \n",
139 | " \n",
140 | " 0 | \n",
141 | " 43549 | \n",
142 | " 6165 | \n",
143 | " 38 | \n",
144 | " 104741 122649 8109 117252 65632 23463 118668 4... | \n",
145 | " 139499 59421 82007 142955 27736 83577 52394 11... | \n",
146 | " 142955 27736 83577 103956 32010 34170 89740 90... | \n",
147 | " 19356.0 | \n",
148 | " 11703.0 | \n",
149 | " 15506;7715;17582 | \n",
150 | " 26334;219;25209;7715;18541 | \n",
151 | " 81;269;159;6 | \n",
152 | " 269 0.8525666;81 0.8525666;8 1.1e-07;306 0.0;2... | \n",
153 | " 26439 5247 6426 3827 1882 26018 20744 22204 30... | \n",
154 | " 25926 8491 13394 2203 26439 6243 33054 16435 1... | \n",
155 | " 2203 26439 6243 33054 16435 16307 17070 24908 ... | \n",
156 | "
\n",
157 | " \n",
158 | " 1 | \n",
159 | " 77432 | \n",
160 | " 9386 | \n",
161 | " 60 | \n",
162 | " 35753 27736 146603 73055 11794 101761 11794 81... | \n",
163 | " 35753 146603 73055 11794 101761 67496 16933 52... | \n",
164 | " 146739 14368 79290 79213 47366 8109 33194 1198... | \n",
165 | " NaN | \n",
166 | " NaN | \n",
167 | " 8199;18322;4243 | \n",
168 | " 24078;19924 | \n",
169 | " 194;267;159;6 | \n",
170 | " 267 0.99293476;194 0.99293476 | \n",
171 | " 31010 32495 6243 13923 15360 30483 2709 26084 ... | \n",
172 | " 31010 32495 13923 15360 30483 2709 26084 15160... | \n",
173 | " 7259 20851 5061 26207 17573 17531 15117 20072 ... | \n",
174 | "
\n",
175 | " \n",
176 | " 2 | \n",
177 | " 12921 | \n",
178 | " 2996 | \n",
179 | " 15 | \n",
180 | " 114413 107973 117252 27736 41035 32715 125374 ... | \n",
181 | " NaN | \n",
182 | " NaN | \n",
183 | " 10288.0 | \n",
184 | " 11354.0 | \n",
185 | " NaN | \n",
186 | " 11696;6926 | \n",
187 | " 202;23;160;6 | \n",
188 | " 23 0.64771646;202 0.64771646 | \n",
189 | " 27077 10050 27464 20072 26018 6243 21492 14750... | \n",
190 | " NaN | \n",
191 | " NaN | \n",
192 | "
\n",
193 | " \n",
194 | "
\n",
195 | "
"
196 | ],
197 | "text/plain": [
198 | " feedid ... asr_char\n",
199 | "0 43549 ... 2203 26439 6243 33054 16435 16307 17070 24908 ...\n",
200 | "1 77432 ... 7259 20851 5061 26207 17573 17531 15117 20072 ...\n",
201 | "2 12921 ... NaN\n",
202 | "\n",
203 | "[3 rows x 15 columns]"
204 | ]
205 | },
206 | "execution_count": 4,
207 | "metadata": {},
208 | "output_type": "execute_result"
209 | }
210 | ],
211 | "source": [
212 | "# feed数据\n",
213 | "feed_df = pd.read_csv(\"wechat_algo_data/feed_info.csv\")\n",
214 | "feed_df.head(3)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 5,
220 | "metadata": {
221 | "colab": {
222 | "base_uri": "https://localhost:8080/",
223 | "height": 143
224 | },
225 | "id": "4Bl0Wuz0evy8",
226 | "outputId": "abad5a6b-226f-4694-8c1f-0f57c835201b"
227 | },
228 | "outputs": [
229 | {
230 | "data": {
231 | "text/html": [
232 | "\n",
233 | "\n",
246 | "
\n",
247 | " \n",
248 | " \n",
249 | " | \n",
250 | " userid | \n",
251 | " feedid | \n",
252 | " date_ | \n",
253 | " device | \n",
254 | " read_comment | \n",
255 | " comment | \n",
256 | " like | \n",
257 | " play | \n",
258 | " stay | \n",
259 | " click_avatar | \n",
260 | " forward | \n",
261 | " follow | \n",
262 | " favorite | \n",
263 | "
\n",
264 | " \n",
265 | " \n",
266 | " \n",
267 | " 0 | \n",
268 | " 8 | \n",
269 | " 71474 | \n",
270 | " 1 | \n",
271 | " 1 | \n",
272 | " 0 | \n",
273 | " 0 | \n",
274 | " 1 | \n",
275 | " 500 | \n",
276 | " 5366 | \n",
277 | " 0 | \n",
278 | " 0 | \n",
279 | " 0 | \n",
280 | " 0 | \n",
281 | "
\n",
282 | " \n",
283 | " 1 | \n",
284 | " 8 | \n",
285 | " 73916 | \n",
286 | " 1 | \n",
287 | " 1 | \n",
288 | " 0 | \n",
289 | " 0 | \n",
290 | " 0 | \n",
291 | " 250 | \n",
292 | " 1533 | \n",
293 | " 0 | \n",
294 | " 0 | \n",
295 | " 0 | \n",
296 | " 0 | \n",
297 | "
\n",
298 | " \n",
299 | " 2 | \n",
300 | " 8 | \n",
301 | " 50282 | \n",
302 | " 1 | \n",
303 | " 1 | \n",
304 | " 0 | \n",
305 | " 0 | \n",
306 | " 0 | \n",
307 | " 750 | \n",
308 | " 1302 | \n",
309 | " 0 | \n",
310 | " 0 | \n",
311 | " 0 | \n",
312 | " 0 | \n",
313 | "
\n",
314 | " \n",
315 | "
\n",
316 | "
"
317 | ],
318 | "text/plain": [
319 | " userid feedid date_ device ... click_avatar forward follow favorite\n",
320 | "0 8 71474 1 1 ... 0 0 0 0\n",
321 | "1 8 73916 1 1 ... 0 0 0 0\n",
322 | "2 8 50282 1 1 ... 0 0 0 0\n",
323 | "\n",
324 | "[3 rows x 13 columns]"
325 | ]
326 | },
327 | "execution_count": 5,
328 | "metadata": {},
329 | "output_type": "execute_result"
330 | }
331 | ],
332 | "source": [
333 | "# action数据\n",
334 | "action_df = pd.read_csv('./wechat_algo_data/user_action.csv')\n",
335 | "action_df.head(3)"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 6,
341 | "metadata": {
342 | "colab": {
343 | "base_uri": "https://localhost:8080/",
344 | "height": 143
345 | },
346 | "id": "u4ZFXRojfAr4",
347 | "outputId": "a5593f42-641f-4ed5-f8d6-084ab83f0608"
348 | },
349 | "outputs": [
350 | {
351 | "data": {
352 | "text/html": [
353 | "\n",
354 | "\n",
367 | "
\n",
368 | " \n",
369 | " \n",
370 | " | \n",
371 | " userid | \n",
372 | " feedid | \n",
373 | " device | \n",
374 | "
\n",
375 | " \n",
376 | " \n",
377 | " \n",
378 | " 0 | \n",
379 | " 14298 | \n",
380 | " 67227 | \n",
381 | " 1 | \n",
382 | "
\n",
383 | " \n",
384 | " 1 | \n",
385 | " 68356 | \n",
386 | " 91864 | \n",
387 | " 2 | \n",
388 | "
\n",
389 | " \n",
390 | " 2 | \n",
391 | " 49925 | \n",
392 | " 104657 | \n",
393 | " 2 | \n",
394 | "
\n",
395 | " \n",
396 | "
\n",
397 | "
"
398 | ],
399 | "text/plain": [
400 | " userid feedid device\n",
401 | "0 14298 67227 1\n",
402 | "1 68356 91864 2\n",
403 | "2 49925 104657 2"
404 | ]
405 | },
406 | "execution_count": 6,
407 | "metadata": {},
408 | "output_type": "execute_result"
409 | }
410 | ],
411 | "source": [
412 | "# 预测数据\n",
413 | "test = pd.read_csv('./wechat_algo_data/test_a.csv')\n",
414 | "test.head(3)"
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "metadata": {
420 | "id": "s2pFscGhfOEq"
421 | },
422 | "source": [
423 | "## 数据合并与预处理"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 7,
429 | "metadata": {
430 | "id": "FQkm-YYp9uE8"
431 | },
432 | "outputs": [],
433 | "source": [
434 | "import os\n",
435 | "import copy\n",
436 | "from tensorflow.python.keras.preprocessing.sequence import pad_sequences"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 8,
442 | "metadata": {
443 | "id": "HP6u0ypb9y-v"
444 | },
445 | "outputs": [],
446 | "source": [
447 | "# 对list字段进行切分和映射编码\n",
448 | "def split(column):\n",
449 | " if not isinstance(column,str):\n",
450 | " return []\n",
451 | " keys = column.strip().split(';')\n",
452 | " for key in keys:\n",
453 | " if key not in key2index:\n",
454 | " key2index[key] = len(key2index) + 1\n",
455 | " return list(map(lambda x: key2index[x], keys))"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": 9,
461 | "metadata": {
462 | "id": "dSsqSI2p-FKb"
463 | },
464 | "outputs": [],
465 | "source": [
466 | "def preprocess(sample,dense_features):\n",
467 | " '''\n",
468 | " 特征工程:对数值型特征做对数变换; id型特征+1; 缺失值补充0。\n",
469 | " '''\n",
470 | " sample[dense_features] = sample[dense_features].fillna(0.0)\n",
471 | " sample[dense_features] = np.log(sample[dense_features] + 1.0)\n",
472 | " \n",
473 | " sample[[\"authorid\", \"bgm_song_id\", \"bgm_singer_id\"]] += 1 # 0 用于填未知\n",
474 | " sample[[\"authorid\", \"bgm_song_id\", \"bgm_singer_id\", \"videoplayseconds\"]] = sample[[\"authorid\", \"bgm_song_id\", \"bgm_singer_id\", \"videoplayseconds\"]].fillna(0)\n",
475 | " sample[\"videoplayseconds\"] = np.log(sample[\"videoplayseconds\"] + 1.0)\n",
476 | " sample[[\"authorid\", \"bgm_song_id\", \"bgm_singer_id\"]] = sample[[\"authorid\", \"bgm_song_id\", \"bgm_singer_id\"]].astype(int)\n",
477 | " return sample"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": 10,
483 | "metadata": {
484 | "id": "6_k0sj47-HwE"
485 | },
486 | "outputs": [],
487 | "source": [
488 | "# 合并数据\n",
489 | "test['date_'] = 15\n",
490 | "action_df = pd.concat([action_df,test])"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 11,
496 | "metadata": {
497 | "id": "5vfgLz46-kAz"
498 | },
499 | "outputs": [],
500 | "source": [
501 | "# 标签列\n",
502 | "target = [\"read_comment\", \"like\", \"click_avatar\", \"forward\"]\n",
503 | "# 稀疏特征\n",
504 | "sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']\n",
505 | "# 变长序列特征\n",
506 | "varlen_features = ['manual_tag_list','manual_keyword_list']\n",
507 | "# 稠密特征\n",
508 | "dense_features = ['videoplayseconds']"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 12,
514 | "metadata": {
515 | "id": "aTkpJTAU--I3"
516 | },
517 | "outputs": [],
518 | "source": [
519 | "# 数据合并\n",
520 | "feed_df = feed_df[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id','manual_tag_list','manual_keyword_list']]\n",
521 | "data = action_df.merge(feed_df, how='left',on='feedid') #行为数据拼接,作者id,bgm_song_id \n",
522 | "data = preprocess(data,dense_features) #特征处理\n",
523 | "data = data[dense_features+sparse_features+varlen_features+['date_']+target]"
524 | ]
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": 13,
529 | "metadata": {
530 | "colab": {
531 | "base_uri": "https://localhost:8080/"
532 | },
533 | "id": "1n1xXawW_CtH",
534 | "outputId": "484f7fef-026f-4005-9d2a-38015d5ef26c"
535 | },
536 | "outputs": [
537 | {
538 | "name": "stdout",
539 | "output_type": "stream",
540 | "text": [
541 | "manual_keyword_list 字段最长的取值序列长度为 18\n",
542 | "manual_tag_list 字段最长的取值序列长度为 11\n"
543 | ]
544 | }
545 | ],
546 | "source": [
547 | "# 变长特征编码\n",
548 | "encoder = {}\n",
549 | "global key2index\n",
550 | "for f in ['manual_keyword_list','manual_tag_list']:\n",
551 | " key2index = {}\n",
552 | " f_list = list(map(split, data[f].values))\n",
553 | " f_length = np.array(list(map(len, f_list)))\n",
554 | " max_len = max(f_length)\n",
555 | " print(f'{f} 字段最长的取值序列长度为 {max_len}')\n",
556 | " # Notice : padding=`post`\n",
557 | " data[f] = list(pad_sequences(f_list, maxlen=max_len, padding='post', ))\n",
558 | " encoder[f] = copy.copy(key2index)"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": 14,
564 | "metadata": {
565 | "colab": {
566 | "base_uri": "https://localhost:8080/"
567 | },
568 | "id": "4FL9zD0r_LQ2",
569 | "outputId": "74890c1b-f668-4cdd-c08c-8510af66eb30"
570 | },
571 | "outputs": [
572 | {
573 | "name": "stdout",
574 | "output_type": "stream",
575 | "text": [
576 | "编码ID字段:userid\n",
577 | "编码ID字段:feedid\n",
578 | "编码ID字段:authorid\n",
579 | "编码ID字段:bgm_song_id\n",
580 | "编码ID字段:bgm_singer_id\n"
581 | ]
582 | }
583 | ],
584 | "source": [
585 | "# 稀疏特征编码\n",
586 | "for featid in sparse_features:\n",
587 | " print(f\"编码ID字段:{featid}\")\n",
588 | " encoder[featid] = {uid:ucode+1 for ucode,uid in enumerate(data[featid].unique())} \n",
589 | " data[featid] = data[featid].apply(lambda x: encoder[featid].get(x,0))"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": 15,
595 | "metadata": {
596 | "colab": {
597 | "base_uri": "https://localhost:8080/"
598 | },
599 | "id": "Vn_7SE6Z_O6u",
600 | "outputId": "ed6b594c-3b91-4c93-c3f0-2135f374aff8"
601 | },
602 | "outputs": [
603 | {
604 | "name": "stdout",
605 | "output_type": "stream",
606 | "text": [
607 | "数据维度: (7739867, 13)\n",
608 | "数据字段: ['videoplayseconds', 'userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id', 'manual_tag_list', 'manual_keyword_list', 'date_', 'read_comment', 'like', 'click_avatar', 'forward']\n",
609 | "不同的date_取值: [ 1 2 3 5 6 7 8 10 11 12 13 14 4 9 15]\n"
610 | ]
611 | }
612 | ],
613 | "source": [
614 | "print('数据维度:', data.shape)\n",
615 | "print('数据字段:', data.columns.tolist())\n",
616 | "print('不同的date_取值: ', data['date_'].unique())\n",
617 | "# 如果资源比较少,可以在这里进行数据采样\n",
618 | "data = data.sample(frac = 1.0)"
619 | ]
620 | },
621 | {
622 | "cell_type": "code",
623 | "execution_count": 16,
624 | "metadata": {
625 | "colab": {
626 | "base_uri": "https://localhost:8080/"
627 | },
628 | "id": "FS41CUfzxlMe",
629 | "outputId": "4490cbb4-021c-46ba-94bd-0163d359a7e4"
630 | },
631 | "outputs": [
632 | {
633 | "name": "stdout",
634 | "output_type": "stream",
635 | "text": [
636 | "mkdir: cannot create directory ‘data_and_feature’: File exists\n"
637 | ]
638 | }
639 | ],
640 | "source": [
641 | "# 或者手动创建文件夹data_and_feature\n",
642 | "!mkdir data_and_feature"
643 | ]
644 | },
645 | {
646 | "cell_type": "code",
647 | "execution_count": 17,
648 | "metadata": {
649 | "id": "rEn01L9K_a_N"
650 | },
651 | "outputs": [],
652 | "source": [
653 | "# 构建训练集,验证集和测试集\n",
654 | "# 第14天样本作为验证集\n",
655 | "train = data[data['date_'] < 14].drop(['date_'],axis = 1)\n",
656 | "val = data[data['date_'] == 14].drop(['date_'],axis = 1) \n",
657 | "test = data[data['date_'] == 15].drop(['date_'],axis = 1)"
658 | ]
659 | },
660 | {
661 | "cell_type": "code",
662 | "execution_count": 18,
663 | "metadata": {
664 | "colab": {
665 | "base_uri": "https://localhost:8080/"
666 | },
667 | "id": "O0osmrVe5xVU",
668 | "outputId": "70f7991c-aa57-4be9-cee2-27033d8b15cf"
669 | },
670 | "outputs": [
671 | {
672 | "data": {
673 | "text/plain": [
674 | "0"
675 | ]
676 | },
677 | "execution_count": 18,
678 | "metadata": {},
679 | "output_type": "execute_result"
680 | }
681 | ],
682 | "source": [
683 | "import gc\n",
684 | "import joblib\n",
685 | "del action_df\n",
686 | "del feed_df\n",
687 | "del data\n",
688 | "gc.collect()"
689 | ]
690 | },
691 | {
692 | "cell_type": "code",
693 | "execution_count": null,
694 | "metadata": {
695 | "id": "DyhIAqXE58U3"
696 | },
697 | "outputs": [],
698 | "source": [
699 | "joblib.dump(train, './data_and_feature/train.txt')\n",
700 | "joblib.dump(val, './data_and_feature/val.txt')\n",
701 | "joblib.dump(test, './data_and_feature/test.txt')\n",
702 | "joblib.dump(encoder, './data_and_feature/encoder.txt')"
703 | ]
704 | }
705 | ],
706 | "metadata": {
707 | "accelerator": "GPU",
708 | "colab": {
709 | "collapsed_sections": [],
710 | "name": "1.data_preprocessing_and_feature_engineering.ipynb",
711 | "provenance": []
712 | },
713 | "kernelspec": {
714 | "display_name": "Python 3 (ipykernel)",
715 | "language": "python",
716 | "name": "python3"
717 | },
718 | "language_info": {
719 | "codemirror_mode": {
720 | "name": "ipython",
721 | "version": 3
722 | },
723 | "file_extension": ".py",
724 | "mimetype": "text/x-python",
725 | "name": "python",
726 | "nbconvert_exporter": "python",
727 | "pygments_lexer": "ipython3",
728 | "version": "3.9.5"
729 | },
730 | "toc": {
731 | "base_numbering": 1,
732 | "nav_menu": {},
733 | "number_sections": true,
734 | "sideBar": true,
735 | "skip_h1_title": false,
736 | "title_cell": "Table of Contents",
737 | "title_sidebar": "Contents",
738 | "toc_cell": false,
739 | "toc_position": {},
740 | "toc_section_display": true,
741 | "toc_window_display": false
742 | }
743 | },
744 | "nbformat": 4,
745 | "nbformat_minor": 1
746 | }
747 |
--------------------------------------------------------------------------------
/2.shared-bottom-DeepFM-example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "BcNP6wnl8Yfq"
7 | },
8 | "source": [
9 | "# DeepFM建模"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {
15 | "id": "s6srBeg382ee"
16 | },
17 | "source": [
18 | "## 加载数据"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "id": "kloudYEG6Cws"
26 | },
27 | "outputs": [],
28 | "source": [
29 | "import joblib\n",
30 | "train = joblib.load('./data_and_feature/train.txt')\n",
31 | "val = joblib.load('./data_and_feature/val.txt')\n",
32 | "test = joblib.load('./data_and_feature/test.txt')\n",
33 | "encoder = joblib.load('./data_and_feature/encoder.txt')\n",
34 | "\n",
35 | "train_num = len(train)"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {
41 | "id": "TfVav1G3NM_Q"
42 | },
43 | "source": [
44 | "## 导入工具库"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 20,
50 | "metadata": {
51 | "id": "atU3210yKot0"
52 | },
53 | "outputs": [],
54 | "source": [
55 | "import numpy as np\n",
56 | "\n",
57 | "import gc\n",
58 | "import os\n",
59 | "import matplotlib.pyplot as plt\n",
60 | "\n",
61 | "import tensorflow as tf\n",
62 | "import tensorflow.keras.backend as K\n",
63 | "\n",
64 | "from tensorflow.keras.layers import *\n",
65 | "from tensorflow.python.keras.layers import Layer\n",
66 | "from tensorflow.keras import regularizers\n",
67 | "\n",
68 | "from tensorflow.keras.models import Model,load_model\n",
69 | "from tensorflow.keras.utils import plot_model\n",
70 | "from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping\n",
71 | "\n",
72 | "from tensorflow.keras import optimizers,initializers\n",
73 | "from tensorflow.python.keras.initializers import glorot_normal"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {
79 | "id": "XlJgKlOlNTcV"
80 | },
81 | "source": [
82 | "## 搭建DeepFM模型"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 21,
88 | "metadata": {
89 | "id": "6FunCHvVMzKq"
90 | },
91 | "outputs": [],
92 | "source": [
93 | "class MeanPoolLayer(Layer):\n",
94 | " def __init__(self, axis, **kwargs):\n",
95 | " super(MeanPoolLayer, self).__init__(**kwargs)\n",
96 | " self.axis = axis\n",
97 | "\n",
98 | " def call(self, x, mask):\n",
99 | " mask = tf.expand_dims(tf.cast(mask,tf.float32),axis = -1)\n",
100 | " x = x * mask\n",
101 | " return K.sum(x, axis=self.axis) / (K.sum(mask, axis=self.axis) + 1e-9)\n",
102 | "\n",
103 | "def secondary_fm(W):\n",
104 | " #先相加再平方。\n",
105 | " frs_part = Add()(W)\n",
106 | " frs_part = Multiply()([frs_part,frs_part]) \n",
107 | " #先平方再相加\n",
108 | " scd_part = Add()([Multiply()([_x,_x]) for _x in W])\n",
109 | " #相减,乘0.5.\n",
110 | " fm_part = Subtract()([frs_part,scd_part])\n",
111 | " fm_part = Lambda(lambda x:K.sum(x,axis = 1,keepdims = True)*0.5)(fm_part)\n",
112 | " return fm_part\n",
113 | "\n",
114 | "\n",
115 | "def build_FM(sparse_cols,dense_cols,sparse_max_len,embed_dim = 16, \n",
116 | " dnn_hidden_units=(128, 128),varlens_cols = [],varlens_max_len = {},\n",
117 | " dropout = 0,embedding_reg_l2 = 1e-6,dnn_reg_l2 = 0.0):\n",
118 | " ''' \n",
119 | " sparse_cols,dense_cols:离散变量名,连续变量名。\n",
120 | " sparse_max_len:字典:离散变量对应的最大的取值范围。\n",
121 | " varlens_cols:可变离散变量名。\n",
122 | " varlens_max_len:可变离散变量的最大取值范围。\n",
123 | " '''\n",
124 | " \n",
125 | " #输入部分,分为sparse,varlens,dense部分。\n",
126 | " sparse_inputs = {f:Input([1],name = f) for f in sparse_cols}\n",
127 | " dense_inputs = {f:Input([1],name = f) for f in dense_cols}\n",
128 | " varlens_inputs = {f:Input([None,1],name = f) for f in varlens_cols}\n",
129 | " \n",
130 | " input_embed = {}\n",
131 | " #离散特征,embedding到k维,得到其隐向量。wi\n",
132 | " for f in sparse_cols:\n",
133 | " _input = sparse_inputs[f]\n",
134 | " embedding = Embedding(sparse_max_len[f], embed_dim, \n",
135 | " embeddings_regularizer=tf.keras.regularizers.l2(embedding_reg_l2)) \n",
136 | " input_embed[f] =Flatten()(embedding(_input)) #(bs,k)\n",
137 | " \n",
138 | " #多标签离散变量\n",
139 | " for f in varlens_inputs:\n",
140 | " _input = varlens_inputs[f]\n",
141 | " mask = Masking(mask_value = 0).compute_mask(_input)\n",
142 | " embedding = Embedding(varlens_max_len[f], embed_dim,\n",
143 | " embeddings_regularizer=tf.keras.regularizers.l2(1e-6))\n",
144 | " _embed =Reshape([-1,embed_dim])(embedding(_input))\n",
145 | " out_embed = MeanPoolLayer(axis=1)(_embed,mask)\n",
146 | " input_embed[f] = out_embed\n",
147 | " \n",
148 | " #连续变量\n",
149 | " for f in dense_inputs:\n",
150 | " _input = dense_inputs[f]\n",
151 | " _embed = Dense(embed_dim,use_bias = False,activation = 'linear')(_input)\n",
152 | " input_embed[f] = _embed\n",
153 | " \n",
154 | " feature_name = sparse_cols+varlens_cols+dense_cols\n",
155 | " fm_embed = [input_embed[f] for f in feature_name]\n",
156 | " fm_part = secondary_fm(fm_embed)\n",
157 | " \n",
158 | " #离散变量和连续变量拼接成dnn feature\n",
159 | " dnn_feature = Concatenate(axis = -1)(fm_embed)\n",
160 | " for num in dnn_hidden_units:\n",
161 | " dnn_feature = Dropout(dropout)(Dense(num,activation='relu',\n",
162 | " kernel_regularizer=regularizers.l2(dnn_reg_l2))(dnn_feature))\n",
163 | " \n",
164 | " dnn_output = Dense(1,activation = 'linear', kernel_regularizer=regularizers.l2(dnn_reg_l2),\n",
165 | " use_bias = True)(dnn_feature)\n",
166 | " logits = Activation('sigmoid')(Add()([fm_part,dnn_output]))\n",
167 | " inputs = [sparse_inputs[f] for f in sparse_inputs]+[varlens_inputs[f] for f in varlens_inputs]\\\n",
168 | " +[dense_inputs[f] for f in dense_inputs]\n",
169 | " model = Model(inputs,logits) \n",
170 | " return model"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 22,
176 | "metadata": {
177 | "colab": {
178 | "base_uri": "https://localhost:8080/"
179 | },
180 | "id": "bRpBQkEcNDBs",
181 | "outputId": "2bc19606-f2a4-4e12-e7ed-038cb7343e1f"
182 | },
183 | "outputs": [
184 | {
185 | "name": "stdout",
186 | "output_type": "stream",
187 | "text": [
188 | "Epoch 1/4\n",
189 | "656/656 [==============================] - 23s 28ms/step - loss: 0.1056 - auc: 0.9169 - val_loss: 0.1022 - val_auc: 0.9197\n",
190 | "Epoch 2/4\n",
191 | "656/656 [==============================] - 17s 25ms/step - loss: 0.0992 - auc: 0.9353 - val_loss: 0.1023 - val_auc: 0.9224\n",
192 | "Epoch 3/4\n",
193 | "656/656 [==============================] - 16s 25ms/step - loss: 0.0990 - auc: 0.9376 - val_loss: 0.1023 - val_auc: 0.9248\n",
194 | "Epoch 4/4\n",
195 | "656/656 [==============================] - 17s 25ms/step - loss: 0.0986 - auc: 0.9394 - val_loss: 0.1022 - val_auc: 0.9253\n"
196 | ]
197 | }
198 | ],
199 | "source": [
200 | "# 特征与标签\n",
201 | "target = [\"read_comment\", \"like\", \"click_avatar\", \"forward\"]\n",
202 | "sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']\n",
203 | "varlen_features = ['manual_tag_list','manual_keyword_list']\n",
204 | "dense_features = ['videoplayseconds']\n",
205 | "\n",
206 | "# 生成输入特征设置\n",
207 | "sparse_max_len = {f:len(encoder[f]) + 1 for f in sparse_features}\n",
208 | "varlens_max_len = {f:len(encoder[f]) + 1 for f in varlen_features}\n",
209 | "feature_names = sparse_features+varlen_features+dense_features\n",
210 | "\n",
211 | "# 构建输入数据\n",
212 | "train_model_input = {name: train[name] if name not in varlen_features else np.stack(train[name]) for name in feature_names } #训练模型的输入,字典类型。名称和具体值\n",
213 | "val_model_input = {name: val[name] if name not in varlen_features else np.stack(val[name]) for name in feature_names }\n",
214 | "test_model_input = {name: test[name] if name not in varlen_features else np.stack(test[name]) for name in feature_names}\n",
215 | "\n",
216 | "train_labels = train['read_comment'].values\n",
217 | "val_labels = val['read_comment'].values\n",
218 | "\n",
219 | "# 多余的特征删除,释放内存\n",
220 | "del train,val \n",
221 | "gc.collect()\n",
222 | "\n",
223 | "model = build_FM(sparse_features,dense_features,sparse_max_len,embed_dim = 16, \n",
224 | " dnn_hidden_units=(64,64),varlens_cols = varlen_features,varlens_max_len = varlens_max_len,\n",
225 | " dropout = 0.1,embedding_reg_l2 = 1e-6,dnn_reg_l2 = 0.0)\n",
226 | "\n",
227 | "adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)\n",
228 | "model.compile(adam, loss = 'binary_crossentropy' ,metrics = [tf.keras.metrics.AUC()],)\n",
229 | "\n",
230 | "history = model.fit(train_model_input, train_labels,validation_data = (val_model_input,val_labels),\n",
231 | " batch_size=10240, epochs=4, verbose=1)"
232 | ]
233 | }
234 | ],
235 | "metadata": {
236 | "accelerator": "GPU",
237 | "colab": {
238 | "collapsed_sections": [],
239 | "name": "2.shared-bottom-DeepFM.ipynb",
240 | "provenance": []
241 | },
242 | "kernelspec": {
243 | "display_name": "Python 3 (ipykernel)",
244 | "language": "python",
245 | "name": "python3"
246 | },
247 | "language_info": {
248 | "codemirror_mode": {
249 | "name": "ipython",
250 | "version": 3
251 | },
252 | "file_extension": ".py",
253 | "mimetype": "text/x-python",
254 | "name": "python",
255 | "nbconvert_exporter": "python",
256 | "pygments_lexer": "ipython3",
257 | "version": "3.9.5"
258 | },
259 | "toc": {
260 | "base_numbering": 1,
261 | "nav_menu": {},
262 | "number_sections": true,
263 | "sideBar": true,
264 | "skip_h1_title": false,
265 | "title_cell": "Table of Contents",
266 | "title_sidebar": "Contents",
267 | "toc_cell": false,
268 | "toc_position": {},
269 | "toc_section_display": true,
270 | "toc_window_display": false
271 | }
272 | },
273 | "nbformat": 4,
274 | "nbformat_minor": 1
275 | }
276 |
--------------------------------------------------------------------------------
/3.MMoE-example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "3.MMoE-example.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | },
17 | "accelerator": "GPU"
18 | },
19 | "cells": [
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "37Lzwxf-fDWT"
24 | },
25 | "source": [
26 | "## MMoE多任务多目标建模"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "id": "s6srBeg382ee"
33 | },
34 | "source": [
35 | "## 加载数据"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "metadata": {
41 | "id": "kloudYEG6Cws"
42 | },
43 | "source": [
44 | "import joblib\n",
45 | "train = joblib.load('./data_and_feature/train.txt')\n",
46 | "val = joblib.load('./data_and_feature/val.txt')\n",
47 | "test = joblib.load('./data_and_feature/test.txt')\n",
48 | "encoder = joblib.load('./data_and_feature/encoder.txt')\n",
49 | "\n",
50 | "train_num = len(train)"
51 | ],
52 | "execution_count": null,
53 | "outputs": []
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {
58 | "id": "TfVav1G3NM_Q"
59 | },
60 | "source": [
61 | "## 导入工具库"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "metadata": {
67 | "id": "atU3210yKot0"
68 | },
69 | "source": [
70 | "import numpy as np\n",
71 | "\n",
72 | "import gc\n",
73 | "import os\n",
74 | "import matplotlib.pyplot as plt\n",
75 | "\n",
76 | "import tensorflow as tf\n",
77 | "import tensorflow.keras.backend as K\n",
78 | "\n",
79 | "from tensorflow.keras.layers import *\n",
80 | "from tensorflow.python.keras.layers import Layer\n",
81 | "from tensorflow.keras import regularizers\n",
82 | "\n",
83 | "from tensorflow.keras.models import Model,load_model\n",
84 | "from tensorflow.keras.utils import plot_model\n",
85 | "from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping\n",
86 | "\n",
87 | "from tensorflow.keras import optimizers,initializers\n",
88 | "from tensorflow.python.keras.initializers import glorot_normal"
89 | ],
90 | "execution_count": 19,
91 | "outputs": []
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {
96 | "id": "XlJgKlOlNTcV"
97 | },
98 | "source": [
99 | "## 搭建MMoE模型"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "metadata": {
105 | "id": "6FunCHvVMzKq"
106 | },
107 | "source": [
108 | "class MeanPoolLayer(Layer):\n",
109 | " def __init__(self, axis, **kwargs):\n",
110 | " super(MeanPoolLayer, self).__init__(**kwargs)\n",
111 | " self.axis = axis\n",
112 | "\n",
113 | " def call(self, x, mask):\n",
114 | " mask = tf.expand_dims(tf.cast(mask,tf.float32),axis = -1)\n",
115 | " x = x * mask\n",
116 | " return K.sum(x, axis=self.axis) / (K.sum(mask, axis=self.axis) + 1e-9)\n",
117 | "\n",
118 | "class MmoeLayer(tf.keras.layers.Layer):\n",
119 | " def __init__(self,expert_dim,n_expert,n_task):\n",
120 | " super(MmoeLayer, self).__init__()\n",
121 | " self.n_task = n_task\n",
122 | " self.expert_layer = [Dense(expert_dim,activation = 'relu') for i in range(n_expert)]\n",
123 | " self.gate_layers = [Dense(n_expert,activation = 'softmax') for i in range(n_task)]\n",
124 | " \n",
125 | " def call(self,x):\n",
126 | " #多个专家网络\n",
127 | " E_net = [expert(x) for expert in self.expert_layer]\n",
128 | " E_net = Concatenate(axis = 1)([e[:,tf.newaxis,:] for e in E_net]) #(bs,n_expert,n_dims)\n",
129 | " #多个门网络\n",
130 | " gate_net = [gate(x) for gate in self.gate_layers] #n_task个(bs,n_expert)\n",
131 | " \n",
132 | " #每个towers等于,对应的门网络乘上所有的专家网络。\n",
133 | " towers = []\n",
134 | " for i in range(self.n_task):\n",
135 | " g = tf.expand_dims(gate_net[i],axis = -1) #(bs,n_expert,1)\n",
136 | " _tower = tf.matmul(E_net, g,transpose_a=True)\n",
137 | " towers.append(Flatten()(_tower)) #(bs,expert_dim)\n",
138 | " \n",
139 | " return towers\n",
140 | "\n",
141 | "def build_mmoe(sparse_cols,dense_cols,sparse_max_len,embed_dim,expert_dim,\n",
142 | " varlens_cols,varlens_max_len,n_expert,n_task,target = [],\n",
143 | " dnn_hidden_units = (64,),dnn_reg_l2 = 1e-5,drop_rate = 0.1,\n",
144 | " embedding_reg_l2 = 1e-6):\n",
145 | " \n",
146 | " \n",
147 | " #输入部分,分为sparse,varlens,dense部分。\n",
148 | " sparse_inputs = {f:Input([1],name = f) for f in sparse_cols}\n",
149 | " dense_inputs = {f:Input([1],name = f) for f in dense_cols}\n",
150 | " varlens_inputs = {f:Input([None,1],name = f) for f in varlens_cols}\n",
151 | " \n",
152 | " input_embed = {}\n",
153 | " #离散特征,embedding到k维\n",
154 | " for f in sparse_cols:\n",
155 | " _input = sparse_inputs[f]\n",
156 | " embedding = Embedding(sparse_max_len[f], embed_dim, \n",
157 | " embeddings_regularizer=tf.keras.regularizers.l2(embedding_reg_l2)) \n",
158 | " input_embed[f] =Flatten()(embedding(_input)) #(bs,k)\n",
159 | " \n",
160 | " #多标签离散变量\n",
161 | " for f in varlens_inputs:\n",
162 | " _input = varlens_inputs[f]\n",
163 | " mask = Masking(mask_value = 0).compute_mask(_input)\n",
164 | " embedding = Embedding(varlens_max_len[f], embed_dim,\n",
165 | " embeddings_regularizer=tf.keras.regularizers.l2(1e-6))\n",
166 | " _embed =Reshape([-1,embed_dim])(embedding(_input))\n",
167 | " out_embed = MeanPoolLayer(axis=1)(_embed,mask)\n",
168 | " input_embed[f] = out_embed\n",
169 | " \n",
170 | " input_embed.update(dense_inputs) #加入连续变量\n",
171 | " input_embed = Concatenate(axis = -1)([input_embed[f] for f in input_embed])\n",
172 | " for num in dnn_hidden_units:\n",
173 | " input_embed = Dropout(drop_rate)(Dense(num,activation = 'relu',\n",
174 | " kernel_regularizer=regularizers.l2(dnn_reg_l2))(input_embed))\n",
175 | " \n",
176 | " #mmoe网络层\n",
177 | " towers = MmoeLayer(expert_dim,n_expert,n_task)(input_embed)\n",
178 | " outputs = [Dense(1,activation = 'sigmoid', kernel_regularizer=regularizers.l2(dnn_reg_l2),\n",
179 | " name = f,use_bias = True)(_t) for _t,f in zip(towers,target)]\n",
180 | " inputs = [sparse_inputs[f] for f in sparse_inputs]+[varlens_inputs[f] for f in varlens_inputs]\\\n",
181 | " +[dense_inputs[f] for f in dense_inputs]\n",
182 | " model = Model(inputs,outputs) \n",
183 | " return model"
184 | ],
185 | "execution_count": 20,
186 | "outputs": []
187 | },
188 | {
189 | "cell_type": "code",
190 | "metadata": {
191 | "id": "hSbZkTbuOzYp",
192 | "colab": {
193 | "base_uri": "https://localhost:8080/"
194 | },
195 | "outputId": "1b830a16-bde5-4647-e96d-00db9e46d9e1"
196 | },
197 | "source": [
198 | "target = [\"read_comment\", \"like\", \"click_avatar\", \"forward\"]\n",
199 | "sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']\n",
200 | "varlen_features = ['manual_tag_list','manual_keyword_list']\n",
201 | "dense_features = ['videoplayseconds']\n",
202 | "\n",
203 | "# 生成输入特征设置\n",
204 | "sparse_max_len = {f:len(encoder[f]) + 1 for f in sparse_features}\n",
205 | "varlens_max_len = {f:len(encoder[f]) + 1 for f in varlen_features}\n",
206 | "feature_names = sparse_features+varlen_features+dense_features\n",
207 | "\n",
208 | "# 构建输入数据\n",
209 | "train_model_input = {name: train[name] if name not in varlen_features else np.stack(train[name]) for name in feature_names } #训练模型的输入,字典类型。名称和具体值\n",
210 | "val_model_input = {name: val[name] if name not in varlen_features else np.stack(val[name]) for name in feature_names }\n",
211 | "test_model_input = {name: test[name] if name not in varlen_features else np.stack(test[name]) for name in feature_names}\n",
212 | "\n",
213 | "train_labels = [train[y].values for y in target]\n",
214 | "val_labels = [val[y].values for y in target]\n",
215 | "\n",
216 | "# 多余的特征删除,释放内存\n",
217 | "del train,val\n",
218 | "gc.collect()\n",
219 | "\n",
220 | "# 构建模型,训练和评估\n",
221 | "model = build_mmoe(sparse_features,dense_features,sparse_max_len,embed_dim = 16,expert_dim = 32,\n",
222 | " n_task = 4,n_expert = 4,varlens_cols = varlen_features,varlens_max_len = varlens_max_len,\n",
223 | " dnn_hidden_units = (64,64),target = target,dnn_reg_l2 = 1e-5,drop_rate = 0.1)\n",
224 | "\n",
225 | "adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)\n",
226 | "model.compile(adam, loss = 'binary_crossentropy' ,metrics = [tf.keras.metrics.AUC()],)\n",
227 | "\n",
228 | "history = model.fit(train_model_input, train_labels,validation_data = (val_model_input,val_labels),\n",
229 | " batch_size=10240, epochs=4, verbose=1)"
230 | ],
231 | "execution_count": 21,
232 | "outputs": [
233 | {
234 | "output_type": "stream",
235 | "name": "stdout",
236 | "text": [
237 | "Epoch 1/4\n",
238 | "656/656 [==============================] - 37s 46ms/step - loss: 0.2885 - read_comment_loss: 0.1024 - like_loss: 0.1022 - click_avatar_loss: 0.0443 - forward_loss: 0.0263 - read_comment_auc: 0.9066 - like_auc: 0.8086 - click_avatar_auc: 0.7547 - forward_auc: 0.7429 - val_loss: 0.2560 - val_read_comment_loss: 0.0938 - val_like_loss: 0.0907 - val_click_avatar_loss: 0.0369 - val_forward_loss: 0.0190 - val_read_comment_auc: 0.9195 - val_like_auc: 0.8290 - val_click_avatar_auc: 0.8163 - val_forward_auc: 0.7755\n",
239 | "Epoch 2/4\n",
240 | "656/656 [==============================] - 29s 44ms/step - loss: 0.2531 - read_comment_loss: 0.0908 - like_loss: 0.0901 - click_avatar_loss: 0.0358 - forward_loss: 0.0199 - read_comment_auc: 0.9337 - like_auc: 0.8581 - click_avatar_auc: 0.8376 - forward_auc: 0.8379 - val_loss: 0.2535 - val_read_comment_loss: 0.0931 - val_like_loss: 0.0894 - val_click_avatar_loss: 0.0364 - val_forward_loss: 0.0186 - val_read_comment_auc: 0.9245 - val_like_auc: 0.8338 - val_click_avatar_auc: 0.8166 - val_forward_auc: 0.7871\n",
241 | "Epoch 3/4\n",
242 | "656/656 [==============================] - 29s 44ms/step - loss: 0.2513 - read_comment_loss: 0.0898 - like_loss: 0.0892 - click_avatar_loss: 0.0351 - forward_loss: 0.0195 - read_comment_auc: 0.9363 - like_auc: 0.8632 - click_avatar_auc: 0.8490 - forward_auc: 0.8507 - val_loss: 0.2545 - val_read_comment_loss: 0.0933 - val_like_loss: 0.0894 - val_click_avatar_loss: 0.0364 - val_forward_loss: 0.0184 - val_read_comment_auc: 0.9241 - val_like_auc: 0.8369 - val_click_avatar_auc: 0.8237 - val_forward_auc: 0.8138\n",
243 | "Epoch 4/4\n",
244 | "656/656 [==============================] - 29s 44ms/step - loss: 0.2504 - read_comment_loss: 0.0892 - like_loss: 0.0886 - click_avatar_loss: 0.0347 - forward_loss: 0.0192 - read_comment_auc: 0.9376 - like_auc: 0.8661 - click_avatar_auc: 0.8543 - forward_auc: 0.8574 - val_loss: 0.2545 - val_read_comment_loss: 0.0928 - val_like_loss: 0.0894 - val_click_avatar_loss: 0.0363 - val_forward_loss: 0.0184 - val_read_comment_auc: 0.9241 - val_like_auc: 0.8370 - val_click_avatar_auc: 0.8223 - val_forward_auc: 0.8267\n"
245 | ]
246 | }
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "metadata": {
252 | "id": "wfDOMfjEevAW"
253 | },
254 | "source": [
255 | ""
256 | ],
257 | "execution_count": null,
258 | "outputs": []
259 | }
260 | ]
261 | }
--------------------------------------------------------------------------------
/4.PLE-example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "4.PLE-example.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | },
17 | "accelerator": "GPU"
18 | },
19 | "cells": [
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {
23 | "id": "BKaDNTCPZl4n"
24 | },
25 | "source": [
26 | "# PLE多任务学习建模"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {
32 | "id": "s6srBeg382ee"
33 | },
34 | "source": [
35 | "## 加载数据"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "metadata": {
41 | "id": "kloudYEG6Cws"
42 | },
43 | "source": [
44 | "import joblib\n",
45 | "train = joblib.load('./data_and_feature/train.txt')\n",
46 | "val = joblib.load('./data_and_feature/val.txt')\n",
47 | "test = joblib.load('./data_and_feature/test.txt')\n",
48 | "encoder = joblib.load('./data_and_feature/encoder.txt')\n",
49 | "\n",
50 | "train_num = len(train)"
51 | ],
52 | "execution_count": null,
53 | "outputs": []
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {
58 | "id": "TfVav1G3NM_Q"
59 | },
60 | "source": [
61 | "## 导入工具库"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "metadata": {
67 | "id": "atU3210yKot0"
68 | },
69 | "source": [
70 | "import numpy as np\n",
71 | "\n",
72 | "import gc\n",
73 | "import os\n",
74 | "import matplotlib.pyplot as plt\n",
75 | "\n",
76 | "import tensorflow as tf\n",
77 | "import tensorflow.keras.backend as K\n",
78 | "\n",
79 | "from tensorflow.keras.layers import *\n",
80 | "from tensorflow.python.keras.layers import Layer\n",
81 | "from tensorflow.keras import regularizers\n",
82 | "\n",
83 | "from tensorflow.keras.models import Model,load_model\n",
84 | "from tensorflow.keras.utils import plot_model\n",
85 | "from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping\n",
86 | "\n",
87 | "from tensorflow.keras import optimizers,initializers\n",
88 | "from tensorflow.python.keras.initializers import glorot_normal"
89 | ],
90 | "execution_count": 19,
91 | "outputs": []
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {
96 | "id": "XlJgKlOlNTcV"
97 | },
98 | "source": [
99 | "## 搭建PLE模型"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "metadata": {
105 | "id": "6FunCHvVMzKq"
106 | },
107 | "source": [
108 | "class MeanPoolLayer(Layer):\n",
109 | " def __init__(self, axis, **kwargs):\n",
110 | " super(MeanPoolLayer, self).__init__(**kwargs)\n",
111 | " self.axis = axis\n",
112 | "\n",
113 | " def call(self, x, mask):\n",
114 | " mask = tf.expand_dims(tf.cast(mask,tf.float32),axis = -1)\n",
115 | " x = x * mask\n",
116 | " return K.sum(x, axis=self.axis) / (K.sum(mask, axis=self.axis) + 1e-9)\n",
117 | "\n",
118 | "class PleLayer(tf.keras.layers.Layer):\n",
119 | " '''\n",
120 | " n_experts:list,每个任务使用几个expert。[2,3]第一个任务使用2个expert,第二个任务使用3个expert。\n",
121 | " n_expert_share:int,共享的部分设置的expert个数。\n",
122 | " expert_dim:int,每个专家网络输出的向量维度。\n",
123 | " n_task:int,任务个数。\n",
124 | " '''\n",
125 | " def __init__(self,n_task,n_experts,expert_dim,n_expert_share,dnn_reg_l2 = 1e-5):\n",
126 | " super(PleLayer, self).__init__()\n",
127 | " self.n_task = n_task\n",
128 | " \n",
129 | " # 生成多个任务特定网络和1个共享网络。\n",
130 | " self.E_layer = []\n",
131 | " for i in range(n_task):\n",
132 | " sub_exp = [Dense(expert_dim,activation = 'relu') for j in range(n_experts[i])]\n",
133 | " self.E_layer.append(sub_exp)\n",
134 | " \n",
135 | " self.share_layer = [Dense(expert_dim,activation = 'relu') for j in range(n_expert_share)]\n",
136 | " #定义门控网络\n",
137 | " self.gate_layers = [Dense(n_expert_share+n_experts[i],kernel_regularizer=regularizers.l2(dnn_reg_l2),\n",
138 | " activation = 'softmax') for i in range(n_task)]\n",
139 | "\n",
140 | " def call(self,x):\n",
141 | " #特定网络和共享网络\n",
142 | " E_net = [[expert(x) for expert in sub_expert] for sub_expert in self.E_layer]\n",
143 | " share_net = [expert(x) for expert in self.share_layer]\n",
144 | " \n",
145 | " #门的权重乘上,指定任务和共享任务的输出。\n",
146 | " towers = []\n",
147 | " for i in range(self.n_task):\n",
148 | " g = self.gate_layers[i](x)\n",
149 | " g = tf.expand_dims(g,axis = -1) #(bs,n_expert_share+n_experts[i],1)\n",
150 | " _e = share_net+E_net[i] \n",
151 | " _e = Concatenate(axis = 1)([expert[:,tf.newaxis,:] for expert in _e]) #(bs,n_expert_share+n_experts[i],expert_dim)\n",
152 | " _tower = tf.matmul(_e, g,transpose_a=True)\n",
153 | " towers.append(Flatten()(_tower)) #(bs,expert_dim)\n",
154 | " return towers\n",
155 | "\n",
156 | "def build_ple(sparse_cols,dense_cols,sparse_max_len,embed_dim,expert_dim = 4,\n",
157 | " varlens_cols = [],varlens_max_len = [],dnn_hidden_units = (64,64),\n",
158 | " n_task = 2,n_experts = [2,2],n_expert_share = 4,dnn_reg_l2 = 1e-6,\n",
159 | " drop_rate = 0.0,embedding_reg_l2 = 1e-6,targets = []):\n",
160 | "\n",
161 | " #输入部分,分为sparse,varlens,dense部分。\n",
162 | " sparse_inputs = {f:Input([1],name = f) for f in sparse_cols}\n",
163 | " dense_inputs = {f:Input([1],name = f) for f in dense_cols}\n",
164 | " varlens_inputs = {f:Input([None,1],name = f) for f in varlens_cols}\n",
165 | " \n",
166 | " input_embed = {}\n",
167 | " #离散特征,embedding到k维\n",
168 | " for f in sparse_cols:\n",
169 | " _input = sparse_inputs[f]\n",
170 | " embedding = Embedding(sparse_max_len[f], embed_dim, \n",
171 | " embeddings_regularizer=tf.keras.regularizers.l2(embedding_reg_l2)) \n",
172 | " input_embed[f] =Flatten()(embedding(_input)) #(bs,k)\n",
173 | " \n",
174 | " #多标签离散变量\n",
175 | " for f in varlens_inputs:\n",
176 | " _input = varlens_inputs[f]\n",
177 | " mask = Masking(mask_value = 0).compute_mask(_input)\n",
178 | " embedding = Embedding(varlens_max_len[f], embed_dim,\n",
179 | " embeddings_regularizer=tf.keras.regularizers.l2(1e-6))\n",
180 | " _embed =Reshape([-1,embed_dim])(embedding(_input))\n",
181 | " out_embed = MeanPoolLayer(axis=1)(_embed,mask)\n",
182 | " input_embed[f] = out_embed\n",
183 | " \n",
184 | " input_embed.update(dense_inputs) #加入连续变量\n",
185 | " input_embed = Concatenate(axis = -1)([input_embed[f] for f in input_embed]) \n",
186 | " \n",
187 | " for num in dnn_hidden_units:\n",
188 | " input_embed = Dropout(drop_rate)(Dense(num,activation = 'relu',\n",
189 | " kernel_regularizer=regularizers.l2(dnn_reg_l2))(input_embed))\n",
190 | " #Ple网络层\n",
191 | " towers = PleLayer(n_task,n_experts,expert_dim,n_expert_share)(input_embed)\n",
192 | " outputs = [Dense(1,activation = 'sigmoid',kernel_regularizer=regularizers.l2(dnn_reg_l2),\n",
193 | " name = f,use_bias = True)(_t) for f,_t in zip(targets,towers)]\n",
194 | " inputs = [sparse_inputs[f] for f in sparse_inputs]+[varlens_inputs[f] for f in varlens_inputs]\\\n",
195 | " +[dense_inputs[f] for f in dense_inputs]\n",
196 | " model = Model(inputs,outputs) \n",
197 | " return model"
198 | ],
199 | "execution_count": 21,
200 | "outputs": []
201 | },
202 | {
203 | "cell_type": "code",
204 | "metadata": {
205 | "colab": {
206 | "base_uri": "https://localhost:8080/"
207 | },
208 | "id": "hSbZkTbuOzYp",
209 | "outputId": "c01c962f-05e2-47d4-dac2-c667066669a0"
210 | },
211 | "source": [
212 | "target = [\"read_comment\", \"like\", \"click_avatar\", \"forward\"]\n",
213 | "sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']\n",
214 | "varlen_features = ['manual_tag_list','manual_keyword_list']\n",
215 | "dense_features = ['videoplayseconds']\n",
216 | "\n",
217 | "# 生成输入特征设置\n",
218 | "sparse_max_len = {f:len(encoder[f]) + 1 for f in sparse_features}\n",
219 | "varlens_max_len = {f:len(encoder[f]) + 1 for f in varlen_features}\n",
220 | "feature_names = sparse_features+varlen_features+dense_features\n",
221 | "\n",
222 | "# 构建输入数据\n",
223 | "train_model_input = {name: train[name] if name not in varlen_features else np.stack(train[name]) for name in feature_names } #训练模型的输入,字典类型。名称和具体值\n",
224 | "val_model_input = {name: val[name] if name not in varlen_features else np.stack(val[name]) for name in feature_names }\n",
225 | "test_model_input = {name: test[name] if name not in varlen_features else np.stack(test[name]) for name in feature_names}\n",
226 | "\n",
227 | "train_labels = [train[y].values for y in target]\n",
228 | "val_labels = [val[y].values for y in target]\n",
229 | "\n",
230 | "# 删除多余的数据,释放内存\n",
231 | "del train,val\n",
232 | "gc.collect()\n",
233 | "\n",
234 | "# 构建模型,训练和评估\n",
235 | "model = build_ple(sparse_features,dense_features,sparse_max_len,embed_dim = 16,expert_dim = 32,\n",
236 | " varlens_cols = varlen_features,varlens_max_len = varlens_max_len,dnn_hidden_units = (64,),\n",
237 | " n_task = 4,n_experts = [4,4,4,4],n_expert_share = 8,dnn_reg_l2 = 1e-6,\n",
238 | " drop_rate = 0.1,embedding_reg_l2 = 1e-6,targets = target)\n",
239 | "\n",
240 | "adam = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)\n",
241 | "model.compile(adam, loss = 'binary_crossentropy' ,metrics = [tf.keras.metrics.AUC()],)\n",
242 | "\n",
243 | "history = model.fit(train_model_input, train_labels,validation_data = (val_model_input,val_labels),\n",
244 | " batch_size=10240, epochs=4, verbose=1)"
245 | ],
246 | "execution_count": 22,
247 | "outputs": [
248 | {
249 | "output_type": "stream",
250 | "name": "stdout",
251 | "text": [
252 | "Epoch 1/4\n",
253 | "656/656 [==============================] - 60s 78ms/step - loss: 0.2822 - read_comment_loss: 0.1019 - like_loss: 0.0989 - click_avatar_loss: 0.0423 - forward_loss: 0.0263 - read_comment_auc: 0.9084 - like_auc: 0.8201 - click_avatar_auc: 0.7767 - forward_auc: 0.7613 - val_loss: 0.2538 - val_read_comment_loss: 0.0936 - val_like_loss: 0.0905 - val_click_avatar_loss: 0.0364 - val_forward_loss: 0.0186 - val_read_comment_auc: 0.9194 - val_like_auc: 0.8285 - val_click_avatar_auc: 0.8236 - val_forward_auc: 0.7960\n",
254 | "Epoch 2/4\n",
255 | "656/656 [==============================] - 49s 75ms/step - loss: 0.2495 - read_comment_loss: 0.0899 - like_loss: 0.0892 - click_avatar_loss: 0.0352 - forward_loss: 0.0193 - read_comment_auc: 0.9359 - like_auc: 0.8629 - click_avatar_auc: 0.8484 - forward_auc: 0.8568 - val_loss: 0.2522 - val_read_comment_loss: 0.0924 - val_like_loss: 0.0892 - val_click_avatar_loss: 0.0368 - val_forward_loss: 0.0183 - val_read_comment_auc: 0.9238 - val_like_auc: 0.8365 - val_click_avatar_auc: 0.8240 - val_forward_auc: 0.8120\n",
256 | "Epoch 3/4\n",
257 | "656/656 [==============================] - 49s 74ms/step - loss: 0.2475 - read_comment_loss: 0.0887 - like_loss: 0.0883 - click_avatar_loss: 0.0346 - forward_loss: 0.0190 - read_comment_auc: 0.9389 - like_auc: 0.8675 - click_avatar_auc: 0.8567 - forward_auc: 0.8675 - val_loss: 0.2523 - val_read_comment_loss: 0.0923 - val_like_loss: 0.0891 - val_click_avatar_loss: 0.0362 - val_forward_loss: 0.0182 - val_read_comment_auc: 0.9244 - val_like_auc: 0.8361 - val_click_avatar_auc: 0.8255 - val_forward_auc: 0.8159\n",
258 | "Epoch 4/4\n",
259 | "656/656 [==============================] - 49s 74ms/step - loss: 0.2466 - read_comment_loss: 0.0881 - like_loss: 0.0878 - click_avatar_loss: 0.0342 - forward_loss: 0.0187 - read_comment_auc: 0.9402 - like_auc: 0.8702 - click_avatar_auc: 0.8622 - forward_auc: 0.8740 - val_loss: 0.2522 - val_read_comment_loss: 0.0926 - val_like_loss: 0.0886 - val_click_avatar_loss: 0.0361 - val_forward_loss: 0.0182 - val_read_comment_auc: 0.9245 - val_like_auc: 0.8390 - val_click_avatar_auc: 0.8202 - val_forward_auc: 0.8220\n"
260 | ]
261 | }
262 | ]
263 | }
264 | ]
265 | }
--------------------------------------------------------------------------------
/5.evaluation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "7926bb67",
6 | "metadata": {},
7 | "source": [
8 | "# 加权AUC评估"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "9d0b9069",
14 | "metadata": {},
15 | "source": [
16 | "## 导入工具库"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "id": "b3bddeaf",
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "import time\n",
27 | "import numpy as np\n",
28 | "from collections import defaultdict\n",
29 | "from sklearn.metrics import roc_auc_score"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "id": "2d5bad5f",
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "def uAUC(labels, preds, user_id_list):\n",
40 | " \"\"\"Calculate user AUC\"\"\"\n",
41 | " user_pred = defaultdict(lambda: [])\n",
42 | " user_truth = defaultdict(lambda: [])\n",
43 | " for idx, truth in enumerate(labels):\n",
44 | " user_id = user_id_list[idx]\n",
45 | " pred = preds[idx]\n",
46 | " truth = labels[idx]\n",
47 | " user_pred[user_id].append(pred)\n",
48 | " user_truth[user_id].append(truth)\n",
49 | "\n",
50 | " user_flag = defaultdict(lambda: False)\n",
51 | " for user_id in set(user_id_list):\n",
52 | " truths = user_truth[user_id]\n",
53 | " flag = False\n",
54 | " # 若全是正样本或全是负样本,则flag为False\n",
55 | " for i in range(len(truths) - 1):\n",
56 | " if truths[i] != truths[i + 1]:\n",
57 | " flag = True\n",
58 | " break\n",
59 | " user_flag[user_id] = flag\n",
60 | "\n",
61 | " total_auc = 0.0\n",
62 | " size = 0.0\n",
63 | " for user_id in user_flag:\n",
64 | " if user_flag[user_id]:\n",
65 | " auc = roc_auc_score(np.asarray(user_truth[user_id]), np.asarray(user_pred[user_id]))\n",
66 | " total_auc += auc \n",
67 | " size += 1.0\n",
68 | " user_auc = float(total_auc)/size\n",
69 | " return user_auc"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 3,
75 | "id": "b061bf71",
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "def compute_weighted_score(score_dict, weight_dict):\n",
80 | " '''基于多个行为的uAUC值,计算加权uAUC\n",
81 | " Input:\n",
82 | " scores_dict: 多个行为的uAUC值映射字典, dict\n",
83 | " weights_dict: 多个行为的权重映射字典, dict\n",
84 | " Output:\n",
85 | " score: 加权uAUC值, float\n",
86 | " '''\n",
87 | " score = 0.0\n",
88 | " weight_sum = 0.0\n",
89 | " for action in score_dict:\n",
90 | " weight = float(weight_dict[action])\n",
91 | " score += weight*score_dict[action]\n",
92 | " weight_sum += weight\n",
93 | " score /= float(weight_sum)\n",
94 | " score = round(score, 6)\n",
95 | " return score"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 4,
101 | "id": "87692a86",
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "def evaluate_deepctr(val_labels,val_pred_ans,userid_list):\n",
106 | " eval_dict = {}\n",
107 | " target = [\"read_comment\", \"like\", \"click_avatar\", \"forward\"]\n",
108 | " for i, action in enumerate(target):\n",
109 | " eval_dict[action] = uAUC(val_labels[i], val_pred_ans[i], userid_list)\n",
110 | " print(eval_dict)\n",
111 | " weight_dict = {\"read_comment\": 4, \"like\": 3, \"click_avatar\": 2, \"favorite\": 1, \"forward\": 1,\n",
112 | " \"comment\": 1, \"follow\": 1}\n",
113 | " weight_auc = compute_weighted_score(eval_dict, weight_dict)\n",
114 | " print(\"Weighted uAUC: \", weight_auc)\n",
115 | " return weight_auc"
116 | ]
117 | }
118 | ],
119 | "metadata": {
120 | "kernelspec": {
121 | "display_name": "Python 3 (ipykernel)",
122 | "language": "python",
123 | "name": "python3"
124 | },
125 | "language_info": {
126 | "codemirror_mode": {
127 | "name": "ipython",
128 | "version": 3
129 | },
130 | "file_extension": ".py",
131 | "mimetype": "text/x-python",
132 | "name": "python",
133 | "nbconvert_exporter": "python",
134 | "pygments_lexer": "ipython3",
135 | "version": "3.9.5"
136 | },
137 | "toc": {
138 | "base_numbering": 1,
139 | "nav_menu": {},
140 | "number_sections": true,
141 | "sideBar": true,
142 | "skip_h1_title": false,
143 | "title_cell": "Table of Contents",
144 | "title_sidebar": "Contents",
145 | "toc_cell": false,
146 | "toc_position": {},
147 | "toc_section_display": true,
148 | "toc_window_display": false
149 | }
150 | },
151 | "nbformat": 4,
152 | "nbformat_minor": 5
153 | }
154 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 多任务学习TensorFlow实践
2 |
3 | ## 说明
4 | 本github包含多任务多目标学习的一些模型(Share-Bottom, MMoE, PLE等)的实现代码,使用的数据为2021《微信大数据挑战赛》数据集。
5 |
6 | **[韩信子](https://github.com/HanXinzi-AI)@[ShowMeAI](http://www.showmeai.tech/)**
7 |
8 | 关于**多目标学习**在**网络结构设计**和**优化策略**上的方法总结和解析,欢迎参阅[多目标优化及应用(含代码实现)]博文。完整的 微信视频号多目标学习数据集 欢迎关注微信公众号「AI算法研究所」获取网盘地址下载。
9 |
10 | ## 微信视频号推荐数据说明
11 |
12 | 
13 |
14 | 对于给定的一定数量到访过微信视频号“热门推荐”的用户,采集和整理用户在视频号内的历史n天的行为数据(脱敏)构建数据集,希望通过算法在测试集上预测出这些用户对于不同视频内容的互动行为(包括**点赞、点击头像、收藏、转发**等)的发生概率。业务效果评估准则为多目标的加权uAUC值。
15 |
16 | ### 数据集与字段说明
17 |
18 | 数据集包含训练集和测试集,训练集用于训练模型,测试集用于评估模型效果,提交结果demo文件用于展示提交结果的格式。所有数据文件格式都是带表头的.csv格式,不同字段列之间用英文逗号分隔。
19 |
20 | #### 训练集
21 |
22 | **(1) Feed信息表**
23 |
24 | 该数据包含了视频(简称为feed)的基本信息和文本、音频、视频等多模态特征。具体字段如下:
25 |
26 | | 字段名 | 类型 | 说明 | 备注 |
27 | | -------------------- | ------ | ---------------------------------------------------- | ---------------- |
28 | | feedid | String | Feed视频ID | 已脱敏 |
29 | | authorid | String | 视频号作者ID | 已脱敏 |
30 | | videoplayseconds | Int | Feed时长 | 单位:秒 |
31 | | description | String | Feed配文,以词为单位使用空格分隔 | 已脱敏;存在空值 |
32 | | ocr | String | 图像识别信息,以词为单位使用空格分隔 | 已脱敏;存在空值 |
33 | | asr | String | 语音识别信息,以词为单位使用空格分隔 | 已脱敏;存在空值 |
34 | | description_char | String | Feed配文,以字为单位使用空格分隔 | 已脱敏;存在空值 |
35 | | ocr_char | String | 图像识别信息,以字为单位使用空格分隔 | 已脱敏;存在空值 |
36 | | asr_char | String | 语音识别信息,以字为单位使用空格分隔 | 已脱敏;存在空值 |
37 | | bgm_song_id | Int | 背景音乐ID | 已脱敏;存在空值 |
38 | | bgm_singer_id | Int | 背景音乐歌手ID | 已脱敏;存在空值 |
39 | | manual_keyword_list | String | 人工标注的关键词,多个关键词使用英文分号”;”分隔 | 已脱敏;存在空值 |
40 | | machine_keyword_list | String | 机器标注的关键词,多个关键词使用英文分号”;”分隔 | 已脱敏;存在空值 |
41 | | manual_tag_list | String | 人工标注的分类标签,多个标签使用英文分号”;”分隔 | 已脱敏;存在空值 |
42 | | machine_tag_list | String | 机器标注的分类标签,多个标签使用英文分号”;”分隔 | 已脱敏;存在空值 |
43 | | feed_embedding | String | 融合了ocr、asr、图像、文字的多模态的内容理解特征向量 | 512维向量 |
44 |
45 | _说明_
46 |
47 | - 训练集和测试集涉及的feed均在此表中;
48 | - description, orc, asr三个字段为原始文本数据以词为单位使用空格分隔和脱敏处理后得到的。例如:文本“我参加了中国高校计算机大赛“经过处理后得到类似“2 32 100 25 12 89 27”的形式(此处只是一个样例,不代表实际脱敏结果)。此外,我们还提供了以字为单位使用空格分隔和脱敏的结果,对应的字段分别为description_char、ocr_char、asr_char。
49 | - machine_tag_list字段比manual_tag_list字段增加了每个标签对应的预测概率值(取值区间[0,1])。脱敏后的标签和概率值之间用空格分隔。例如:“1025 0.32657512;2034 0.87653981;35 0.47265462”。
50 | - manual_keyword_list和machine_keyword_list共享相同的脱敏映射表。如果原先两个字段都包含同个关键词,那么脱敏后两个字段都会包含同个id。
51 | - manual_tag_list和machine_tag_list共享相同的脱敏映射表。如果原先两个字段都包含同个分类标签,那么脱敏后两个字段都会包含同个id。
52 | - feed_embedding字段为String格式,包含512维,数值之间用空格分隔。
53 |
54 | **(2) 用户行为表**
55 |
56 | 该数据包含了用户在视频号内一段时间内的历史行为数据(包括停留时长、播放时长和各项互动数据)。具体字段如下:
57 |
58 | | 字段名 | 类型 | 说明 | 备注 |
59 | | ------------ | ------------ | ------------ | ---------------------------------------------------- |
60 | | userid | String | 用户ID | 已脱敏 |
61 | | feedid | String | Feed视频ID | 已脱敏 |
62 | | device | Int | 设备类型ID | 已脱敏 |
63 | | date_ | Int | 日期 | 已脱敏为1-n,n代表第n天 |
64 | | play | Int | 视频播放时长 | 单位:毫秒;若播放时长大于视频时长,则属于重播的情况 |
65 | | stay | Int | 用户停留时长 | 单位:毫秒 |
66 | | read_comment | Bool | 是否查看评论 | 取值{0, 1},0代表“否”,1代表“是” |
67 | | like | Bool | 是否点赞 | 取值{0, 1},0代表“否”,1代表“是” |
68 | | click_avatar | Bool | 是否点击头像 | 取值{0, 1},0代表“否”,1代表“是” |
69 | | favorite | Bool | 是否收藏 | 取值{0, 1},0代表“否”,1代表“是” |
70 | | forward | Bool | 是否转发 | 取值{0, 1},0代表“否”,1代表“是” |
71 | | comment | Bool | 是否发表评论 | 取值{0, 1},0代表“否”,1代表“是” |
72 | | follow | Bool | 是否关注 | 取值{0, 1},0代表“否”,1代表“是” |
73 |
74 | _说明_
75 |
76 | - 用户行为表中每个用户对应的数据已按照时间戳顺序由小到大排列,数据中不提供时间戳字段。
77 |
78 | #### 测试集
79 |
80 | 测试集具体字段如下:
81 |
82 | | 字段名 | 类型 | 说明 | 备注 |
83 | | ------ | ------ | ---------- | ------ |
84 | | userid | String | 用户ID | 已脱敏 |
85 | | feedid | String | Feed视频ID | 已脱敏 |
86 | | device | Int | 设备类型ID | 已脱敏 |
87 |
88 | #### 结果格式
89 |
90 | - 初赛阶段:需要对测试集中每一行的userid和feedid的四种互动行为的发生概率进行预测,这四种行为包括:查看评论、点赞、点击头像、转发;
91 | - 复赛阶段:需要对测试集中每一行的userid和feedid的七种互动行为的发生概率进行预测,这七种行为包括:查看评论、点赞、点击头像、转发、收藏、评论和关注。
92 |
93 | 行为具体格式说明如下:
94 |
95 | | 字段名 | 类型 | 说明 | 赛段 | 备注 |
96 | | ------------ | ------ | ------------ | ------------------------------------------------------------ | ------ |
97 | | userid | String | 用户ID | 初赛/复赛 | 已脱敏 |
98 | | feedid | String | Feed视频ID | 已脱敏 | |
99 | | read_comment | Float | 是否查看评论 | 预测用户特定行为发生的概率,取值区间[0,1]。 0代表“否”,1代表“是”; 结果最多保留六位小数。 | |
100 | | like | Float | 是否点赞 | | |
101 | | click_avatar | Float | 是否点击头像 | | |
102 | | forward | Float | 是否转发 | | |
103 | | favorite | Float | 是否收藏 | 仅复赛 | |
104 | | comment | Float | 是否发表评论 | | |
105 | | follow | Float | 是否关注 | | |
106 |
107 |
108 | ### 评估标准
109 |
110 | 采用uAUC作为单个行为预测结果的评估指标,uAUC定义为不同用户下AUC的平均值,计算公式如下:
111 |
112 |
113 |
114 | 其中,n为测试集中的有效用户数,有效用户指的是对于某个待预测的行为,过滤掉测试集中全是正样本或全是负样本的用户后剩下的用户。AUCi为第i个有效用户的预测结果的AUC(Area Under Curve)。
115 |
116 | 初赛的最终分数为4个行为(查看评论、点赞、点击头像、转发)的uAUC值的加权平均。复赛的最终分数为7个行为(查看评论、点赞、点击头像、转发、收藏、评论和关注)的uAUC值的加权平均。分数越高,排名越靠前。
117 |
118 | 在总分中,7个行为的权重如下:
119 |
120 | | 字段名 | 字段说明 | 权重 |
121 | | ------------ | ------------ | ---- |
122 | | read_comment | 是否查看评论 | 4 |
123 | | like | 是否点赞 | 3 |
124 | | click_avatar | 是否点击头像 | 2 |
125 | | forward | 是否转发 | 1 |
126 | | favorite | 是否收藏 | 1 |
127 | | comment | 是否发表评论 | 1 |
128 | | follow | 是否关注 | 1 |
129 |
130 | 加权uAUC的计算公式如下:
131 |
132 |
133 |
134 | 其中,k为待预测的互动行为数,初赛k=4,复赛k=7。 uAUCi为第i个行为的uAUC值, Wi为第i个行为的权重。
135 |
136 | ## 多目标优化算法
137 |
138 | ### 简介
139 | 多目标排序是推荐排序系统中常见的技术实现,在很多推荐与排序常见中,有多个业务目标,找到一种综合排序方法使得多个目标都达到整体最优,能有最好的总体收益。
140 |
141 | 
142 |
143 | ### 网络结构优化
144 |
145 | - Share Bottom:早期一直在使用的方式,参数共享(hard或者soft)的方式来对多任务建模。
146 | - 2018 Google MMOE:将hard的参数共享变成多个expert,通过门控来控制不同loss对每个expert的影响。
147 | - 2020 腾讯 PLE:在MMOE的基础上增加了各任务独有的Expert。
148 |
149 | 
150 |
151 | #### MMoE
152 |
153 | MMoE中不同的expert负责学习不同的信息内容,然后通过gate来组合这些信息,通过不同任务gate的softmax的热力分布差异,来表明expert对不同的目标各司其责,从而提升了效果。
154 |
155 | 
156 |
157 | #### PLE
158 |
159 | PLE在MMoE的基础上,为每个任务增加了自己的specific expert,仅由本任务对其梯度更新。
160 | MMoE通过『化整为零』,把一个共享参数矩阵化成多个结合gate的共享Expert,这样不同的loss在存在相互冲突的时候,在不同的expert上,不同loss可以有相对强弱的表达,那么出现相互抵消的情况就可能减少,呈现出部分experts受某task影响较大,部分experts受其他task主导,形成『各有所得』的状态。而PLE增加了spcific experts,能进一步保障『各有所得』,保证稳定优化。
161 |
162 | 
163 |
164 | ## 代码说明
165 |
166 | 代码由几个Jupyter Notebook构成,其中`data-preprocessing-and-feature-engineering`包含数据预处理与特征工程步骤,运行完成之后,可以依次运行后续的不同结构模型进行建模。
--------------------------------------------------------------------------------