├── .gitattributes ├── figs ├── KuaiRec.png ├── kuairec-long.png └── colab-badge.svg ├── .gitignore ├── loaddata.py ├── get_statistics_bak.ipynb ├── LICENSE └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /figs/KuaiRec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chongminggao/KuaiRec/HEAD/figs/KuaiRec.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .ipynb_checkpoints/ 3 | get_statistcs_bak.ipynb 4 | .idea/ 5 | .DS_Store -------------------------------------------------------------------------------- /figs/kuairec-long.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chongminggao/KuaiRec/HEAD/figs/kuairec-long.png -------------------------------------------------------------------------------- /loaddata.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | print("Loading big matrix...") 4 | big_matrix = pd.read_csv("data/big_matrix.csv") 5 | print("Loading small matrix...") 6 | small_matrix = pd.read_csv("data/small_matrix.csv") 7 | 8 | print("Loading social network...") 9 | social_network = pd.read_csv("data/social_network.csv") 10 | social_network["friend_list"] = social_network["friend_list"].map(eval) 11 | 12 | print("Loading item features...") 13 | item_categories = pd.read_csv("data/item_categories.csv") 14 | item_categories["feat"] = item_categories["feat"].map(eval) 15 | 16 | print("Loading user features...") 17 | user_features = pd.read_csv("data/user_features.csv") 18 | 19 | print("Loading items' daily features...") 20 | item_daily_feat = pd.read_csv("data/item_daily_features.csv") 21 | 22 | print("All data loaded.") 23 | print("1. Big matrix:") 24 | print(big_matrix) 25 | print("2. Small matrix:") 26 | print(small_matrix) 27 | print("3. Social network of users in big matrix:") 28 | print(social_network) 29 | print("4. Items' basic features of all items in big matrix") 30 | print(item_categories) 31 | print("5. User features of all users in big matrix. \nNote: this table is added in KuaiRec v2.0") 32 | print(user_features) 33 | print("6. Item daily features. \nNote: this table is added in KuaiRec v2.0") 34 | print(item_daily_feat) 35 | 36 | -------------------------------------------------------------------------------- /figs/colab-badge.svg: -------------------------------------------------------------------------------- 1 | Open in ColabOpen in Colab 2 | -------------------------------------------------------------------------------- /get_statistics_bak.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "977d952c-72dc-45aa-b6cf-9867bcf6c7cf", 6 | "metadata": { 7 | "pycharm": { 8 | "name": "#%% md\n" 9 | } 10 | }, 11 | "source": [ 12 | "# Statistics of KuaiRec" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "d4c7a222-89d1-429a-9f7b-f6a2af0a816a", 18 | "metadata": { 19 | "pycharm": { 20 | "name": "#%% md\n" 21 | } 22 | }, 23 | "source": [ 24 | "## Load data" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "outputs": [], 31 | "source": [ 32 | "import pandas as pd\n", 33 | "\n", 34 | "print(\"Loading big matrix...\")\n", 35 | "big_matrix = pd.read_csv(\"data/big_matrix.csv\")\n", 36 | "print(\"Loading small matrix...\")\n", 37 | "small_matrix = pd.read_csv(\"data/small_matrix.csv\")\n", 38 | "\n", 39 | "print(\"Loading social network...\")\n", 40 | "social_network = pd.read_csv(\"data/social_network.csv\")\n", 41 | "social_network[\"friend_list\"] = social_network[\"friend_list\"].map(eval)\n", 42 | "\n", 43 | "print(\"Loading item features...\")\n", 44 | "item_feat = pd.read_csv(\"data/item_categories.csv\")\n", 45 | "item_feat[\"feat\"] = item_feat[\"feat\"].map(eval)\n", 46 | "\n", 47 | "print(\"Loading user features...\")\n", 48 | "user_feat = pd.read_csv(\"data/user_features.csv\")\n", 49 | "\n", 50 | "print(\"Loading items' daily features...\")\n", 51 | "item_daily_feat = pd.read_csv(\"data/item_daily_feat.csv\")\n", 52 | "\n", 53 | "print(\"All data loaded.\")" 54 | ], 55 | "metadata": { 56 | "collapsed": false, 57 | "pycharm": { 58 | "name": "#%%\n" 59 | } 60 | } 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "source": [ 65 | "## Visualization of the four tables" 66 | ], 67 | "metadata": { 68 | "collapsed": false, 69 | "pycharm": { 70 | "name": "#%% md\n" 71 | } 72 | } 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "outputs": [], 78 | "source": [ 79 | "big_matrix" 80 | ], 81 | "metadata": { 82 | "collapsed": false, 83 | "pycharm": { 84 | "name": "#%%\n" 85 | } 86 | } 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "outputs": [], 92 | "source": [ 93 | "small_matrix" 94 | ], 95 | "metadata": { 96 | "collapsed": false, 97 | "pycharm": { 98 | "name": "#%%\n" 99 | } 100 | } 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "outputs": [], 106 | "source": [ 107 | "item_feat" 108 | ], 109 | "metadata": { 110 | "collapsed": false, 111 | "pycharm": { 112 | "name": "#%%\n" 113 | } 114 | } 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "outputs": [], 120 | "source": [ 121 | "social_network" 122 | ], 123 | "metadata": { 124 | "collapsed": false, 125 | "pycharm": { 126 | "name": "#%%\n" 127 | } 128 | } 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "outputs": [], 134 | "source": [ 135 | "item_daily_feat" 136 | ], 137 | "metadata": { 138 | "collapsed": false, 139 | "pycharm": { 140 | "name": "#%%\n" 141 | } 142 | } 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "outputs": [], 148 | "source": [ 149 | "user_feat" 150 | ], 151 | "metadata": { 152 | "collapsed": false, 153 | "pycharm": { 154 | "name": "#%%\n" 155 | } 156 | } 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "outputs": [], 162 | "source": [ 163 | "import seaborn as sns\n", 164 | "import matplotlib.pyplot as plt\n", 165 | "from matplotlib.font_manager import FontProperties\n", 166 | "myfont = FontProperties(fname=\"./SimHei.ttf\")\n", 167 | "\n", 168 | "def visual_continue(df, func=None):\n", 169 | " ax = sns.distplot(df)\n", 170 | " if func:\n", 171 | " func(ax)\n", 172 | " \n", 173 | " gca = plt.gca()\n", 174 | " fig_title = \"Statistcis of {}\".format(df.name)\n", 175 | " gca.set_title(fig_title, fontsize=14)\n", 176 | " gca.set_ylabel(\"Density\", fontsize=14)\n", 177 | " gca.set_xlabel(df.name, fontsize=14)\n", 178 | " \n", 179 | " plt.show()\n", 180 | "\n", 181 | "def visual_statistics_discrete(df, var=\"my_variable\", display_ratio=True, func=None, order=None, size=(6, 4.5)):\n", 182 | " ncount = len(df)\n", 183 | "\n", 184 | " fig = plt.figure(figsize=size)\n", 185 | " ax1 = fig.add_axes([0.14, 0.15, 0.74, 0.75])\n", 186 | " sns.countplot(x=df, color=\"#9fc5e8\", linewidth=.6, edgecolor='k', ax=ax1, order=order)\n", 187 | "\n", 188 | "\n", 189 | " plt.grid(axis='y', linestyle='-.')\n", 190 | "\n", 191 | " gca = plt.gca()\n", 192 | " fig_title = \"Statistcis of {}\".format(var)\n", 193 | " gca.set_title(fig_title, fontsize=14)\n", 194 | " gca.set_ylabel(\"Count\", fontsize=14)\n", 195 | " gca.set_xlabel(var, fontsize=14)\n", 196 | " \n", 197 | " if func:\n", 198 | " func(ax1)\n", 199 | "\n", 200 | " if display_ratio:\n", 201 | " # Make twin axis\n", 202 | " ax2 = ax1.twinx()\n", 203 | " ax2.set_ylabel(\"ratio (%)\", fontsize=14)\n", 204 | "\n", 205 | "\n", 206 | " for p in ax1.patches:\n", 207 | " x = p.get_bbox().get_points()[:, 0]\n", 208 | " y = p.get_bbox().get_points()[1, 1]\n", 209 | " ax1.annotate('{:.1f}%'.format(100. * y / ncount), (x.mean(), y),\n", 210 | " ha='center', va='bottom', fontsize=10, rotation=30) # set the alignment of the text\n", 211 | "\n", 212 | " ax2.set_ylim(0, ax1.get_ylim()[1] / ncount * 100)\n", 213 | "\n", 214 | " plt.savefig(\"f{var}.png\")\n", 215 | " plt.show()\n", 216 | " " 217 | ], 218 | "metadata": { 219 | "collapsed": false, 220 | "pycharm": { 221 | "name": "#%%\n" 222 | } 223 | } 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "outputs": [], 229 | "source": [ 230 | "import warnings; warnings.simplefilter('ignore')" 231 | ], 232 | "metadata": { 233 | "collapsed": false, 234 | "pycharm": { 235 | "name": "#%%\n" 236 | } 237 | } 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "source": [ 242 | "## Statistics of social network" 243 | ], 244 | "metadata": { 245 | "collapsed": false, 246 | "pycharm": { 247 | "name": "#%% md\n" 248 | } 249 | } 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "outputs": [], 255 | "source": [ 256 | "print(social_network.friend_list.map(len).describe())\n", 257 | "visual_statistics_discrete(social_network.friend_list.map(len), \"number of friends\")" 258 | ], 259 | "metadata": { 260 | "collapsed": false, 261 | "pycharm": { 262 | "name": "#%%\n" 263 | } 264 | } 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "source": [ 269 | "## Statistics of video features" 270 | ], 271 | "metadata": { 272 | "collapsed": false, 273 | "pycharm": { 274 | "name": "#%% md\n" 275 | } 276 | } 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "outputs": [], 282 | "source": [ 283 | "num_feat = item_feat.feat.map(len)\n", 284 | "print(num_feat)\n", 285 | "visual_statistics_discrete(num_feat, \"number of tags\")" 286 | ], 287 | "metadata": { 288 | "collapsed": false, 289 | "pycharm": { 290 | "name": "#%%\n" 291 | } 292 | } 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "source": [ 297 | "## Count of 31 tags " 298 | ], 299 | "metadata": { 300 | "collapsed": false, 301 | "pycharm": { 302 | "name": "#%% md\n" 303 | } 304 | } 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "outputs": [], 310 | "source": [ 311 | "import collections\n", 312 | "import itertools\n", 313 | "\n", 314 | "cnt = item_feat.feat.map(collections.Counter)\n", 315 | "cnt_all = collections.Counter()\n", 316 | "for d in cnt:\n", 317 | " cnt_all.update(d)\n", 318 | "# print(dict(cnt_all))\n", 319 | "all_feat = pd.Series(sorted(list(itertools.chain.from_iterable([[i]*k for i,k in cnt_all.items()]))),name=\"feat\")\n", 320 | "# print(all_feat)\n", 321 | "visual_statistics_discrete(all_feat, \"tag\", size=(12,4.5))" 322 | ], 323 | "metadata": { 324 | "collapsed": false, 325 | "pycharm": { 326 | "name": "#%%\n" 327 | } 328 | } 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "source": [ 333 | "## Watch_ratio in big matrix" 334 | ], 335 | "metadata": { 336 | "collapsed": false, 337 | "pycharm": { 338 | "name": "#%% md\n" 339 | } 340 | } 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "outputs": [], 346 | "source": [ 347 | "big_watch_ratio = big_matrix.watch_ratio[big_matrix.watch_ratio <= 5]\n", 348 | "print(big_watch_ratio.describe())\n", 349 | "visual_continue(big_watch_ratio)" 350 | ], 351 | "metadata": { 352 | "collapsed": false, 353 | "pycharm": { 354 | "name": "#%%\n" 355 | } 356 | } 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "source": [ 361 | "## Watch_ratio in small matrix" 362 | ], 363 | "metadata": { 364 | "collapsed": false, 365 | "pycharm": { 366 | "name": "#%% md\n" 367 | } 368 | } 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "outputs": [], 374 | "source": [ 375 | "small_watch_ratio = small_matrix.watch_ratio[small_matrix.watch_ratio <= 5]\n", 376 | "print(\"watch_ratio in small matrix\")\n", 377 | "print(small_watch_ratio.describe())\n", 378 | "visual_continue(small_watch_ratio)" 379 | ], 380 | "metadata": { 381 | "collapsed": false, 382 | "pycharm": { 383 | "name": "#%%\n" 384 | } 385 | } 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "source": [ 390 | "## Video duration of the big matrix (in millisecond)" 391 | ], 392 | "metadata": { 393 | "collapsed": false, 394 | "pycharm": { 395 | "name": "#%% md\n" 396 | } 397 | } 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "outputs": [], 403 | "source": [ 404 | "big_video_duration = big_matrix.video_duration\n", 405 | "print(big_video_duration.describe())\n", 406 | "# visual_continue(big_video_duration)\n", 407 | "visual_continue(big_video_duration[big_video_duration < 100000])" 408 | ], 409 | "metadata": { 410 | "collapsed": false, 411 | "pycharm": { 412 | "name": "#%%\n" 413 | } 414 | } 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "source": [ 419 | "## Video duration of the small matrix (in millisecond)" 420 | ], 421 | "metadata": { 422 | "collapsed": false, 423 | "pycharm": { 424 | "name": "#%% md\n" 425 | } 426 | } 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "outputs": [], 432 | "source": [ 433 | "small_video_duration = small_matrix.video_duration\n", 434 | "print(small_video_duration.describe())\n", 435 | "# visual_continue(small_video_duration)\n", 436 | "visual_continue(small_video_duration[small_video_duration < 100000])" 437 | ], 438 | "metadata": { 439 | "collapsed": false, 440 | "pycharm": { 441 | "name": "#%%\n" 442 | } 443 | } 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "source": [ 448 | "## 大矩阵中每个用户的总播放次数" 449 | ], 450 | "metadata": { 451 | "collapsed": false, 452 | "pycharm": { 453 | "name": "#%% md\n" 454 | } 455 | } 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "outputs": [], 461 | "source": [ 462 | "big_play_time = big_matrix.groupby('user_id').agg({\"date\":len})\n", 463 | "big_play_time.name = \"play times\"\n", 464 | "print(big_play_time.describe())\n", 465 | "visual_continue(big_play_time)" 466 | ], 467 | "metadata": { 468 | "collapsed": false, 469 | "pycharm": { 470 | "name": "#%%\n" 471 | } 472 | } 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "source": [ 477 | "## 小矩阵中每个用户的总播放次数" 478 | ], 479 | "metadata": { 480 | "collapsed": false, 481 | "pycharm": { 482 | "name": "#%% md\n" 483 | } 484 | } 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "outputs": [], 490 | "source": [ 491 | "small_play_time = small_matrix.groupby('user_id').agg({\"date\":len})\n", 492 | "small_play_time.name = \"play times\"\n", 493 | "print(small_play_time.describe())\n", 494 | "visual_continue(small_play_time)" 495 | ], 496 | "metadata": { 497 | "collapsed": false, 498 | "pycharm": { 499 | "name": "#%%\n" 500 | } 501 | } 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "source": [ 506 | "## 大矩阵中每个用户的日播放次数" 507 | ], 508 | "metadata": { 509 | "collapsed": false, 510 | "pycharm": { 511 | "name": "#%% md\n" 512 | } 513 | } 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "outputs": [], 519 | "source": [ 520 | "big_daily_play_time = big_matrix.groupby(['user_id', 'date']).size()\n", 521 | "big_daily_play_time.name = \"play times\"\n", 522 | "print(big_daily_play_time.describe())\n", 523 | "visual_continue(big_daily_play_time)" 524 | ], 525 | "metadata": { 526 | "collapsed": false, 527 | "pycharm": { 528 | "name": "#%%\n" 529 | } 530 | } 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "source": [ 535 | "## 小矩阵中每个用户的日播放次数" 536 | ], 537 | "metadata": { 538 | "collapsed": false, 539 | "pycharm": { 540 | "name": "#%% md\n" 541 | } 542 | } 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "outputs": [], 548 | "source": [ 549 | "small_daily_play_time = small_matrix.groupby(['user_id', 'date']).size()\n", 550 | "small_daily_play_time.name = \"play times\"\n", 551 | "print(small_daily_play_time.describe())\n", 552 | "visual_continue(small_daily_play_time)" 553 | ], 554 | "metadata": { 555 | "collapsed": false, 556 | "pycharm": { 557 | "name": "#%%\n" 558 | } 559 | } 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "source": [ 564 | "## 大矩阵中播放日期分布" 565 | ], 566 | "metadata": { 567 | "collapsed": false, 568 | "pycharm": { 569 | "name": "#%% md\n" 570 | } 571 | } 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "outputs": [], 577 | "source": [ 578 | "import functools\n", 579 | "def adjust_xticks(ax):\n", 580 | " # print(ax.get_xticklabels())\n", 581 | " ax.set_xticks(list(range(0,len(ax.get_xticklabels()),3)))\n", 582 | " # print(ax.get_xticklabels())\n", 583 | " # ax.set_xticklabels(rotation = 45)\n", 584 | " for tick in ax.get_xticklabels():\n", 585 | " tick.set_rotation(45)\n", 586 | " # plt.xticks(fontsize=8)\n", 587 | " # ax.set_xticklabels([\"{}\".format(str(i)) for i in ax.get_xticks()])" 588 | ], 589 | "metadata": { 590 | "collapsed": false, 591 | "pycharm": { 592 | "name": "#%%\n" 593 | } 594 | } 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "outputs": [], 600 | "source": [ 601 | "visual_statistics_discrete(big_matrix.date, \"date\", display_ratio=False, func=adjust_xticks, order=small_matrix.date[~small_matrix.date.isna()].map(int).unique())" 602 | ], 603 | "metadata": { 604 | "collapsed": false, 605 | "pycharm": { 606 | "name": "#%%\n" 607 | } 608 | } 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "source": [ 613 | "## 小矩阵中播放日期分布" 614 | ], 615 | "metadata": { 616 | "collapsed": false, 617 | "pycharm": { 618 | "name": "#%% md\n" 619 | } 620 | } 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "outputs": [], 626 | "source": [ 627 | "visual_statistics_discrete(small_matrix.date[~small_matrix.date.isna()].map(int), \"date\", display_ratio=False, func=adjust_xticks)" 628 | ], 629 | "metadata": { 630 | "collapsed": false, 631 | "pycharm": { 632 | "name": "#%%\n" 633 | } 634 | } 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "outputs": [], 640 | "source": [], 641 | "metadata": { 642 | "collapsed": false, 643 | "pycharm": { 644 | "name": "#%%\n" 645 | } 646 | } 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "id": "54567a88-88ae-4060-bd67-78f39601e59c", 652 | "metadata": { 653 | "pycharm": { 654 | "name": "#%%\n" 655 | } 656 | }, 657 | "outputs": [], 658 | "source": [] 659 | } 660 | ], 661 | "metadata": { 662 | "kernelspec": { 663 | "display_name": "Python 3 (ipykernel)", 664 | "language": "python", 665 | "name": "python3" 666 | }, 667 | "language_info": { 668 | "codemirror_mode": { 669 | "name": "ipython", 670 | "version": 3 671 | }, 672 | "file_extension": ".py", 673 | "mimetype": "text/x-python", 674 | "name": "python", 675 | "nbconvert_exporter": "python", 676 | "pygments_lexer": "ipython3", 677 | "version": "3.9.1" 678 | } 679 | }, 680 | "nbformat": 4, 681 | "nbformat_minor": 5 682 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-ShareAlike 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-ShareAlike 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. Share means to provide material to the public by any means or 126 | process that requires permission under the Licensed Rights, such 127 | as reproduction, public display, public performance, distribution, 128 | dissemination, communication, or importation, and to make material 129 | available to the public including in ways that members of the 130 | public may access the material from a place and at a time 131 | individually chosen by them. 132 | 133 | l. Sui Generis Database Rights means rights other than copyright 134 | resulting from Directive 96/9/EC of the European Parliament and of 135 | the Council of 11 March 1996 on the legal protection of databases, 136 | as amended and/or succeeded, as well as other essentially 137 | equivalent rights anywhere in the world. 138 | 139 | m. You means the individual or entity exercising the Licensed Rights 140 | under this Public License. Your has a corresponding meaning. 141 | 142 | 143 | Section 2 -- Scope. 144 | 145 | a. License grant. 146 | 147 | 1. Subject to the terms and conditions of this Public License, 148 | the Licensor hereby grants You a worldwide, royalty-free, 149 | non-sublicensable, non-exclusive, irrevocable license to 150 | exercise the Licensed Rights in the Licensed Material to: 151 | 152 | a. reproduce and Share the Licensed Material, in whole or 153 | in part; and 154 | 155 | b. produce, reproduce, and Share Adapted Material. 156 | 157 | 2. Exceptions and Limitations. For the avoidance of doubt, where 158 | Exceptions and Limitations apply to Your use, this Public 159 | License does not apply, and You do not need to comply with 160 | its terms and conditions. 161 | 162 | 3. Term. The term of this Public License is specified in Section 163 | 6(a). 164 | 165 | 4. Media and formats; technical modifications allowed. The 166 | Licensor authorizes You to exercise the Licensed Rights in 167 | all media and formats whether now known or hereafter created, 168 | and to make technical modifications necessary to do so. The 169 | Licensor waives and/or agrees not to assert any right or 170 | authority to forbid You from making technical modifications 171 | necessary to exercise the Licensed Rights, including 172 | technical modifications necessary to circumvent Effective 173 | Technological Measures. For purposes of this Public License, 174 | simply making modifications authorized by this Section 2(a) 175 | (4) never produces Adapted Material. 176 | 177 | 5. Downstream recipients. 178 | 179 | a. Offer from the Licensor -- Licensed Material. Every 180 | recipient of the Licensed Material automatically 181 | receives an offer from the Licensor to exercise the 182 | Licensed Rights under the terms and conditions of this 183 | Public License. 184 | 185 | b. Additional offer from the Licensor -- Adapted Material. 186 | Every recipient of Adapted Material from You 187 | automatically receives an offer from the Licensor to 188 | exercise the Licensed Rights in the Adapted Material 189 | under the conditions of the Adapter's License You apply. 190 | 191 | c. No downstream restrictions. You may not offer or impose 192 | any additional or different terms or conditions on, or 193 | apply any Effective Technological Measures to, the 194 | Licensed Material if doing so restricts exercise of the 195 | Licensed Rights by any recipient of the Licensed 196 | Material. 197 | 198 | 6. No endorsement. Nothing in this Public License constitutes or 199 | may be construed as permission to assert or imply that You 200 | are, or that Your use of the Licensed Material is, connected 201 | with, or sponsored, endorsed, or granted official status by, 202 | the Licensor or others designated to receive attribution as 203 | provided in Section 3(a)(1)(A)(i). 204 | 205 | b. Other rights. 206 | 207 | 1. Moral rights, such as the right of integrity, are not 208 | licensed under this Public License, nor are publicity, 209 | privacy, and/or other similar personality rights; however, to 210 | the extent possible, the Licensor waives and/or agrees not to 211 | assert any such rights held by the Licensor to the limited 212 | extent necessary to allow You to exercise the Licensed 213 | Rights, but not otherwise. 214 | 215 | 2. Patent and trademark rights are not licensed under this 216 | Public License. 217 | 218 | 3. To the extent possible, the Licensor waives any right to 219 | collect royalties from You for the exercise of the Licensed 220 | Rights, whether directly or through a collecting society 221 | under any voluntary or waivable statutory or compulsory 222 | licensing scheme. In all other cases the Licensor expressly 223 | reserves any right to collect such royalties. 224 | 225 | 226 | Section 3 -- License Conditions. 227 | 228 | Your exercise of the Licensed Rights is expressly made subject to the 229 | following conditions. 230 | 231 | a. Attribution. 232 | 233 | 1. If You Share the Licensed Material (including in modified 234 | form), You must: 235 | 236 | a. retain the following if it is supplied by the Licensor 237 | with the Licensed Material: 238 | 239 | i. identification of the creator(s) of the Licensed 240 | Material and any others designated to receive 241 | attribution, in any reasonable manner requested by 242 | the Licensor (including by pseudonym if 243 | designated); 244 | 245 | ii. a copyright notice; 246 | 247 | iii. a notice that refers to this Public License; 248 | 249 | iv. a notice that refers to the disclaimer of 250 | warranties; 251 | 252 | v. a URI or hyperlink to the Licensed Material to the 253 | extent reasonably practicable; 254 | 255 | b. indicate if You modified the Licensed Material and 256 | retain an indication of any previous modifications; and 257 | 258 | c. indicate the Licensed Material is licensed under this 259 | Public License, and include the text of, or the URI or 260 | hyperlink to, this Public License. 261 | 262 | 2. You may satisfy the conditions in Section 3(a)(1) in any 263 | reasonable manner based on the medium, means, and context in 264 | which You Share the Licensed Material. For example, it may be 265 | reasonable to satisfy the conditions by providing a URI or 266 | hyperlink to a resource that includes the required 267 | information. 268 | 269 | 3. If requested by the Licensor, You must remove any of the 270 | information required by Section 3(a)(1)(A) to the extent 271 | reasonably practicable. 272 | 273 | b. ShareAlike. 274 | 275 | In addition to the conditions in Section 3(a), if You Share 276 | Adapted Material You produce, the following conditions also apply. 277 | 278 | 1. The Adapter's License You apply must be a Creative Commons 279 | license with the same License Elements, this version or 280 | later, or a BY-SA Compatible License. 281 | 282 | 2. You must include the text of, or the URI or hyperlink to, the 283 | Adapter's License You apply. You may satisfy this condition 284 | in any reasonable manner based on the medium, means, and 285 | context in which You Share Adapted Material. 286 | 287 | 3. You may not offer or impose any additional or different terms 288 | or conditions on, or apply any Effective Technological 289 | Measures to, Adapted Material that restrict exercise of the 290 | rights granted under the Adapter's License You apply. 291 | 292 | 293 | Section 4 -- Sui Generis Database Rights. 294 | 295 | Where the Licensed Rights include Sui Generis Database Rights that 296 | apply to Your use of the Licensed Material: 297 | 298 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 299 | to extract, reuse, reproduce, and Share all or a substantial 300 | portion of the contents of the database; 301 | 302 | b. if You include all or a substantial portion of the database 303 | contents in a database in which You have Sui Generis Database 304 | Rights, then the database in which You have Sui Generis Database 305 | Rights (but not its individual contents) is Adapted Material, 306 | including for purposes of Section 3(b); and 307 | 308 | c. You must comply with the conditions in Section 3(a) if You Share 309 | all or a substantial portion of the contents of the database. 310 | 311 | For the avoidance of doubt, this Section 4 supplements and does not 312 | replace Your obligations under this Public License where the Licensed 313 | Rights include other Copyright and Similar Rights. 314 | 315 | 316 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 317 | 318 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 319 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 320 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 321 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 322 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 323 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 324 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 325 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 326 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 327 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 328 | 329 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 330 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 331 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 332 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 333 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 334 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 335 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 336 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 337 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 338 | 339 | c. The disclaimer of warranties and limitation of liability provided 340 | above shall be interpreted in a manner that, to the extent 341 | possible, most closely approximates an absolute disclaimer and 342 | waiver of all liability. 343 | 344 | 345 | Section 6 -- Term and Termination. 346 | 347 | a. This Public License applies for the term of the Copyright and 348 | Similar Rights licensed here. However, if You fail to comply with 349 | this Public License, then Your rights under this Public License 350 | terminate automatically. 351 | 352 | b. Where Your right to use the Licensed Material has terminated under 353 | Section 6(a), it reinstates: 354 | 355 | 1. automatically as of the date the violation is cured, provided 356 | it is cured within 30 days of Your discovery of the 357 | violation; or 358 | 359 | 2. upon express reinstatement by the Licensor. 360 | 361 | For the avoidance of doubt, this Section 6(b) does not affect any 362 | right the Licensor may have to seek remedies for Your violations 363 | of this Public License. 364 | 365 | c. For the avoidance of doubt, the Licensor may also offer the 366 | Licensed Material under separate terms or conditions or stop 367 | distributing the Licensed Material at any time; however, doing so 368 | will not terminate this Public License. 369 | 370 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 371 | License. 372 | 373 | 374 | Section 7 -- Other Terms and Conditions. 375 | 376 | a. The Licensor shall not be bound by any additional or different 377 | terms or conditions communicated by You unless expressly agreed. 378 | 379 | b. Any arrangements, understandings, or agreements regarding the 380 | Licensed Material not stated herein are separate from and 381 | independent of the terms and conditions of this Public License. 382 | 383 | 384 | Section 8 -- Interpretation. 385 | 386 | a. For the avoidance of doubt, this Public License does not, and 387 | shall not be interpreted to, reduce, limit, restrict, or impose 388 | conditions on any use of the Licensed Material that could lawfully 389 | be made without permission under this Public License. 390 | 391 | b. To the extent possible, if any provision of this Public License is 392 | deemed unenforceable, it shall be automatically reformed to the 393 | minimum extent necessary to make it enforceable. If the provision 394 | cannot be reformed, it shall be severed from this Public License 395 | without affecting the enforceability of the remaining terms and 396 | conditions. 397 | 398 | c. No term or condition of this Public License will be waived and no 399 | failure to comply consented to unless expressly agreed to by the 400 | Licensor. 401 | 402 | d. Nothing in this Public License constitutes or may be interpreted 403 | as a limitation upon, or waiver of, any privileges and immunities 404 | that apply to the Licensor or You, including from the legal 405 | processes of any jurisdiction or authority. 406 | 407 | 408 | ======================================================================= 409 | 410 | Creative Commons is not a party to its public licenses. 411 | Notwithstanding, Creative Commons may elect to apply one of its public 412 | licenses to material it publishes and in those instances will be 413 | considered the “Licensor.” The text of the Creative Commons public 414 | licenses is dedicated to the public domain under the CC0 Public Domain 415 | Dedication. Except for the limited purpose of indicating that material 416 | is shared under a Creative Commons public license or as otherwise 417 | permitted by the Creative Commons policies published at 418 | creativecommons.org/policies, Creative Commons does not authorize the 419 | use of the trademark "Creative Commons" or any other trademark or logo 420 | of Creative Commons without its prior written consent including, 421 | without limitation, in connection with any unauthorized modifications 422 | to any of its public licenses or any other arrangements, 423 | understandings, or agreements concerning use of licensed material. For 424 | the avoidance of doubt, this paragraph does not form part of the public 425 | licenses. 426 | 427 | Creative Commons may be contacted at creativecommons.org. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KuaiRec: A Fully-observed Dataset for Recommender Systems (Density: Almost 100%) 2 | 3 | [![LICENSE](https://img.shields.io/badge/license-CC%20BY--SA%204.0-green)](https://github.com/chongminggao/KuaiRec/blob/main/LICENSE) 4 | 5 | *KuaiRec* is a real-world dataset collected from the recommendation logs of the video-sharing mobile app [Kuaishou](https://www.kuaishou.com/cn). For now, it is the first dataset that contains a fully observed user-item interaction matrix. For the term "fully observed", we mean there are almost no missing values in the user-item matrix, i.e., each user has viewed each video and then left feedback. 6 | 7 | Other related **open-sourced** datasets are **[KuaiRand](https://kuairand.com/)** and **[KuaiSAR](https://kuaisar.github.io/)**. 8 | 9 | The following figure illustrates the user-item matrices in traditional datasets and *KuaiRec*. 10 | 11 | ![kuaidata](figs/kuairec-long.png) 12 | 13 | With all user preferences known, KuaiRec can used in offline evaluation (i.e., offline A/B test) for recommendation models. It can benefit lots of research directions, such as unbiased recommendation, interactive/conversational recommendation, reinforcement learning (RL), and off-policy evaluation (OPE) for recommendation. 14 | 15 | If you use it in your work, please cite our paper: 16 | [![LINK](https://img.shields.io/badge/-Paper%20Link-lightgrey)](https://arxiv.org/abs/2202.10842) [![PDF](https://img.shields.io/badge/-PDF-red)](https://arxiv.org/pdf/2202.10842.pdf) 17 | 18 | ``` 19 | @inproceedings{gao2022kuairec, 20 | author = {Gao, Chongming and Li, Shijun and Lei, Wenqiang and Chen, Jiawei and Li, Biao and Jiang, Peng and He, Xiangnan and Mao, Jiaxin and Chua, Tat-Seng}, 21 | title = {KuaiRec: A Fully-Observed Dataset and Insights for Evaluating Recommender Systems}, 22 | booktitle = {Proceedings of the 31st ACM International Conference on Information \& Knowledge Management}, 23 | series = {CIKM '22}, 24 | location = {Atlanta, GA, USA}, 25 | url = {https://doi.org/10.1145/3511808.3557220}, 26 | doi = {10.1145/3511808.3557220}, 27 | numpages = {11}, 28 | year = {2022}, 29 | pages = {540–550} 30 | } 31 | ``` 32 | 33 | [This repository](https://github.com/xiwenchao/fully_observed_demo) lists the example codes in evaluating conversational recommendations as described in the paper. 34 | 35 | We provide some simple statistics of this dataset [here](https://kuairec.com/Statistics_KuaiRec.html). It is generated by [Statistics_KuaiRec.ipynb](https://github.com/chongminggao/KuaiRec/blob/main/Statistics_KuaiRec.ipynb). You can do it online at Google Colab [![colab](./figs/colab-badge.svg)](https://colab.research.google.com/github/chongminggao/KuaiRec/blob/main/Statistics_KuaiRec.ipynb). 36 | 37 | --- 38 | 39 | ## News! 40 | 41 | **2024.06.02: To facilitate the application of large language models (LLM) in recommendation systems, we collect caption information and category information for all videos and presented them in text format!** 42 | 43 | - The corresponding caption and category information can be downloaded here: [kuairec_caption_category.csv](https://github.com/chongminggao/KuaiRec/blob/main/kuairec_caption_category.csv). Additionally, we have packaged them into the `KuaiRec.zip` file, which can be downloaded from the download section below. 44 | - The descriptions of the caption and category information are available [here](#6-descriptions-of-the-caption-and-category-fields-in-kuairec_caption_categorycsv-added-on-20240602). 45 | 46 | **2022.05.16: We update the dataset to version 2.0.** We made the following changes: 47 | 48 | - We removed the unused video `ID=1225` from all tables having the field `video_id` and re-indexed the rest videos, i.e., `ID = ID - 1 if ID > 1225`. 49 | - We added two tables to enhance the side information for users and videos, respectively. See `4.item_daily_features.csv` and `5. user_features.csv` under the data description section for details. 50 | 51 | 52 | 53 | 54 | ## Download the data 55 | 56 | We provide several options to download this dataset: 57 | 58 | Option 1. Download via the "wget" command. 59 | 60 | ```shell 61 | wget https://nas.chongminggao.top:4430/datasets/KuaiRec.zip --no-check-certificate 62 | unzip KuaiRec.zip 63 | ``` 64 | Option 2. Download manually through the following links: 65 | 66 | - Optional link 1: [Google Drive](https://drive.google.com/file/d/1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE/view?usp=sharing) 67 | 68 | - Optional link 2: [USTC Drive (中科大)](https://rec.ustc.edu.cn/share/00cc9940-aa72-11ec-aa7d-d38daca349cb) 69 | 70 | The script `loaddata.py` provides a simple way to load the data via Pandas in Python. 71 | 72 | --- 73 | 74 | ## Data Descriptions 75 | 76 | *KuaiRec* contains millions of user-item interactions as well as side information including the item categories and a social network. Six files are included in the download data: 77 | 78 | ```shell 79 | KuaiRec 80 | ├── data 81 | │   ├── big_matrix.csv 82 | │   ├── small_matrix.csv 83 | │   ├── social_network.csv 84 | │   ├── user_features.csv 85 | │   ├── item_daily_features.csv 86 | │   └── item_categories.csv 87 | │   └── kuairec_caption_category.csv 88 | ``` 89 | 90 | The statistics of the small matrix and big matrix in *KuaiRec*. 91 | 92 | | | #Users | #Items | #Interactions | Density | 93 | | -------------- | :----: | :----: | :-----------: | :-----: | 94 | | *small matrix* | 1,411 | 3,327 | 4,676,570 | 99.6% | 95 | | *big matrix* | 7,176 | 10,728 | 12,530,806 | 16.3% | 96 | 97 | Note that the density of the small matrix is 99.6% instead of 100% because some users have explicitly indicated that they would not be willing to receive recommendations from certain authors. I.e., They blocked these videos. 98 | 99 | #### 1. Descriptions of the fields in `big_matrix.csv` and `small_matrix.csv`. 100 | 101 | | Field Name: | Description | Type | Example | 102 | | -------------- | -------------------------------------------------------- | ------- | ------------------------- | 103 | | user_id | The ID of the user. | int64 | 0 | 104 | | video_id | The ID of the viewed video. | int64 | 3650 | 105 | | play_duration | Time of video viewing of this interaction (millisecond). | int64 | 13838 | 106 | | video_duration | Time of this video (millisecond). | int64 | 10867 | 107 | | time | Human-readable date for this interaction | str | "2020-07-05 00:08:23.438" | 108 | | date | Date of this interaction | int64 | 20200705 | 109 | | timestamp | Unix timestamp | float64 | 1593878903.438 | 110 | | watch_ratio | The video watching ratio (=play_duration/video_duration) | float64 | 1.273397 | 111 | 112 | The "watch_ratio" can be deemed as the label of the interaction. Note: there is no "like" signal for this dataset. If you need this binary signal in your scenarios, you can create it yourself. E.g., `like = 1 if watch_ratio > 2.0`. 113 | 114 | #### 2. Descriptions of the fields in `social_network.csv` 115 | 116 | | Field Name: | Description | Type | Example | 117 | | ----------- | -------------------------------------------- | ----- | ----------- | 118 | | user_id | The ID of the user. | int64 | 5352 | 119 | | friend_list | The list of IDs of the friends of this user. | list | [4202,7126] | 120 | 121 | #### 3. Descriptions of the fields in `item_categories.csv`. 122 | 123 | | Field Name: | Description | Type | Example | 124 | | ----------- | ------------------------------- | ----- | ------- | 125 | | video_id | The ID of the video. | int64 | 1 | 126 | | feat | The list of tags of this video. | list | [27,9] | 127 | 128 | #### 4. Descriptions of the fields in `item_daily_features.csv`. (Added on 2022.05.16) 129 | 130 | | Field Name: | Description | Type | Example | 131 | | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------ | ------- | ------------- | 132 | | video_id | The ID of the video. | int64 | 3784 | 133 | | date | Date of the statistics of this video. | int64 | 20200730 | 134 | | author_id | The ID of the author of this video. | int64 | 441 | 135 | | video_type | Type of this video (NORMAL or AD). | str | "NORMAL" | 136 | | upload_dt | Upload date of this video. | str | "2020-07-08" | 137 | | upload_type | The upload type of this video. | str | "ShortImport" | 138 | | visible_status | The visible state of this video on the APP now. | str | "public" | 139 | | video_duration | The time duration of this duration (in milliseconds). | float64 | 17200.0 | 140 | | video_width | The width of this video on the server. | int64 | 720 | 141 | | video_height | The height of this video on the server. | int64 | 1280 | 142 | | music_id | Background music ID of this video. | int64 | 989206467 | 143 | | video_tag_id | The ID of the tag of this video. | int64 | 2522 | 144 | | video_tag_name | The name of the tag of this video. | string | "祝福" | 145 | | show_cnt | The number of shows of this video **within this day (the same with all following fields)** | int64 | 7716 | 146 | | show_user_num | The number of users who received the recommendation of this video. | int64 | 5256 | 147 | | play_cnt | The number of plays. | int64 | 7701 | 148 | | play_user_num | The number of users who play this video. | int64 | 5034 | 149 | | play_duration | The total time duration of playing this video (in milliseconds). | int64 | 138333346 | 150 | | complete_play_cnt | The number of complete plays. *complete play*: finishing playing the whole video, i.e., `#(play_duration >= video_duration)`. | int64 | 3446 | 151 | | complete_play_user_num | The number of users who perform the *complete play*. | int64 | 2033 | 152 | | valid_play_cnt | *valid play*: `play_duration >= video_duration if video_duration <= 7s`, or `play_duration > 7 if video_duration > 7s`. | int64 | 5099 | 153 | | valid_play_user_num | The number of users who perform the *complete play*. | int64 | 3195 | 154 | | long_time_play_cnt | *long time play*: `play_duration >= video_duration if video_duration <= 18s`, or `play_duration >=18 if video_duration > 18s`. | int64 | 3299 | 155 | | long_time_play_user_num | The number of users who perform the *long time play*. | int64 | 1940 | 156 | | short_time_play_cnt | *short time play*: `play_duration < min(3s, video_duration)`. | int64 | 1538 | 157 | | short_time_play_user_num | The number of users who perform the *short time play*. | int64 | 1190 | 158 | | play_progress | The average video playing ratio (`=play_duration/video_duration`) | int64 | 0.579695 | 159 | | comment_stay_duration | Total time of staying in the comments section | int64 | 467865 | 160 | | like_cnt | Total likes | int64 | 659 | 161 | | like_user_num | The number of users who hit the "like" button. | int64 | 657 | 162 | | click_like_cnt | The number of the "like" resulted from double click | int64 | 496 | 163 | | double_click_cnt | The number of users who double-click the video. | int64 | 163 | 164 | | cancel_like_cnt | The number of likes that are canceled by users. | int64 | 15 | 165 | | cancel_like_user_num | The number of users who cancel their likes. | int64 | 15 | 166 | | comment_cnt | The number of comments within this day. | int64 | 13 | 167 | | comment_user_num | The number of users who comment on this video. | int64 | 12 | 168 | | direct_comment_cnt | The number of direct comments (depth=1). | int64 | 13 | 169 | | reply_comment_cnt | The number of reply comments (depth>1). | int64 | 0 | 170 | | delete_comment_cnt | The number of deleted comments. | int64 | 0 | 171 | | delete_comment_user_num | The number of users who delete their comments. | int64 | 0 | 172 | | comment_like_cnt | The number of comment likes. | int64 | 2 | 173 | | comment_like_user_num | The number of users who like the comments. | int64 | 2 | 174 | | follow_cnt | The number of increased follows from this video. | int64 | 151 | 175 | | follow_user_num | The number of users who follow the author of this video due to this video. | int64 | 151 | 176 | | cancel_follow_cnt | The number of decreased follows from this video. | int64 | 0 | 177 | | cancel_follow_user_num | The number of users who cancel their following of the author of this video due to this video. | int64 | 0 | 178 | | share_cnt | The times of sharing this video. | int64 | 1 | 179 | | share_user_num | The number of users who share this video. | int64 | 1 | 180 | | download_cnt | The times of downloading this video. | int64 | 2 | 181 | | download_user_num | The number of users who download this video. | int64 | 2 | 182 | | report_cnt | The times of reporting this video. | int64 | 0 | 183 | | report_user_num | The number of users who report this video. | int64 | 0 | 184 | | reduce_similar_cnt | The times of reducing similar content of this video. | int64 | 2 | 185 | | reduce_similar_user_num | The number of users who choose to reduce similar content of this video. | int64 | 2 | 186 | | collect_cnt | The times of adding this video to favorite videos. | int64 | 0 | 187 | | collect_user_num | The number of users who add this video to their favorite videos. | int64 | 0 | 188 | | cancel_collect_cnt | The times of removing this video from favorite videos. | int64 | 0 | 189 | | cancel_collect_user_num | The number of users who remove this video from their favorite videos | int64 | 0 | 190 | 191 | 192 | #### 5. Descriptions of the fields in `user_features.csv` (Added on 2022.05.16) 193 | 194 | | Field Name: | Description | Type | Example | 195 | | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- | ------------- | 196 | | user_id | The ID of the user. | int64 | 0 | 197 | | user_active_degree | In the set of {'high_active', 'full_active', 'middle_active', 'UNKNOWN'}. | str | "high_active" | 198 | | is_lowactive_period | Is this user in its low active period | int64 | 0 | 199 | | is_live_streamer | Is this user a live streamer? | int64 | 0 | 200 | | is_video_author | Has this user uploaded any video? | int64 | 0 | 201 | | follow_user_num | The number of users that this user follows. | int64 | 5 | 202 | | follow_user_num_range | The range of the number of users that this user follows. In the set of {'0', '(0,10]', '(10,50]', '(100,150]', '(150,250]', '(250,500]', '(50,100]', '500+'} | str | "(0,10]" | 203 | | fans_user_num | The number of the fans of this user. | int64 | 0 | 204 | | fans_user_num_range | The range of the number of fans of this user. In the set of {'0', '[1,10)', '[10,100)', '[100,1k)', '[1k,5k)', '[5k,1w)', '[1w,10w)'} | str | "0" | 205 | | friend_user_num | The number of friends that this user has. | int64 | 0 | 206 | | friend_user_num_range | The range of the number of friends that this user has. In the set of {'0', '[1,5)', '[5,30)', '[30,60)', '[60,120)', '[120,250)', '250+'} | str | "0" | 207 | | register_days | The days since this user has registered. | int64 | 107 | 208 | | register_days_range | The range of the registered days. In the set of {'15-30', '31-60', '61-90', '91-180', '181-365', '366-730', '730+'}. | str | "61-90" | 209 | | onehot_feat0 | An encrypted feature of the user. Each value indicate the position of "1" in the one-hot vector. Range: {0,1} | int64 | 0 | 210 | | onehot_feat1 | An encrypted feature. Range: {0, 1, ..., 7} | int64 | 1 | 211 | | onehot_feat2 | An encrypted feature. Range: {0, 1, ..., 29} | int64 | 17 | 212 | | onehot_feat3 | An encrypted feature. Range: {0, 1, ..., 1075} | int64 | 638 | 213 | | onehot_feat4 | An encrypted feature. Range: {0, 1, ..., 11} | int64 | 2 | 214 | | onehot_feat5 | An encrypted feature. Range: {0, 1, ..., 9} | int64 | 0 | 215 | | onehot_feat6 | An encrypted feature. Range: {0, 1, 2} | int64 | 1 | 216 | | onehot_feat7 | An encrypted feature. Range: {0, 1, ..., 46} | int64 | 6 | 217 | | onehot_feat8 | An encrypted feature. Range: {0, 1, ..., 339} | int64 | 184 | 218 | | onehot_feat9 | An encrypted feature. Range: {0, 1, ..., 6} | int64 | 6 | 219 | | onehot_feat10 | An encrypted feature. Range: {0, 1, ..., 4} | int64 | 3 | 220 | | onehot_feat11 | An encrypted feature. Range: {0, 1, ..., 2} | int64 | 0 | 221 | | onehot_feat12 | An encrypted feature. Range: {0, 1} | int64 | 0 | 222 | | onehot_feat13 | An encrypted feature. Range: {0, 1} | int64 | 0 | 223 | | onehot_feat14 | An encrypted feature. Range: {0, 1} | int64 | 0 | 224 | | onehot_feat15 | An encrypted feature. Range: {0, 1} | int64 | 0 | 225 | | onehot_feat16 | An encrypted feature. Range: {0, 1} | int64 | 0 | 226 | | onehot_feat17 | An encrypted feature. Range: {0, 1} | int64 | 0 | 227 | 228 | 229 | #### 6. Descriptions of the caption and category fields in `kuairec_caption_category.csv` (Added on 2024.06.02) 230 | 231 | 232 | | Field Name: | Description | Type | Example | 233 | | -------------------------- | ------------------------------------------------------ | ----- | ------------------------------------------------------------ | 234 | | video_id | The ID of the video | int64 | 2418 | 235 | | manual_cover_text | 封面文字 (added by its author) | str | "被小可爱发现了" | 236 | | caption | 简介标题 (added by its author) | str | "这是什么狗狗,这么可爱真的可以这么遛吗?#喜欢的双击加关注 #直播 #博美俊介 #萌宠驾到" | 237 | | topic_tag | Tags of the topics of this video (added by its author) | str | "[博美俊介,喜欢的双击加关注,直播,萌宠驾到]" | 238 | | first_level_category_id | First-level category ID | int64 | 17 | 239 | | first_level_category_name | First-level category name | str | "宠物" | 240 | | second_level_category_id | Second-level category ID | int64 | 233 | 241 | | second_level_category_name | Second-level category name | str | "宠物日常记录" | 242 | | third_level_category_id | Thrid-level category ID | int64 | 1169 | 243 | | third_level_category_name | Third-level category name | str | "宠物狗" | --------------------------------------------------------------------------------