├── .ipynb_checkpoints └── baseline-checkpoint.ipynb ├── README.md └── notebook ├── baseline.ipynb └── edge GCN.ipynb /.ipynb_checkpoints/baseline-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Wall time: 6.3 s\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%time\n", 18 | "import pandas as pd\n", 19 | "import os\n", 20 | "import numpy as np\n", 21 | "import torch\n", 22 | "import torch_geometric\n", 23 | "from sklearn.preprocessing import LabelEncoder\n", 24 | "import gc\n", 25 | "from torch_geometric.data import Data\n", 26 | "\n", 27 | "# item_feature = pd.read_csv('../data/item_feature.csv')\n", 28 | "# test = pd.read_csv('../data/test.csv')\n", 29 | "# user_feature = pd.read_csv('../data/user_feature.csv')\n", 30 | "train = pd.read_pickle('../data/sml_train.pkl')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 42 | "\n", 55 | "\n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | "
user_iditem_idbehavior_typedate
01.732029e+093.193364e+08clk2019-06-19
21.732029e+091.197152e+09clk2019-06-19
31.732029e+091.145630e+09clk2019-06-19
51.732029e+091.162473e+09clk2019-06-19
71.732029e+091.128524e+09clk2019-06-19
\n", 103 | "
" 104 | ], 105 | "text/plain": [ 106 | " user_id item_id behavior_type date\n", 107 | "0 1.732029e+09 3.193364e+08 clk 2019-06-19\n", 108 | "2 1.732029e+09 1.197152e+09 clk 2019-06-19\n", 109 | "3 1.732029e+09 1.145630e+09 clk 2019-06-19\n", 110 | "5 1.732029e+09 1.162473e+09 clk 2019-06-19\n", 111 | "7 1.732029e+09 1.128524e+09 clk 2019-06-19" 112 | ] 113 | }, 114 | "execution_count": 2, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "train.head()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### baseline思路(时间序列思路,未解决推荐问题和冷启动问题)\n", 128 | "1. 根据历史clk记录预测未来可能重复clk的user-item pair\n", 129 | "2. 历史clk记录可以用user-item二部图来表示,user和item作为节点,其中的边作为clk记录\n", 130 | "3. feature为user_embedding和item_embedding,通过concat两者的embedding后接MLP得到user-item pair的预测值\n", 131 | "4. label为历史clk边中重复clk的边 \n", 132 | "\n", 133 | "---\n", 134 | "由于内存不足,事先在服务器筛了训练数据中18-20号的数据,用18-19号的clk预测20号会重复clk的。 " 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 3, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "Wall time: 54.3 s\n" 147 | ] 148 | }, 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "42" 153 | ] 154 | }, 155 | "execution_count": 3, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "%%time\n", 162 | "train = train[train.behavior_type=='clk']\n", 163 | "his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id'])\n", 164 | "now = train[train.date>='2019-06-20']\n", 165 | "del train\n", 166 | "train = his[['user_id','item_id']].merge(now[['user_id','item_id','behavior_type']],how='left')\n", 167 | "del his,now\n", 168 | "gc.collect()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "## build model" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 4, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "Wall time: 9.81 s\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "%%time\n", 193 | "train = pd.concat([train[train.behavior_type.isnull()==False],train[train.behavior_type.isnull()==True].sample(3000000)],axis=0)\n", 194 | "# 显存不足,下采样\n", 195 | "train['behavior_type'] = train['behavior_type'].fillna(0)\n", 196 | "train['behavior_type'] = train['behavior_type'].map({'clk':1})\n", 197 | "\n", 198 | "u_enc,i_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id'])\n", 199 | "train['user_id'] = u_enc.transform(train['user_id'])\n", 200 | "train['item_id'] = i_enc.transform(train['item_id'])+u_enc.classes_.shape[0]\n", 201 | "\n", 202 | "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])\n", 203 | "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n", 204 | "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n", 205 | "y = torch.tensor(train['behavior_type'].fillna(0).values,dtype=torch.long)\n", 206 | "data = Data(u=u,i=i,edge_index=edge_index,y=y)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", 216 | "data = data.to(device)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 59, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "positive label ratio: 0.2816764522287305\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "print('positive label ratio: ',data.y.sum().item()/data.y.shape[0])" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 60, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stderr", 243 | "output_type": "stream", 244 | "text": [ 245 | "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n", 246 | " warnings.warn(__num_nodes_warn_msg__.format('edge'))\n" 247 | ] 248 | }, 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "False" 253 | ] 254 | }, 255 | "execution_count": 60, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "data.contains_isolated_nodes()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 69, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "name": "stderr", 271 | "output_type": "stream", 272 | "text": [ 273 | "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n", 274 | " warnings.warn(__num_nodes_warn_msg__.format('edge'))\n" 275 | ] 276 | }, 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "2243363" 281 | ] 282 | }, 283 | "execution_count": 69, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "data.num_nodes" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 68, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "4176391" 301 | ] 302 | }, 303 | "execution_count": 68, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "data.num_edges" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 66, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "torch.Size([776543, 1])" 321 | ] 322 | }, 323 | "execution_count": 66, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "data.u.shape # number of user" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 67, 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "data": { 339 | "text/plain": [ 340 | "torch.Size([1466820, 1])" 341 | ] 342 | }, 343 | "execution_count": 67, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "data.i.shape # number of item" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 70, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "3.6665601930725125e-06" 361 | ] 362 | }, 363 | "execution_count": 70, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "data.num_edges/(data.u.shape[0]*data.i.shape[0]) # very sparse" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 6, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "from torch_geometric.nn import GCNConv\n", 379 | "import torch.nn.functional as F\n", 380 | "\n", 381 | "\n", 382 | "# hyper param\n", 383 | "EMB_DIM = 10\n", 384 | "\n", 385 | "class Net(torch.nn.Module):\n", 386 | " def __init__(self):\n", 387 | " super(Net, self).__init__()\n", 388 | " self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n", 389 | " self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n", 390 | " self.conv1 = GCNConv(EMB_DIM*2,EMB_DIM)\n", 391 | " self.conv2 = GCNConv(EMB_DIM,8)\n", 392 | " self.lin = torch.nn.Linear(8,2)\n", 393 | " \n", 394 | " def forward(self, data):\n", 395 | " u,i,edge_index = data.u,data.i,data.edge_index\n", 396 | " \n", 397 | " emb_u = self.u_emb(u[edge_index[0]]).view(-1,EMB_DIM)\n", 398 | " emb_i = self.i_emb(i[(edge_index[1]-u_enc.classes_.shape[0])]-u_enc.classes_.shape[0]).view(-1,EMB_DIM)\n", 399 | " \n", 400 | " x = torch.cat([emb_u,emb_i],dim=1)\n", 401 | " x = self.conv1(x,edge_index)\n", 402 | " x = F.relu(x)\n", 403 | " x = F.dropout(x,training=self.training)\n", 404 | " x = self.conv2(x,edge_index)\n", 405 | " x = F.relu(x)\n", 406 | " x = F.dropout(x,training=self.training)\n", 407 | " \n", 408 | " x = self.lin(x)\n", 409 | " return F.log_softmax(x,dim=1)\n", 410 | " \n" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 8, 416 | "metadata": { 417 | "scrolled": true 418 | }, 419 | "outputs": [], 420 | "source": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 58, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "name": "stdout", 429 | "output_type": "stream", 430 | "text": [ 431 | "tensor(0.7049, device='cuda:0', grad_fn=)\n", 432 | "0.6750931605781164\n", 433 | "tensor(0.7053, device='cuda:0', grad_fn=)\n", 434 | "0.674996905222715\n", 435 | "tensor(0.7048, device='cuda:0', grad_fn=)\n", 436 | "0.6750962733134901\n", 437 | "tensor(0.7049, device='cuda:0', grad_fn=)\n", 438 | "0.6753460104669319\n", 439 | "tensor(0.7050, device='cuda:0', grad_fn=)\n", 440 | "0.6751274006672268\n", 441 | "tensor(0.7052, device='cuda:0', grad_fn=)\n", 442 | "0.6751913314629785\n", 443 | "tensor(0.7050, device='cuda:0', grad_fn=)\n", 444 | "0.6752758542004329\n", 445 | "tensor(0.7051, device='cuda:0', grad_fn=)\n", 446 | "0.675027314252904\n", 447 | "tensor(0.7051, device='cuda:0', grad_fn=)\n", 448 | "0.6748395923657531\n", 449 | "tensor(0.7052, device='cuda:0', grad_fn=)\n", 450 | "0.6751312317261482\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "model = Net().to(device)\n", 456 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n", 457 | "weight = torch.tensor([1,1.075],dtype=torch.float).to(device)\n", 458 | "\n", 459 | "model.train()\n", 460 | "for epoch in range(50):\n", 461 | " optimizer.zero_grad()\n", 462 | " out = model(data)\n", 463 | " loss = F.nll_loss(out[],data.y,weight=weight)\n", 464 | " loss.backward()\n", 465 | " optimizer.step\n", 466 | " if epoch%5==0:\n", 467 | " print(loss)\n", 468 | " _,pred = model(data).max(dim=1) \n", 469 | " print(pred.eq(data.y).sum().item()/data.y.shape[0])" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": { 475 | "scrolled": true 476 | }, 477 | "source": [ 478 | "### evalutate" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 55, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "accuracy: 0.6830433261636661\n", 491 | "recall: 0.149021881330272\n", 492 | "precision: 0.3520523736846333\n", 493 | "f1: 0.2094041213580665\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n", 499 | "train['pred'] = pred.to('cpu').numpy()\n", 500 | "print('accuracy: ',accuracy_score(train['behavior_type'].fillna(0),train['pred']))\n", 501 | "print('recall: ',recall_score(train['behavior_type'].fillna(0), train['pred']))\n", 502 | "print('precision: ',precision_score(train['behavior_type'].fillna(0), train['pred']))\n", 503 | "print('f1: ',f1_score(train['behavior_type'].fillna(0), train['pred']))" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 56, 509 | "metadata": {}, 510 | "outputs": [ 511 | { 512 | "data": { 513 | "text/plain": [ 514 | "0 3678431\n", 515 | "1 497960\n", 516 | "Name: pred, dtype: int64" 517 | ] 518 | }, 519 | "execution_count": 56, 520 | "metadata": {}, 521 | "output_type": "execute_result" 522 | } 523 | ], 524 | "source": [ 525 | "train.pred.value_counts()" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 57, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "data": { 535 | "text/plain": [ 536 | "0.0 3000000\n", 537 | "1.0 1176391\n", 538 | "Name: behavior_type, dtype: int64" 539 | ] 540 | }, 541 | "execution_count": 57, 542 | "metadata": {}, 543 | "output_type": "execute_result" 544 | } 545 | ], 546 | "source": [ 547 | "train.behavior_type.fillna(0).value_counts()" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [] 556 | } 557 | ], 558 | "metadata": { 559 | "kernelspec": { 560 | "display_name": "Python 3", 561 | "language": "python", 562 | "name": "python3" 563 | }, 564 | "language_info": { 565 | "codemirror_mode": { 566 | "name": "ipython", 567 | "version": 3 568 | }, 569 | "file_extension": ".py", 570 | "mimetype": "text/x-python", 571 | "name": "python", 572 | "nbconvert_exporter": "python", 573 | "pygments_lexer": "ipython3", 574 | "version": "3.7.3" 575 | } 576 | }, 577 | "nbformat": 4, 578 | "nbformat_minor": 2 579 | } 580 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CIKM 2 | ### 代码在notebook文件夹中 3 | 1. baseline的神经网络有错误,不要用 4 | 2. edge GCN中,包括了用原生GCN做node embedding的模型,和我魔改的引入edge embedding的GCN的模型 -------------------------------------------------------------------------------- /notebook/baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Wall time: 6.3 s\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%time\n", 18 | "import pandas as pd\n", 19 | "import os\n", 20 | "import numpy as np\n", 21 | "import torch\n", 22 | "import torch_geometric\n", 23 | "from sklearn.preprocessing import LabelEncoder\n", 24 | "import gc\n", 25 | "from torch_geometric.data import Data\n", 26 | "\n", 27 | "# item_feature = pd.read_csv('../data/item_feature.csv')\n", 28 | "# test = pd.read_csv('../data/test.csv')\n", 29 | "# user_feature = pd.read_csv('../data/user_feature.csv')\n", 30 | "train = pd.read_pickle('../data/sml_train.pkl')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 42 | "\n", 55 | "\n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | "
user_iditem_idbehavior_typedate
01.732029e+093.193364e+08clk2019-06-19
21.732029e+091.197152e+09clk2019-06-19
31.732029e+091.145630e+09clk2019-06-19
51.732029e+091.162473e+09clk2019-06-19
71.732029e+091.128524e+09clk2019-06-19
\n", 103 | "
" 104 | ], 105 | "text/plain": [ 106 | " user_id item_id behavior_type date\n", 107 | "0 1.732029e+09 3.193364e+08 clk 2019-06-19\n", 108 | "2 1.732029e+09 1.197152e+09 clk 2019-06-19\n", 109 | "3 1.732029e+09 1.145630e+09 clk 2019-06-19\n", 110 | "5 1.732029e+09 1.162473e+09 clk 2019-06-19\n", 111 | "7 1.732029e+09 1.128524e+09 clk 2019-06-19" 112 | ] 113 | }, 114 | "execution_count": 2, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "train.head()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### baseline思路(时间序列思路,未解决推荐问题和冷启动问题)\n", 128 | "1. 根据历史clk记录预测未来可能重复clk的user-item pair\n", 129 | "2. 历史clk记录可以用user-item二部图来表示,user和item作为节点,其中的边作为clk记录\n", 130 | "3. feature为user_embedding和item_embedding,通过concat两者的embedding后接MLP得到user-item pair的预测值\n", 131 | "4. label为历史clk边中重复clk的边 \n", 132 | "\n", 133 | "---\n", 134 | "由于内存不足,事先在服务器筛了训练数据中18-20号的数据,用18-19号的clk预测20号会重复clk的。 " 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 3, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "Wall time: 54.3 s\n" 147 | ] 148 | }, 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "42" 153 | ] 154 | }, 155 | "execution_count": 3, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "%%time\n", 162 | "train = train[train.behavior_type=='clk']\n", 163 | "his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id'])\n", 164 | "now = train[train.date>='2019-06-20']\n", 165 | "del train\n", 166 | "train = his[['user_id','item_id']].merge(now[['user_id','item_id','behavior_type']],how='left')\n", 167 | "del his,now\n", 168 | "gc.collect()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "## build model" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 4, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "Wall time: 9.81 s\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "%%time\n", 193 | "train = pd.concat([train[train.behavior_type.isnull()==False],train[train.behavior_type.isnull()==True].sample(3000000)],axis=0)\n", 194 | "# 显存不足,下采样\n", 195 | "train['behavior_type'] = train['behavior_type'].fillna(0)\n", 196 | "train['behavior_type'] = train['behavior_type'].map({'clk':1})\n", 197 | "\n", 198 | "u_enc,i_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id'])\n", 199 | "train['user_id'] = u_enc.transform(train['user_id'])\n", 200 | "train['item_id'] = i_enc.transform(train['item_id'])+u_enc.classes_.shape[0]\n", 201 | "\n", 202 | "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])\n", 203 | "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n", 204 | "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n", 205 | "y = torch.tensor(train['behavior_type'].fillna(0).values,dtype=torch.long)\n", 206 | "data = Data(u=u,i=i,edge_index=edge_index,y=y)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", 216 | "data = data.to(device)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 59, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "positive label ratio: 0.2816764522287305\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "print('positive label ratio: ',data.y.sum().item()/data.y.shape[0])" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 60, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stderr", 243 | "output_type": "stream", 244 | "text": [ 245 | "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n", 246 | " warnings.warn(__num_nodes_warn_msg__.format('edge'))\n" 247 | ] 248 | }, 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "False" 253 | ] 254 | }, 255 | "execution_count": 60, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "data.contains_isolated_nodes()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 69, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "name": "stderr", 271 | "output_type": "stream", 272 | "text": [ 273 | "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n", 274 | " warnings.warn(__num_nodes_warn_msg__.format('edge'))\n" 275 | ] 276 | }, 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "2243363" 281 | ] 282 | }, 283 | "execution_count": 69, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "data.num_nodes" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 68, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "4176391" 301 | ] 302 | }, 303 | "execution_count": 68, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "data.num_edges" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 66, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "data": { 319 | "text/plain": [ 320 | "torch.Size([776543, 1])" 321 | ] 322 | }, 323 | "execution_count": 66, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "data.u.shape # number of user" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 67, 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "data": { 339 | "text/plain": [ 340 | "torch.Size([1466820, 1])" 341 | ] 342 | }, 343 | "execution_count": 67, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "data.i.shape # number of item" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 70, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "3.6665601930725125e-06" 361 | ] 362 | }, 363 | "execution_count": 70, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "data.num_edges/(data.u.shape[0]*data.i.shape[0]) # very sparse" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 6, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "from torch_geometric.nn import GCNConv\n", 379 | "import torch.nn.functional as F\n", 380 | "\n", 381 | "\n", 382 | "# hyper param\n", 383 | "EMB_DIM = 10\n", 384 | "\n", 385 | "class Net(torch.nn.Module):\n", 386 | " def __init__(self):\n", 387 | " super(Net, self).__init__()\n", 388 | " self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n", 389 | " self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n", 390 | " self.conv1 = GCNConv(EMB_DIM*2,EMB_DIM)\n", 391 | " self.conv2 = GCNConv(EMB_DIM,8)\n", 392 | " self.lin = torch.nn.Linear(8,2)\n", 393 | " \n", 394 | " def forward(self, data):\n", 395 | " u,i,edge_index = data.u,data.i,data.edge_index\n", 396 | " \n", 397 | " emb_u = self.u_emb(u[edge_index[0]]).view(-1,EMB_DIM)\n", 398 | " emb_i = self.i_emb(i[(edge_index[1]-u_enc.classes_.shape[0])]-u_enc.classes_.shape[0]).view(-1,EMB_DIM)\n", 399 | " \n", 400 | " x = torch.cat([emb_u,emb_i],dim=1)\n", 401 | " x = self.conv1(x,edge_index)\n", 402 | " x = F.relu(x)\n", 403 | " x = F.dropout(x,training=self.training)\n", 404 | " x = self.conv2(x,edge_index)\n", 405 | " x = F.relu(x)\n", 406 | " x = F.dropout(x,training=self.training)\n", 407 | " \n", 408 | " x = self.lin(x)\n", 409 | " return F.log_softmax(x,dim=1)\n", 410 | " \n" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 8, 416 | "metadata": { 417 | "scrolled": true 418 | }, 419 | "outputs": [], 420 | "source": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 58, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "name": "stdout", 429 | "output_type": "stream", 430 | "text": [ 431 | "tensor(0.7049, device='cuda:0', grad_fn=)\n", 432 | "0.6750931605781164\n", 433 | "tensor(0.7053, device='cuda:0', grad_fn=)\n", 434 | "0.674996905222715\n", 435 | "tensor(0.7048, device='cuda:0', grad_fn=)\n", 436 | "0.6750962733134901\n", 437 | "tensor(0.7049, device='cuda:0', grad_fn=)\n", 438 | "0.6753460104669319\n", 439 | "tensor(0.7050, device='cuda:0', grad_fn=)\n", 440 | "0.6751274006672268\n", 441 | "tensor(0.7052, device='cuda:0', grad_fn=)\n", 442 | "0.6751913314629785\n", 443 | "tensor(0.7050, device='cuda:0', grad_fn=)\n", 444 | "0.6752758542004329\n", 445 | "tensor(0.7051, device='cuda:0', grad_fn=)\n", 446 | "0.675027314252904\n", 447 | "tensor(0.7051, device='cuda:0', grad_fn=)\n", 448 | "0.6748395923657531\n", 449 | "tensor(0.7052, device='cuda:0', grad_fn=)\n", 450 | "0.6751312317261482\n" 451 | ] 452 | } 453 | ], 454 | "source": [ 455 | "model = Net().to(device)\n", 456 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n", 457 | "weight = torch.tensor([1,1.075],dtype=torch.float).to(device)\n", 458 | "\n", 459 | "model.train()\n", 460 | "for epoch in range(50):\n", 461 | " optimizer.zero_grad()\n", 462 | " out = model(data)\n", 463 | " loss = F.nll_loss(out[],data.y,weight=weight)\n", 464 | " loss.backward()\n", 465 | " optimizer.step\n", 466 | " if epoch%5==0:\n", 467 | " print(loss)\n", 468 | " _,pred = model(data).max(dim=1) \n", 469 | " print(pred.eq(data.y).sum().item()/data.y.shape[0])" 470 | ] 471 | }, 472 | { 473 | "cell_type": "markdown", 474 | "metadata": { 475 | "scrolled": true 476 | }, 477 | "source": [ 478 | "### evalutate" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 55, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "accuracy: 0.6830433261636661\n", 491 | "recall: 0.149021881330272\n", 492 | "precision: 0.3520523736846333\n", 493 | "f1: 0.2094041213580665\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n", 499 | "train['pred'] = pred.to('cpu').numpy()\n", 500 | "print('accuracy: ',accuracy_score(train['behavior_type'].fillna(0),train['pred']))\n", 501 | "print('recall: ',recall_score(train['behavior_type'].fillna(0), train['pred']))\n", 502 | "print('precision: ',precision_score(train['behavior_type'].fillna(0), train['pred']))\n", 503 | "print('f1: ',f1_score(train['behavior_type'].fillna(0), train['pred']))" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 56, 509 | "metadata": {}, 510 | "outputs": [ 511 | { 512 | "data": { 513 | "text/plain": [ 514 | "0 3678431\n", 515 | "1 497960\n", 516 | "Name: pred, dtype: int64" 517 | ] 518 | }, 519 | "execution_count": 56, 520 | "metadata": {}, 521 | "output_type": "execute_result" 522 | } 523 | ], 524 | "source": [ 525 | "train.pred.value_counts()" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 57, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "data": { 535 | "text/plain": [ 536 | "0.0 3000000\n", 537 | "1.0 1176391\n", 538 | "Name: behavior_type, dtype: int64" 539 | ] 540 | }, 541 | "execution_count": 57, 542 | "metadata": {}, 543 | "output_type": "execute_result" 544 | } 545 | ], 546 | "source": [ 547 | "train.behavior_type.fillna(0).value_counts()" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [] 556 | } 557 | ], 558 | "metadata": { 559 | "kernelspec": { 560 | "display_name": "Python 3", 561 | "language": "python", 562 | "name": "python3" 563 | }, 564 | "language_info": { 565 | "codemirror_mode": { 566 | "name": "ipython", 567 | "version": 3 568 | }, 569 | "file_extension": ".py", 570 | "mimetype": "text/x-python", 571 | "name": "python", 572 | "nbconvert_exporter": "python", 573 | "pygments_lexer": "ipython3", 574 | "version": "3.7.3" 575 | } 576 | }, 577 | "nbformat": 4, 578 | "nbformat_minor": 2 579 | } 580 | -------------------------------------------------------------------------------- /notebook/edge GCN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "C:\\Users\\user\\Anaconda3\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n" 14 | ] 15 | }, 16 | { 17 | "name": "stdout", 18 | "output_type": "stream", 19 | "text": [ 20 | "Wall time: 53.9 s\n" 21 | ] 22 | } 23 | ], 24 | "source": [ 25 | "%%time\n", 26 | "import pandas as pd\n", 27 | "import os\n", 28 | "import numpy as np\n", 29 | "import torch\n", 30 | "import torch_geometric\n", 31 | "from sklearn.preprocessing import LabelEncoder\n", 32 | "import gc\n", 33 | "from torch_geometric.data import Data\n", 34 | "\n", 35 | "# item_feature = pd.read_csv('../data/item_feature.csv')\n", 36 | "# test = pd.read_csv('../data/test.csv')\n", 37 | "# user_feature = pd.read_csv('../data/user_feature.csv')\n", 38 | "train = pd.read_pickle('../data/sml_train.pkl')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/html": [ 49 | "
\n", 50 | "\n", 63 | "\n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | "
user_iditem_idbehavior_typedate
01.732029e+093.193364e+08clk2019-06-19
21.732029e+091.197152e+09clk2019-06-19
31.732029e+091.145630e+09clk2019-06-19
51.732029e+091.162473e+09clk2019-06-19
71.732029e+091.128524e+09clk2019-06-19
\n", 111 | "
" 112 | ], 113 | "text/plain": [ 114 | " user_id item_id behavior_type date\n", 115 | "0 1.732029e+09 3.193364e+08 clk 2019-06-19\n", 116 | "2 1.732029e+09 1.197152e+09 clk 2019-06-19\n", 117 | "3 1.732029e+09 1.145630e+09 clk 2019-06-19\n", 118 | "5 1.732029e+09 1.162473e+09 clk 2019-06-19\n", 119 | "7 1.732029e+09 1.128524e+09 clk 2019-06-19" 120 | ] 121 | }, 122 | "execution_count": 2, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "train.head()" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### 加入edge信息的GCN(时间序列思路,未解决推荐问题和冷启动问题)\n", 136 | "1. 根据历史 __所有behavior边__ 预测未来可能在这些边中是clk的概率\n", 137 | "2. __behavior__ 用 __user-item__ 二部图来表示, __user__ 和 __item__ 作为节点,边属性为 __behavior_type__\n", 138 | "3. 更新每个user/item feature时,concat 1-hop node embedding和edge_embedding,然后求均值作为新的user/item feature\n", 139 | "4. 最后concat __user_embedding__ , __item_embedding__ 作为user-item pair 的feature, 该user-item pair在未来是否发生clk作为label" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 3, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "Wall time: 1min 29s\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "%%time\n", 157 | "his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id','behavior_type'])\n", 158 | "now = train[(train.date>='2019-06-20')&(train.behavior_type=='clk')].drop_duplicates(subset=['user_id','item_id'])\n", 159 | "del train\n", 160 | "now.rename(columns={'behavior_type':'label'},inplace=True)\n", 161 | "train = his[['user_id','item_id','behavior_type']].merge(now[['user_id','item_id','label']],how='left')\n", 162 | "del his,now\n", 163 | "gc.collect()\n", 164 | "train['label'] = train['label'].map({'clk':1})\n", 165 | "train['label'] = train['label'].fillna(0) " 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### build model" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 5, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "Wall time: 18.8 s\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "%%time\n", 190 | "train = pd.concat([train[train.label==1],train[train.label==0].sample(2000000)],axis=0)\n", 191 | "# 显存不足,下采样\n", 192 | "\n", 193 | "u_enc,i_enc,e_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id']),LabelEncoder().fit(train['behavior_type'])\n", 194 | "train['user_id'] = u_enc.transform(train['user_id'])\n", 195 | "train['item_id'] = i_enc.transform(train['item_id'])\n", 196 | "train['behavior_type'] = e_enc.transform(train['behavior_type'])" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 11, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "0.860387" 208 | ] 209 | }, 210 | "execution_count": 11, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "# clk rate\n", 217 | "train.label.sum()/(train.shape[0]-train.label.sum())" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "#### no edge GCN \n", 225 | "不利用边属性信息,只用节点的embedding进行GCN" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 6, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values+1+train['user_id'].max()])\n", 235 | "edge_attr = torch.tensor(train.behavior_type.values, dtype=torch.long)\n", 236 | "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n", 237 | "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n", 238 | "e = torch.tensor(train['behavior_type'].unique().reshape(-1,1))\n", 239 | "y = torch.tensor(train.label.values,dtype=torch.long)\n", 240 | "data = Data(u=u,i=i,e=e,edge_index=edge_index,edge_attr=edge_attr,y=y)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 7, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "from torch_geometric.nn import GCNConv,MessagePassing\n", 250 | "import torch.nn.functional as F\n", 251 | "\n", 252 | "# hyper param\n", 253 | "EMB_DIM = 10\n", 254 | "\n", 255 | "class noedge_GCN(torch.nn.Module):\n", 256 | " def __init__(self):\n", 257 | " super(noedge_GCN, self).__init__()\n", 258 | " self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n", 259 | " self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n", 260 | " self.conv1 = GCNConv(EMB_DIM,6)\n", 261 | " self.conv2 = GCNConv(6,4)\n", 262 | " self.lin = torch.nn.Linear(8,2)\n", 263 | " \n", 264 | " def forward(self, data):\n", 265 | " u,i,e,edge_index,edge_attr = data.u,data.i,data.e,data.edge_index,data.edge_attr\n", 266 | " \n", 267 | " emb_u = self.u_emb(u).view(-1,EMB_DIM)\n", 268 | " emb_i = self.i_emb(i).view(-1,EMB_DIM)\n", 269 | " x = torch.cat([emb_u, emb_i],dim=0)\n", 270 | " x = self.conv1(x,edge_index)\n", 271 | " x = F.relu(x)\n", 272 | " x = F.dropout(x,training=self.training)\n", 273 | " x = self.conv2(x,edge_index)\n", 274 | " x = F.relu(x)\n", 275 | " x = F.dropout(x,training=self.training)\n", 276 | " x = torch.cat([x[edge_index[0]],x[edge_index[1]]],dim=1)\n", 277 | " x = self.lin(x)\n", 278 | " x = F.dropout(x,training=self.training)\n", 279 | " \n", 280 | " return F.log_softmax(x,dim=1)\n", 281 | " \n", 282 | "\n", 283 | "model = noedge_GCN()" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 12, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "epoch_0 loss: 0.8651493787765503\n", 296 | "epoch_1 loss: 0.7528172135353088\n", 297 | "epoch_2 loss: 0.7403311133384705\n", 298 | "epoch_3 loss: 0.7339281439781189\n", 299 | "epoch_4 loss: 0.7297651171684265\n", 300 | "epoch_5 loss: 0.7220962643623352\n", 301 | "epoch_6 loss: 0.7202253341674805\n", 302 | "epoch_7 loss: 0.7201514840126038\n", 303 | "epoch_8 loss: 0.7203397750854492\n", 304 | "epoch_9 loss: 0.721700131893158\n", 305 | "Wall time: 3min 35s\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "%%time\n", 311 | "model = noedge_GCN()\n", 312 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n", 313 | "weight = torch.tensor([0.860387,1],dtype=torch.float) # label的weight属于超参,需要调参!!!\n", 314 | "\n", 315 | "model.train()\n", 316 | "for epoch in range(10):\n", 317 | " optimizer.zero_grad()\n", 318 | " out = model(data)\n", 319 | " loss = F.nll_loss(out, data.y,weight=weight)\n", 320 | " loss.backward()\n", 321 | " optimizer.step()\n", 322 | " if epoch%1==0:\n", 323 | " model.eval()\n", 324 | " print('epoch_{} loss: {}'.format(epoch,loss.item()))" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 14, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "accuracy: 0.46687463414870134\n", 337 | "recall: 0.9499719312355951\n", 338 | "precision: 0.4627905982579322\n", 339 | "f1: 0.6223807175044113\n" 340 | ] 341 | } 342 | ], 343 | "source": [ 344 | "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n", 345 | "model.eval()\n", 346 | "_,pred = model(data).max(dim=1)\n", 347 | "train['pred'] = pred.numpy()\n", 348 | "print('accuracy: ',accuracy_score(train['label'],train['pred']))\n", 349 | "print('recall: ',recall_score(train['label'], train['pred']))\n", 350 | "print('precision: ',precision_score(train['label'], train['pred']))\n", 351 | "print('f1: ',f1_score(train['label'], train['pred']))" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "#### edge GCN\n", 359 | "同时利用node_embedding和edge_embedding做GCN,最终每个节点的embedding是融合了该节点领域的node_embedding和edge_embedding \n", 360 | "魔改GCN" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 18, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])\n", 370 | "edge_attr = torch.tensor(train.behavior_type.values, dtype=torch.long)\n", 371 | "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n", 372 | "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n", 373 | "e = torch.tensor(train['behavior_type'].unique().reshape(-1,1))\n", 374 | "y = torch.tensor(train.label.values,dtype=torch.long)\n", 375 | "data = Data(u=u,i=i,e=e,edge_index=edge_index,edge_attr=edge_attr,y=y)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 32, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "import torch\n", 385 | "from torch_geometric.nn import MessagePassing\n", 386 | "from torch_geometric.utils import add_self_loops, degree\n", 387 | "\n", 388 | "class edgeGCN(MessagePassing):\n", 389 | " \n", 390 | " def __init__(self, in_channels, out_channels, flow, aggr='add', **kwargs):\n", 391 | " super(edgeGCN, self).__init__(aggr=aggr,flow=flow)\n", 392 | " self.lin_u = torch.nn.Linear(in_channels, out_channels)\n", 393 | " self.lin_i = torch.nn.Linear(in_channels, out_channels)\n", 394 | " self.lin_e = torch.nn.Linear(in_channels, out_channels)\n", 395 | " self.lin_aggr = torch.nn.Linear(out_channels*2, out_channels)\n", 396 | " self.flow = flow\n", 397 | " \n", 398 | " def forward(self,u,i,e,edge_index,edge_type):\n", 399 | " \n", 400 | " # linear transformation\n", 401 | " u = self.lin_u(u)\n", 402 | " u = F.relu(u)\n", 403 | " u = F.dropout(u)\n", 404 | " i = self.lin_i(i)\n", 405 | " i = F.relu(i)\n", 406 | " i = F.dropout(i)\n", 407 | " e = self.lin_e(e)\n", 408 | " e = F.relu(e)\n", 409 | " e = F.dropout(e) \n", 410 | " \n", 411 | " return self.propagate(x=(u,i),e=e,edge_index=edge_index,edge_type=edge_type,size=(u.size(0), i.size(0)))\n", 412 | " \n", 413 | " def message(self,x_j, x_i, e, edge_index, edge_type,size):\n", 414 | " \n", 415 | " # x_i is user_embedding\n", 416 | " # x_j is item_embedding\n", 417 | " \n", 418 | " # get normalized laplacian\n", 419 | " row,col = edge_index\n", 420 | " deg_i = degree(row, size[0], dtype=x_i.dtype)\n", 421 | " deg_j = degree(col, size[1], dtype=x_j.dtype)\n", 422 | " deg_inv_sqrt_i = deg_i.pow(-0.5)\n", 423 | " deg_inv_sqrt_j = deg_j.pow(-0.5)\n", 424 | " norm = deg_inv_sqrt_i[row]*deg_inv_sqrt_j[col]\n", 425 | " \n", 426 | " # concat neighbor nodes embedding and edge embedding\n", 427 | " if self.flow == 'target_to_source':\n", 428 | " emb = torch.cat([x_j, e[edge_type]], dim=1)\n", 429 | " else:\n", 430 | " emb = torch.cat([x_i, e[edge_type]], dim=1)\n", 431 | " return norm.view(-1,1)*emb\n", 432 | " \n", 433 | " def update(self, aggr_out):\n", 434 | " return self.lin_aggr(aggr_out)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 69, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "# hyper param\n", 444 | "EMB_DIM = 10\n", 445 | "\n", 446 | "class Net(torch.nn.Module):\n", 447 | " def __init__(self):\n", 448 | " super(Net, self).__init__()\n", 449 | " self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n", 450 | " self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n", 451 | " self.e_emb = torch.nn.Embedding(len(e),EMB_DIM)\n", 452 | " self.e_lin_1 = torch.nn.Linear(EMB_DIM,6)\n", 453 | " self.u_gcn_1 = edgeGCN(EMB_DIM,6,flow='target_to_source')\n", 454 | " self.i_gcn_1 = edgeGCN(EMB_DIM,6,flow='source_to_target')\n", 455 | " self.u_gcn_2 = edgeGCN(6,2,flow='target_to_source')\n", 456 | " self.i_gcn_2 = edgeGCN(6,2,flow='source_to_target')\n", 457 | " self.lin = torch.nn.Linear(4,2)\n", 458 | " \n", 459 | " def forward(self, data):\n", 460 | " u,i,e,edge_index,edge_attr = data.u,data.i,data.e,data.edge_index,data.edge_attr\n", 461 | " \n", 462 | " u_emb = self.u_emb(u).view(-1,EMB_DIM)\n", 463 | " i_emb = self.i_emb(i).view(-1,EMB_DIM)\n", 464 | " e_emb = self.e_emb(e).view(-1,EMB_DIM)\n", 465 | " \n", 466 | " x_u,x_i = self.u_gcn_1(u=u_emb, i=i_emb, e=e_emb, edge_index=edge_index, edge_type = edge_attr),\\\n", 467 | " self.i_gcn_1(u=u_emb, i=i_emb, e=e_emb, edge_index=edge_index, edge_type = edge_attr)\n", 468 | " e_emb = self.e_lin_1(e_emb)\n", 469 | " x_u,x_i = self.u_gcn_2(u=x_u, i=x_i, e=e_emb, edge_index=edge_index, edge_type = edge_attr),\\\n", 470 | " self.i_gcn_2(u=x_u, i=x_i, e=e_emb, edge_index=edge_index, edge_type = edge_attr)\n", 471 | "\n", 472 | " \n", 473 | " x = torch.cat([x_u[edge_index[0]],x_i[edge_index[1]]],dim=1)\n", 474 | " x = self.lin(x)\n", 475 | " x = F.dropout(x,training=self.training)\n", 476 | " \n", 477 | " return F.log_softmax(x,dim=1)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 73, 483 | "metadata": {}, 484 | "outputs": [ 485 | { 486 | "name": "stdout", 487 | "output_type": "stream", 488 | "text": [ 489 | "epoch_0 loss: 0.8134878873825073\n", 490 | "epoch_1 loss: 0.7332140207290649\n", 491 | "epoch_2 loss: 0.7305666208267212\n", 492 | "epoch_3 loss: 0.7313019633293152\n", 493 | "Wall time: 1min 36s\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "%%time\n", 499 | "model = Net()\n", 500 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n", 501 | "weight = torch.tensor([0.860387,1],dtype=torch.float) # label的weight属于超参,需要调参!!!\n", 502 | "\n", 503 | "model.train()\n", 504 | "\n", 505 | "# train param\n", 506 | "patience = 0\n", 507 | "best_loss = 1\n", 508 | "\n", 509 | "for epoch in range(10):\n", 510 | " optimizer.zero_grad()\n", 511 | " out = model(data)\n", 512 | " loss = F.nll_loss(out, data.y,weight=weight)\n", 513 | " \n", 514 | " if epoch%1==0:\n", 515 | " model.eval()\n", 516 | " print('epoch_{} loss: {}'.format(epoch,loss.item()))\n", 517 | " \n", 518 | " if loss>best_loss:\n", 519 | " patience += 1\n", 520 | " if patience == 2: \n", 521 | " break\n", 522 | " else:\n", 523 | " patience = 0\n", 524 | " best_loss = loss\n", 525 | " \n", 526 | " loss.backward()\n", 527 | " optimizer.step()" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 74, 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "name": "stdout", 537 | "output_type": "stream", 538 | "text": [ 539 | "accuracy: 0.5050347051446823\n", 540 | "recall: 0.5282611197054349\n", 541 | "precision: 0.468827943036212\n", 542 | "f1: 0.4967732239615924\n" 543 | ] 544 | } 545 | ], 546 | "source": [ 547 | "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n", 548 | "model.eval()\n", 549 | "_,pred = model(data).max(dim=1)\n", 550 | "train['pred'] = pred.numpy()\n", 551 | "print('accuracy: ',accuracy_score(train['label'],train['pred']))\n", 552 | "print('recall: ',recall_score(train['label'], train['pred']))\n", 553 | "print('precision: ',precision_score(train['label'], train['pred']))\n", 554 | "print('f1: ',f1_score(train['label'], train['pred']))" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": null, 574 | "metadata": {}, 575 | "outputs": [], 576 | "source": [] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [] 640 | } 641 | ], 642 | "metadata": { 643 | "kernelspec": { 644 | "display_name": "Python 3", 645 | "language": "python", 646 | "name": "python3" 647 | }, 648 | "language_info": { 649 | "codemirror_mode": { 650 | "name": "ipython", 651 | "version": 3 652 | }, 653 | "file_extension": ".py", 654 | "mimetype": "text/x-python", 655 | "name": "python", 656 | "nbconvert_exporter": "python", 657 | "pygments_lexer": "ipython3", 658 | "version": "3.6.5" 659 | } 660 | }, 661 | "nbformat": 4, 662 | "nbformat_minor": 2 663 | } 664 | --------------------------------------------------------------------------------