├── .ipynb_checkpoints
    └── baseline-checkpoint.ipynb
├── README.md
└── notebook
    ├── baseline.ipynb
    └── edge GCN.ipynb


/.ipynb_checkpoints/baseline-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Wall time: 6.3 s\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "%%time\n",
 18 |     "import pandas as pd\n",
 19 |     "import os\n",
 20 |     "import numpy as np\n",
 21 |     "import torch\n",
 22 |     "import torch_geometric\n",
 23 |     "from sklearn.preprocessing import LabelEncoder\n",
 24 |     "import gc\n",
 25 |     "from torch_geometric.data import Data\n",
 26 |     "\n",
 27 |     "# item_feature = pd.read_csv('../data/item_feature.csv')\n",
 28 |     "# test = pd.read_csv('../data/test.csv')\n",
 29 |     "# user_feature = pd.read_csv('../data/user_feature.csv')\n",
 30 |     "train = pd.read_pickle('../data/sml_train.pkl')"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/html": [
 41 |        "<div>\n",
 42 |        "<style scoped>\n",
 43 |        "    .dataframe tbody tr th:only-of-type {\n",
 44 |        "        vertical-align: middle;\n",
 45 |        "    }\n",
 46 |        "\n",
 47 |        "    .dataframe tbody tr th {\n",
 48 |        "        vertical-align: top;\n",
 49 |        "    }\n",
 50 |        "\n",
 51 |        "    .dataframe thead th {\n",
 52 |        "        text-align: right;\n",
 53 |        "    }\n",
 54 |        "</style>\n",
 55 |        "<table border=\"1\" class=\"dataframe\">\n",
 56 |        "  <thead>\n",
 57 |        "    <tr style=\"text-align: right;\">\n",
 58 |        "      <th></th>\n",
 59 |        "      <th>user_id</th>\n",
 60 |        "      <th>item_id</th>\n",
 61 |        "      <th>behavior_type</th>\n",
 62 |        "      <th>date</th>\n",
 63 |        "    </tr>\n",
 64 |        "  </thead>\n",
 65 |        "  <tbody>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>0</th>\n",
 68 |        "      <td>1.732029e+09</td>\n",
 69 |        "      <td>3.193364e+08</td>\n",
 70 |        "      <td>clk</td>\n",
 71 |        "      <td>2019-06-19</td>\n",
 72 |        "    </tr>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>2</th>\n",
 75 |        "      <td>1.732029e+09</td>\n",
 76 |        "      <td>1.197152e+09</td>\n",
 77 |        "      <td>clk</td>\n",
 78 |        "      <td>2019-06-19</td>\n",
 79 |        "    </tr>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>3</th>\n",
 82 |        "      <td>1.732029e+09</td>\n",
 83 |        "      <td>1.145630e+09</td>\n",
 84 |        "      <td>clk</td>\n",
 85 |        "      <td>2019-06-19</td>\n",
 86 |        "    </tr>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>5</th>\n",
 89 |        "      <td>1.732029e+09</td>\n",
 90 |        "      <td>1.162473e+09</td>\n",
 91 |        "      <td>clk</td>\n",
 92 |        "      <td>2019-06-19</td>\n",
 93 |        "    </tr>\n",
 94 |        "    <tr>\n",
 95 |        "      <th>7</th>\n",
 96 |        "      <td>1.732029e+09</td>\n",
 97 |        "      <td>1.128524e+09</td>\n",
 98 |        "      <td>clk</td>\n",
 99 |        "      <td>2019-06-19</td>\n",
100 |        "    </tr>\n",
101 |        "  </tbody>\n",
102 |        "</table>\n",
103 |        "</div>"
104 |       ],
105 |       "text/plain": [
106 |        "        user_id       item_id behavior_type       date\n",
107 |        "0  1.732029e+09  3.193364e+08           clk 2019-06-19\n",
108 |        "2  1.732029e+09  1.197152e+09           clk 2019-06-19\n",
109 |        "3  1.732029e+09  1.145630e+09           clk 2019-06-19\n",
110 |        "5  1.732029e+09  1.162473e+09           clk 2019-06-19\n",
111 |        "7  1.732029e+09  1.128524e+09           clk 2019-06-19"
112 |       ]
113 |      },
114 |      "execution_count": 2,
115 |      "metadata": {},
116 |      "output_type": "execute_result"
117 |     }
118 |    ],
119 |    "source": [
120 |     "train.head()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "### baseline思路(时间序列思路，未解决推荐问题和冷启动问题)\n",
128 |     "1. 根据历史clk记录预测未来可能重复clk的user-item pair\n",
129 |     "2. 历史clk记录可以用user-item二部图来表示，user和item作为节点，其中的边作为clk记录\n",
130 |     "3. feature为user_embedding和item_embedding，通过concat两者的embedding后接MLP得到user-item pair的预测值\n",
131 |     "4. label为历史clk边中重复clk的边  \n",
132 |     "\n",
133 |     "---\n",
134 |     "由于内存不足，事先在服务器筛了训练数据中18-20号的数据，用18-19号的clk预测20号会重复clk的。   "
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 3,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "Wall time: 54.3 s\n"
147 |      ]
148 |     },
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "42"
153 |       ]
154 |      },
155 |      "execution_count": 3,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "%%time\n",
162 |     "train = train[train.behavior_type=='clk']\n",
163 |     "his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id'])\n",
164 |     "now = train[train.date>='2019-06-20']\n",
165 |     "del train\n",
166 |     "train = his[['user_id','item_id']].merge(now[['user_id','item_id','behavior_type']],how='left')\n",
167 |     "del his,now\n",
168 |     "gc.collect()"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "## build model"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 4,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "Wall time: 9.81 s\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "%%time\n",
193 |     "train = pd.concat([train[train.behavior_type.isnull()==False],train[train.behavior_type.isnull()==True].sample(3000000)],axis=0)\n",
194 |     "# 显存不足，下采样\n",
195 |     "train['behavior_type'] = train['behavior_type'].fillna(0)\n",
196 |     "train['behavior_type'] = train['behavior_type'].map({'clk':1})\n",
197 |     "\n",
198 |     "u_enc,i_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id'])\n",
199 |     "train['user_id'] = u_enc.transform(train['user_id'])\n",
200 |     "train['item_id'] = i_enc.transform(train['item_id'])+u_enc.classes_.shape[0]\n",
201 |     "\n",
202 |     "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])\n",
203 |     "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n",
204 |     "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n",
205 |     "y = torch.tensor(train['behavior_type'].fillna(0).values,dtype=torch.long)\n",
206 |     "data = Data(u=u,i=i,edge_index=edge_index,y=y)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
216 |     "data = data.to(device)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 59,
222 |    "metadata": {},
223 |    "outputs": [
224 |     {
225 |      "name": "stdout",
226 |      "output_type": "stream",
227 |      "text": [
228 |       "positive label ratio:  0.2816764522287305\n"
229 |      ]
230 |     }
231 |    ],
232 |    "source": [
233 |     "print('positive label ratio: ',data.y.sum().item()/data.y.shape[0])"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 60,
239 |    "metadata": {},
240 |    "outputs": [
241 |     {
242 |      "name": "stderr",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n",
246 |       "  warnings.warn(__num_nodes_warn_msg__.format('edge'))\n"
247 |      ]
248 |     },
249 |     {
250 |      "data": {
251 |       "text/plain": [
252 |        "False"
253 |       ]
254 |      },
255 |      "execution_count": 60,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "data.contains_isolated_nodes()"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 69,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "name": "stderr",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n",
274 |       "  warnings.warn(__num_nodes_warn_msg__.format('edge'))\n"
275 |      ]
276 |     },
277 |     {
278 |      "data": {
279 |       "text/plain": [
280 |        "2243363"
281 |       ]
282 |      },
283 |      "execution_count": 69,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "data.num_nodes"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 68,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/plain": [
300 |        "4176391"
301 |       ]
302 |      },
303 |      "execution_count": 68,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "data.num_edges"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 66,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/plain": [
320 |        "torch.Size([776543, 1])"
321 |       ]
322 |      },
323 |      "execution_count": 66,
324 |      "metadata": {},
325 |      "output_type": "execute_result"
326 |     }
327 |    ],
328 |    "source": [
329 |     "data.u.shape # number of user"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 67,
335 |    "metadata": {},
336 |    "outputs": [
337 |     {
338 |      "data": {
339 |       "text/plain": [
340 |        "torch.Size([1466820, 1])"
341 |       ]
342 |      },
343 |      "execution_count": 67,
344 |      "metadata": {},
345 |      "output_type": "execute_result"
346 |     }
347 |    ],
348 |    "source": [
349 |     "data.i.shape # number of item"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 70,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "text/plain": [
360 |        "3.6665601930725125e-06"
361 |       ]
362 |      },
363 |      "execution_count": 70,
364 |      "metadata": {},
365 |      "output_type": "execute_result"
366 |     }
367 |    ],
368 |    "source": [
369 |     "data.num_edges/(data.u.shape[0]*data.i.shape[0]) # very sparse"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 6,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "from torch_geometric.nn import GCNConv\n",
379 |     "import torch.nn.functional as F\n",
380 |     "\n",
381 |     "\n",
382 |     "# hyper param\n",
383 |     "EMB_DIM = 10\n",
384 |     "\n",
385 |     "class Net(torch.nn.Module):\n",
386 |     "    def __init__(self):\n",
387 |     "        super(Net, self).__init__()\n",
388 |     "        self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n",
389 |     "        self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n",
390 |     "        self.conv1 = GCNConv(EMB_DIM*2,EMB_DIM)\n",
391 |     "        self.conv2 = GCNConv(EMB_DIM,8)\n",
392 |     "        self.lin = torch.nn.Linear(8,2)\n",
393 |     "        \n",
394 |     "    def forward(self, data):\n",
395 |     "        u,i,edge_index = data.u,data.i,data.edge_index\n",
396 |     "        \n",
397 |     "        emb_u = self.u_emb(u[edge_index[0]]).view(-1,EMB_DIM)\n",
398 |     "        emb_i = self.i_emb(i[(edge_index[1]-u_enc.classes_.shape[0])]-u_enc.classes_.shape[0]).view(-1,EMB_DIM)\n",
399 |     "        \n",
400 |     "        x = torch.cat([emb_u,emb_i],dim=1)\n",
401 |     "        x = self.conv1(x,edge_index)\n",
402 |     "        x = F.relu(x)\n",
403 |     "        x = F.dropout(x,training=self.training)\n",
404 |     "        x = self.conv2(x,edge_index)\n",
405 |     "        x = F.relu(x)\n",
406 |     "        x = F.dropout(x,training=self.training)\n",
407 |     "        \n",
408 |     "        x = self.lin(x)\n",
409 |     "        return F.log_softmax(x,dim=1)\n",
410 |     "    \n"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 8,
416 |    "metadata": {
417 |     "scrolled": true
418 |    },
419 |    "outputs": [],
420 |    "source": []
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 58,
425 |    "metadata": {},
426 |    "outputs": [
427 |     {
428 |      "name": "stdout",
429 |      "output_type": "stream",
430 |      "text": [
431 |       "tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward>)\n",
432 |       "0.6750931605781164\n",
433 |       "tensor(0.7053, device='cuda:0', grad_fn=<NllLossBackward>)\n",
434 |       "0.674996905222715\n",
435 |       "tensor(0.7048, device='cuda:0', grad_fn=<NllLossBackward>)\n",
436 |       "0.6750962733134901\n",
437 |       "tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward>)\n",
438 |       "0.6753460104669319\n",
439 |       "tensor(0.7050, device='cuda:0', grad_fn=<NllLossBackward>)\n",
440 |       "0.6751274006672268\n",
441 |       "tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)\n",
442 |       "0.6751913314629785\n",
443 |       "tensor(0.7050, device='cuda:0', grad_fn=<NllLossBackward>)\n",
444 |       "0.6752758542004329\n",
445 |       "tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)\n",
446 |       "0.675027314252904\n",
447 |       "tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)\n",
448 |       "0.6748395923657531\n",
449 |       "tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)\n",
450 |       "0.6751312317261482\n"
451 |      ]
452 |     }
453 |    ],
454 |    "source": [
455 |     "model = Net().to(device)\n",
456 |     "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
457 |     "weight = torch.tensor([1,1.075],dtype=torch.float).to(device)\n",
458 |     "\n",
459 |     "model.train()\n",
460 |     "for epoch in range(50):\n",
461 |     "    optimizer.zero_grad()\n",
462 |     "    out = model(data)\n",
463 |     "    loss = F.nll_loss(out[],data.y,weight=weight)\n",
464 |     "    loss.backward()\n",
465 |     "    optimizer.step\n",
466 |     "    if epoch%5==0:\n",
467 |     "        print(loss)\n",
468 |     "        _,pred = model(data).max(dim=1) \n",
469 |     "        print(pred.eq(data.y).sum().item()/data.y.shape[0])"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {
475 |     "scrolled": true
476 |    },
477 |    "source": [
478 |     "### evalutate"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 55,
484 |    "metadata": {},
485 |    "outputs": [
486 |     {
487 |      "name": "stdout",
488 |      "output_type": "stream",
489 |      "text": [
490 |       "accuracy:  0.6830433261636661\n",
491 |       "recall:  0.149021881330272\n",
492 |       "precision:  0.3520523736846333\n",
493 |       "f1:  0.2094041213580665\n"
494 |      ]
495 |     }
496 |    ],
497 |    "source": [
498 |     "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n",
499 |     "train['pred'] = pred.to('cpu').numpy()\n",
500 |     "print('accuracy: ',accuracy_score(train['behavior_type'].fillna(0),train['pred']))\n",
501 |     "print('recall: ',recall_score(train['behavior_type'].fillna(0), train['pred']))\n",
502 |     "print('precision: ',precision_score(train['behavior_type'].fillna(0), train['pred']))\n",
503 |     "print('f1: ',f1_score(train['behavior_type'].fillna(0), train['pred']))"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 56,
509 |    "metadata": {},
510 |    "outputs": [
511 |     {
512 |      "data": {
513 |       "text/plain": [
514 |        "0    3678431\n",
515 |        "1     497960\n",
516 |        "Name: pred, dtype: int64"
517 |       ]
518 |      },
519 |      "execution_count": 56,
520 |      "metadata": {},
521 |      "output_type": "execute_result"
522 |     }
523 |    ],
524 |    "source": [
525 |     "train.pred.value_counts()"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": 57,
531 |    "metadata": {},
532 |    "outputs": [
533 |     {
534 |      "data": {
535 |       "text/plain": [
536 |        "0.0    3000000\n",
537 |        "1.0    1176391\n",
538 |        "Name: behavior_type, dtype: int64"
539 |       ]
540 |      },
541 |      "execution_count": 57,
542 |      "metadata": {},
543 |      "output_type": "execute_result"
544 |     }
545 |    ],
546 |    "source": [
547 |     "train.behavior_type.fillna(0).value_counts()"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "code",
552 |    "execution_count": null,
553 |    "metadata": {},
554 |    "outputs": [],
555 |    "source": []
556 |   }
557 |  ],
558 |  "metadata": {
559 |   "kernelspec": {
560 |    "display_name": "Python 3",
561 |    "language": "python",
562 |    "name": "python3"
563 |   },
564 |   "language_info": {
565 |    "codemirror_mode": {
566 |     "name": "ipython",
567 |     "version": 3
568 |    },
569 |    "file_extension": ".py",
570 |    "mimetype": "text/x-python",
571 |    "name": "python",
572 |    "nbconvert_exporter": "python",
573 |    "pygments_lexer": "ipython3",
574 |    "version": "3.7.3"
575 |   }
576 |  },
577 |  "nbformat": 4,
578 |  "nbformat_minor": 2
579 | }
580 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CIKM  
2 | ### 代码在notebook文件夹中  
3 | 1. baseline的神经网络有错误，不要用
4 | 2. edge GCN中，包括了用原生GCN做node embedding的模型，和我魔改的引入edge embedding的GCN的模型


--------------------------------------------------------------------------------
/notebook/baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Wall time: 6.3 s\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "%%time\n",
 18 |     "import pandas as pd\n",
 19 |     "import os\n",
 20 |     "import numpy as np\n",
 21 |     "import torch\n",
 22 |     "import torch_geometric\n",
 23 |     "from sklearn.preprocessing import LabelEncoder\n",
 24 |     "import gc\n",
 25 |     "from torch_geometric.data import Data\n",
 26 |     "\n",
 27 |     "# item_feature = pd.read_csv('../data/item_feature.csv')\n",
 28 |     "# test = pd.read_csv('../data/test.csv')\n",
 29 |     "# user_feature = pd.read_csv('../data/user_feature.csv')\n",
 30 |     "train = pd.read_pickle('../data/sml_train.pkl')"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/html": [
 41 |        "<div>\n",
 42 |        "<style scoped>\n",
 43 |        "    .dataframe tbody tr th:only-of-type {\n",
 44 |        "        vertical-align: middle;\n",
 45 |        "    }\n",
 46 |        "\n",
 47 |        "    .dataframe tbody tr th {\n",
 48 |        "        vertical-align: top;\n",
 49 |        "    }\n",
 50 |        "\n",
 51 |        "    .dataframe thead th {\n",
 52 |        "        text-align: right;\n",
 53 |        "    }\n",
 54 |        "</style>\n",
 55 |        "<table border=\"1\" class=\"dataframe\">\n",
 56 |        "  <thead>\n",
 57 |        "    <tr style=\"text-align: right;\">\n",
 58 |        "      <th></th>\n",
 59 |        "      <th>user_id</th>\n",
 60 |        "      <th>item_id</th>\n",
 61 |        "      <th>behavior_type</th>\n",
 62 |        "      <th>date</th>\n",
 63 |        "    </tr>\n",
 64 |        "  </thead>\n",
 65 |        "  <tbody>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>0</th>\n",
 68 |        "      <td>1.732029e+09</td>\n",
 69 |        "      <td>3.193364e+08</td>\n",
 70 |        "      <td>clk</td>\n",
 71 |        "      <td>2019-06-19</td>\n",
 72 |        "    </tr>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>2</th>\n",
 75 |        "      <td>1.732029e+09</td>\n",
 76 |        "      <td>1.197152e+09</td>\n",
 77 |        "      <td>clk</td>\n",
 78 |        "      <td>2019-06-19</td>\n",
 79 |        "    </tr>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>3</th>\n",
 82 |        "      <td>1.732029e+09</td>\n",
 83 |        "      <td>1.145630e+09</td>\n",
 84 |        "      <td>clk</td>\n",
 85 |        "      <td>2019-06-19</td>\n",
 86 |        "    </tr>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>5</th>\n",
 89 |        "      <td>1.732029e+09</td>\n",
 90 |        "      <td>1.162473e+09</td>\n",
 91 |        "      <td>clk</td>\n",
 92 |        "      <td>2019-06-19</td>\n",
 93 |        "    </tr>\n",
 94 |        "    <tr>\n",
 95 |        "      <th>7</th>\n",
 96 |        "      <td>1.732029e+09</td>\n",
 97 |        "      <td>1.128524e+09</td>\n",
 98 |        "      <td>clk</td>\n",
 99 |        "      <td>2019-06-19</td>\n",
100 |        "    </tr>\n",
101 |        "  </tbody>\n",
102 |        "</table>\n",
103 |        "</div>"
104 |       ],
105 |       "text/plain": [
106 |        "        user_id       item_id behavior_type       date\n",
107 |        "0  1.732029e+09  3.193364e+08           clk 2019-06-19\n",
108 |        "2  1.732029e+09  1.197152e+09           clk 2019-06-19\n",
109 |        "3  1.732029e+09  1.145630e+09           clk 2019-06-19\n",
110 |        "5  1.732029e+09  1.162473e+09           clk 2019-06-19\n",
111 |        "7  1.732029e+09  1.128524e+09           clk 2019-06-19"
112 |       ]
113 |      },
114 |      "execution_count": 2,
115 |      "metadata": {},
116 |      "output_type": "execute_result"
117 |     }
118 |    ],
119 |    "source": [
120 |     "train.head()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "### baseline思路(时间序列思路，未解决推荐问题和冷启动问题)\n",
128 |     "1. 根据历史clk记录预测未来可能重复clk的user-item pair\n",
129 |     "2. 历史clk记录可以用user-item二部图来表示，user和item作为节点，其中的边作为clk记录\n",
130 |     "3. feature为user_embedding和item_embedding，通过concat两者的embedding后接MLP得到user-item pair的预测值\n",
131 |     "4. label为历史clk边中重复clk的边  \n",
132 |     "\n",
133 |     "---\n",
134 |     "由于内存不足，事先在服务器筛了训练数据中18-20号的数据，用18-19号的clk预测20号会重复clk的。   "
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 3,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "Wall time: 54.3 s\n"
147 |      ]
148 |     },
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "42"
153 |       ]
154 |      },
155 |      "execution_count": 3,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "%%time\n",
162 |     "train = train[train.behavior_type=='clk']\n",
163 |     "his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id'])\n",
164 |     "now = train[train.date>='2019-06-20']\n",
165 |     "del train\n",
166 |     "train = his[['user_id','item_id']].merge(now[['user_id','item_id','behavior_type']],how='left')\n",
167 |     "del his,now\n",
168 |     "gc.collect()"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "## build model"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 4,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "Wall time: 9.81 s\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "%%time\n",
193 |     "train = pd.concat([train[train.behavior_type.isnull()==False],train[train.behavior_type.isnull()==True].sample(3000000)],axis=0)\n",
194 |     "# 显存不足，下采样\n",
195 |     "train['behavior_type'] = train['behavior_type'].fillna(0)\n",
196 |     "train['behavior_type'] = train['behavior_type'].map({'clk':1})\n",
197 |     "\n",
198 |     "u_enc,i_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id'])\n",
199 |     "train['user_id'] = u_enc.transform(train['user_id'])\n",
200 |     "train['item_id'] = i_enc.transform(train['item_id'])+u_enc.classes_.shape[0]\n",
201 |     "\n",
202 |     "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])\n",
203 |     "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n",
204 |     "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n",
205 |     "y = torch.tensor(train['behavior_type'].fillna(0).values,dtype=torch.long)\n",
206 |     "data = Data(u=u,i=i,edge_index=edge_index,y=y)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
216 |     "data = data.to(device)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 59,
222 |    "metadata": {},
223 |    "outputs": [
224 |     {
225 |      "name": "stdout",
226 |      "output_type": "stream",
227 |      "text": [
228 |       "positive label ratio:  0.2816764522287305\n"
229 |      ]
230 |     }
231 |    ],
232 |    "source": [
233 |     "print('positive label ratio: ',data.y.sum().item()/data.y.shape[0])"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 60,
239 |    "metadata": {},
240 |    "outputs": [
241 |     {
242 |      "name": "stderr",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n",
246 |       "  warnings.warn(__num_nodes_warn_msg__.format('edge'))\n"
247 |      ]
248 |     },
249 |     {
250 |      "data": {
251 |       "text/plain": [
252 |        "False"
253 |       ]
254 |      },
255 |      "execution_count": 60,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "data.contains_isolated_nodes()"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 69,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "name": "stderr",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n",
274 |       "  warnings.warn(__num_nodes_warn_msg__.format('edge'))\n"
275 |      ]
276 |     },
277 |     {
278 |      "data": {
279 |       "text/plain": [
280 |        "2243363"
281 |       ]
282 |      },
283 |      "execution_count": 69,
284 |      "metadata": {},
285 |      "output_type": "execute_result"
286 |     }
287 |    ],
288 |    "source": [
289 |     "data.num_nodes"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 68,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/plain": [
300 |        "4176391"
301 |       ]
302 |      },
303 |      "execution_count": 68,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "data.num_edges"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 66,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/plain": [
320 |        "torch.Size([776543, 1])"
321 |       ]
322 |      },
323 |      "execution_count": 66,
324 |      "metadata": {},
325 |      "output_type": "execute_result"
326 |     }
327 |    ],
328 |    "source": [
329 |     "data.u.shape # number of user"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 67,
335 |    "metadata": {},
336 |    "outputs": [
337 |     {
338 |      "data": {
339 |       "text/plain": [
340 |        "torch.Size([1466820, 1])"
341 |       ]
342 |      },
343 |      "execution_count": 67,
344 |      "metadata": {},
345 |      "output_type": "execute_result"
346 |     }
347 |    ],
348 |    "source": [
349 |     "data.i.shape # number of item"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 70,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "text/plain": [
360 |        "3.6665601930725125e-06"
361 |       ]
362 |      },
363 |      "execution_count": 70,
364 |      "metadata": {},
365 |      "output_type": "execute_result"
366 |     }
367 |    ],
368 |    "source": [
369 |     "data.num_edges/(data.u.shape[0]*data.i.shape[0]) # very sparse"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 6,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "from torch_geometric.nn import GCNConv\n",
379 |     "import torch.nn.functional as F\n",
380 |     "\n",
381 |     "\n",
382 |     "# hyper param\n",
383 |     "EMB_DIM = 10\n",
384 |     "\n",
385 |     "class Net(torch.nn.Module):\n",
386 |     "    def __init__(self):\n",
387 |     "        super(Net, self).__init__()\n",
388 |     "        self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n",
389 |     "        self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n",
390 |     "        self.conv1 = GCNConv(EMB_DIM*2,EMB_DIM)\n",
391 |     "        self.conv2 = GCNConv(EMB_DIM,8)\n",
392 |     "        self.lin = torch.nn.Linear(8,2)\n",
393 |     "        \n",
394 |     "    def forward(self, data):\n",
395 |     "        u,i,edge_index = data.u,data.i,data.edge_index\n",
396 |     "        \n",
397 |     "        emb_u = self.u_emb(u[edge_index[0]]).view(-1,EMB_DIM)\n",
398 |     "        emb_i = self.i_emb(i[(edge_index[1]-u_enc.classes_.shape[0])]-u_enc.classes_.shape[0]).view(-1,EMB_DIM)\n",
399 |     "        \n",
400 |     "        x = torch.cat([emb_u,emb_i],dim=1)\n",
401 |     "        x = self.conv1(x,edge_index)\n",
402 |     "        x = F.relu(x)\n",
403 |     "        x = F.dropout(x,training=self.training)\n",
404 |     "        x = self.conv2(x,edge_index)\n",
405 |     "        x = F.relu(x)\n",
406 |     "        x = F.dropout(x,training=self.training)\n",
407 |     "        \n",
408 |     "        x = self.lin(x)\n",
409 |     "        return F.log_softmax(x,dim=1)\n",
410 |     "    \n"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 8,
416 |    "metadata": {
417 |     "scrolled": true
418 |    },
419 |    "outputs": [],
420 |    "source": []
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 58,
425 |    "metadata": {},
426 |    "outputs": [
427 |     {
428 |      "name": "stdout",
429 |      "output_type": "stream",
430 |      "text": [
431 |       "tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward>)\n",
432 |       "0.6750931605781164\n",
433 |       "tensor(0.7053, device='cuda:0', grad_fn=<NllLossBackward>)\n",
434 |       "0.674996905222715\n",
435 |       "tensor(0.7048, device='cuda:0', grad_fn=<NllLossBackward>)\n",
436 |       "0.6750962733134901\n",
437 |       "tensor(0.7049, device='cuda:0', grad_fn=<NllLossBackward>)\n",
438 |       "0.6753460104669319\n",
439 |       "tensor(0.7050, device='cuda:0', grad_fn=<NllLossBackward>)\n",
440 |       "0.6751274006672268\n",
441 |       "tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)\n",
442 |       "0.6751913314629785\n",
443 |       "tensor(0.7050, device='cuda:0', grad_fn=<NllLossBackward>)\n",
444 |       "0.6752758542004329\n",
445 |       "tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)\n",
446 |       "0.675027314252904\n",
447 |       "tensor(0.7051, device='cuda:0', grad_fn=<NllLossBackward>)\n",
448 |       "0.6748395923657531\n",
449 |       "tensor(0.7052, device='cuda:0', grad_fn=<NllLossBackward>)\n",
450 |       "0.6751312317261482\n"
451 |      ]
452 |     }
453 |    ],
454 |    "source": [
455 |     "model = Net().to(device)\n",
456 |     "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
457 |     "weight = torch.tensor([1,1.075],dtype=torch.float).to(device)\n",
458 |     "\n",
459 |     "model.train()\n",
460 |     "for epoch in range(50):\n",
461 |     "    optimizer.zero_grad()\n",
462 |     "    out = model(data)\n",
463 |     "    loss = F.nll_loss(out[],data.y,weight=weight)\n",
464 |     "    loss.backward()\n",
465 |     "    optimizer.step\n",
466 |     "    if epoch%5==0:\n",
467 |     "        print(loss)\n",
468 |     "        _,pred = model(data).max(dim=1) \n",
469 |     "        print(pred.eq(data.y).sum().item()/data.y.shape[0])"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "markdown",
474 |    "metadata": {
475 |     "scrolled": true
476 |    },
477 |    "source": [
478 |     "### evalutate"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 55,
484 |    "metadata": {},
485 |    "outputs": [
486 |     {
487 |      "name": "stdout",
488 |      "output_type": "stream",
489 |      "text": [
490 |       "accuracy:  0.6830433261636661\n",
491 |       "recall:  0.149021881330272\n",
492 |       "precision:  0.3520523736846333\n",
493 |       "f1:  0.2094041213580665\n"
494 |      ]
495 |     }
496 |    ],
497 |    "source": [
498 |     "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n",
499 |     "train['pred'] = pred.to('cpu').numpy()\n",
500 |     "print('accuracy: ',accuracy_score(train['behavior_type'].fillna(0),train['pred']))\n",
501 |     "print('recall: ',recall_score(train['behavior_type'].fillna(0), train['pred']))\n",
502 |     "print('precision: ',precision_score(train['behavior_type'].fillna(0), train['pred']))\n",
503 |     "print('f1: ',f1_score(train['behavior_type'].fillna(0), train['pred']))"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 56,
509 |    "metadata": {},
510 |    "outputs": [
511 |     {
512 |      "data": {
513 |       "text/plain": [
514 |        "0    3678431\n",
515 |        "1     497960\n",
516 |        "Name: pred, dtype: int64"
517 |       ]
518 |      },
519 |      "execution_count": 56,
520 |      "metadata": {},
521 |      "output_type": "execute_result"
522 |     }
523 |    ],
524 |    "source": [
525 |     "train.pred.value_counts()"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": 57,
531 |    "metadata": {},
532 |    "outputs": [
533 |     {
534 |      "data": {
535 |       "text/plain": [
536 |        "0.0    3000000\n",
537 |        "1.0    1176391\n",
538 |        "Name: behavior_type, dtype: int64"
539 |       ]
540 |      },
541 |      "execution_count": 57,
542 |      "metadata": {},
543 |      "output_type": "execute_result"
544 |     }
545 |    ],
546 |    "source": [
547 |     "train.behavior_type.fillna(0).value_counts()"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "code",
552 |    "execution_count": null,
553 |    "metadata": {},
554 |    "outputs": [],
555 |    "source": []
556 |   }
557 |  ],
558 |  "metadata": {
559 |   "kernelspec": {
560 |    "display_name": "Python 3",
561 |    "language": "python",
562 |    "name": "python3"
563 |   },
564 |   "language_info": {
565 |    "codemirror_mode": {
566 |     "name": "ipython",
567 |     "version": 3
568 |    },
569 |    "file_extension": ".py",
570 |    "mimetype": "text/x-python",
571 |    "name": "python",
572 |    "nbconvert_exporter": "python",
573 |    "pygments_lexer": "ipython3",
574 |    "version": "3.7.3"
575 |   }
576 |  },
577 |  "nbformat": 4,
578 |  "nbformat_minor": 2
579 | }
580 | 


--------------------------------------------------------------------------------
/notebook/edge GCN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "C:\\Users\\user\\Anaconda3\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
 13 |       "  from ._conv import register_converters as _register_converters\n"
 14 |      ]
 15 |     },
 16 |     {
 17 |      "name": "stdout",
 18 |      "output_type": "stream",
 19 |      "text": [
 20 |       "Wall time: 53.9 s\n"
 21 |      ]
 22 |     }
 23 |    ],
 24 |    "source": [
 25 |     "%%time\n",
 26 |     "import pandas as pd\n",
 27 |     "import os\n",
 28 |     "import numpy as np\n",
 29 |     "import torch\n",
 30 |     "import torch_geometric\n",
 31 |     "from sklearn.preprocessing import LabelEncoder\n",
 32 |     "import gc\n",
 33 |     "from torch_geometric.data import Data\n",
 34 |     "\n",
 35 |     "# item_feature = pd.read_csv('../data/item_feature.csv')\n",
 36 |     "# test = pd.read_csv('../data/test.csv')\n",
 37 |     "# user_feature = pd.read_csv('../data/user_feature.csv')\n",
 38 |     "train = pd.read_pickle('../data/sml_train.pkl')"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 2,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/html": [
 49 |        "<div>\n",
 50 |        "<style scoped>\n",
 51 |        "    .dataframe tbody tr th:only-of-type {\n",
 52 |        "        vertical-align: middle;\n",
 53 |        "    }\n",
 54 |        "\n",
 55 |        "    .dataframe tbody tr th {\n",
 56 |        "        vertical-align: top;\n",
 57 |        "    }\n",
 58 |        "\n",
 59 |        "    .dataframe thead th {\n",
 60 |        "        text-align: right;\n",
 61 |        "    }\n",
 62 |        "</style>\n",
 63 |        "<table border=\"1\" class=\"dataframe\">\n",
 64 |        "  <thead>\n",
 65 |        "    <tr style=\"text-align: right;\">\n",
 66 |        "      <th></th>\n",
 67 |        "      <th>user_id</th>\n",
 68 |        "      <th>item_id</th>\n",
 69 |        "      <th>behavior_type</th>\n",
 70 |        "      <th>date</th>\n",
 71 |        "    </tr>\n",
 72 |        "  </thead>\n",
 73 |        "  <tbody>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>0</th>\n",
 76 |        "      <td>1.732029e+09</td>\n",
 77 |        "      <td>3.193364e+08</td>\n",
 78 |        "      <td>clk</td>\n",
 79 |        "      <td>2019-06-19</td>\n",
 80 |        "    </tr>\n",
 81 |        "    <tr>\n",
 82 |        "      <th>2</th>\n",
 83 |        "      <td>1.732029e+09</td>\n",
 84 |        "      <td>1.197152e+09</td>\n",
 85 |        "      <td>clk</td>\n",
 86 |        "      <td>2019-06-19</td>\n",
 87 |        "    </tr>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>3</th>\n",
 90 |        "      <td>1.732029e+09</td>\n",
 91 |        "      <td>1.145630e+09</td>\n",
 92 |        "      <td>clk</td>\n",
 93 |        "      <td>2019-06-19</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>5</th>\n",
 97 |        "      <td>1.732029e+09</td>\n",
 98 |        "      <td>1.162473e+09</td>\n",
 99 |        "      <td>clk</td>\n",
100 |        "      <td>2019-06-19</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>7</th>\n",
104 |        "      <td>1.732029e+09</td>\n",
105 |        "      <td>1.128524e+09</td>\n",
106 |        "      <td>clk</td>\n",
107 |        "      <td>2019-06-19</td>\n",
108 |        "    </tr>\n",
109 |        "  </tbody>\n",
110 |        "</table>\n",
111 |        "</div>"
112 |       ],
113 |       "text/plain": [
114 |        "        user_id       item_id behavior_type       date\n",
115 |        "0  1.732029e+09  3.193364e+08           clk 2019-06-19\n",
116 |        "2  1.732029e+09  1.197152e+09           clk 2019-06-19\n",
117 |        "3  1.732029e+09  1.145630e+09           clk 2019-06-19\n",
118 |        "5  1.732029e+09  1.162473e+09           clk 2019-06-19\n",
119 |        "7  1.732029e+09  1.128524e+09           clk 2019-06-19"
120 |       ]
121 |      },
122 |      "execution_count": 2,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "train.head()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "### 加入edge信息的GCN(时间序列思路，未解决推荐问题和冷启动问题)\n",
136 |     "1. 根据历史 __所有behavior边__ 预测未来可能在这些边中是clk的概率\n",
137 |     "2. __behavior__ 用 __user-item__ 二部图来表示， __user__ 和 __item__ 作为节点，边属性为 __behavior_type__\n",
138 |     "3. 更新每个user/item feature时，concat 1-hop node embedding和edge_embedding，然后求均值作为新的user/item feature\n",
139 |     "4. 最后concat __user_embedding__ , __item_embedding__ 作为user-item pair 的feature, 该user-item pair在未来是否发生clk作为label"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 3,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "Wall time: 1min 29s\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "%%time\n",
157 |     "his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id','behavior_type'])\n",
158 |     "now = train[(train.date>='2019-06-20')&(train.behavior_type=='clk')].drop_duplicates(subset=['user_id','item_id'])\n",
159 |     "del train\n",
160 |     "now.rename(columns={'behavior_type':'label'},inplace=True)\n",
161 |     "train = his[['user_id','item_id','behavior_type']].merge(now[['user_id','item_id','label']],how='left')\n",
162 |     "del his,now\n",
163 |     "gc.collect()\n",
164 |     "train['label'] = train['label'].map({'clk':1})\n",
165 |     "train['label'] = train['label'].fillna(0) "
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "### build model"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 5,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "Wall time: 18.8 s\n"
185 |      ]
186 |     }
187 |    ],
188 |    "source": [
189 |     "%%time\n",
190 |     "train = pd.concat([train[train.label==1],train[train.label==0].sample(2000000)],axis=0)\n",
191 |     "# 显存不足，下采样\n",
192 |     "\n",
193 |     "u_enc,i_enc,e_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id']),LabelEncoder().fit(train['behavior_type'])\n",
194 |     "train['user_id'] = u_enc.transform(train['user_id'])\n",
195 |     "train['item_id'] = i_enc.transform(train['item_id'])\n",
196 |     "train['behavior_type'] = e_enc.transform(train['behavior_type'])"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 11,
202 |    "metadata": {},
203 |    "outputs": [
204 |     {
205 |      "data": {
206 |       "text/plain": [
207 |        "0.860387"
208 |       ]
209 |      },
210 |      "execution_count": 11,
211 |      "metadata": {},
212 |      "output_type": "execute_result"
213 |     }
214 |    ],
215 |    "source": [
216 |     "# clk rate\n",
217 |     "train.label.sum()/(train.shape[0]-train.label.sum())"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "#### no edge GCN   \n",
225 |     "不利用边属性信息，只用节点的embedding进行GCN"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 6,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values+1+train['user_id'].max()])\n",
235 |     "edge_attr = torch.tensor(train.behavior_type.values, dtype=torch.long)\n",
236 |     "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n",
237 |     "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n",
238 |     "e = torch.tensor(train['behavior_type'].unique().reshape(-1,1))\n",
239 |     "y = torch.tensor(train.label.values,dtype=torch.long)\n",
240 |     "data = Data(u=u,i=i,e=e,edge_index=edge_index,edge_attr=edge_attr,y=y)"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 7,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "from torch_geometric.nn import GCNConv,MessagePassing\n",
250 |     "import torch.nn.functional as F\n",
251 |     "\n",
252 |     "# hyper param\n",
253 |     "EMB_DIM = 10\n",
254 |     "\n",
255 |     "class noedge_GCN(torch.nn.Module):\n",
256 |     "    def __init__(self):\n",
257 |     "        super(noedge_GCN, self).__init__()\n",
258 |     "        self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n",
259 |     "        self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n",
260 |     "        self.conv1 = GCNConv(EMB_DIM,6)\n",
261 |     "        self.conv2 = GCNConv(6,4)\n",
262 |     "        self.lin = torch.nn.Linear(8,2)\n",
263 |     "        \n",
264 |     "    def forward(self, data):\n",
265 |     "        u,i,e,edge_index,edge_attr = data.u,data.i,data.e,data.edge_index,data.edge_attr\n",
266 |     "        \n",
267 |     "        emb_u = self.u_emb(u).view(-1,EMB_DIM)\n",
268 |     "        emb_i = self.i_emb(i).view(-1,EMB_DIM)\n",
269 |     "        x = torch.cat([emb_u, emb_i],dim=0)\n",
270 |     "        x = self.conv1(x,edge_index)\n",
271 |     "        x = F.relu(x)\n",
272 |     "        x = F.dropout(x,training=self.training)\n",
273 |     "        x = self.conv2(x,edge_index)\n",
274 |     "        x = F.relu(x)\n",
275 |     "        x = F.dropout(x,training=self.training)\n",
276 |     "        x = torch.cat([x[edge_index[0]],x[edge_index[1]]],dim=1)\n",
277 |     "        x = self.lin(x)\n",
278 |     "        x = F.dropout(x,training=self.training)\n",
279 |     "        \n",
280 |     "        return F.log_softmax(x,dim=1)\n",
281 |     "        \n",
282 |     "\n",
283 |     "model = noedge_GCN()"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 12,
289 |    "metadata": {},
290 |    "outputs": [
291 |     {
292 |      "name": "stdout",
293 |      "output_type": "stream",
294 |      "text": [
295 |       "epoch_0 loss: 0.8651493787765503\n",
296 |       "epoch_1 loss: 0.7528172135353088\n",
297 |       "epoch_2 loss: 0.7403311133384705\n",
298 |       "epoch_3 loss: 0.7339281439781189\n",
299 |       "epoch_4 loss: 0.7297651171684265\n",
300 |       "epoch_5 loss: 0.7220962643623352\n",
301 |       "epoch_6 loss: 0.7202253341674805\n",
302 |       "epoch_7 loss: 0.7201514840126038\n",
303 |       "epoch_8 loss: 0.7203397750854492\n",
304 |       "epoch_9 loss: 0.721700131893158\n",
305 |       "Wall time: 3min 35s\n"
306 |      ]
307 |     }
308 |    ],
309 |    "source": [
310 |     "%%time\n",
311 |     "model = noedge_GCN()\n",
312 |     "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
313 |     "weight = torch.tensor([0.860387,1],dtype=torch.float) # label的weight属于超参，需要调参！！！\n",
314 |     "\n",
315 |     "model.train()\n",
316 |     "for epoch in range(10):\n",
317 |     "    optimizer.zero_grad()\n",
318 |     "    out = model(data)\n",
319 |     "    loss = F.nll_loss(out, data.y,weight=weight)\n",
320 |     "    loss.backward()\n",
321 |     "    optimizer.step()\n",
322 |     "    if epoch%1==0:\n",
323 |     "        model.eval()\n",
324 |     "        print('epoch_{} loss: {}'.format(epoch,loss.item()))"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 14,
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "accuracy:  0.46687463414870134\n",
337 |       "recall:  0.9499719312355951\n",
338 |       "precision:  0.4627905982579322\n",
339 |       "f1:  0.6223807175044113\n"
340 |      ]
341 |     }
342 |    ],
343 |    "source": [
344 |     "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n",
345 |     "model.eval()\n",
346 |     "_,pred = model(data).max(dim=1)\n",
347 |     "train['pred'] = pred.numpy()\n",
348 |     "print('accuracy: ',accuracy_score(train['label'],train['pred']))\n",
349 |     "print('recall: ',recall_score(train['label'], train['pred']))\n",
350 |     "print('precision: ',precision_score(train['label'], train['pred']))\n",
351 |     "print('f1: ',f1_score(train['label'], train['pred']))"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "#### edge GCN\n",
359 |     "同时利用node_embedding和edge_embedding做GCN，最终每个节点的embedding是融合了该节点领域的node_embedding和edge_embedding  \n",
360 |     "魔改GCN"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 18,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])\n",
370 |     "edge_attr = torch.tensor(train.behavior_type.values, dtype=torch.long)\n",
371 |     "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n",
372 |     "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n",
373 |     "e = torch.tensor(train['behavior_type'].unique().reshape(-1,1))\n",
374 |     "y = torch.tensor(train.label.values,dtype=torch.long)\n",
375 |     "data = Data(u=u,i=i,e=e,edge_index=edge_index,edge_attr=edge_attr,y=y)"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 32,
381 |    "metadata": {},
382 |    "outputs": [],
383 |    "source": [
384 |     "import torch\n",
385 |     "from torch_geometric.nn import MessagePassing\n",
386 |     "from torch_geometric.utils import add_self_loops, degree\n",
387 |     "\n",
388 |     "class edgeGCN(MessagePassing):\n",
389 |     "    \n",
390 |     "    def __init__(self, in_channels, out_channels, flow, aggr='add', **kwargs):\n",
391 |     "        super(edgeGCN, self).__init__(aggr=aggr,flow=flow)\n",
392 |     "        self.lin_u = torch.nn.Linear(in_channels, out_channels)\n",
393 |     "        self.lin_i = torch.nn.Linear(in_channels, out_channels)\n",
394 |     "        self.lin_e = torch.nn.Linear(in_channels, out_channels)\n",
395 |     "        self.lin_aggr = torch.nn.Linear(out_channels*2, out_channels)\n",
396 |     "        self.flow = flow\n",
397 |     "        \n",
398 |     "    def forward(self,u,i,e,edge_index,edge_type):\n",
399 |     "        \n",
400 |     "        # linear transformation\n",
401 |     "        u = self.lin_u(u)\n",
402 |     "        u = F.relu(u)\n",
403 |     "        u = F.dropout(u)\n",
404 |     "        i = self.lin_i(i)\n",
405 |     "        i = F.relu(i)\n",
406 |     "        i = F.dropout(i)\n",
407 |     "        e = self.lin_e(e)\n",
408 |     "        e = F.relu(e)\n",
409 |     "        e = F.dropout(e)        \n",
410 |     "        \n",
411 |     "        return self.propagate(x=(u,i),e=e,edge_index=edge_index,edge_type=edge_type,size=(u.size(0), i.size(0)))\n",
412 |     "    \n",
413 |     "    def message(self,x_j, x_i, e, edge_index, edge_type,size):\n",
414 |     "        \n",
415 |     "        # x_i is user_embedding\n",
416 |     "        # x_j is item_embedding\n",
417 |     "        \n",
418 |     "        # get normalized laplacian\n",
419 |     "        row,col = edge_index\n",
420 |     "        deg_i = degree(row, size[0], dtype=x_i.dtype)\n",
421 |     "        deg_j = degree(col, size[1], dtype=x_j.dtype)\n",
422 |     "        deg_inv_sqrt_i = deg_i.pow(-0.5)\n",
423 |     "        deg_inv_sqrt_j = deg_j.pow(-0.5)\n",
424 |     "        norm = deg_inv_sqrt_i[row]*deg_inv_sqrt_j[col]\n",
425 |     "        \n",
426 |     "        # concat neighbor nodes embedding and edge embedding\n",
427 |     "        if self.flow == 'target_to_source':\n",
428 |     "            emb = torch.cat([x_j, e[edge_type]], dim=1)\n",
429 |     "        else:\n",
430 |     "            emb = torch.cat([x_i, e[edge_type]], dim=1)\n",
431 |     "        return norm.view(-1,1)*emb\n",
432 |     "        \n",
433 |     "    def update(self, aggr_out):\n",
434 |     "        return self.lin_aggr(aggr_out)"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 69,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "# hyper param\n",
444 |     "EMB_DIM = 10\n",
445 |     "\n",
446 |     "class Net(torch.nn.Module):\n",
447 |     "    def __init__(self):\n",
448 |     "        super(Net, self).__init__()\n",
449 |     "        self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n",
450 |     "        self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n",
451 |     "        self.e_emb = torch.nn.Embedding(len(e),EMB_DIM)\n",
452 |     "        self.e_lin_1 = torch.nn.Linear(EMB_DIM,6)\n",
453 |     "        self.u_gcn_1 = edgeGCN(EMB_DIM,6,flow='target_to_source')\n",
454 |     "        self.i_gcn_1 = edgeGCN(EMB_DIM,6,flow='source_to_target')\n",
455 |     "        self.u_gcn_2 = edgeGCN(6,2,flow='target_to_source')\n",
456 |     "        self.i_gcn_2 = edgeGCN(6,2,flow='source_to_target')\n",
457 |     "        self.lin = torch.nn.Linear(4,2)\n",
458 |     "        \n",
459 |     "    def forward(self, data):\n",
460 |     "        u,i,e,edge_index,edge_attr = data.u,data.i,data.e,data.edge_index,data.edge_attr\n",
461 |     "        \n",
462 |     "        u_emb = self.u_emb(u).view(-1,EMB_DIM)\n",
463 |     "        i_emb = self.i_emb(i).view(-1,EMB_DIM)\n",
464 |     "        e_emb = self.e_emb(e).view(-1,EMB_DIM)\n",
465 |     "        \n",
466 |     "        x_u,x_i = self.u_gcn_1(u=u_emb, i=i_emb, e=e_emb, edge_index=edge_index, edge_type = edge_attr),\\\n",
467 |     "                    self.i_gcn_1(u=u_emb, i=i_emb, e=e_emb, edge_index=edge_index, edge_type = edge_attr)\n",
468 |     "        e_emb = self.e_lin_1(e_emb)\n",
469 |     "        x_u,x_i = self.u_gcn_2(u=x_u, i=x_i, e=e_emb, edge_index=edge_index, edge_type = edge_attr),\\\n",
470 |     "                    self.i_gcn_2(u=x_u, i=x_i, e=e_emb, edge_index=edge_index, edge_type = edge_attr)\n",
471 |     "\n",
472 |     "        \n",
473 |     "        x = torch.cat([x_u[edge_index[0]],x_i[edge_index[1]]],dim=1)\n",
474 |     "        x = self.lin(x)\n",
475 |     "        x = F.dropout(x,training=self.training)\n",
476 |     "        \n",
477 |     "        return F.log_softmax(x,dim=1)"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": 73,
483 |    "metadata": {},
484 |    "outputs": [
485 |     {
486 |      "name": "stdout",
487 |      "output_type": "stream",
488 |      "text": [
489 |       "epoch_0 loss: 0.8134878873825073\n",
490 |       "epoch_1 loss: 0.7332140207290649\n",
491 |       "epoch_2 loss: 0.7305666208267212\n",
492 |       "epoch_3 loss: 0.7313019633293152\n",
493 |       "Wall time: 1min 36s\n"
494 |      ]
495 |     }
496 |    ],
497 |    "source": [
498 |     "%%time\n",
499 |     "model = Net()\n",
500 |     "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
501 |     "weight = torch.tensor([0.860387,1],dtype=torch.float) # label的weight属于超参，需要调参！！！\n",
502 |     "\n",
503 |     "model.train()\n",
504 |     "\n",
505 |     "# train param\n",
506 |     "patience = 0\n",
507 |     "best_loss = 1\n",
508 |     "\n",
509 |     "for epoch in range(10):\n",
510 |     "    optimizer.zero_grad()\n",
511 |     "    out = model(data)\n",
512 |     "    loss = F.nll_loss(out, data.y,weight=weight)\n",
513 |     "    \n",
514 |     "    if epoch%1==0:\n",
515 |     "        model.eval()\n",
516 |     "        print('epoch_{} loss: {}'.format(epoch,loss.item()))\n",
517 |     "        \n",
518 |     "    if loss>best_loss:\n",
519 |     "        patience += 1\n",
520 |     "        if patience == 2: \n",
521 |     "            break\n",
522 |     "    else:\n",
523 |     "        patience = 0\n",
524 |     "        best_loss = loss\n",
525 |     "        \n",
526 |     "    loss.backward()\n",
527 |     "    optimizer.step()"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 74,
533 |    "metadata": {},
534 |    "outputs": [
535 |     {
536 |      "name": "stdout",
537 |      "output_type": "stream",
538 |      "text": [
539 |       "accuracy:  0.5050347051446823\n",
540 |       "recall:  0.5282611197054349\n",
541 |       "precision:  0.468827943036212\n",
542 |       "f1:  0.4967732239615924\n"
543 |      ]
544 |     }
545 |    ],
546 |    "source": [
547 |     "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n",
548 |     "model.eval()\n",
549 |     "_,pred = model(data).max(dim=1)\n",
550 |     "train['pred'] = pred.numpy()\n",
551 |     "print('accuracy: ',accuracy_score(train['label'],train['pred']))\n",
552 |     "print('recall: ',recall_score(train['label'], train['pred']))\n",
553 |     "print('precision: ',precision_score(train['label'], train['pred']))\n",
554 |     "print('f1: ',f1_score(train['label'], train['pred']))"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": null,
560 |    "metadata": {},
561 |    "outputs": [],
562 |    "source": []
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": null,
567 |    "metadata": {},
568 |    "outputs": [],
569 |    "source": []
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": null,
574 |    "metadata": {},
575 |    "outputs": [],
576 |    "source": []
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": null,
581 |    "metadata": {},
582 |    "outputs": [],
583 |    "source": []
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": null,
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": []
591 |   },
592 |   {
593 |    "cell_type": "code",
594 |    "execution_count": null,
595 |    "metadata": {},
596 |    "outputs": [],
597 |    "source": []
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": null,
602 |    "metadata": {},
603 |    "outputs": [],
604 |    "source": []
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": null,
609 |    "metadata": {},
610 |    "outputs": [],
611 |    "source": []
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": []
619 |   },
620 |   {
621 |    "cell_type": "code",
622 |    "execution_count": null,
623 |    "metadata": {},
624 |    "outputs": [],
625 |    "source": []
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": null,
630 |    "metadata": {},
631 |    "outputs": [],
632 |    "source": []
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": null,
637 |    "metadata": {},
638 |    "outputs": [],
639 |    "source": []
640 |   }
641 |  ],
642 |  "metadata": {
643 |   "kernelspec": {
644 |    "display_name": "Python 3",
645 |    "language": "python",
646 |    "name": "python3"
647 |   },
648 |   "language_info": {
649 |    "codemirror_mode": {
650 |     "name": "ipython",
651 |     "version": 3
652 |    },
653 |    "file_extension": ".py",
654 |    "mimetype": "text/x-python",
655 |    "name": "python",
656 |    "nbconvert_exporter": "python",
657 |    "pygments_lexer": "ipython3",
658 |    "version": "3.6.5"
659 |   }
660 |  },
661 |  "nbformat": 4,
662 |  "nbformat_minor": 2
663 | }
664 | 


--------------------------------------------------------------------------------
	user_id	item_id	behavior_type	date
0	1.732029e+09	3.193364e+08	clk	2019-06-19
2	1.732029e+09	1.197152e+09	clk	2019-06-19
3	1.732029e+09	1.145630e+09	clk	2019-06-19
5	1.732029e+09	1.162473e+09	clk	2019-06-19
7	1.732029e+09	1.128524e+09	clk	2019-06-19