├── .ipynb_checkpoints
└── baseline-checkpoint.ipynb
├── README.md
└── notebook
├── baseline.ipynb
└── edge GCN.ipynb
/.ipynb_checkpoints/baseline-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "Wall time: 6.3 s\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "%%time\n",
18 | "import pandas as pd\n",
19 | "import os\n",
20 | "import numpy as np\n",
21 | "import torch\n",
22 | "import torch_geometric\n",
23 | "from sklearn.preprocessing import LabelEncoder\n",
24 | "import gc\n",
25 | "from torch_geometric.data import Data\n",
26 | "\n",
27 | "# item_feature = pd.read_csv('../data/item_feature.csv')\n",
28 | "# test = pd.read_csv('../data/test.csv')\n",
29 | "# user_feature = pd.read_csv('../data/user_feature.csv')\n",
30 | "train = pd.read_pickle('../data/sml_train.pkl')"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/html": [
41 | "
\n",
42 | "\n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " | \n",
59 | " user_id | \n",
60 | " item_id | \n",
61 | " behavior_type | \n",
62 | " date | \n",
63 | "
\n",
64 | " \n",
65 | " \n",
66 | " \n",
67 | " 0 | \n",
68 | " 1.732029e+09 | \n",
69 | " 3.193364e+08 | \n",
70 | " clk | \n",
71 | " 2019-06-19 | \n",
72 | "
\n",
73 | " \n",
74 | " 2 | \n",
75 | " 1.732029e+09 | \n",
76 | " 1.197152e+09 | \n",
77 | " clk | \n",
78 | " 2019-06-19 | \n",
79 | "
\n",
80 | " \n",
81 | " 3 | \n",
82 | " 1.732029e+09 | \n",
83 | " 1.145630e+09 | \n",
84 | " clk | \n",
85 | " 2019-06-19 | \n",
86 | "
\n",
87 | " \n",
88 | " 5 | \n",
89 | " 1.732029e+09 | \n",
90 | " 1.162473e+09 | \n",
91 | " clk | \n",
92 | " 2019-06-19 | \n",
93 | "
\n",
94 | " \n",
95 | " 7 | \n",
96 | " 1.732029e+09 | \n",
97 | " 1.128524e+09 | \n",
98 | " clk | \n",
99 | " 2019-06-19 | \n",
100 | "
\n",
101 | " \n",
102 | "
\n",
103 | "
"
104 | ],
105 | "text/plain": [
106 | " user_id item_id behavior_type date\n",
107 | "0 1.732029e+09 3.193364e+08 clk 2019-06-19\n",
108 | "2 1.732029e+09 1.197152e+09 clk 2019-06-19\n",
109 | "3 1.732029e+09 1.145630e+09 clk 2019-06-19\n",
110 | "5 1.732029e+09 1.162473e+09 clk 2019-06-19\n",
111 | "7 1.732029e+09 1.128524e+09 clk 2019-06-19"
112 | ]
113 | },
114 | "execution_count": 2,
115 | "metadata": {},
116 | "output_type": "execute_result"
117 | }
118 | ],
119 | "source": [
120 | "train.head()"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "### baseline思路(时间序列思路,未解决推荐问题和冷启动问题)\n",
128 | "1. 根据历史clk记录预测未来可能重复clk的user-item pair\n",
129 | "2. 历史clk记录可以用user-item二部图来表示,user和item作为节点,其中的边作为clk记录\n",
130 | "3. feature为user_embedding和item_embedding,通过concat两者的embedding后接MLP得到user-item pair的预测值\n",
131 | "4. label为历史clk边中重复clk的边 \n",
132 | "\n",
133 | "---\n",
134 | "由于内存不足,事先在服务器筛了训练数据中18-20号的数据,用18-19号的clk预测20号会重复clk的。 "
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 3,
140 | "metadata": {},
141 | "outputs": [
142 | {
143 | "name": "stdout",
144 | "output_type": "stream",
145 | "text": [
146 | "Wall time: 54.3 s\n"
147 | ]
148 | },
149 | {
150 | "data": {
151 | "text/plain": [
152 | "42"
153 | ]
154 | },
155 | "execution_count": 3,
156 | "metadata": {},
157 | "output_type": "execute_result"
158 | }
159 | ],
160 | "source": [
161 | "%%time\n",
162 | "train = train[train.behavior_type=='clk']\n",
163 | "his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id'])\n",
164 | "now = train[train.date>='2019-06-20']\n",
165 | "del train\n",
166 | "train = his[['user_id','item_id']].merge(now[['user_id','item_id','behavior_type']],how='left')\n",
167 | "del his,now\n",
168 | "gc.collect()"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "## build model"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 4,
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "name": "stdout",
185 | "output_type": "stream",
186 | "text": [
187 | "Wall time: 9.81 s\n"
188 | ]
189 | }
190 | ],
191 | "source": [
192 | "%%time\n",
193 | "train = pd.concat([train[train.behavior_type.isnull()==False],train[train.behavior_type.isnull()==True].sample(3000000)],axis=0)\n",
194 | "# 显存不足,下采样\n",
195 | "train['behavior_type'] = train['behavior_type'].fillna(0)\n",
196 | "train['behavior_type'] = train['behavior_type'].map({'clk':1})\n",
197 | "\n",
198 | "u_enc,i_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id'])\n",
199 | "train['user_id'] = u_enc.transform(train['user_id'])\n",
200 | "train['item_id'] = i_enc.transform(train['item_id'])+u_enc.classes_.shape[0]\n",
201 | "\n",
202 | "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])\n",
203 | "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n",
204 | "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n",
205 | "y = torch.tensor(train['behavior_type'].fillna(0).values,dtype=torch.long)\n",
206 | "data = Data(u=u,i=i,edge_index=edge_index,y=y)"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
216 | "data = data.to(device)"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 59,
222 | "metadata": {},
223 | "outputs": [
224 | {
225 | "name": "stdout",
226 | "output_type": "stream",
227 | "text": [
228 | "positive label ratio: 0.2816764522287305\n"
229 | ]
230 | }
231 | ],
232 | "source": [
233 | "print('positive label ratio: ',data.y.sum().item()/data.y.shape[0])"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 60,
239 | "metadata": {},
240 | "outputs": [
241 | {
242 | "name": "stderr",
243 | "output_type": "stream",
244 | "text": [
245 | "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n",
246 | " warnings.warn(__num_nodes_warn_msg__.format('edge'))\n"
247 | ]
248 | },
249 | {
250 | "data": {
251 | "text/plain": [
252 | "False"
253 | ]
254 | },
255 | "execution_count": 60,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | }
259 | ],
260 | "source": [
261 | "data.contains_isolated_nodes()"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 69,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "name": "stderr",
271 | "output_type": "stream",
272 | "text": [
273 | "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n",
274 | " warnings.warn(__num_nodes_warn_msg__.format('edge'))\n"
275 | ]
276 | },
277 | {
278 | "data": {
279 | "text/plain": [
280 | "2243363"
281 | ]
282 | },
283 | "execution_count": 69,
284 | "metadata": {},
285 | "output_type": "execute_result"
286 | }
287 | ],
288 | "source": [
289 | "data.num_nodes"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 68,
295 | "metadata": {},
296 | "outputs": [
297 | {
298 | "data": {
299 | "text/plain": [
300 | "4176391"
301 | ]
302 | },
303 | "execution_count": 68,
304 | "metadata": {},
305 | "output_type": "execute_result"
306 | }
307 | ],
308 | "source": [
309 | "data.num_edges"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 66,
315 | "metadata": {},
316 | "outputs": [
317 | {
318 | "data": {
319 | "text/plain": [
320 | "torch.Size([776543, 1])"
321 | ]
322 | },
323 | "execution_count": 66,
324 | "metadata": {},
325 | "output_type": "execute_result"
326 | }
327 | ],
328 | "source": [
329 | "data.u.shape # number of user"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 67,
335 | "metadata": {},
336 | "outputs": [
337 | {
338 | "data": {
339 | "text/plain": [
340 | "torch.Size([1466820, 1])"
341 | ]
342 | },
343 | "execution_count": 67,
344 | "metadata": {},
345 | "output_type": "execute_result"
346 | }
347 | ],
348 | "source": [
349 | "data.i.shape # number of item"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 70,
355 | "metadata": {},
356 | "outputs": [
357 | {
358 | "data": {
359 | "text/plain": [
360 | "3.6665601930725125e-06"
361 | ]
362 | },
363 | "execution_count": 70,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "data.num_edges/(data.u.shape[0]*data.i.shape[0]) # very sparse"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 6,
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "from torch_geometric.nn import GCNConv\n",
379 | "import torch.nn.functional as F\n",
380 | "\n",
381 | "\n",
382 | "# hyper param\n",
383 | "EMB_DIM = 10\n",
384 | "\n",
385 | "class Net(torch.nn.Module):\n",
386 | " def __init__(self):\n",
387 | " super(Net, self).__init__()\n",
388 | " self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n",
389 | " self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n",
390 | " self.conv1 = GCNConv(EMB_DIM*2,EMB_DIM)\n",
391 | " self.conv2 = GCNConv(EMB_DIM,8)\n",
392 | " self.lin = torch.nn.Linear(8,2)\n",
393 | " \n",
394 | " def forward(self, data):\n",
395 | " u,i,edge_index = data.u,data.i,data.edge_index\n",
396 | " \n",
397 | " emb_u = self.u_emb(u[edge_index[0]]).view(-1,EMB_DIM)\n",
398 | " emb_i = self.i_emb(i[(edge_index[1]-u_enc.classes_.shape[0])]-u_enc.classes_.shape[0]).view(-1,EMB_DIM)\n",
399 | " \n",
400 | " x = torch.cat([emb_u,emb_i],dim=1)\n",
401 | " x = self.conv1(x,edge_index)\n",
402 | " x = F.relu(x)\n",
403 | " x = F.dropout(x,training=self.training)\n",
404 | " x = self.conv2(x,edge_index)\n",
405 | " x = F.relu(x)\n",
406 | " x = F.dropout(x,training=self.training)\n",
407 | " \n",
408 | " x = self.lin(x)\n",
409 | " return F.log_softmax(x,dim=1)\n",
410 | " \n"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 8,
416 | "metadata": {
417 | "scrolled": true
418 | },
419 | "outputs": [],
420 | "source": []
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": 58,
425 | "metadata": {},
426 | "outputs": [
427 | {
428 | "name": "stdout",
429 | "output_type": "stream",
430 | "text": [
431 | "tensor(0.7049, device='cuda:0', grad_fn=)\n",
432 | "0.6750931605781164\n",
433 | "tensor(0.7053, device='cuda:0', grad_fn=)\n",
434 | "0.674996905222715\n",
435 | "tensor(0.7048, device='cuda:0', grad_fn=)\n",
436 | "0.6750962733134901\n",
437 | "tensor(0.7049, device='cuda:0', grad_fn=)\n",
438 | "0.6753460104669319\n",
439 | "tensor(0.7050, device='cuda:0', grad_fn=)\n",
440 | "0.6751274006672268\n",
441 | "tensor(0.7052, device='cuda:0', grad_fn=)\n",
442 | "0.6751913314629785\n",
443 | "tensor(0.7050, device='cuda:0', grad_fn=)\n",
444 | "0.6752758542004329\n",
445 | "tensor(0.7051, device='cuda:0', grad_fn=)\n",
446 | "0.675027314252904\n",
447 | "tensor(0.7051, device='cuda:0', grad_fn=)\n",
448 | "0.6748395923657531\n",
449 | "tensor(0.7052, device='cuda:0', grad_fn=)\n",
450 | "0.6751312317261482\n"
451 | ]
452 | }
453 | ],
454 | "source": [
455 | "model = Net().to(device)\n",
456 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
457 | "weight = torch.tensor([1,1.075],dtype=torch.float).to(device)\n",
458 | "\n",
459 | "model.train()\n",
460 | "for epoch in range(50):\n",
461 | " optimizer.zero_grad()\n",
462 | " out = model(data)\n",
463 | " loss = F.nll_loss(out[],data.y,weight=weight)\n",
464 | " loss.backward()\n",
465 | " optimizer.step\n",
466 | " if epoch%5==0:\n",
467 | " print(loss)\n",
468 | " _,pred = model(data).max(dim=1) \n",
469 | " print(pred.eq(data.y).sum().item()/data.y.shape[0])"
470 | ]
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "metadata": {
475 | "scrolled": true
476 | },
477 | "source": [
478 | "### evalutate"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 55,
484 | "metadata": {},
485 | "outputs": [
486 | {
487 | "name": "stdout",
488 | "output_type": "stream",
489 | "text": [
490 | "accuracy: 0.6830433261636661\n",
491 | "recall: 0.149021881330272\n",
492 | "precision: 0.3520523736846333\n",
493 | "f1: 0.2094041213580665\n"
494 | ]
495 | }
496 | ],
497 | "source": [
498 | "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n",
499 | "train['pred'] = pred.to('cpu').numpy()\n",
500 | "print('accuracy: ',accuracy_score(train['behavior_type'].fillna(0),train['pred']))\n",
501 | "print('recall: ',recall_score(train['behavior_type'].fillna(0), train['pred']))\n",
502 | "print('precision: ',precision_score(train['behavior_type'].fillna(0), train['pred']))\n",
503 | "print('f1: ',f1_score(train['behavior_type'].fillna(0), train['pred']))"
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": 56,
509 | "metadata": {},
510 | "outputs": [
511 | {
512 | "data": {
513 | "text/plain": [
514 | "0 3678431\n",
515 | "1 497960\n",
516 | "Name: pred, dtype: int64"
517 | ]
518 | },
519 | "execution_count": 56,
520 | "metadata": {},
521 | "output_type": "execute_result"
522 | }
523 | ],
524 | "source": [
525 | "train.pred.value_counts()"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": 57,
531 | "metadata": {},
532 | "outputs": [
533 | {
534 | "data": {
535 | "text/plain": [
536 | "0.0 3000000\n",
537 | "1.0 1176391\n",
538 | "Name: behavior_type, dtype: int64"
539 | ]
540 | },
541 | "execution_count": 57,
542 | "metadata": {},
543 | "output_type": "execute_result"
544 | }
545 | ],
546 | "source": [
547 | "train.behavior_type.fillna(0).value_counts()"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": null,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": []
556 | }
557 | ],
558 | "metadata": {
559 | "kernelspec": {
560 | "display_name": "Python 3",
561 | "language": "python",
562 | "name": "python3"
563 | },
564 | "language_info": {
565 | "codemirror_mode": {
566 | "name": "ipython",
567 | "version": 3
568 | },
569 | "file_extension": ".py",
570 | "mimetype": "text/x-python",
571 | "name": "python",
572 | "nbconvert_exporter": "python",
573 | "pygments_lexer": "ipython3",
574 | "version": "3.7.3"
575 | }
576 | },
577 | "nbformat": 4,
578 | "nbformat_minor": 2
579 | }
580 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CIKM
2 | ### 代码在notebook文件夹中
3 | 1. baseline的神经网络有错误,不要用
4 | 2. edge GCN中,包括了用原生GCN做node embedding的模型,和我魔改的引入edge embedding的GCN的模型
--------------------------------------------------------------------------------
/notebook/baseline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "Wall time: 6.3 s\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "%%time\n",
18 | "import pandas as pd\n",
19 | "import os\n",
20 | "import numpy as np\n",
21 | "import torch\n",
22 | "import torch_geometric\n",
23 | "from sklearn.preprocessing import LabelEncoder\n",
24 | "import gc\n",
25 | "from torch_geometric.data import Data\n",
26 | "\n",
27 | "# item_feature = pd.read_csv('../data/item_feature.csv')\n",
28 | "# test = pd.read_csv('../data/test.csv')\n",
29 | "# user_feature = pd.read_csv('../data/user_feature.csv')\n",
30 | "train = pd.read_pickle('../data/sml_train.pkl')"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 2,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "data": {
40 | "text/html": [
41 | "\n",
42 | "\n",
55 | "
\n",
56 | " \n",
57 | " \n",
58 | " | \n",
59 | " user_id | \n",
60 | " item_id | \n",
61 | " behavior_type | \n",
62 | " date | \n",
63 | "
\n",
64 | " \n",
65 | " \n",
66 | " \n",
67 | " 0 | \n",
68 | " 1.732029e+09 | \n",
69 | " 3.193364e+08 | \n",
70 | " clk | \n",
71 | " 2019-06-19 | \n",
72 | "
\n",
73 | " \n",
74 | " 2 | \n",
75 | " 1.732029e+09 | \n",
76 | " 1.197152e+09 | \n",
77 | " clk | \n",
78 | " 2019-06-19 | \n",
79 | "
\n",
80 | " \n",
81 | " 3 | \n",
82 | " 1.732029e+09 | \n",
83 | " 1.145630e+09 | \n",
84 | " clk | \n",
85 | " 2019-06-19 | \n",
86 | "
\n",
87 | " \n",
88 | " 5 | \n",
89 | " 1.732029e+09 | \n",
90 | " 1.162473e+09 | \n",
91 | " clk | \n",
92 | " 2019-06-19 | \n",
93 | "
\n",
94 | " \n",
95 | " 7 | \n",
96 | " 1.732029e+09 | \n",
97 | " 1.128524e+09 | \n",
98 | " clk | \n",
99 | " 2019-06-19 | \n",
100 | "
\n",
101 | " \n",
102 | "
\n",
103 | "
"
104 | ],
105 | "text/plain": [
106 | " user_id item_id behavior_type date\n",
107 | "0 1.732029e+09 3.193364e+08 clk 2019-06-19\n",
108 | "2 1.732029e+09 1.197152e+09 clk 2019-06-19\n",
109 | "3 1.732029e+09 1.145630e+09 clk 2019-06-19\n",
110 | "5 1.732029e+09 1.162473e+09 clk 2019-06-19\n",
111 | "7 1.732029e+09 1.128524e+09 clk 2019-06-19"
112 | ]
113 | },
114 | "execution_count": 2,
115 | "metadata": {},
116 | "output_type": "execute_result"
117 | }
118 | ],
119 | "source": [
120 | "train.head()"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "### baseline思路(时间序列思路,未解决推荐问题和冷启动问题)\n",
128 | "1. 根据历史clk记录预测未来可能重复clk的user-item pair\n",
129 | "2. 历史clk记录可以用user-item二部图来表示,user和item作为节点,其中的边作为clk记录\n",
130 | "3. feature为user_embedding和item_embedding,通过concat两者的embedding后接MLP得到user-item pair的预测值\n",
131 | "4. label为历史clk边中重复clk的边 \n",
132 | "\n",
133 | "---\n",
134 | "由于内存不足,事先在服务器筛了训练数据中18-20号的数据,用18-19号的clk预测20号会重复clk的。 "
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 3,
140 | "metadata": {},
141 | "outputs": [
142 | {
143 | "name": "stdout",
144 | "output_type": "stream",
145 | "text": [
146 | "Wall time: 54.3 s\n"
147 | ]
148 | },
149 | {
150 | "data": {
151 | "text/plain": [
152 | "42"
153 | ]
154 | },
155 | "execution_count": 3,
156 | "metadata": {},
157 | "output_type": "execute_result"
158 | }
159 | ],
160 | "source": [
161 | "%%time\n",
162 | "train = train[train.behavior_type=='clk']\n",
163 | "his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id'])\n",
164 | "now = train[train.date>='2019-06-20']\n",
165 | "del train\n",
166 | "train = his[['user_id','item_id']].merge(now[['user_id','item_id','behavior_type']],how='left')\n",
167 | "del his,now\n",
168 | "gc.collect()"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "## build model"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 4,
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "name": "stdout",
185 | "output_type": "stream",
186 | "text": [
187 | "Wall time: 9.81 s\n"
188 | ]
189 | }
190 | ],
191 | "source": [
192 | "%%time\n",
193 | "train = pd.concat([train[train.behavior_type.isnull()==False],train[train.behavior_type.isnull()==True].sample(3000000)],axis=0)\n",
194 | "# 显存不足,下采样\n",
195 | "train['behavior_type'] = train['behavior_type'].fillna(0)\n",
196 | "train['behavior_type'] = train['behavior_type'].map({'clk':1})\n",
197 | "\n",
198 | "u_enc,i_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id'])\n",
199 | "train['user_id'] = u_enc.transform(train['user_id'])\n",
200 | "train['item_id'] = i_enc.transform(train['item_id'])+u_enc.classes_.shape[0]\n",
201 | "\n",
202 | "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])\n",
203 | "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n",
204 | "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n",
205 | "y = torch.tensor(train['behavior_type'].fillna(0).values,dtype=torch.long)\n",
206 | "data = Data(u=u,i=i,edge_index=edge_index,y=y)"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
216 | "data = data.to(device)"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 59,
222 | "metadata": {},
223 | "outputs": [
224 | {
225 | "name": "stdout",
226 | "output_type": "stream",
227 | "text": [
228 | "positive label ratio: 0.2816764522287305\n"
229 | ]
230 | }
231 | ],
232 | "source": [
233 | "print('positive label ratio: ',data.y.sum().item()/data.y.shape[0])"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 60,
239 | "metadata": {},
240 | "outputs": [
241 | {
242 | "name": "stderr",
243 | "output_type": "stream",
244 | "text": [
245 | "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n",
246 | " warnings.warn(__num_nodes_warn_msg__.format('edge'))\n"
247 | ]
248 | },
249 | {
250 | "data": {
251 | "text/plain": [
252 | "False"
253 | ]
254 | },
255 | "execution_count": 60,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | }
259 | ],
260 | "source": [
261 | "data.contains_isolated_nodes()"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 69,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "name": "stderr",
271 | "output_type": "stream",
272 | "text": [
273 | "D:\\anaconda\\envs\\torch_env\\lib\\site-packages\\torch_geometric\\data\\data.py:191: UserWarning: The number of nodes in your data object can only be inferred by its edge indices, and hence may result in unexpected batch-wise behavior, e.g., in case there exists isolated nodes. Please consider explicitly setting the number of nodes for this data object by assigning it to data.num_nodes.\n",
274 | " warnings.warn(__num_nodes_warn_msg__.format('edge'))\n"
275 | ]
276 | },
277 | {
278 | "data": {
279 | "text/plain": [
280 | "2243363"
281 | ]
282 | },
283 | "execution_count": 69,
284 | "metadata": {},
285 | "output_type": "execute_result"
286 | }
287 | ],
288 | "source": [
289 | "data.num_nodes"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 68,
295 | "metadata": {},
296 | "outputs": [
297 | {
298 | "data": {
299 | "text/plain": [
300 | "4176391"
301 | ]
302 | },
303 | "execution_count": 68,
304 | "metadata": {},
305 | "output_type": "execute_result"
306 | }
307 | ],
308 | "source": [
309 | "data.num_edges"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 66,
315 | "metadata": {},
316 | "outputs": [
317 | {
318 | "data": {
319 | "text/plain": [
320 | "torch.Size([776543, 1])"
321 | ]
322 | },
323 | "execution_count": 66,
324 | "metadata": {},
325 | "output_type": "execute_result"
326 | }
327 | ],
328 | "source": [
329 | "data.u.shape # number of user"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 67,
335 | "metadata": {},
336 | "outputs": [
337 | {
338 | "data": {
339 | "text/plain": [
340 | "torch.Size([1466820, 1])"
341 | ]
342 | },
343 | "execution_count": 67,
344 | "metadata": {},
345 | "output_type": "execute_result"
346 | }
347 | ],
348 | "source": [
349 | "data.i.shape # number of item"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 70,
355 | "metadata": {},
356 | "outputs": [
357 | {
358 | "data": {
359 | "text/plain": [
360 | "3.6665601930725125e-06"
361 | ]
362 | },
363 | "execution_count": 70,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "data.num_edges/(data.u.shape[0]*data.i.shape[0]) # very sparse"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 6,
375 | "metadata": {},
376 | "outputs": [],
377 | "source": [
378 | "from torch_geometric.nn import GCNConv\n",
379 | "import torch.nn.functional as F\n",
380 | "\n",
381 | "\n",
382 | "# hyper param\n",
383 | "EMB_DIM = 10\n",
384 | "\n",
385 | "class Net(torch.nn.Module):\n",
386 | " def __init__(self):\n",
387 | " super(Net, self).__init__()\n",
388 | " self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n",
389 | " self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n",
390 | " self.conv1 = GCNConv(EMB_DIM*2,EMB_DIM)\n",
391 | " self.conv2 = GCNConv(EMB_DIM,8)\n",
392 | " self.lin = torch.nn.Linear(8,2)\n",
393 | " \n",
394 | " def forward(self, data):\n",
395 | " u,i,edge_index = data.u,data.i,data.edge_index\n",
396 | " \n",
397 | " emb_u = self.u_emb(u[edge_index[0]]).view(-1,EMB_DIM)\n",
398 | " emb_i = self.i_emb(i[(edge_index[1]-u_enc.classes_.shape[0])]-u_enc.classes_.shape[0]).view(-1,EMB_DIM)\n",
399 | " \n",
400 | " x = torch.cat([emb_u,emb_i],dim=1)\n",
401 | " x = self.conv1(x,edge_index)\n",
402 | " x = F.relu(x)\n",
403 | " x = F.dropout(x,training=self.training)\n",
404 | " x = self.conv2(x,edge_index)\n",
405 | " x = F.relu(x)\n",
406 | " x = F.dropout(x,training=self.training)\n",
407 | " \n",
408 | " x = self.lin(x)\n",
409 | " return F.log_softmax(x,dim=1)\n",
410 | " \n"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 8,
416 | "metadata": {
417 | "scrolled": true
418 | },
419 | "outputs": [],
420 | "source": []
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": 58,
425 | "metadata": {},
426 | "outputs": [
427 | {
428 | "name": "stdout",
429 | "output_type": "stream",
430 | "text": [
431 | "tensor(0.7049, device='cuda:0', grad_fn=)\n",
432 | "0.6750931605781164\n",
433 | "tensor(0.7053, device='cuda:0', grad_fn=)\n",
434 | "0.674996905222715\n",
435 | "tensor(0.7048, device='cuda:0', grad_fn=)\n",
436 | "0.6750962733134901\n",
437 | "tensor(0.7049, device='cuda:0', grad_fn=)\n",
438 | "0.6753460104669319\n",
439 | "tensor(0.7050, device='cuda:0', grad_fn=)\n",
440 | "0.6751274006672268\n",
441 | "tensor(0.7052, device='cuda:0', grad_fn=)\n",
442 | "0.6751913314629785\n",
443 | "tensor(0.7050, device='cuda:0', grad_fn=)\n",
444 | "0.6752758542004329\n",
445 | "tensor(0.7051, device='cuda:0', grad_fn=)\n",
446 | "0.675027314252904\n",
447 | "tensor(0.7051, device='cuda:0', grad_fn=)\n",
448 | "0.6748395923657531\n",
449 | "tensor(0.7052, device='cuda:0', grad_fn=)\n",
450 | "0.6751312317261482\n"
451 | ]
452 | }
453 | ],
454 | "source": [
455 | "model = Net().to(device)\n",
456 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
457 | "weight = torch.tensor([1,1.075],dtype=torch.float).to(device)\n",
458 | "\n",
459 | "model.train()\n",
460 | "for epoch in range(50):\n",
461 | " optimizer.zero_grad()\n",
462 | " out = model(data)\n",
463 | " loss = F.nll_loss(out[],data.y,weight=weight)\n",
464 | " loss.backward()\n",
465 | " optimizer.step\n",
466 | " if epoch%5==0:\n",
467 | " print(loss)\n",
468 | " _,pred = model(data).max(dim=1) \n",
469 | " print(pred.eq(data.y).sum().item()/data.y.shape[0])"
470 | ]
471 | },
472 | {
473 | "cell_type": "markdown",
474 | "metadata": {
475 | "scrolled": true
476 | },
477 | "source": [
478 | "### evalutate"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 55,
484 | "metadata": {},
485 | "outputs": [
486 | {
487 | "name": "stdout",
488 | "output_type": "stream",
489 | "text": [
490 | "accuracy: 0.6830433261636661\n",
491 | "recall: 0.149021881330272\n",
492 | "precision: 0.3520523736846333\n",
493 | "f1: 0.2094041213580665\n"
494 | ]
495 | }
496 | ],
497 | "source": [
498 | "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n",
499 | "train['pred'] = pred.to('cpu').numpy()\n",
500 | "print('accuracy: ',accuracy_score(train['behavior_type'].fillna(0),train['pred']))\n",
501 | "print('recall: ',recall_score(train['behavior_type'].fillna(0), train['pred']))\n",
502 | "print('precision: ',precision_score(train['behavior_type'].fillna(0), train['pred']))\n",
503 | "print('f1: ',f1_score(train['behavior_type'].fillna(0), train['pred']))"
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": 56,
509 | "metadata": {},
510 | "outputs": [
511 | {
512 | "data": {
513 | "text/plain": [
514 | "0 3678431\n",
515 | "1 497960\n",
516 | "Name: pred, dtype: int64"
517 | ]
518 | },
519 | "execution_count": 56,
520 | "metadata": {},
521 | "output_type": "execute_result"
522 | }
523 | ],
524 | "source": [
525 | "train.pred.value_counts()"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": 57,
531 | "metadata": {},
532 | "outputs": [
533 | {
534 | "data": {
535 | "text/plain": [
536 | "0.0 3000000\n",
537 | "1.0 1176391\n",
538 | "Name: behavior_type, dtype: int64"
539 | ]
540 | },
541 | "execution_count": 57,
542 | "metadata": {},
543 | "output_type": "execute_result"
544 | }
545 | ],
546 | "source": [
547 | "train.behavior_type.fillna(0).value_counts()"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": null,
553 | "metadata": {},
554 | "outputs": [],
555 | "source": []
556 | }
557 | ],
558 | "metadata": {
559 | "kernelspec": {
560 | "display_name": "Python 3",
561 | "language": "python",
562 | "name": "python3"
563 | },
564 | "language_info": {
565 | "codemirror_mode": {
566 | "name": "ipython",
567 | "version": 3
568 | },
569 | "file_extension": ".py",
570 | "mimetype": "text/x-python",
571 | "name": "python",
572 | "nbconvert_exporter": "python",
573 | "pygments_lexer": "ipython3",
574 | "version": "3.7.3"
575 | }
576 | },
577 | "nbformat": 4,
578 | "nbformat_minor": 2
579 | }
580 |
--------------------------------------------------------------------------------
/notebook/edge GCN.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "C:\\Users\\user\\Anaconda3\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
13 | " from ._conv import register_converters as _register_converters\n"
14 | ]
15 | },
16 | {
17 | "name": "stdout",
18 | "output_type": "stream",
19 | "text": [
20 | "Wall time: 53.9 s\n"
21 | ]
22 | }
23 | ],
24 | "source": [
25 | "%%time\n",
26 | "import pandas as pd\n",
27 | "import os\n",
28 | "import numpy as np\n",
29 | "import torch\n",
30 | "import torch_geometric\n",
31 | "from sklearn.preprocessing import LabelEncoder\n",
32 | "import gc\n",
33 | "from torch_geometric.data import Data\n",
34 | "\n",
35 | "# item_feature = pd.read_csv('../data/item_feature.csv')\n",
36 | "# test = pd.read_csv('../data/test.csv')\n",
37 | "# user_feature = pd.read_csv('../data/user_feature.csv')\n",
38 | "train = pd.read_pickle('../data/sml_train.pkl')"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 2,
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/html": [
49 | "\n",
50 | "\n",
63 | "
\n",
64 | " \n",
65 | " \n",
66 | " | \n",
67 | " user_id | \n",
68 | " item_id | \n",
69 | " behavior_type | \n",
70 | " date | \n",
71 | "
\n",
72 | " \n",
73 | " \n",
74 | " \n",
75 | " 0 | \n",
76 | " 1.732029e+09 | \n",
77 | " 3.193364e+08 | \n",
78 | " clk | \n",
79 | " 2019-06-19 | \n",
80 | "
\n",
81 | " \n",
82 | " 2 | \n",
83 | " 1.732029e+09 | \n",
84 | " 1.197152e+09 | \n",
85 | " clk | \n",
86 | " 2019-06-19 | \n",
87 | "
\n",
88 | " \n",
89 | " 3 | \n",
90 | " 1.732029e+09 | \n",
91 | " 1.145630e+09 | \n",
92 | " clk | \n",
93 | " 2019-06-19 | \n",
94 | "
\n",
95 | " \n",
96 | " 5 | \n",
97 | " 1.732029e+09 | \n",
98 | " 1.162473e+09 | \n",
99 | " clk | \n",
100 | " 2019-06-19 | \n",
101 | "
\n",
102 | " \n",
103 | " 7 | \n",
104 | " 1.732029e+09 | \n",
105 | " 1.128524e+09 | \n",
106 | " clk | \n",
107 | " 2019-06-19 | \n",
108 | "
\n",
109 | " \n",
110 | "
\n",
111 | "
"
112 | ],
113 | "text/plain": [
114 | " user_id item_id behavior_type date\n",
115 | "0 1.732029e+09 3.193364e+08 clk 2019-06-19\n",
116 | "2 1.732029e+09 1.197152e+09 clk 2019-06-19\n",
117 | "3 1.732029e+09 1.145630e+09 clk 2019-06-19\n",
118 | "5 1.732029e+09 1.162473e+09 clk 2019-06-19\n",
119 | "7 1.732029e+09 1.128524e+09 clk 2019-06-19"
120 | ]
121 | },
122 | "execution_count": 2,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "train.head()"
129 | ]
130 | },
131 | {
132 | "cell_type": "markdown",
133 | "metadata": {},
134 | "source": [
135 | "### 加入edge信息的GCN(时间序列思路,未解决推荐问题和冷启动问题)\n",
136 | "1. 根据历史 __所有behavior边__ 预测未来可能在这些边中是clk的概率\n",
137 | "2. __behavior__ 用 __user-item__ 二部图来表示, __user__ 和 __item__ 作为节点,边属性为 __behavior_type__\n",
138 | "3. 更新每个user/item feature时,concat 1-hop node embedding和edge_embedding,然后求均值作为新的user/item feature\n",
139 | "4. 最后concat __user_embedding__ , __item_embedding__ 作为user-item pair 的feature, 该user-item pair在未来是否发生clk作为label"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 3,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "Wall time: 1min 29s\n"
152 | ]
153 | }
154 | ],
155 | "source": [
156 | "%%time\n",
157 | "his = train[train.date<'2019-06-20'].drop_duplicates(subset=['user_id','item_id','behavior_type'])\n",
158 | "now = train[(train.date>='2019-06-20')&(train.behavior_type=='clk')].drop_duplicates(subset=['user_id','item_id'])\n",
159 | "del train\n",
160 | "now.rename(columns={'behavior_type':'label'},inplace=True)\n",
161 | "train = his[['user_id','item_id','behavior_type']].merge(now[['user_id','item_id','label']],how='left')\n",
162 | "del his,now\n",
163 | "gc.collect()\n",
164 | "train['label'] = train['label'].map({'clk':1})\n",
165 | "train['label'] = train['label'].fillna(0) "
166 | ]
167 | },
168 | {
169 | "cell_type": "markdown",
170 | "metadata": {},
171 | "source": [
172 | "### build model"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 5,
178 | "metadata": {},
179 | "outputs": [
180 | {
181 | "name": "stdout",
182 | "output_type": "stream",
183 | "text": [
184 | "Wall time: 18.8 s\n"
185 | ]
186 | }
187 | ],
188 | "source": [
189 | "%%time\n",
190 | "train = pd.concat([train[train.label==1],train[train.label==0].sample(2000000)],axis=0)\n",
191 | "# 显存不足,下采样\n",
192 | "\n",
193 | "u_enc,i_enc,e_enc = LabelEncoder().fit(train['user_id']),LabelEncoder().fit(train['item_id']),LabelEncoder().fit(train['behavior_type'])\n",
194 | "train['user_id'] = u_enc.transform(train['user_id'])\n",
195 | "train['item_id'] = i_enc.transform(train['item_id'])\n",
196 | "train['behavior_type'] = e_enc.transform(train['behavior_type'])"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 11,
202 | "metadata": {},
203 | "outputs": [
204 | {
205 | "data": {
206 | "text/plain": [
207 | "0.860387"
208 | ]
209 | },
210 | "execution_count": 11,
211 | "metadata": {},
212 | "output_type": "execute_result"
213 | }
214 | ],
215 | "source": [
216 | "# clk rate\n",
217 | "train.label.sum()/(train.shape[0]-train.label.sum())"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "#### no edge GCN \n",
225 | "不利用边属性信息,只用节点的embedding进行GCN"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 6,
231 | "metadata": {},
232 | "outputs": [],
233 | "source": [
234 | "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values+1+train['user_id'].max()])\n",
235 | "edge_attr = torch.tensor(train.behavior_type.values, dtype=torch.long)\n",
236 | "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n",
237 | "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n",
238 | "e = torch.tensor(train['behavior_type'].unique().reshape(-1,1))\n",
239 | "y = torch.tensor(train.label.values,dtype=torch.long)\n",
240 | "data = Data(u=u,i=i,e=e,edge_index=edge_index,edge_attr=edge_attr,y=y)"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 7,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "from torch_geometric.nn import GCNConv,MessagePassing\n",
250 | "import torch.nn.functional as F\n",
251 | "\n",
252 | "# hyper param\n",
253 | "EMB_DIM = 10\n",
254 | "\n",
255 | "class noedge_GCN(torch.nn.Module):\n",
256 | " def __init__(self):\n",
257 | " super(noedge_GCN, self).__init__()\n",
258 | " self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n",
259 | " self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n",
260 | " self.conv1 = GCNConv(EMB_DIM,6)\n",
261 | " self.conv2 = GCNConv(6,4)\n",
262 | " self.lin = torch.nn.Linear(8,2)\n",
263 | " \n",
264 | " def forward(self, data):\n",
265 | " u,i,e,edge_index,edge_attr = data.u,data.i,data.e,data.edge_index,data.edge_attr\n",
266 | " \n",
267 | " emb_u = self.u_emb(u).view(-1,EMB_DIM)\n",
268 | " emb_i = self.i_emb(i).view(-1,EMB_DIM)\n",
269 | " x = torch.cat([emb_u, emb_i],dim=0)\n",
270 | " x = self.conv1(x,edge_index)\n",
271 | " x = F.relu(x)\n",
272 | " x = F.dropout(x,training=self.training)\n",
273 | " x = self.conv2(x,edge_index)\n",
274 | " x = F.relu(x)\n",
275 | " x = F.dropout(x,training=self.training)\n",
276 | " x = torch.cat([x[edge_index[0]],x[edge_index[1]]],dim=1)\n",
277 | " x = self.lin(x)\n",
278 | " x = F.dropout(x,training=self.training)\n",
279 | " \n",
280 | " return F.log_softmax(x,dim=1)\n",
281 | " \n",
282 | "\n",
283 | "model = noedge_GCN()"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 12,
289 | "metadata": {},
290 | "outputs": [
291 | {
292 | "name": "stdout",
293 | "output_type": "stream",
294 | "text": [
295 | "epoch_0 loss: 0.8651493787765503\n",
296 | "epoch_1 loss: 0.7528172135353088\n",
297 | "epoch_2 loss: 0.7403311133384705\n",
298 | "epoch_3 loss: 0.7339281439781189\n",
299 | "epoch_4 loss: 0.7297651171684265\n",
300 | "epoch_5 loss: 0.7220962643623352\n",
301 | "epoch_6 loss: 0.7202253341674805\n",
302 | "epoch_7 loss: 0.7201514840126038\n",
303 | "epoch_8 loss: 0.7203397750854492\n",
304 | "epoch_9 loss: 0.721700131893158\n",
305 | "Wall time: 3min 35s\n"
306 | ]
307 | }
308 | ],
309 | "source": [
310 | "%%time\n",
311 | "model = noedge_GCN()\n",
312 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
313 | "weight = torch.tensor([0.860387,1],dtype=torch.float) # label的weight属于超参,需要调参!!!\n",
314 | "\n",
315 | "model.train()\n",
316 | "for epoch in range(10):\n",
317 | " optimizer.zero_grad()\n",
318 | " out = model(data)\n",
319 | " loss = F.nll_loss(out, data.y,weight=weight)\n",
320 | " loss.backward()\n",
321 | " optimizer.step()\n",
322 | " if epoch%1==0:\n",
323 | " model.eval()\n",
324 | " print('epoch_{} loss: {}'.format(epoch,loss.item()))"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 14,
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "name": "stdout",
334 | "output_type": "stream",
335 | "text": [
336 | "accuracy: 0.46687463414870134\n",
337 | "recall: 0.9499719312355951\n",
338 | "precision: 0.4627905982579322\n",
339 | "f1: 0.6223807175044113\n"
340 | ]
341 | }
342 | ],
343 | "source": [
344 | "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n",
345 | "model.eval()\n",
346 | "_,pred = model(data).max(dim=1)\n",
347 | "train['pred'] = pred.numpy()\n",
348 | "print('accuracy: ',accuracy_score(train['label'],train['pred']))\n",
349 | "print('recall: ',recall_score(train['label'], train['pred']))\n",
350 | "print('precision: ',precision_score(train['label'], train['pred']))\n",
351 | "print('f1: ',f1_score(train['label'], train['pred']))"
352 | ]
353 | },
354 | {
355 | "cell_type": "markdown",
356 | "metadata": {},
357 | "source": [
358 | "#### edge GCN\n",
359 | "同时利用node_embedding和edge_embedding做GCN,最终每个节点的embedding是融合了该节点领域的node_embedding和edge_embedding \n",
360 | "魔改GCN"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 18,
366 | "metadata": {},
367 | "outputs": [],
368 | "source": [
369 | "edge_index = torch.tensor([train['user_id'].values,train['item_id'].values])\n",
370 | "edge_attr = torch.tensor(train.behavior_type.values, dtype=torch.long)\n",
371 | "u = torch.tensor(train['user_id'].unique().reshape(-1,1))\n",
372 | "i = torch.tensor(train['item_id'].unique().reshape(-1,1))\n",
373 | "e = torch.tensor(train['behavior_type'].unique().reshape(-1,1))\n",
374 | "y = torch.tensor(train.label.values,dtype=torch.long)\n",
375 | "data = Data(u=u,i=i,e=e,edge_index=edge_index,edge_attr=edge_attr,y=y)"
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": 32,
381 | "metadata": {},
382 | "outputs": [],
383 | "source": [
384 | "import torch\n",
385 | "from torch_geometric.nn import MessagePassing\n",
386 | "from torch_geometric.utils import add_self_loops, degree\n",
387 | "\n",
388 | "class edgeGCN(MessagePassing):\n",
389 | " \n",
390 | " def __init__(self, in_channels, out_channels, flow, aggr='add', **kwargs):\n",
391 | " super(edgeGCN, self).__init__(aggr=aggr,flow=flow)\n",
392 | " self.lin_u = torch.nn.Linear(in_channels, out_channels)\n",
393 | " self.lin_i = torch.nn.Linear(in_channels, out_channels)\n",
394 | " self.lin_e = torch.nn.Linear(in_channels, out_channels)\n",
395 | " self.lin_aggr = torch.nn.Linear(out_channels*2, out_channels)\n",
396 | " self.flow = flow\n",
397 | " \n",
398 | " def forward(self,u,i,e,edge_index,edge_type):\n",
399 | " \n",
400 | " # linear transformation\n",
401 | " u = self.lin_u(u)\n",
402 | " u = F.relu(u)\n",
403 | " u = F.dropout(u)\n",
404 | " i = self.lin_i(i)\n",
405 | " i = F.relu(i)\n",
406 | " i = F.dropout(i)\n",
407 | " e = self.lin_e(e)\n",
408 | " e = F.relu(e)\n",
409 | " e = F.dropout(e) \n",
410 | " \n",
411 | " return self.propagate(x=(u,i),e=e,edge_index=edge_index,edge_type=edge_type,size=(u.size(0), i.size(0)))\n",
412 | " \n",
413 | " def message(self,x_j, x_i, e, edge_index, edge_type,size):\n",
414 | " \n",
415 | " # x_i is user_embedding\n",
416 | " # x_j is item_embedding\n",
417 | " \n",
418 | " # get normalized laplacian\n",
419 | " row,col = edge_index\n",
420 | " deg_i = degree(row, size[0], dtype=x_i.dtype)\n",
421 | " deg_j = degree(col, size[1], dtype=x_j.dtype)\n",
422 | " deg_inv_sqrt_i = deg_i.pow(-0.5)\n",
423 | " deg_inv_sqrt_j = deg_j.pow(-0.5)\n",
424 | " norm = deg_inv_sqrt_i[row]*deg_inv_sqrt_j[col]\n",
425 | " \n",
426 | " # concat neighbor nodes embedding and edge embedding\n",
427 | " if self.flow == 'target_to_source':\n",
428 | " emb = torch.cat([x_j, e[edge_type]], dim=1)\n",
429 | " else:\n",
430 | " emb = torch.cat([x_i, e[edge_type]], dim=1)\n",
431 | " return norm.view(-1,1)*emb\n",
432 | " \n",
433 | " def update(self, aggr_out):\n",
434 | " return self.lin_aggr(aggr_out)"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 69,
440 | "metadata": {},
441 | "outputs": [],
442 | "source": [
443 | "# hyper param\n",
444 | "EMB_DIM = 10\n",
445 | "\n",
446 | "class Net(torch.nn.Module):\n",
447 | " def __init__(self):\n",
448 | " super(Net, self).__init__()\n",
449 | " self.u_emb = torch.nn.Embedding(len(u),EMB_DIM)\n",
450 | " self.i_emb = torch.nn.Embedding(len(i),EMB_DIM)\n",
451 | " self.e_emb = torch.nn.Embedding(len(e),EMB_DIM)\n",
452 | " self.e_lin_1 = torch.nn.Linear(EMB_DIM,6)\n",
453 | " self.u_gcn_1 = edgeGCN(EMB_DIM,6,flow='target_to_source')\n",
454 | " self.i_gcn_1 = edgeGCN(EMB_DIM,6,flow='source_to_target')\n",
455 | " self.u_gcn_2 = edgeGCN(6,2,flow='target_to_source')\n",
456 | " self.i_gcn_2 = edgeGCN(6,2,flow='source_to_target')\n",
457 | " self.lin = torch.nn.Linear(4,2)\n",
458 | " \n",
459 | " def forward(self, data):\n",
460 | " u,i,e,edge_index,edge_attr = data.u,data.i,data.e,data.edge_index,data.edge_attr\n",
461 | " \n",
462 | " u_emb = self.u_emb(u).view(-1,EMB_DIM)\n",
463 | " i_emb = self.i_emb(i).view(-1,EMB_DIM)\n",
464 | " e_emb = self.e_emb(e).view(-1,EMB_DIM)\n",
465 | " \n",
466 | " x_u,x_i = self.u_gcn_1(u=u_emb, i=i_emb, e=e_emb, edge_index=edge_index, edge_type = edge_attr),\\\n",
467 | " self.i_gcn_1(u=u_emb, i=i_emb, e=e_emb, edge_index=edge_index, edge_type = edge_attr)\n",
468 | " e_emb = self.e_lin_1(e_emb)\n",
469 | " x_u,x_i = self.u_gcn_2(u=x_u, i=x_i, e=e_emb, edge_index=edge_index, edge_type = edge_attr),\\\n",
470 | " self.i_gcn_2(u=x_u, i=x_i, e=e_emb, edge_index=edge_index, edge_type = edge_attr)\n",
471 | "\n",
472 | " \n",
473 | " x = torch.cat([x_u[edge_index[0]],x_i[edge_index[1]]],dim=1)\n",
474 | " x = self.lin(x)\n",
475 | " x = F.dropout(x,training=self.training)\n",
476 | " \n",
477 | " return F.log_softmax(x,dim=1)"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": 73,
483 | "metadata": {},
484 | "outputs": [
485 | {
486 | "name": "stdout",
487 | "output_type": "stream",
488 | "text": [
489 | "epoch_0 loss: 0.8134878873825073\n",
490 | "epoch_1 loss: 0.7332140207290649\n",
491 | "epoch_2 loss: 0.7305666208267212\n",
492 | "epoch_3 loss: 0.7313019633293152\n",
493 | "Wall time: 1min 36s\n"
494 | ]
495 | }
496 | ],
497 | "source": [
498 | "%%time\n",
499 | "model = Net()\n",
500 | "optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
501 | "weight = torch.tensor([0.860387,1],dtype=torch.float) # label的weight属于超参,需要调参!!!\n",
502 | "\n",
503 | "model.train()\n",
504 | "\n",
505 | "# train param\n",
506 | "patience = 0\n",
507 | "best_loss = 1\n",
508 | "\n",
509 | "for epoch in range(10):\n",
510 | " optimizer.zero_grad()\n",
511 | " out = model(data)\n",
512 | " loss = F.nll_loss(out, data.y,weight=weight)\n",
513 | " \n",
514 | " if epoch%1==0:\n",
515 | " model.eval()\n",
516 | " print('epoch_{} loss: {}'.format(epoch,loss.item()))\n",
517 | " \n",
518 | " if loss>best_loss:\n",
519 | " patience += 1\n",
520 | " if patience == 2: \n",
521 | " break\n",
522 | " else:\n",
523 | " patience = 0\n",
524 | " best_loss = loss\n",
525 | " \n",
526 | " loss.backward()\n",
527 | " optimizer.step()"
528 | ]
529 | },
530 | {
531 | "cell_type": "code",
532 | "execution_count": 74,
533 | "metadata": {},
534 | "outputs": [
535 | {
536 | "name": "stdout",
537 | "output_type": "stream",
538 | "text": [
539 | "accuracy: 0.5050347051446823\n",
540 | "recall: 0.5282611197054349\n",
541 | "precision: 0.468827943036212\n",
542 | "f1: 0.4967732239615924\n"
543 | ]
544 | }
545 | ],
546 | "source": [
547 | "from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score\n",
548 | "model.eval()\n",
549 | "_,pred = model(data).max(dim=1)\n",
550 | "train['pred'] = pred.numpy()\n",
551 | "print('accuracy: ',accuracy_score(train['label'],train['pred']))\n",
552 | "print('recall: ',recall_score(train['label'], train['pred']))\n",
553 | "print('precision: ',precision_score(train['label'], train['pred']))\n",
554 | "print('f1: ',f1_score(train['label'], train['pred']))"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": null,
560 | "metadata": {},
561 | "outputs": [],
562 | "source": []
563 | },
564 | {
565 | "cell_type": "code",
566 | "execution_count": null,
567 | "metadata": {},
568 | "outputs": [],
569 | "source": []
570 | },
571 | {
572 | "cell_type": "code",
573 | "execution_count": null,
574 | "metadata": {},
575 | "outputs": [],
576 | "source": []
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": null,
581 | "metadata": {},
582 | "outputs": [],
583 | "source": []
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": null,
588 | "metadata": {},
589 | "outputs": [],
590 | "source": []
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": null,
595 | "metadata": {},
596 | "outputs": [],
597 | "source": []
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": null,
602 | "metadata": {},
603 | "outputs": [],
604 | "source": []
605 | },
606 | {
607 | "cell_type": "code",
608 | "execution_count": null,
609 | "metadata": {},
610 | "outputs": [],
611 | "source": []
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": null,
616 | "metadata": {},
617 | "outputs": [],
618 | "source": []
619 | },
620 | {
621 | "cell_type": "code",
622 | "execution_count": null,
623 | "metadata": {},
624 | "outputs": [],
625 | "source": []
626 | },
627 | {
628 | "cell_type": "code",
629 | "execution_count": null,
630 | "metadata": {},
631 | "outputs": [],
632 | "source": []
633 | },
634 | {
635 | "cell_type": "code",
636 | "execution_count": null,
637 | "metadata": {},
638 | "outputs": [],
639 | "source": []
640 | }
641 | ],
642 | "metadata": {
643 | "kernelspec": {
644 | "display_name": "Python 3",
645 | "language": "python",
646 | "name": "python3"
647 | },
648 | "language_info": {
649 | "codemirror_mode": {
650 | "name": "ipython",
651 | "version": 3
652 | },
653 | "file_extension": ".py",
654 | "mimetype": "text/x-python",
655 | "name": "python",
656 | "nbconvert_exporter": "python",
657 | "pygments_lexer": "ipython3",
658 | "version": "3.6.5"
659 | }
660 | },
661 | "nbformat": 4,
662 | "nbformat_minor": 2
663 | }
664 |
--------------------------------------------------------------------------------