├── .idea
├── RecommendationSystem.iml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── Model
├── DeepCrossNetwork_PyTorch.py
├── DeepFM_PyTorch.py
├── FFM_PyTorch.py
└── FM_PyTorch.py
├── README.md
├── data
├── Criteo
│ ├── __init__.py
│ ├── forDCN
│ │ ├── DCN_dataProcess.py
│ │ └── __init__.py
│ └── forDeepFM
│ │ ├── __init__.py
│ │ └── deepFM_dataProcess.py
├── Movielens100K
│ ├── __init__.py
│ ├── u.item
│ ├── u.user
│ ├── ua.base
│ └── ua.test
└── __init__.py
└── util
└── load_data_util.py
/.idea/RecommendationSystem.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
132 |
133 |
134 |
135 | print
136 | dropout
137 | Ada
138 | continue
139 | loss
140 | roc_auc_score
141 | batch
142 | data
143 | feat_dict
144 | get_criteo_data
145 | 200000
146 | file
147 | 0000
148 | train_filelist
149 | reg_l2
150 | train_DeepFM_model_demo
151 | ../data/
152 | batch_size
153 | torch.cat
154 | mul
155 | feat_dict_
156 | EACH_FILE_DATA_NUM
157 | generating
158 | train.txt
159 | num_field
160 | break
161 | reg
162 | Adam
163 | reg_l1
164 | Loss
165 |
166 |
167 | dropout
168 | BATCH_SIZE
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 | 1567680589653
433 |
434 |
435 | 1567680589653
436 |
437 |
438 | 1568003905805
439 |
440 |
441 |
442 | 1568003905805
443 |
444 |
445 | 1568019652943
446 |
447 |
448 |
449 | 1568019652943
450 |
451 |
452 | 1568118580357
453 |
454 |
455 |
456 | 1568118580358
457 |
458 |
459 | 1568891752509
460 |
461 |
462 |
463 | 1568891752509
464 |
465 |
466 | 1568892588960
467 |
468 |
469 |
470 | 1568892588960
471 |
472 |
473 | 1568893403001
474 |
475 |
476 |
477 | 1568893403001
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
561 |
562 |
563 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 | file://$PROJECT_DIR$/data/Criteo/dataProcess.py
594 | 122
595 |
596 |
597 |
598 | file://$PROJECT_DIR$/data/Criteo/DCN_dataProcess.py
599 | 125
600 |
601 |
602 |
603 | file://$PROJECT_DIR$/data/Criteo/DCN_dataProcess.py
604 | 65
605 |
606 |
607 |
608 | file://$PROJECT_DIR$/Model/Basic-DCN-Demo.py
609 | 57
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
843 |
844 |
845 |
846 |
847 |
848 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 |
874 |
875 |
876 |
877 |
878 |
--------------------------------------------------------------------------------
/Model/DeepCrossNetwork_PyTorch.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import math
4 | import torch
5 | import numpy as np
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from sklearn.metrics import roc_auc_score
9 | from time import time
10 |
11 | EPOCHS = 5
12 | BATCH_SIZE = 2048
13 | AID_DATA_DIR = '../data/Criteo/forDCN/' # 辅助用途的文件路径
14 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15 |
16 |
17 | """
18 | PyTorch implementation of Deep & Cross Network[1]
19 |
20 | Reference:
21 | [1] Deep & Cross Network for Ad Click Predictions,
22 | Ruoxi Wang, Bin Fu, Gang Fu, Zhenguo Li, Mingliang Wang
23 | [2] Keras implementation of Deep & Cross Network
24 | https://github.com/Nirvanada/Deep-and-Cross-Keras
25 | [3] PaddlePaddle implemantation of Deep & Cross Network
26 | https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/ctr/dcn
27 | """
28 |
29 | class DCN_layer(nn.Module):
30 | def __init__(self, num_dense_feat, num_sparse_feat_list, dropout_deep, deep_layer_sizes,
31 | reg_l1=0.01, reg_l2=0.01, num_cross_layers=4):
32 | super(DCN_layer, self).__init__()
33 | self.reg_l1 = reg_l1 # L1正则化并没有去使用
34 | self.reg_l2 = reg_l2
35 | self.num_dense_feat = num_dense_feat # denote as D, 连续型特征数量
36 |
37 | # Embedding and Stacking Layer
38 | embedding_sizes = []
39 | self.sparse_feat_embeddings = nn.ModuleList()
40 |
41 | # 对于每一列特征, 得到它所对应的Embedding的维度
42 | for i, num_sparse_feat in enumerate(num_sparse_feat_list):
43 | embedding_dim = min(num_sparse_feat, 6 * int(np.power(num_sparse_feat, 1/4)))
44 | embedding_sizes.append(embedding_dim)
45 | feat_embedding = nn.Embedding(num_sparse_feat, embedding_dim)
46 | nn.init.xavier_uniform_(feat_embedding.weight)
47 | feat_embedding.to(DEVICE)
48 | self.sparse_feat_embeddings.append(feat_embedding)
49 |
50 | self.num_cross_layers = num_cross_layers # denote as C, Cross层的层数
51 | self.deep_layer_sizes = deep_layer_sizes # Deep层中的各神经元的数量
52 |
53 | # Cross Network方面的参数
54 | self.input_dim = num_dense_feat + sum(embedding_sizes) # denote as In
55 | self.cross_bias = nn.Parameter(torch.randn(num_cross_layers, self.input_dim)) # C * In
56 | nn.init.zeros_(self.cross_bias)
57 | self.cross_W = nn.Parameter(torch.randn(num_cross_layers, self.input_dim))
58 | nn.init.xavier_uniform_(self.cross_W)
59 | self.batchNorm_list = nn.ModuleList()
60 | for _ in range(num_cross_layers):
61 | self.batchNorm_list.append(nn.BatchNorm1d(self.input_dim))
62 |
63 | # 神经网络方面的参数
64 | all_dims = [self.input_dim] + deep_layer_sizes
65 | for i in range(len(deep_layer_sizes)):
66 | setattr(self, 'linear_' + str(i + 1), nn.Linear(all_dims[i], all_dims[i + 1]))
67 | setattr(self, 'batchNorm_' + str(i + 1), nn.BatchNorm1d(all_dims[i + 1]))
68 | setattr(self, 'dropout_' + str(i + 1), nn.Dropout(dropout_deep[i + 1]))
69 |
70 | # Combination部分: 最后一层全连接层
71 | self.fc = nn.Linear(self.input_dim + all_dims[-1], 1)
72 | nn.init.xavier_uniform_(self.fc.weight)
73 |
74 | def forward(self, feat_index_list, dense_x):
75 | x0 = dense_x
76 | for i, feat_index in enumerate(feat_index_list):
77 | sparse_x = self.sparse_feat_embeddings[i](feat_index)
78 | x0 = torch.cat((x0, sparse_x), dim=1) # None * In
79 |
80 | # Cross Network 部分
81 | x_cross = x0 # None * In
82 | for i in range(self.num_cross_layers):
83 | W = torch.unsqueeze(self.cross_W[i, :].T, dim=1) # In * 1
84 | xT_W = torch.mm(x_cross, W) # None * 1
85 | x_cross = torch.mul(x0, xT_W) + self.cross_bias[i, :] + x_cross # None * In
86 | x_cross = self.batchNorm_list[i](x_cross)
87 |
88 | # Deep Network 部分
89 | x_deep = x0 # None * In
90 | for i in range(1, len(self.deep_layer_sizes) + 1):
91 | x_deep = getattr(self, 'linear_' + str(i))(x_deep)
92 | x_deep = getattr(self, 'batchNorm_' + str(i))(x_deep)
93 | x_deep = F.relu(x_deep)
94 | x_deep = getattr(self, 'dropout_' + str(i))(x_deep)
95 |
96 | x_stack = torch.cat((x_cross, x_deep), dim=1)
97 | output = self.fc(x_stack)
98 |
99 | return output
100 |
101 |
102 | """ ************************************************************************************ """
103 | """ 训练和测试FM模型 """
104 | """ ************************************************************************************ """
105 | def train_DeepFM_model_demo(device):
106 | """
107 | 训练DeepFM的方式
108 | :return:
109 | """
110 | train_filelist = ["%s%s" % (AID_DATA_DIR + 'train/', x) for x in os.listdir(AID_DATA_DIR + 'train/')]
111 | test_filelist = ["%s%s" % (AID_DATA_DIR + 'test_valid/', x) for x in os.listdir(AID_DATA_DIR + 'test_valid/')]
112 |
113 | train_file_id = [int(re.sub('[\D]', '', x)) for x in train_filelist]
114 | train_filelist = [train_filelist[idx] for idx in np.argsort(train_file_id)]
115 |
116 | test_file_id = [int(re.sub('[\D]', '', x)) for x in test_filelist]
117 | test_filelist = [test_filelist[idx] for idx in np.argsort(test_file_id)]
118 |
119 | num_sparse_feat_list = []
120 | for line in open(AID_DATA_DIR + 'cat_feature_num.txt'):
121 | cat_feat_num = line.rstrip().split(' ')
122 | num_sparse_feat_list.append(int(cat_feat_num[1]) + 1)
123 |
124 | # num_sparse_feat_list = []
125 | # with open(fname.strip(), 'r') as fin:
126 | # for line in fin:
127 | # cat_feat_num = line.rstrip().split(' ')
128 | # num_sparse_feat_list.append(int(cat_feat_num[1]) + 1)
129 |
130 | # 下面的num_sparse_feat之所以还要加1个维度, 是因为缺失值的处理(详见数据处理过程)
131 | dcn = DCN_layer(reg_l2=1e-5, num_dense_feat=13, num_sparse_feat_list=num_sparse_feat_list,
132 | dropout_deep=[0.5, 0.5, 0.5], deep_layer_sizes=[1024, 1024], num_cross_layers=6).to(DEVICE)
133 | print("Start Training DeepFM Model!")
134 |
135 | # 定义损失函数还有优化器
136 | optimizer = torch.optim.Adam(dcn.parameters(), lr=1e-4)
137 |
138 | # 计数train和test的数据量
139 | train_item_count, test_item_count = 0, 0
140 | for fname in train_filelist:
141 | with open(fname.strip(), 'r') as fin:
142 | for _ in fin:
143 | train_item_count += 1
144 |
145 | for fname in test_filelist:
146 | with open(fname.strip(), 'r') as fin:
147 | for _ in fin:
148 | test_item_count += 1
149 |
150 | # 由于数据量过大, 如果使用pytorch的DataSet来自定义数据的话, 会耗时很久, 因此, 这里使用其它方式
151 | cat_feat_idx_dict_list = [{} for _ in range(26)]
152 | for i in range(26):
153 | lookup_idx = 1 # remain 0 for default value
154 | for line in open(os.path.join(AID_DATA_DIR + 'vocab', 'C' + str(i + 1) + '.txt')):
155 | cat_feat_idx_dict_list[i][line.strip()] = lookup_idx
156 | lookup_idx += 1
157 |
158 | for epoch in range(1, EPOCHS + 1):
159 | tic = time()
160 | train(dcn, train_filelist, train_item_count, device, optimizer, epoch, cat_feat_idx_dict_list)
161 | torch.save(dcn, 'DCN_' + str(epoch) + '.model')
162 | toc = time()
163 | dcn1 = torch.load('DCN_' + str(epoch) + '.model')
164 | dcn1.eval()
165 | test(dcn1, test_filelist, test_item_count, device, cat_feat_idx_dict_list)
166 | print('The Time of Epoch: %.5f min' % float((toc - tic) / 60.0))
167 | print('The Test Time of Epoch: %.5f min' % float((time() - toc) / 60.0))
168 |
169 |
170 | def test(model, test_filelist, test_item_count, device, cat_feat_idx_dict_list):
171 | fname_idx = 0
172 | pred_y, true_y = [], []
173 | sparse_features_idxs, dense_features_values, labels = None, None, None
174 | test_loss = 0
175 | with torch.no_grad():
176 | # 不断地取出数据进行计算
177 | pre_file_data_count = 0
178 | for batch_idx in range(math.ceil(test_item_count / BATCH_SIZE)):
179 | # 取出当前Batch所在的数据的下标
180 | st_idx, ed_idx = batch_idx * BATCH_SIZE, (batch_idx + 1) * BATCH_SIZE
181 | ed_idx = min(ed_idx, test_item_count - 1)
182 |
183 | if sparse_features_idxs is None:
184 | # sparse_features_idxs, dense_features_values, labels = get_idx_value_label(
185 | # test_filelist[fname_idx], feat_dict_, shuffle=False)
186 | sparse_features_idxs, dense_features_values, labels = new_get_idx_value_label(
187 | test_filelist[fname_idx], cat_feat_idx_dict_list, shuffle=False)
188 |
189 | st_idx -= pre_file_data_count
190 | ed_idx -= pre_file_data_count
191 |
192 | if ed_idx <= len(sparse_features_idxs):
193 | batch_fea_idxs = sparse_features_idxs[st_idx:ed_idx, :]
194 | batch_fea_values = dense_features_values[st_idx:ed_idx, :]
195 | batch_labels = labels[st_idx:ed_idx, :]
196 | else:
197 | pre_file_data_count += len(sparse_features_idxs)
198 |
199 | batch_fea_idxs_part1 = sparse_features_idxs[st_idx::, :]
200 | batch_fea_values_part1 = dense_features_values[st_idx::, :]
201 | batch_labels_part1 = labels[st_idx::, :]
202 |
203 | ed_idx -= len(sparse_features_idxs)
204 | fname_idx += 1
205 | # sparse_features_idxs, dense_features_values, labels = get_idx_value_label(
206 | # test_filelist[fname_idx], feat_dict_, shuffle=False)
207 | sparse_features_idxs, dense_features_values, labels = new_get_idx_value_label(
208 | test_filelist[fname_idx], cat_feat_idx_dict_list, shuffle=False)
209 |
210 | batch_fea_idxs_part2 = sparse_features_idxs[0:ed_idx, :]
211 | batch_fea_values_part2 = dense_features_values[0:ed_idx, :]
212 | batch_labels_part2 = labels[0:ed_idx, :]
213 |
214 | batch_fea_idxs = np.vstack((batch_fea_idxs_part1, batch_fea_idxs_part2))
215 | batch_fea_values = np.vstack((batch_fea_values_part1, batch_fea_values_part2))
216 | batch_labels = np.vstack((batch_labels_part1, batch_labels_part2))
217 | batch_fea_values = torch.from_numpy(batch_fea_values)
218 | batch_labels = torch.from_numpy(batch_labels)
219 |
220 | sparse_idx_list = []
221 | for i in range(26):
222 | sparse_idx = batch_fea_idxs[:, i]
223 | sparse_idx = torch.LongTensor([int(x) for x in sparse_idx])
224 | sparse_idx = sparse_idx.to(device)
225 | sparse_idx_list.append(sparse_idx)
226 |
227 | dense_value = batch_fea_values.to(device, dtype=torch.float32)
228 | target = batch_labels.to(device, dtype=torch.float32)
229 | output = model(sparse_idx_list, dense_value)
230 |
231 | test_loss += F.binary_cross_entropy_with_logits(output, target)
232 |
233 | pred_y.extend(list(output.cpu().numpy()))
234 | true_y.extend(list(target.cpu().numpy()))
235 |
236 | print('Roc AUC: %.5f' % roc_auc_score(y_true=np.array(true_y), y_score=np.array(pred_y)))
237 | test_loss /= math.ceil(test_item_count / BATCH_SIZE)
238 | print('Test set: Average loss: {:.5f}'.format(test_loss))
239 |
240 |
241 | def train(model, train_filelist, train_item_count, device, optimizer, epoch, cat_feat_idx_dict_list):
242 | fname_idx = 0
243 | sparse_features_idxs, dense_features_values, labels = None, None, None
244 | # 依顺序来遍历访问
245 | pre_file_data_count = 0
246 | for batch_idx in range(math.ceil(train_item_count / BATCH_SIZE)):
247 | # 得到当前Batch所在的数据的下标
248 | st_idx, ed_idx = batch_idx * BATCH_SIZE, (batch_idx + 1) * BATCH_SIZE
249 | ed_idx = min(ed_idx, train_item_count - 1)
250 |
251 | if sparse_features_idxs is None:
252 | # sparse_features_idxs, dense_features_values, labels = get_idx_value_label(
253 | # train_filelist[fname_idx], feat_dict_)
254 | sparse_features_idxs, dense_features_values, labels = new_get_idx_value_label(
255 | train_filelist[fname_idx], cat_feat_idx_dict_list, shuffle=True)
256 |
257 | st_idx -= pre_file_data_count
258 | ed_idx -= pre_file_data_count
259 |
260 | if ed_idx < len(sparse_features_idxs):
261 | batch_fea_idxs = sparse_features_idxs[st_idx:ed_idx, :]
262 | batch_fea_values = dense_features_values[st_idx:ed_idx, :]
263 | batch_labels = labels[st_idx:ed_idx, :]
264 | else:
265 | pre_file_data_count += len(sparse_features_idxs)
266 |
267 | batch_fea_idxs_part1 = sparse_features_idxs[st_idx::, :]
268 | batch_fea_values_part1 = dense_features_values[st_idx::, :]
269 | batch_labels_part1 = labels[st_idx::, :]
270 |
271 | ed_idx -= len(sparse_features_idxs)
272 | fname_idx += 1
273 | # sparse_features_idxs, dense_features_values, labels = get_idx_value_label(
274 | # train_filelist[fname_idx], feat_dict_)
275 | sparse_features_idxs, dense_features_values, labels = new_get_idx_value_label(
276 | train_filelist[fname_idx], cat_feat_idx_dict_list, shuffle=True)
277 |
278 | batch_fea_idxs_part2 = sparse_features_idxs[0:ed_idx, :]
279 | batch_fea_values_part2 = dense_features_values[0:ed_idx, :]
280 | batch_labels_part2 = labels[0:ed_idx, :]
281 |
282 | batch_fea_idxs = np.vstack((batch_fea_idxs_part1, batch_fea_idxs_part2))
283 | batch_fea_values = np.vstack((batch_fea_values_part1, batch_fea_values_part2))
284 | batch_labels = np.vstack((batch_labels_part1, batch_labels_part2))
285 |
286 | batch_fea_values = torch.from_numpy(batch_fea_values)
287 | batch_labels = torch.from_numpy(batch_labels)
288 |
289 | sparse_idx_list = []
290 | for i in range(26):
291 | sparse_idx = batch_fea_idxs[:, i]
292 | sparse_idx = torch.LongTensor([int(x) for x in sparse_idx])
293 | sparse_idx = sparse_idx.to(device)
294 | sparse_idx_list.append(sparse_idx)
295 |
296 | dense_value = batch_fea_values.to(device, dtype=torch.float32)
297 | target = batch_labels.to(device, dtype=torch.float32)
298 | optimizer.zero_grad()
299 | output = model(sparse_idx_list, dense_value)
300 | loss = F.binary_cross_entropy_with_logits(output, target)
301 |
302 | regularization_loss = 0
303 | for param in model.parameters():
304 | # regularization_loss += model.reg_l1 * torch.sum(torch.abs(param))
305 | regularization_loss += model.reg_l2 * torch.sum(torch.pow(param, 2))
306 | loss += regularization_loss
307 |
308 | loss.backward()
309 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=100)
310 | optimizer.step()
311 | if batch_idx % 1000 == 0:
312 | print('Train Epoch: {} [{} / {} ({:.2f}%)]\tLoss:{:.6f}'.format(
313 | epoch, batch_idx * BATCH_SIZE, train_item_count,
314 | 100. * batch_idx / math.ceil(int(train_item_count / BATCH_SIZE)), loss.item()))
315 |
316 |
317 | def new_get_idx_value_label(fname, cat_feat_idx_dict_list, shuffle=True):
318 | cont_idx_ = list(range(1, 14))
319 | cat_idx_ = list(range(14, 40))
320 |
321 | def new_process_line(line):
322 | sparse_feat_idx = []
323 | dense_feat_value = []
324 |
325 | features = line.rstrip('\n').split('\t')
326 | for idx in cont_idx_:
327 | if features[idx] == '':
328 | dense_feat_value.append(0)
329 | else:
330 | # log transform
331 | dense_feat_value.append(
332 | math.log(4 + float(features[idx])) if idx == 2 else math.log(1 + float(features[idx])))
333 |
334 | for idx in cat_idx_:
335 | if features[idx] == '' or features[idx] not in cat_feat_idx_dict_list[idx - 14]:
336 | sparse_feat_idx.append(0)
337 | else:
338 | sparse_feat_idx.append(cat_feat_idx_dict_list[idx - 14][features[idx]])
339 |
340 | return sparse_feat_idx, dense_feat_value, [int(features[0])]
341 |
342 | sparse_features_idxs, dense_features_values, labels = [], [], []
343 | with open(fname.strip(), 'r') as fin:
344 | for line in fin:
345 | sparse_feat_idx, dense_feat_value, label = new_process_line(line)
346 | sparse_features_idxs.append(sparse_feat_idx)
347 | dense_features_values.append(dense_feat_value)
348 | labels.append(label)
349 |
350 | sparse_features_idxs = np.array(sparse_features_idxs)
351 | dense_features_values = np.array(dense_features_values)
352 | labels = np.array(labels).astype(np.int32)
353 |
354 | # 进行shuffle
355 | if shuffle:
356 | idx_list = np.arange(len(labels))
357 | np.random.shuffle(idx_list)
358 | sparse_features_idxs = sparse_features_idxs[idx_list, :]
359 | dense_features_values = dense_features_values[idx_list, :]
360 | labels = labels[idx_list, :]
361 | return sparse_features_idxs, dense_features_values, labels
362 |
363 |
364 | def get_idx_value_label(fname, feat_dict_, shuffle=True):
365 | continuous_range_ = range(1, 14)
366 | categorical_range_ = range(14, 40)
367 |
368 | def _process_line(line):
369 | features = line.rstrip('\n').split('\t')
370 | sparse_feat_idx = []
371 | dense_feat_value = []
372 |
373 | # 对于连续型数据, 根据kaggle Winner的做法, 使用取Log处理
374 | for idx in continuous_range_:
375 | if features[idx] == '':
376 | dense_feat_value.append(0.0)
377 | else:
378 | fea_value = math.log(4 + float(features[idx])) if idx == 2 else math.log(1 + float(features[idx]))
379 | dense_feat_value.append(fea_value)
380 |
381 | # 处理分类型数据, 由于DCN使用Embedding的方式处理, 并不需要value的值, 因此, 仅需要返回Embedding所对应的index即可
382 | for idx in categorical_range_:
383 | if features[idx] == '' or features[idx] not in feat_dict_['C' + str(idx)]:
384 | sparse_feat_idx.append(0)
385 | else:
386 | sparse_feat_idx.append(feat_dict_['C' + str(idx)][features[idx]])
387 |
388 | return sparse_feat_idx, dense_feat_value, [int(features[0])]
389 |
390 | sparse_features_idxs, dense_features_values, labels = [], [], []
391 | with open(fname.strip(), 'r') as fin:
392 | for line in fin:
393 | sparse_feat_idx, dense_feat_value, label = _process_line(line)
394 | sparse_features_idxs.append(sparse_feat_idx)
395 | dense_features_values.append(dense_feat_value)
396 | labels.append(label)
397 |
398 | sparse_features_idxs = np.array(sparse_features_idxs)
399 | dense_features_values = np.array(dense_features_values)
400 | labels = np.array(labels).astype(np.int32)
401 |
402 | # 进行shuffle
403 | if shuffle:
404 | idx_list = np.arange(len(labels))
405 | np.random.shuffle(idx_list)
406 | sparse_features_idxs = sparse_features_idxs[idx_list, :]
407 | dense_features_values = dense_features_values[idx_list, :]
408 | labels = labels[idx_list, :]
409 | return sparse_features_idxs, dense_features_values, labels
410 |
411 | if __name__ == '__main__':
412 | train_DeepFM_model_demo(DEVICE)
413 |
--------------------------------------------------------------------------------
/Model/DeepFM_PyTorch.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import math
4 | import pickle
5 | import torch
6 | import numpy as np
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 | from sklearn.metrics import roc_auc_score
10 |
11 | EPOCHS = 10
12 | BATCH_SIZE = 2048
13 | AID_DATA_DIR = '../data/Criteo/forDeepFM/' # 辅助用途的文件路径
14 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15 |
16 | """
17 | PyTorch implementation of DeepFM[1]
18 |
19 | Reference:
20 | [1] DeepFM: A Factorization-Machine based Neural Network for CTR Prediction,
21 | Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He
22 | [2] Tensorflow implementation of DeepFM for CTR prediction
23 | https://github.com/ChenglongChen/tensorflow-DeepFM
24 | [3] PaddlePaddle implemantation of DeepFM for CTR prediction
25 | https://github.com/PaddlePaddle/models/tree/develop/PaddleRec/ctr/deepfm
26 | """
27 |
28 | class DeepFM(nn.Module):
29 | def __init__(self, num_feat, num_field, dropout_deep, dropout_fm,
30 | reg_l1=0.01, reg_l2=0.01, layer_sizes=[400, 400, 400], embedding_size=10):
31 | super(DeepFM, self).__init__()
32 | self.reg_l1 = reg_l1
33 | self.reg_l2 = reg_l2 # L1/L2正则化并没有去使用
34 | self.num_feat = num_feat # denote as M
35 | self.num_field = num_field # denote as F
36 | self.embedding_size = embedding_size # denote as K
37 | self.layer_sizes = layer_sizes
38 |
39 | self.dropout_deep = dropout_deep
40 | self.dropout_fm = dropout_fm
41 |
42 | # first order term parameters embedding
43 | self.first_weights = nn.Embedding(num_feat, 1) # None * M * 1
44 | nn.init.xavier_uniform_(self.first_weights.weight.data)
45 |
46 | # 需要定义一个 Embedding
47 | self.feat_embeddings = nn.Embedding(num_feat, embedding_size) # None * M * K
48 | nn.init.xavier_uniform_(self.feat_embeddings.weight.data)
49 |
50 | # 神经网络方面的参数
51 | all_dims = [self.num_field * self.embedding_size] + layer_sizes
52 | for i in range(1, len(layer_sizes) + 1):
53 | setattr(self, 'linear_' + str(i), nn.Linear(all_dims[i - 1], all_dims[i]))
54 | setattr(self, 'batchNorm_' + str(i), nn.BatchNorm1d(all_dims[i]))
55 | setattr(self, 'dropout_' + str(i), nn.Dropout(dropout_deep[i]))
56 |
57 | # 最后一层全连接层
58 | self.fc = nn.Linear(num_field + embedding_size + all_dims[-1], 1)
59 |
60 | def forward(self, feat_index, feat_value):
61 | feat_value = torch.unsqueeze(feat_value, dim=2) # None * F * 1
62 |
63 | # Step1: 先计算一阶线性的部分 sum_square part
64 | first_weights = self.first_weights(feat_index) # None * F * 1
65 | first_weight_value = torch.mul(first_weights, feat_value)
66 | y_first_order = torch.sum(first_weight_value, dim=2) # None * F
67 | y_first_order = nn.Dropout(self.dropout_fm[0])(y_first_order) # None * F
68 |
69 | # Step2: 再计算二阶部分
70 | secd_feat_emb = self.feat_embeddings(feat_index) # None * F * K
71 | feat_emd_value = secd_feat_emb * feat_value # None * F * K(广播)
72 |
73 | # sum_square part
74 | summed_feat_emb = torch.sum(feat_emd_value, 1) # None * K
75 | interaction_part1 = torch.pow(summed_feat_emb, 2) # None * K
76 |
77 | # squared_sum part
78 | squared_feat_emd_value = torch.pow(feat_emd_value, 2) # None * K
79 | interaction_part2 = torch.sum(squared_feat_emd_value, dim=1) # None * K
80 |
81 | y_secd_order = 0.5 * torch.sub(interaction_part1, interaction_part2)
82 | y_secd_order = nn.Dropout(self.dropout_fm[1])(y_secd_order)
83 |
84 | # Step3: Deep部分
85 | y_deep = feat_emd_value.reshape(-1, self.num_field * self.embedding_size) # None * (F * K)
86 | y_deep = nn.Dropout(self.dropout_deep[0])(y_deep)
87 |
88 | for i in range(1, len(self.layer_sizes) + 1):
89 | y_deep = getattr(self, 'linear_' + str(i))(y_deep)
90 | y_deep = getattr(self, 'batchNorm_' + str(i))(y_deep)
91 | y_deep = F.relu(y_deep)
92 | y_deep = getattr(self, 'dropout_' + str(i))(y_deep)
93 |
94 | concat_input = torch.cat((y_first_order, y_secd_order, y_deep), dim=1)
95 | output = self.fc(concat_input)
96 | return output
97 |
98 |
99 | """ ************************************************************************************ """
100 | """ 训练和测试FM模型 """
101 | """ ************************************************************************************ """
102 | def train_DeepFM_model_demo(device):
103 | """
104 | 训练DeepFM的方式
105 | :return:
106 | """
107 | train_filelist = ["%s%s" % (AID_DATA_DIR + 'train_data/', x) for x in os.listdir(AID_DATA_DIR + 'train_data/')]
108 | test_filelist = ["%s%s" % (AID_DATA_DIR + 'test_data/', x) for x in os.listdir(AID_DATA_DIR + 'test_data/')]
109 | train_file_id = [int(re.sub('[\D]', '', x)) for x in train_filelist]
110 | train_filelist = [train_filelist[idx] for idx in np.argsort(train_file_id)]
111 |
112 | test_file_id = [int(re.sub('[\D]', '', x)) for x in test_filelist]
113 | test_filelist = [test_filelist[idx] for idx in np.argsort(test_file_id)]
114 |
115 | feat_dict_ = pickle.load(open(AID_DATA_DIR + 'aid_data/feat_dict_10.pkl2', 'rb'))
116 |
117 | # 下面的num_feat的长度还需要考虑缺失值的处理而多了一个维度
118 | deepfm = DeepFM(num_feat=len(feat_dict_) + 1, num_field=39,
119 | dropout_deep=[0.5, 0.5, 0.5, 0.5], dropout_fm=[0, 0],
120 | layer_sizes=[400, 400, 400], embedding_size=10).to(DEVICE)
121 | print("Start Training DeepFM Model!")
122 |
123 | # 定义损失函数还有优化器
124 | optimizer = torch.optim.Adam(deepfm.parameters())
125 |
126 | # 计数train和test的数据量
127 | train_item_count = get_in_filelist_item_num(train_filelist)
128 | test_item_count = get_in_filelist_item_num(test_filelist)
129 |
130 | # 由于数据量过大, 如果使用pytorch的DataSet来自定义数据的话, 会耗时很久, 因此, 这里使用其它方式
131 | for epoch in range(1, EPOCHS + 1):
132 | train(deepfm, train_filelist, train_item_count, feat_dict_, device, optimizer, epoch)
133 | test(deepfm, test_filelist, test_item_count, feat_dict_, device)
134 |
135 |
136 | def get_in_filelist_item_num(filelist):
137 | count = 0
138 | for fname in filelist:
139 | with open(fname.strip(), 'r') as fin:
140 | for _ in fin:
141 | count += 1
142 | return count
143 |
144 |
145 | def test(model, test_filelist, test_item_count, feat_dict_, device):
146 | fname_idx = 0
147 | pred_y, true_y = [], []
148 | features_idxs, features_values, labels = None, None, None
149 | test_loss = 0
150 | with torch.no_grad():
151 | # 不断地取出数据进行计算
152 | pre_file_data_count = 0 # 记录在前面已经访问的文件中的数据的数量
153 | for batch_idx in range(math.ceil(test_item_count / BATCH_SIZE)):
154 | # 取出当前Batch所在的数据的下标
155 | st_idx, ed_idx = batch_idx * BATCH_SIZE, (batch_idx + 1) * BATCH_SIZE
156 | ed_idx = min(ed_idx, test_item_count - 1)
157 |
158 | if features_idxs is None:
159 | features_idxs, features_values, labels = get_idx_value_label(
160 | test_filelist[fname_idx], feat_dict_, shuffle=False)
161 |
162 | # 得到在现有文件中的所对应的起始位置及终止位置
163 | st_idx -= pre_file_data_count
164 | ed_idx -= pre_file_data_count
165 |
166 | # 如果数据越过当前文件所对应的范围时, 则再读取下一个文件
167 | if ed_idx <= len(features_idxs):
168 | batch_fea_idxs = features_idxs[st_idx:ed_idx, :]
169 | batch_fea_values = features_values[st_idx:ed_idx, :]
170 | batch_labels = labels[st_idx:ed_idx, :]
171 | else:
172 | pre_file_data_count += len(features_idxs)
173 |
174 | # 得到在这个文件内的数据
175 | batch_fea_idxs_part1 = features_idxs[st_idx::, :]
176 | batch_fea_values_part1 = features_values[st_idx::, :]
177 | batch_labels_part1 = labels[st_idx::, :]
178 |
179 | # 得到在下一个文件内的数据
180 | fname_idx += 1
181 | ed_idx -= len(features_idxs)
182 | features_idxs, features_values, labels = get_idx_value_label(
183 | test_filelist[fname_idx], feat_dict_, shuffle=False)
184 | batch_fea_idxs_part2 = features_idxs[0:ed_idx, :]
185 | batch_fea_values_part2 = features_values[0:ed_idx, :]
186 | batch_labels_part2 = labels[0:ed_idx, :]
187 |
188 | # 将两部分数据进行合并(正常情况下, 数据最多只会在两个文件中)
189 | batch_fea_idxs = np.vstack((batch_fea_idxs_part1, batch_fea_idxs_part2))
190 | batch_fea_values = np.vstack((batch_fea_values_part1, batch_fea_values_part2))
191 | batch_labels = np.vstack((batch_labels_part1, batch_labels_part2))
192 |
193 | # 进行格式转换
194 | batch_fea_values = torch.from_numpy(batch_fea_values)
195 | batch_labels = torch.from_numpy(batch_labels)
196 |
197 | idx = torch.LongTensor([[int(x) for x in x_idx] for x_idx in batch_fea_idxs])
198 | idx = idx.to(device)
199 | value = batch_fea_values.to(device, dtype=torch.float32)
200 | target = batch_labels.to(device, dtype=torch.float32)
201 | output = model(idx, value)
202 |
203 | test_loss += F.binary_cross_entropy_with_logits(output, target)
204 |
205 | pred_y.extend(list(output.cpu().numpy()))
206 | true_y.extend(list(target.cpu().numpy()))
207 |
208 | print('Roc AUC: %.5f' % roc_auc_score(y_true=np.array(true_y), y_score=np.array(pred_y)))
209 | test_loss /= math.ceil(test_item_count / BATCH_SIZE)
210 | print('Test set: Average loss: {:.5f}'.format(test_loss))
211 |
212 |
213 | def train(model, train_filelist, train_item_count, feat_dict_, device, optimizer, epoch):
214 | fname_idx = 0
215 | features_idxs, features_values, labels = None, None, None
216 |
217 | # 依顺序来遍历访问
218 | pre_file_data_count = 0 # 记录在前面已经访问的文件中的数据的数量
219 | for batch_idx in range(math.ceil(train_item_count / BATCH_SIZE)):
220 | # 得到当前Batch所要取的数据的起始及终止下标
221 | st_idx, ed_idx = batch_idx * BATCH_SIZE, (batch_idx + 1) * BATCH_SIZE
222 | ed_idx = min(ed_idx, train_item_count - 1)
223 |
224 | if features_idxs is None:
225 | features_idxs, features_values, labels = get_idx_value_label(train_filelist[fname_idx], feat_dict_)
226 |
227 | # 得到在现有文件中的所对应的起始位置及终止位置
228 | st_idx -= pre_file_data_count
229 | ed_idx -= pre_file_data_count
230 |
231 | # 如果数据越过当前文件所对应的范围时, 则再读取下一个文件
232 | if ed_idx < len(features_idxs):
233 | batch_fea_idxs = features_idxs[st_idx:ed_idx, :]
234 | batch_fea_values = features_values[st_idx:ed_idx, :]
235 | batch_labels = labels[st_idx:ed_idx, :]
236 | else:
237 | pre_file_data_count += len(features_idxs)
238 |
239 | # 得到在这个文件内的数据
240 | batch_fea_idxs_part1 = features_idxs[st_idx::, :]
241 | batch_fea_values_part1 = features_values[st_idx::, :]
242 | batch_labels_part1 = labels[st_idx::, :]
243 |
244 | # 得到在下一个文件内的数据
245 | fname_idx += 1
246 | ed_idx -= len(features_idxs)
247 | features_idxs, features_values, labels = get_idx_value_label(train_filelist[fname_idx], feat_dict_)
248 | batch_fea_idxs_part2 = features_idxs[0:ed_idx, :]
249 | batch_fea_values_part2 = features_values[0:ed_idx, :]
250 | batch_labels_part2 = labels[0:ed_idx, :]
251 |
252 | # 将两部分数据进行合并(正常情况下, 数据最多只会在两个文件中)
253 | batch_fea_idxs = np.vstack((batch_fea_idxs_part1, batch_fea_idxs_part2))
254 | batch_fea_values = np.vstack((batch_fea_values_part1, batch_fea_values_part2))
255 | batch_labels = np.vstack((batch_labels_part1, batch_labels_part2))
256 |
257 | # 进行格式转换
258 | batch_fea_values = torch.from_numpy(batch_fea_values)
259 | batch_labels = torch.from_numpy(batch_labels)
260 |
261 | idx = torch.LongTensor([[int(x) for x in x_idx] for x_idx in batch_fea_idxs])
262 | idx = idx.to(device)
263 | value = batch_fea_values.to(device, dtype=torch.float32)
264 | target = batch_labels.to(device, dtype=torch.float32)
265 | optimizer.zero_grad()
266 | output = model(idx, value)
267 | loss = F.binary_cross_entropy_with_logits(output, target)
268 | loss.backward()
269 |
270 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=100)
271 | optimizer.step()
272 | if batch_idx % 1000 == 0:
273 | print('Train Epoch: {} [{} / {} ({:.0f}%]\tLoss:{:.6f}'.format(
274 | epoch, batch_idx * len(idx), train_item_count,
275 | 100. * batch_idx / math.ceil(int(train_item_count / BATCH_SIZE)), loss.item()))
276 |
277 |
278 | def get_idx_value_label(fname, feat_dict_, shuffle=True):
279 | continuous_range_ = range(1, 14)
280 | categorical_range_ = range(14, 40)
281 | cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
282 | cont_max_ = [5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 46, 231, 4008, 7393]
283 | cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]
284 |
285 | def _process_line(line):
286 | features = line.rstrip('\n').split('\t')
287 | feat_idx = []
288 | feat_value = []
289 |
290 | # MinMax标准化连续型数据
291 | for idx in continuous_range_:
292 | if features[idx] == '':
293 | feat_idx.append(0)
294 | feat_value.append(0.0)
295 | else:
296 | feat_idx.append(feat_dict_[idx])
297 | feat_value.append((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1])
298 |
299 | # 处理分类型数据
300 | for idx in categorical_range_:
301 | if features[idx] == '' or features[idx] not in feat_dict_:
302 | feat_idx.append(0)
303 | feat_value.append(0.0)
304 | else:
305 | feat_idx.append(feat_dict_[features[idx]])
306 | feat_value.append(1.0)
307 |
308 | return feat_idx, feat_value, [int(features[0])]
309 |
310 | features_idxs, features_values, labels = [], [], []
311 | with open(fname.strip(), 'r') as fin:
312 | for line in fin:
313 | feat_idx, feat_value, label = _process_line(line)
314 | features_idxs.append(feat_idx)
315 | features_values.append(feat_value)
316 | labels.append(label)
317 |
318 | features_idxs = np.array(features_idxs)
319 | features_values = np.array(features_values)
320 | labels = np.array(labels).astype(np.int32)
321 |
322 | # 进行shuffle
323 | if shuffle:
324 | idx_list = np.arange(len(features_idxs))
325 | np.random.shuffle(idx_list)
326 |
327 | features_idxs = features_idxs[idx_list, :]
328 | features_values = features_values[idx_list, :]
329 | labels = labels[idx_list, :]
330 | return features_idxs, features_values, labels
331 |
332 | if __name__ == '__main__':
333 | train_DeepFM_model_demo(DEVICE)
334 |
--------------------------------------------------------------------------------
/Model/FFM_PyTorch.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import pandas as pd
4 | import torch.nn.functional as F
5 | import torch.nn as nn
6 | from util.load_data_util import get_batch_loader
7 | from sklearn import preprocessing
8 |
9 | EPOCHS = 1000
10 | BATCH_SIZE = 1000
11 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12 |
13 |
14 | """ ************************************************************************************ """
15 | """ 数据读取和转换 """
16 | """ ************************************************************************************ """
17 | def load_dataset():
18 | """
19 | FFM模型里Load数据的方式与FM模型中基本一致, 除了Field部分
20 | :return:
21 | """
22 | # Step1: 获取基本信息
23 | header = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
24 | df_user = pd.read_csv('../data/FM-Data/u.user', sep='|', names=header)
25 | header = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure',
26 | 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
27 | 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
28 | df_item = pd.read_csv('../data/FM-Data/u.item', sep='|', names=header, encoding="ISO-8859-1")
29 | df_item = df_item.drop(columns=['title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown'])
30 |
31 | df_user['age'] = pd.cut(df_user['age'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
32 | labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90',
33 | '90-100'])
34 | df_user = pd.get_dummies(df_user, columns=['gender', 'occupation', 'age'])
35 | df_user = df_user.drop(columns=['zip_code'])
36 |
37 | user_features = df_user.columns.values.tolist()
38 | movie_features = df_item.columns.values.tolist()
39 | cols = user_features + movie_features
40 | cols.remove('user_id')
41 | cols.remove('item_id')
42 |
43 | # Step2: 把特征进行归类处理(分成4个field)
44 | # 这里, 如果我们把Field分成4类, Gender, Occupation, Age, Other
45 | field_index, feature2field = {}, {}
46 | other_idxs = []
47 | for idx, col in enumerate(cols):
48 | infos = col.split('_')
49 | if len(infos) == 2:
50 | field = infos[0]
51 | field_index[field] = field_index.get(field, len(field_index))
52 | feature2field[idx] = field_index[field]
53 | if len(infos) == 1:
54 | other_idxs.append(idx)
55 | for idx in other_idxs:
56 | feature2field[idx] = len(field_index)
57 |
58 | # Step3: 根据user_id, item_id进行Merge, 得到对应的数据
59 | header = ['user_id', 'item_id', 'rating', 'timestamp']
60 | df_train = pd.read_csv('../data/FM-Data/ua.base', sep='\t', names=header)
61 | df_train = df_train.merge(df_user, on='user_id', how='left')
62 | df_train = df_train.merge(df_item, on='item_id', how='left')
63 |
64 | df_test = pd.read_csv('../data/FM-Data/ua.test', sep='\t', names=header)
65 | df_test = df_test.merge(df_user, on='user_id', how='left')
66 | df_test = df_test.merge(df_item, on='item_id', how='left')
67 |
68 | # Step4: Label的变换, 以避免Cuda中报错
69 | # 需要对Label进行一定的转换, 因为原始的Label是[1, 2, 3, 4, 5]
70 | # 而 cuda中, 如果直接以这种Label的话, 会报错(Label 需要在[0, n_class - 1]范围
71 | # 因此, 需要转成[0, 1, 2, 3, 4]
72 | map_dict = dict()
73 | label_set = sorted(set(df_train['rating']) | set(df_test['rating']))
74 | for x in label_set:
75 | map_dict[x] = map_dict.get(x, len(map_dict))
76 |
77 | df_train['rating'] = df_train.rating.apply(lambda x: map_dict[x])
78 | df_test['rating'] = df_test.rating.apply(lambda x: map_dict[x])
79 |
80 | # # 如果想要使用"二分类"的话, 可以使用下面的方式来处理
81 | # df_train['rating'] = df_train.rating.apply(lambda x: 1 if int(x) == 1 else 0)
82 | # df_test['rating'] = df_test.rating.apply(lambda x: 1 if int(x) == 1 else 0)
83 |
84 | # Step5: 输出成np.array格式
85 | train_labels = np.array(df_train['rating'].astype(np.int32))
86 | test_labels = np.array(df_test['rating'].astype(np.int32))
87 | return df_train[cols].values, train_labels, df_test[cols].values, test_labels, feature2field
88 |
89 |
90 | """ ************************************************************************************ """
91 | """ FFM层 """
92 | """ ************************************************************************************ """
93 | class FFM_layer(nn.Module):
94 | def __init__(self, field_map_dict, fea_num, reg_l1=0.01, reg_l2=0.01, class_num=1, latent_factor_dim=10):
95 | super(FFM_layer, self).__init__()
96 | self.reg_l1 = reg_l1
97 | self.reg_l2 = reg_l2
98 | self.fea_num = fea_num
99 | self.field_map_dict = field_map_dict # 需要有个下标对应的字典
100 |
101 | field_num = len(field_map_dict)
102 | self.linear = nn.Linear(fea_num, class_num) # 用于bias与线性部分
103 | self.v = nn.Parameter(torch.randn(fea_num, field_num, latent_factor_dim, class_num))
104 |
105 | def forward(self, x):
106 | # 先计算得到线性的那一部分
107 | linear_part = self.linear(x)
108 |
109 | # 计算交叉部分
110 | interaction_part = 0.0
111 | for i in range(self.fea_num):
112 | for j in range(i + 1, self.fea_num):
113 | v_ifj = self.v[i, self.field_map_dict[j], :, :]
114 | v_jfi = self.v[j, self.field_map_dict[i], :, :]
115 |
116 | xij = torch.unsqueeze(x[:, i] * x[:, j], dim=1)
117 | v_ijji = torch.unsqueeze(torch.sum(v_ifj * v_jfi, dim=0), dim=0)
118 |
119 | interaction_part += torch.mm(xij, v_ijji)
120 |
121 | output = linear_part + interaction_part
122 | output = torch.log_softmax(output, dim=1)
123 | return output
124 |
125 |
126 | """ ************************************************************************************ """
127 | """ 训练和测试FM模型 """
128 | """ ************************************************************************************ """
129 | def train_FFM_model_demo():
130 |
131 | # Step1: 导入数据
132 | x_train, y_train, x_test, y_test, feature2field = load_dataset()
133 | x_train = preprocessing.scale(x_train, with_mean=True, with_std=True)
134 | x_test = preprocessing.scale(x_test, with_mean=True, with_std=True)
135 | class_num = len(set([y for y in y_train] + [y for y in y_test]))
136 |
137 | # FFM模型
138 | ffm = FFM_layer(field_map_dict=feature2field, fea_num=x_train.shape[1], reg_l1=0.01, reg_l2=0.01,
139 | class_num=class_num, latent_factor_dim=10).to(DEVICE)
140 |
141 | # 定义损失函数还有优化器
142 | optm = torch.optim.Adam(ffm.parameters())
143 |
144 | train_loader = get_batch_loader(x_train, y_train, BATCH_SIZE, shuffle=True)
145 | test_loader = get_batch_loader(x_test, y_test, BATCH_SIZE, shuffle=False)
146 |
147 | for epoch in range(1, EPOCHS + 1):
148 | train(ffm, DEVICE, train_loader, optm, epoch)
149 | test(ffm, DEVICE, test_loader)
150 |
151 |
152 | def train(model, device, train_loader, optimizer, epoch):
153 | model.train()
154 | for batch_idx, (data, target) in enumerate(train_loader):
155 | data, target = data.to(device, dtype=torch.float32), target.to(device).long()
156 | optimizer.zero_grad()
157 | output = model(data)
158 | loss = F.nll_loss(output, target)
159 |
160 | regularization_loss = 0
161 | for param in model.parameters():
162 | regularization_loss += model.reg_l1 * torch.sum(torch.abs(param))
163 | regularization_loss += model.reg_l2 * torch.sum(torch.pow(param, 2))
164 | loss += regularization_loss
165 |
166 | loss.backward()
167 | optimizer.step()
168 | if (batch_idx + 1) % 500 == 0:
169 | print('Train Epoch: {} [{} / {} ({:.0f}%]\tLoss:{:.6f}'.format(
170 | epoch, batch_idx * len(data), len(train_loader.dataset),
171 | 100. * batch_idx / len(train_loader), loss.item()
172 | ))
173 |
174 |
175 | def test(model, device, test_loader):
176 | model.eval()
177 | test_loss = 0
178 | correct = 0
179 | with torch.no_grad():
180 | for data, target in test_loader:
181 | data, target = data.to(device, dtype=torch.float32), target.to(device).long()
182 | output = model(data)
183 | test_loss += F.nll_loss(output, target, reduction='sum').item()
184 |
185 | regularization_loss = 0
186 | for param in model.parameters():
187 | regularization_loss += model.reg_l1 * torch.sum(torch.abs(param))
188 | regularization_loss += model.reg_l2 * torch.sum(torch.pow(param, 2))
189 | test_loss += regularization_loss
190 |
191 | pred = output.max(1, keepdim=True)[1]
192 | correct += pred.eq(target.view_as(pred)).sum().item()
193 | test_loss /= len(test_loader.dataset)
194 | print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
195 | test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)
196 | ))
197 |
198 |
199 | if __name__ == '__main__':
200 | train_FFM_model_demo()
--------------------------------------------------------------------------------
/Model/FM_PyTorch.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import pandas as pd
4 | import torch.nn.functional as F
5 | import torch.nn as nn
6 | from sklearn import preprocessing
7 | from util.load_data_util import get_batch_loader
8 |
9 | EPOCHS = 500
10 | BATCH_SIZE = 1000
11 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
12 |
13 |
14 | """ ************************************************************************************ """
15 | """ 数据读取和转换 """
16 | """ ************************************************************************************ """
17 | def load_dataset():
18 | header = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
19 | df_user = pd.read_csv('../data/FM-Data/u.user', sep='|', names=header)
20 | header = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure',
21 | 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
22 | 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
23 | df_item = pd.read_csv('../data/FM-Data/u.item', sep='|', names=header, encoding="ISO-8859-1")
24 | df_item = df_item.drop(columns=['title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown'])
25 |
26 | df_user['age'] = pd.cut(df_user['age'], [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
27 | labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90',
28 | '90-100'])
29 | df_user = pd.get_dummies(df_user, columns=['gender', 'occupation', 'age'])
30 | df_user = df_user.drop(columns=['zip_code'])
31 |
32 | user_features = df_user.columns.values.tolist()
33 | movie_features = df_item.columns.values.tolist()
34 | cols = user_features + movie_features
35 |
36 | header = ['user_id', 'item_id', 'rating', 'timestamp']
37 | df_train = pd.read_csv('../data/FM-Data/ua.base', sep='\t', names=header)
38 | df_train = df_train.merge(df_user, on='user_id', how='left')
39 | df_train = df_train.merge(df_item, on='item_id', how='left')
40 |
41 | df_test = pd.read_csv('../data/FM-Data/ua.test', sep='\t', names=header)
42 | df_test = df_test.merge(df_user, on='user_id', how='left')
43 | df_test = df_test.merge(df_item, on='item_id', how='left')
44 |
45 | # 需要对Label进行一定的转换, 因为原始的Label是[1, 2, 3, 4, 5]
46 | # 而 cuda中, 如果直接以这种Label的话, 会报错(Label 需要在[0, n_class - 1]范围
47 | # 因此, 需要转成[0, 1, 2, 3, 4]
48 | map_dict = dict()
49 | label_set = sorted(set(df_train['rating']) | set(df_test['rating']))
50 | for x in label_set:
51 | map_dict[x] = map_dict.get(x, len(map_dict))
52 |
53 | df_train['rating'] = df_train.rating.apply(lambda x: map_dict[x])
54 | df_test['rating'] = df_test.rating.apply(lambda x: map_dict[x])
55 |
56 | # 如果想要使用"二分类"的话, 可以使用下面的方式来处理
57 | # df_train['rating'] = df_train.rating.apply(lambda x: 1 if int(x) == 1 else 0)
58 | # df_test['rating'] = df_test.rating.apply(lambda x: 1 if int(x) == 1 else 0)
59 |
60 | train_labels = np.array(df_train['rating'].astype(np.int32))
61 | test_labels = np.array(df_test['rating'].astype(np.int32))
62 | return df_train[cols].values, train_labels, df_test[cols].values, test_labels
63 |
64 |
65 | """ ************************************************************************************ """
66 | """ FM层 """
67 | """ ************************************************************************************ """
68 | class FM_layer(nn.Module):
69 | def __init__(self, reg_l1=0.01, reg_l2=0.01, class_num=1, feature_num=10, latent_factor_dim=5):
70 | super(FM_layer, self).__init__()
71 | self.reg_l1 = reg_l1
72 | self.reg_l2 = reg_l2
73 | self.fea_num = feature_num
74 | self.k = latent_factor_dim
75 | self.class_num = class_num
76 | self.linear = nn.Linear(self.fea_num, class_num) # 前两项的线性层
77 | self.v = nn.Parameter(torch.randn(self.fea_num, self.k, class_num)) # 交互矩阵
78 |
79 | def forward(self, x):
80 | # 先计算得到线性的那一部分
81 | linear_part = self.linear(x)
82 |
83 | # 再计算得到交互的那一个部分
84 | # 为了使用矩阵计算, (Batch * fea_num) * (fea_num * k * class_num), 我们使用Tensor的转置来处理
85 | interaction_part1 = torch.matmul(self.v.permute(2, 1, 0), x.T).permute(2, 1, 0)
86 | interaction_part1 = torch.pow(interaction_part1, 2)
87 | interaction_part1 = 0.5 * torch.sum(interaction_part1, dim=1)
88 | interaction_part1 = torch.squeeze(interaction_part1, dim=1)
89 |
90 | x_square, v_square = torch.pow(x, 2), torch.pow(self.v, 2)
91 | interaction_part2 = torch.matmul(v_square.permute(2, 1, 0), x_square.T).permute(2, 1, 0)
92 | interaction_part2 = -0.5 * torch.sum(interaction_part2, dim=1)
93 | interaction_part2 = torch.squeeze(interaction_part2, dim=1)
94 |
95 | output = linear_part + interaction_part1 + interaction_part2
96 |
97 | output = F.log_softmax(output, dim=1)
98 | return output
99 |
100 |
101 | """ ************************************************************************************ """
102 | """ 训练和测试FM模型 """
103 | """ ************************************************************************************ """
104 | def train_FM_model_demo():
105 |
106 | # Step1: 导入数据
107 | x_train, y_train, x_test, y_test = load_dataset()
108 | x_train = preprocessing.scale(x_train, with_mean=True, with_std=True)
109 | x_test = preprocessing.scale(x_test, with_mean=True, with_std=True)
110 | class_num = len(set([y for y in y_train] + [y for y in y_test]))
111 |
112 | # FM模型
113 | fm = FM_layer(class_num=class_num, feature_num=x_train.shape[1], latent_factor_dim=40).to(DEVICE)
114 |
115 | # 定义损失函数还有优化器
116 | optm = torch.optim.Adam(fm.parameters())
117 |
118 | train_loader = get_batch_loader(x_train, y_train, BATCH_SIZE, shuffle=True)
119 | test_loader = get_batch_loader(x_test, y_test, BATCH_SIZE, shuffle=False)
120 |
121 | for epoch in range(1, EPOCHS + 1):
122 | train(fm, DEVICE, train_loader, optm, epoch)
123 | test(fm, DEVICE, test_loader)
124 |
125 |
126 | def train(model, device, train_loader, optimizer, epoch):
127 | model.train()
128 | for batch_idx, (data, target) in enumerate(train_loader):
129 | data, target = data.to(device, dtype=torch.float32), target.to(device).long()
130 | optimizer.zero_grad()
131 | output = model(data)
132 |
133 | if model.class_num == 2:
134 | loss = F.cross_entropy(output, target)
135 | else:
136 | loss = F.nll_loss(output, target)
137 |
138 | # 加上L1和L2损失
139 | regularization_loss = 0
140 | for param in model.parameters():
141 | regularization_loss += model.reg_l1 * torch.sum(torch.abs(param))
142 | regularization_loss += model.reg_l2 * torch.sum(torch.pow(param, 2))
143 | loss += regularization_loss
144 |
145 | loss.backward()
146 | optimizer.step()
147 | if (batch_idx + 1) % 500 == 0:
148 | print('Train Epoch: {} [{} / {} ({:.0f}%]\tLoss:{:.6f}'.format(
149 | epoch, batch_idx * len(data), len(train_loader.dataset),
150 | 100. * batch_idx / len(train_loader), loss.item()
151 | ))
152 |
153 | def test(model, device, test_loader):
154 | model.eval()
155 | test_loss, correct = 0, 0
156 | with torch.no_grad():
157 | for data, target in test_loader:
158 | data, target = data.to(device, dtype=torch.float32), target.to(device).long()
159 | output = model(data)
160 |
161 | if model.class_num == 2:
162 | test_loss += F.cross_entropy(output, target)
163 | else:
164 | test_loss += F.nll_loss(output, target, reduction='sum').item()
165 |
166 | regularization_loss = 0
167 | for param in model.parameters():
168 | regularization_loss += model.reg_l1 * torch.sum(torch.abs(param))
169 | regularization_loss += model.reg_l2 * torch.sum(torch.pow(param, 2))
170 | test_loss += regularization_loss
171 |
172 | pred = output.max(1, keepdim=True)[1]
173 | correct += pred.eq(target.view_as(pred)).sum().item()
174 | test_loss /= len(test_loader.dataset)
175 | print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
176 | test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)
177 | ))
178 |
179 |
180 | if __name__ == '__main__':
181 | train_FM_model_demo()
182 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 基于PyTorch框架实现的推荐系统的经典模型
2 |
3 | ### 1. 相关数据集
4 | - ##### Criteo数据集
5 | - 整个数据集包含约4500W条记录. 每一行的第1列为Label, 表示点击与否,
6 | 然后接下来是13个整型特征(I1-I13)以及26个离散型特征(C1-C26)
7 | - 数据集的下载链接为http://labs.criteo.com/2014/02/download-kaggle-display-advertising-challenge-dataset/
8 | - 数据集下载后放置在data/Criteo/目录下
9 | - ##### Movielens100K
10 | - movielens100k数据集 ,包含943个用户对于1682个影片超过10万条评分信息。推荐算法研究最常用的数据集
11 | - 数据集包含 ua.base, ua.test, u.item, u.user 4个文件
12 | - 由于数据集比较小, 这里直接提供放置在data/Movielens100K目录下了
13 | ### 1. 实现的模型:
14 | - ##### FM: Factorization Machine
15 | - 论文链接: https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf
16 | - 使用测试数据集: Movielens100K
17 | - 支持多分类预测(论文没提供测试结果)
18 | - ##### FFM: Field-aware Factorization Machine
19 | - 论文链接: https://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf
20 | - 使用测试数据集: Movielens100K
21 | - 支持多分类预测(论文没提供测试结果)
22 | - ##### DeepFM: Factorization-Machine based Neural Network
23 | - 论文链接: https://www.ijcai.org/proceedings/2017/0239.pdf
24 | - 使用测试数据集: Criteo
25 | - 根据论文的方式将数据集按9:1分成训练集+测试集
26 | - 先运行data/forDeepFM/deepFM_dataProcess.py进行数据预处理, 再运行Model/DeepFM_PyTorch.py
27 | - 经过3个Epoch训练之后, AUC可以达到0.795(略低于论文的效果)
28 | - ##### DCN: Deep&Cross Network
29 | - 论文链接: https://arxiv.org/pdf/1708.05123.pdf
30 | - 使用测试数据集: Criteo
31 | - 在处理数据集时, 仅简单地把数据按9:1分成训练集+测试集, 并没有按论文0.9:0.05:0.05的方式来划分
32 | - 先运行data/forDCN/DCN_dataProcess.py进行数据预处理, 再运行Model/DeepCrossNetwork_PyTorch.py
33 | - 经过5轮Epoch训练之后, AUC为0.795, LogLoss为 (略差于论文的效果), 相信经过更多轮训练的话, 效果会更好
34 |
35 | |轮数|AUC|LogLoss|
36 | |-----|---|-------|
37 | |1Epoch|0.80157|0.45192|
38 | |2Epoch|0.80430|0.44922|
39 | |3Epoch|0.80546|0.44817|
40 | |4Epoch|0.80639|0.44729|
41 | |5Epoch|0.80696|0.44678|
42 |
43 |
--------------------------------------------------------------------------------
/data/Criteo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuking/RecommendationSystem/79842abb12c7cb03454967ccaafd7a365678a8e3/data/Criteo/__init__.py
--------------------------------------------------------------------------------
/data/Criteo/forDCN/DCN_dataProcess.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | from __future__ import print_function, absolute_import, division
4 |
5 | import os
6 | import sys
7 | from collections import Counter
8 |
9 | """
10 | preprocess Criteo train data, generate extra statistic files for model input.
11 | """
12 | # input filename
13 | FILENAME = '../train.txt'
14 |
15 | # global vars
16 | CAT_FEATURE_NUM = 'cat_feature_num.txt'
17 | INT_FEATURE_MINMAX = 'int_feature_minmax.txt'
18 | VOCAB_DIR = 'vocab'
19 | TRAIN_DIR = 'train'
20 | TEST_VALID_DIR = 'test_valid'
21 | SPLIT_RATIO = 0.9
22 | LINE_NUMS = "line_nums.log"
23 | FREQ_THR = 10
24 |
25 | INT_COLUMN_NAMES = ['I' + str(i) for i in range(1, 14)]
26 | CAT_COLUMN_NAMES = ['C' + str(i) for i in range(1, 27)]
27 |
28 |
29 | def check_statfiles():
30 | """
31 | check if statistic files of Criteo exists
32 | :return:
33 | """
34 | statsfiles = [CAT_FEATURE_NUM, INT_FEATURE_MINMAX] + [
35 | os.path.join(VOCAB_DIR, cat_fn + '.txt') for cat_fn in CAT_COLUMN_NAMES
36 | ]
37 | if all([os.path.exists(fn) for fn in statsfiles]):
38 | return True
39 | return False
40 |
41 |
42 | def create_statfiles():
43 | """
44 | create statistic files of Criteo, including:
45 | min/max of interger features
46 | counts of categorical features
47 | vocabs of each categorical features
48 | :return:
49 | """
50 | int_minmax_list = [[sys.maxsize, -sys.maxsize]
51 | for _ in range(13)] # count integer feature min max
52 | cat_ct_list = [Counter() for _ in range(26)] # count categorical features
53 | for idx, line in enumerate(open(FILENAME)):
54 | spls = line.rstrip('\n').split('\t')
55 | assert len(spls) == 40
56 |
57 | for i in range(13):
58 | if not spls[1 + i]: continue
59 | int_val = int(spls[1 + i])
60 | int_minmax_list[i][0] = min(int_minmax_list[i][0], int_val)
61 | int_minmax_list[i][1] = max(int_minmax_list[i][1], int_val)
62 |
63 | for i in range(26):
64 | cat_ct_list[i].update([spls[14 + i]])
65 |
66 | # save min max of integer features
67 | with open(INT_FEATURE_MINMAX, 'w') as f:
68 | for name, minmax in zip(INT_COLUMN_NAMES, int_minmax_list):
69 | print("{} {} {}".format(name, minmax[0], minmax[1]), file=f)
70 |
71 | # remove '' from all cat_set[i] and filter low freq categorical value
72 | cat_set_list = [set() for i in range(len(cat_ct_list))]
73 | for i, ct in enumerate(cat_ct_list):
74 | if '' in ct: del ct['']
75 | for key in list(ct.keys()):
76 | if ct[key] >= FREQ_THR:
77 | cat_set_list[i].add(key)
78 |
79 | del cat_ct_list
80 |
81 | # create vocab dir
82 | if not os.path.exists(VOCAB_DIR):
83 | os.makedirs(VOCAB_DIR)
84 |
85 | # write vocab file of categorical features
86 | with open(CAT_FEATURE_NUM, 'w') as cat_feat_count_file:
87 | for name, s in zip(CAT_COLUMN_NAMES, cat_set_list):
88 | print('{} {}'.format(name, len(s)), file=cat_feat_count_file)
89 |
90 | vocabfile = os.path.join(VOCAB_DIR, name + '.txt')
91 |
92 | with open(vocabfile, 'w') as f:
93 | for vocab_val in s:
94 | print(vocab_val, file=f)
95 |
96 |
97 | def split_data():
98 | """
99 | split train.txt into train and test_valid files.
100 | :return:
101 | """
102 | if not os.path.exists(TRAIN_DIR):
103 | os.makedirs(TRAIN_DIR)
104 | if not os.path.exists(TEST_VALID_DIR):
105 | os.makedirs(TEST_VALID_DIR)
106 |
107 | fin = open('../train.txt', 'r')
108 | data_dir = TRAIN_DIR
109 | fout = open(os.path.join(data_dir, 'part-0'), 'w')
110 | split_idx = int(45840617 * SPLIT_RATIO)
111 | for line_idx, line in enumerate(fin):
112 | if line_idx == split_idx:
113 | fout.close()
114 | data_dir = TEST_VALID_DIR
115 | cur_part_idx = int(line_idx / 200000)
116 | fout = open(data_dir + '/part-' + str(cur_part_idx), 'w')
117 | if line_idx % 200000 == 0 and line_idx != 0:
118 | fout.close()
119 | cur_part_idx = int(line_idx / 200000)
120 | fout = open(data_dir + '/part-' + str(cur_part_idx), 'w')
121 | fout.write(line)
122 | fout.close()
123 | fin.close()
124 |
125 |
126 | if __name__ == '__main__':
127 | if not check_statfiles():
128 | print('create statstic files of Criteo...')
129 | create_statfiles()
130 | print('split train.txt...')
131 | split_data()
132 | print('done')
--------------------------------------------------------------------------------
/data/Criteo/forDCN/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuking/RecommendationSystem/79842abb12c7cb03454967ccaafd7a365678a8e3/data/Criteo/forDCN/__init__.py
--------------------------------------------------------------------------------
/data/Criteo/forDeepFM/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuking/RecommendationSystem/79842abb12c7cb03454967ccaafd7a365678a8e3/data/Criteo/forDeepFM/__init__.py
--------------------------------------------------------------------------------
/data/Criteo/forDeepFM/deepFM_dataProcess.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy
3 | import shutil
4 | import pickle
5 | import numpy as np
6 | from collections import Counter
7 | from torch.utils.data import Dataset
8 | import re
9 |
10 | EACH_FILE_DATA_NUM = 204800
11 |
12 | """
13 | [1] PaddlePaddle implemantation of DeepFM for CTR prediction
14 | https://github.com/PaddlePaddle/models/blob/develop/PaddleRec/ctr/deepfm/data/preprocess.py
15 | """
16 |
17 |
18 | def get_raw_data():
19 | if not os.path.isdir('raw_data'):
20 | os.mkdir('raw_data')
21 | print(os.getcwd())
22 | fin = open('../train.txt', 'r')
23 | fout = open('raw_data/part-0', 'w')
24 | for line_idx, line in enumerate(fin):
25 | # # 生成小样本, 用于测试算法
26 | # if line_idx >= EACH_FILE_DATA_NUM * 10:
27 | # break
28 |
29 | if line_idx % EACH_FILE_DATA_NUM == 0 and line_idx != 0:
30 | fout.close()
31 | cur_part_idx = int(line_idx / EACH_FILE_DATA_NUM)
32 | fout = open('raw_data/part-' + str(cur_part_idx), 'w')
33 | fout.write(line)
34 |
35 | fout.close()
36 | fin.close()
37 |
38 |
39 | def split_data():
40 | split_rate_ = 0.9
41 | dir_train_file_idx_ = 'aid_data/train_file_idx.txt'
42 | filelist_ = ['raw_data/part-%d' % x for x in range(len(os.listdir('raw_data')))]
43 |
44 | if not os.path.exists(dir_train_file_idx_):
45 | train_file_idx = list(
46 | numpy.random.choice(
47 | len(filelist_), int(len(filelist_) * split_rate_), False))
48 | with open(dir_train_file_idx_, 'w') as fout:
49 | fout.write(str(train_file_idx))
50 | else:
51 | with open(dir_train_file_idx_, 'r') as fin:
52 | train_file_idx = eval(fin.read())
53 |
54 | for idx in range(len(filelist_)):
55 | if idx in train_file_idx:
56 | shutil.move(filelist_[idx], 'train_data')
57 | else:
58 | shutil.move(filelist_[idx], 'test_data')
59 |
60 |
61 | def get_feat_dict():
62 | freq_ = 10
63 | dir_feat_dict_ = 'aid_data/feat_dict_' + str(freq_) + '.pkl2'
64 | continuous_range_ = range(1, 14)
65 | categorical_range_ = range(14, 40)
66 |
67 | if not os.path.exists(dir_feat_dict_):
68 | # print('generate a feature dict')
69 | # Count the number of occurrences of discrete features
70 | feat_cnt = Counter()
71 | with open('../train.txt', 'r') as fin:
72 | for line_idx, line in enumerate(fin):
73 | # # 生成小样本, 用于测试算法
74 | # if line_idx >= EACH_FILE_DATA_NUM * 10:
75 | # break
76 |
77 | if line_idx % EACH_FILE_DATA_NUM == 0:
78 | print('generating feature dict', line_idx / 45000000)
79 | features = line.rstrip('\n').split('\t')
80 | for idx in categorical_range_:
81 | if features[idx] == '': continue
82 | feat_cnt.update([features[idx]])
83 |
84 | # Only retain discrete features with high frequency
85 | dis_feat_set = set()
86 | for feat, ot in feat_cnt.items():
87 | if ot >= freq_:
88 | dis_feat_set.add(feat)
89 |
90 | # Create a dictionary for continuous and discrete features
91 | feat_dict = {}
92 | tc = 1
93 | # Continuous features
94 | for idx in continuous_range_:
95 | feat_dict[idx] = tc
96 | tc += 1
97 | # Discrete features
98 | cnt_feat_set = set()
99 | with open('../train.txt', 'r') as fin:
100 | for line_idx, line in enumerate(fin):
101 | # # 生成小样本, 用于测试算法
102 | # if line_idx >= EACH_FILE_DATA_NUM * 10:
103 | # break
104 |
105 | features = line.rstrip('\n').split('\t')
106 | for idx in categorical_range_:
107 | if features[idx] == '' or features[idx] not in dis_feat_set:
108 | continue
109 | if features[idx] not in cnt_feat_set:
110 | cnt_feat_set.add(features[idx])
111 | feat_dict[features[idx]] = tc
112 | tc += 1
113 |
114 | # Save dictionary
115 | with open(dir_feat_dict_, 'wb') as fout:
116 | pickle.dump(feat_dict, fout)
117 | print('args.num_feat ', len(feat_dict) + 1)
118 |
119 |
120 | """ ************************************************************************************ """
121 | """ 下面两个方法已经弃用 """
122 | """ ************************************************************************************ """
123 | def get_criteo_data(filelist):
124 | """
125 | 获取文件目录下的Criteo数据集
126 | :param filelist:
127 | :return:
128 | """
129 | continuous_range_ = range(1, 14)
130 | categorical_range_ = range(14, 40)
131 | cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
132 | cont_max_ = [5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 46, 231, 4008, 7393]
133 | cont_diff_ = [cont_max_[i] - cont_min_[i] for i in range(len(cont_min_))]
134 | print(os.getcwd())
135 | feat_dict_ = pickle.load(open('data/aid_data/feat_dict_10.pkl2', 'rb'))
136 |
137 | count = 0
138 | features_idxs, features_values, labels = [], [], []
139 | for fname in filelist:
140 |
141 | count += 1
142 | print(count)
143 |
144 | with open(fname.strip(), 'r') as fin:
145 | for line in fin:
146 | features = line.rstrip('\n').split('\t')
147 | feat_idx = []
148 | feat_value = []
149 |
150 | # MinMax标准化连续型数据
151 | for idx in continuous_range_:
152 | if features[idx] == '':
153 | feat_idx.append(0)
154 | feat_value.append(0.0)
155 | else:
156 | feat_idx.append(feat_dict_[idx])
157 | feat_value.append((float(features[idx]) - cont_min_[idx - 1]) / cont_diff_[idx - 1])
158 |
159 | #
160 | for idx in categorical_range_:
161 | if features[idx] == '' or features[idx] not in feat_dict_:
162 | feat_idx.append(0)
163 | feat_value.append(0.0)
164 | else:
165 | feat_idx.append(feat_dict_[features[idx]])
166 | feat_value.append(1.0)
167 |
168 | features_idxs.append(feat_idx)
169 | features_values.append(feat_value)
170 | labels.append([int(features[0])])
171 | features_idxs = np.array(features_idxs)
172 | features_values = np.array(features_values)
173 | labels = np.array(labels).astype(np.int32)
174 | return features_idxs, features_values, labels
175 |
176 |
177 | class CriteoDataset(Dataset):
178 | """
179 | 这种方式由于速度过慢, 已经弃用
180 | """
181 | def __init__(self, filelist):
182 | file_idxs = [int(re.sub('[\D]', '', x)) for x in filelist]
183 | self.filelist = [filelist[idx] for idx in np.argsort(file_idxs)]
184 | self.each_file_item_num = EACH_FILE_DATA_NUM
185 | item_count = 0
186 | for fname in filelist:
187 | with open(fname.strip(), 'r') as fin:
188 | for _ in fin:
189 | item_count += 1
190 | self.item_count = item_count
191 |
192 | self.continuous_range_ = range(1, 14)
193 | self.categorical_range_ = range(14, 40)
194 | self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
195 | self.cont_max_ = [5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 46, 231, 4008, 7393]
196 | self.cont_diff_ = [self.cont_max_[i] - self.cont_min_[i] for i in range(len(self.cont_min_))]
197 | self.feat_dict_ = pickle.load(open('data/aid_data/feat_dict_10.pkl2', 'rb'))
198 |
199 | def __getitem__(self, idx):
200 | # 根据idx得到它所在的下标
201 | file_idx = int(idx / self.each_file_item_num)
202 | item_idx = idx % self.each_file_item_num
203 |
204 | feat_idx = []
205 | feat_value = []
206 | label = []
207 |
208 | fname = self.filelist[file_idx]
209 | with open(fname.strip(), 'r') as fin:
210 | count = 0
211 | for line_idx, line in enumerate(fin):
212 | count += 1
213 | if line_idx == item_idx:
214 | features = line.rstrip('\n').split('\t')
215 |
216 | # MinMax标准化连续型数据
217 | for idx in self.continuous_range_:
218 | if features[idx] == '':
219 | feat_idx.append(0)
220 | feat_value.append(0.0)
221 | else:
222 | feat_idx.append(self.feat_dict_[idx])
223 | feat_value.append((float(features[idx]) - self.cont_min_[idx - 1]) / self.cont_diff_[idx - 1])
224 |
225 | #
226 | for idx in self.categorical_range_:
227 | if features[idx] == '' or features[idx] not in self.feat_dict_:
228 | feat_idx.append(0)
229 | feat_value.append(0.0)
230 | else:
231 | feat_idx.append(self.feat_dict_[features[idx]])
232 | feat_value.append(1.0)
233 |
234 | label.append(int(features[0]))
235 | return np.array(feat_idx), np.array(feat_value), np.array(label).astype(np.int32)
236 |
237 | def __len__(self):
238 | return self.item_count
239 |
240 |
241 | if __name__ == '__main__':
242 | if not os.path.isdir('train_data'):
243 | os.mkdir('train_data')
244 | if not os.path.isdir('test_data'):
245 | os.mkdir('test_data')
246 | if not os.path.isdir('aid_data'):
247 | os.mkdir('aid_data')
248 |
249 | get_raw_data()
250 | split_data()
251 | get_feat_dict()
252 |
253 | print('Done!')
254 |
255 |
--------------------------------------------------------------------------------
/data/Movielens100K/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuking/RecommendationSystem/79842abb12c7cb03454967ccaafd7a365678a8e3/data/Movielens100K/__init__.py
--------------------------------------------------------------------------------
/data/Movielens100K/u.item:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuking/RecommendationSystem/79842abb12c7cb03454967ccaafd7a365678a8e3/data/Movielens100K/u.item
--------------------------------------------------------------------------------
/data/Movielens100K/u.user:
--------------------------------------------------------------------------------
1 | 1|24|M|technician|85711
2 | 2|53|F|other|94043
3 | 3|23|M|writer|32067
4 | 4|24|M|technician|43537
5 | 5|33|F|other|15213
6 | 6|42|M|executive|98101
7 | 7|57|M|administrator|91344
8 | 8|36|M|administrator|05201
9 | 9|29|M|student|01002
10 | 10|53|M|lawyer|90703
11 | 11|39|F|other|30329
12 | 12|28|F|other|06405
13 | 13|47|M|educator|29206
14 | 14|45|M|scientist|55106
15 | 15|49|F|educator|97301
16 | 16|21|M|entertainment|10309
17 | 17|30|M|programmer|06355
18 | 18|35|F|other|37212
19 | 19|40|M|librarian|02138
20 | 20|42|F|homemaker|95660
21 | 21|26|M|writer|30068
22 | 22|25|M|writer|40206
23 | 23|30|F|artist|48197
24 | 24|21|F|artist|94533
25 | 25|39|M|engineer|55107
26 | 26|49|M|engineer|21044
27 | 27|40|F|librarian|30030
28 | 28|32|M|writer|55369
29 | 29|41|M|programmer|94043
30 | 30|7|M|student|55436
31 | 31|24|M|artist|10003
32 | 32|28|F|student|78741
33 | 33|23|M|student|27510
34 | 34|38|F|administrator|42141
35 | 35|20|F|homemaker|42459
36 | 36|19|F|student|93117
37 | 37|23|M|student|55105
38 | 38|28|F|other|54467
39 | 39|41|M|entertainment|01040
40 | 40|38|M|scientist|27514
41 | 41|33|M|engineer|80525
42 | 42|30|M|administrator|17870
43 | 43|29|F|librarian|20854
44 | 44|26|M|technician|46260
45 | 45|29|M|programmer|50233
46 | 46|27|F|marketing|46538
47 | 47|53|M|marketing|07102
48 | 48|45|M|administrator|12550
49 | 49|23|F|student|76111
50 | 50|21|M|writer|52245
51 | 51|28|M|educator|16509
52 | 52|18|F|student|55105
53 | 53|26|M|programmer|55414
54 | 54|22|M|executive|66315
55 | 55|37|M|programmer|01331
56 | 56|25|M|librarian|46260
57 | 57|16|M|none|84010
58 | 58|27|M|programmer|52246
59 | 59|49|M|educator|08403
60 | 60|50|M|healthcare|06472
61 | 61|36|M|engineer|30040
62 | 62|27|F|administrator|97214
63 | 63|31|M|marketing|75240
64 | 64|32|M|educator|43202
65 | 65|51|F|educator|48118
66 | 66|23|M|student|80521
67 | 67|17|M|student|60402
68 | 68|19|M|student|22904
69 | 69|24|M|engineer|55337
70 | 70|27|M|engineer|60067
71 | 71|39|M|scientist|98034
72 | 72|48|F|administrator|73034
73 | 73|24|M|student|41850
74 | 74|39|M|scientist|T8H1N
75 | 75|24|M|entertainment|08816
76 | 76|20|M|student|02215
77 | 77|30|M|technician|29379
78 | 78|26|M|administrator|61801
79 | 79|39|F|administrator|03755
80 | 80|34|F|administrator|52241
81 | 81|21|M|student|21218
82 | 82|50|M|programmer|22902
83 | 83|40|M|other|44133
84 | 84|32|M|executive|55369
85 | 85|51|M|educator|20003
86 | 86|26|M|administrator|46005
87 | 87|47|M|administrator|89503
88 | 88|49|F|librarian|11701
89 | 89|43|F|administrator|68106
90 | 90|60|M|educator|78155
91 | 91|55|M|marketing|01913
92 | 92|32|M|entertainment|80525
93 | 93|48|M|executive|23112
94 | 94|26|M|student|71457
95 | 95|31|M|administrator|10707
96 | 96|25|F|artist|75206
97 | 97|43|M|artist|98006
98 | 98|49|F|executive|90291
99 | 99|20|M|student|63129
100 | 100|36|M|executive|90254
101 | 101|15|M|student|05146
102 | 102|38|M|programmer|30220
103 | 103|26|M|student|55108
104 | 104|27|M|student|55108
105 | 105|24|M|engineer|94043
106 | 106|61|M|retired|55125
107 | 107|39|M|scientist|60466
108 | 108|44|M|educator|63130
109 | 109|29|M|other|55423
110 | 110|19|M|student|77840
111 | 111|57|M|engineer|90630
112 | 112|30|M|salesman|60613
113 | 113|47|M|executive|95032
114 | 114|27|M|programmer|75013
115 | 115|31|M|engineer|17110
116 | 116|40|M|healthcare|97232
117 | 117|20|M|student|16125
118 | 118|21|M|administrator|90210
119 | 119|32|M|programmer|67401
120 | 120|47|F|other|06260
121 | 121|54|M|librarian|99603
122 | 122|32|F|writer|22206
123 | 123|48|F|artist|20008
124 | 124|34|M|student|60615
125 | 125|30|M|lawyer|22202
126 | 126|28|F|lawyer|20015
127 | 127|33|M|none|73439
128 | 128|24|F|marketing|20009
129 | 129|36|F|marketing|07039
130 | 130|20|M|none|60115
131 | 131|59|F|administrator|15237
132 | 132|24|M|other|94612
133 | 133|53|M|engineer|78602
134 | 134|31|M|programmer|80236
135 | 135|23|M|student|38401
136 | 136|51|M|other|97365
137 | 137|50|M|educator|84408
138 | 138|46|M|doctor|53211
139 | 139|20|M|student|08904
140 | 140|30|F|student|32250
141 | 141|49|M|programmer|36117
142 | 142|13|M|other|48118
143 | 143|42|M|technician|08832
144 | 144|53|M|programmer|20910
145 | 145|31|M|entertainment|V3N4P
146 | 146|45|M|artist|83814
147 | 147|40|F|librarian|02143
148 | 148|33|M|engineer|97006
149 | 149|35|F|marketing|17325
150 | 150|20|F|artist|02139
151 | 151|38|F|administrator|48103
152 | 152|33|F|educator|68767
153 | 153|25|M|student|60641
154 | 154|25|M|student|53703
155 | 155|32|F|other|11217
156 | 156|25|M|educator|08360
157 | 157|57|M|engineer|70808
158 | 158|50|M|educator|27606
159 | 159|23|F|student|55346
160 | 160|27|M|programmer|66215
161 | 161|50|M|lawyer|55104
162 | 162|25|M|artist|15610
163 | 163|49|M|administrator|97212
164 | 164|47|M|healthcare|80123
165 | 165|20|F|other|53715
166 | 166|47|M|educator|55113
167 | 167|37|M|other|L9G2B
168 | 168|48|M|other|80127
169 | 169|52|F|other|53705
170 | 170|53|F|healthcare|30067
171 | 171|48|F|educator|78750
172 | 172|55|M|marketing|22207
173 | 173|56|M|other|22306
174 | 174|30|F|administrator|52302
175 | 175|26|F|scientist|21911
176 | 176|28|M|scientist|07030
177 | 177|20|M|programmer|19104
178 | 178|26|M|other|49512
179 | 179|15|M|entertainment|20755
180 | 180|22|F|administrator|60202
181 | 181|26|M|executive|21218
182 | 182|36|M|programmer|33884
183 | 183|33|M|scientist|27708
184 | 184|37|M|librarian|76013
185 | 185|53|F|librarian|97403
186 | 186|39|F|executive|00000
187 | 187|26|M|educator|16801
188 | 188|42|M|student|29440
189 | 189|32|M|artist|95014
190 | 190|30|M|administrator|95938
191 | 191|33|M|administrator|95161
192 | 192|42|M|educator|90840
193 | 193|29|M|student|49931
194 | 194|38|M|administrator|02154
195 | 195|42|M|scientist|93555
196 | 196|49|M|writer|55105
197 | 197|55|M|technician|75094
198 | 198|21|F|student|55414
199 | 199|30|M|writer|17604
200 | 200|40|M|programmer|93402
201 | 201|27|M|writer|E2A4H
202 | 202|41|F|educator|60201
203 | 203|25|F|student|32301
204 | 204|52|F|librarian|10960
205 | 205|47|M|lawyer|06371
206 | 206|14|F|student|53115
207 | 207|39|M|marketing|92037
208 | 208|43|M|engineer|01720
209 | 209|33|F|educator|85710
210 | 210|39|M|engineer|03060
211 | 211|66|M|salesman|32605
212 | 212|49|F|educator|61401
213 | 213|33|M|executive|55345
214 | 214|26|F|librarian|11231
215 | 215|35|M|programmer|63033
216 | 216|22|M|engineer|02215
217 | 217|22|M|other|11727
218 | 218|37|M|administrator|06513
219 | 219|32|M|programmer|43212
220 | 220|30|M|librarian|78205
221 | 221|19|M|student|20685
222 | 222|29|M|programmer|27502
223 | 223|19|F|student|47906
224 | 224|31|F|educator|43512
225 | 225|51|F|administrator|58202
226 | 226|28|M|student|92103
227 | 227|46|M|executive|60659
228 | 228|21|F|student|22003
229 | 229|29|F|librarian|22903
230 | 230|28|F|student|14476
231 | 231|48|M|librarian|01080
232 | 232|45|M|scientist|99709
233 | 233|38|M|engineer|98682
234 | 234|60|M|retired|94702
235 | 235|37|M|educator|22973
236 | 236|44|F|writer|53214
237 | 237|49|M|administrator|63146
238 | 238|42|F|administrator|44124
239 | 239|39|M|artist|95628
240 | 240|23|F|educator|20784
241 | 241|26|F|student|20001
242 | 242|33|M|educator|31404
243 | 243|33|M|educator|60201
244 | 244|28|M|technician|80525
245 | 245|22|M|student|55109
246 | 246|19|M|student|28734
247 | 247|28|M|engineer|20770
248 | 248|25|M|student|37235
249 | 249|25|M|student|84103
250 | 250|29|M|executive|95110
251 | 251|28|M|doctor|85032
252 | 252|42|M|engineer|07733
253 | 253|26|F|librarian|22903
254 | 254|44|M|educator|42647
255 | 255|23|M|entertainment|07029
256 | 256|35|F|none|39042
257 | 257|17|M|student|77005
258 | 258|19|F|student|77801
259 | 259|21|M|student|48823
260 | 260|40|F|artist|89801
261 | 261|28|M|administrator|85202
262 | 262|19|F|student|78264
263 | 263|41|M|programmer|55346
264 | 264|36|F|writer|90064
265 | 265|26|M|executive|84601
266 | 266|62|F|administrator|78756
267 | 267|23|M|engineer|83716
268 | 268|24|M|engineer|19422
269 | 269|31|F|librarian|43201
270 | 270|18|F|student|63119
271 | 271|51|M|engineer|22932
272 | 272|33|M|scientist|53706
273 | 273|50|F|other|10016
274 | 274|20|F|student|55414
275 | 275|38|M|engineer|92064
276 | 276|21|M|student|95064
277 | 277|35|F|administrator|55406
278 | 278|37|F|librarian|30033
279 | 279|33|M|programmer|85251
280 | 280|30|F|librarian|22903
281 | 281|15|F|student|06059
282 | 282|22|M|administrator|20057
283 | 283|28|M|programmer|55305
284 | 284|40|M|executive|92629
285 | 285|25|M|programmer|53713
286 | 286|27|M|student|15217
287 | 287|21|M|salesman|31211
288 | 288|34|M|marketing|23226
289 | 289|11|M|none|94619
290 | 290|40|M|engineer|93550
291 | 291|19|M|student|44106
292 | 292|35|F|programmer|94703
293 | 293|24|M|writer|60804
294 | 294|34|M|technician|92110
295 | 295|31|M|educator|50325
296 | 296|43|F|administrator|16803
297 | 297|29|F|educator|98103
298 | 298|44|M|executive|01581
299 | 299|29|M|doctor|63108
300 | 300|26|F|programmer|55106
301 | 301|24|M|student|55439
302 | 302|42|M|educator|77904
303 | 303|19|M|student|14853
304 | 304|22|F|student|71701
305 | 305|23|M|programmer|94086
306 | 306|45|M|other|73132
307 | 307|25|M|student|55454
308 | 308|60|M|retired|95076
309 | 309|40|M|scientist|70802
310 | 310|37|M|educator|91711
311 | 311|32|M|technician|73071
312 | 312|48|M|other|02110
313 | 313|41|M|marketing|60035
314 | 314|20|F|student|08043
315 | 315|31|M|educator|18301
316 | 316|43|F|other|77009
317 | 317|22|M|administrator|13210
318 | 318|65|M|retired|06518
319 | 319|38|M|programmer|22030
320 | 320|19|M|student|24060
321 | 321|49|F|educator|55413
322 | 322|20|M|student|50613
323 | 323|21|M|student|19149
324 | 324|21|F|student|02176
325 | 325|48|M|technician|02139
326 | 326|41|M|administrator|15235
327 | 327|22|M|student|11101
328 | 328|51|M|administrator|06779
329 | 329|48|M|educator|01720
330 | 330|35|F|educator|33884
331 | 331|33|M|entertainment|91344
332 | 332|20|M|student|40504
333 | 333|47|M|other|V0R2M
334 | 334|32|M|librarian|30002
335 | 335|45|M|executive|33775
336 | 336|23|M|salesman|42101
337 | 337|37|M|scientist|10522
338 | 338|39|F|librarian|59717
339 | 339|35|M|lawyer|37901
340 | 340|46|M|engineer|80123
341 | 341|17|F|student|44405
342 | 342|25|F|other|98006
343 | 343|43|M|engineer|30093
344 | 344|30|F|librarian|94117
345 | 345|28|F|librarian|94143
346 | 346|34|M|other|76059
347 | 347|18|M|student|90210
348 | 348|24|F|student|45660
349 | 349|68|M|retired|61455
350 | 350|32|M|student|97301
351 | 351|61|M|educator|49938
352 | 352|37|F|programmer|55105
353 | 353|25|M|scientist|28480
354 | 354|29|F|librarian|48197
355 | 355|25|M|student|60135
356 | 356|32|F|homemaker|92688
357 | 357|26|M|executive|98133
358 | 358|40|M|educator|10022
359 | 359|22|M|student|61801
360 | 360|51|M|other|98027
361 | 361|22|M|student|44074
362 | 362|35|F|homemaker|85233
363 | 363|20|M|student|87501
364 | 364|63|M|engineer|01810
365 | 365|29|M|lawyer|20009
366 | 366|20|F|student|50670
367 | 367|17|M|student|37411
368 | 368|18|M|student|92113
369 | 369|24|M|student|91335
370 | 370|52|M|writer|08534
371 | 371|36|M|engineer|99206
372 | 372|25|F|student|66046
373 | 373|24|F|other|55116
374 | 374|36|M|executive|78746
375 | 375|17|M|entertainment|37777
376 | 376|28|F|other|10010
377 | 377|22|M|student|18015
378 | 378|35|M|student|02859
379 | 379|44|M|programmer|98117
380 | 380|32|M|engineer|55117
381 | 381|33|M|artist|94608
382 | 382|45|M|engineer|01824
383 | 383|42|M|administrator|75204
384 | 384|52|M|programmer|45218
385 | 385|36|M|writer|10003
386 | 386|36|M|salesman|43221
387 | 387|33|M|entertainment|37412
388 | 388|31|M|other|36106
389 | 389|44|F|writer|83702
390 | 390|42|F|writer|85016
391 | 391|23|M|student|84604
392 | 392|52|M|writer|59801
393 | 393|19|M|student|83686
394 | 394|25|M|administrator|96819
395 | 395|43|M|other|44092
396 | 396|57|M|engineer|94551
397 | 397|17|M|student|27514
398 | 398|40|M|other|60008
399 | 399|25|M|other|92374
400 | 400|33|F|administrator|78213
401 | 401|46|F|healthcare|84107
402 | 402|30|M|engineer|95129
403 | 403|37|M|other|06811
404 | 404|29|F|programmer|55108
405 | 405|22|F|healthcare|10019
406 | 406|52|M|educator|93109
407 | 407|29|M|engineer|03261
408 | 408|23|M|student|61755
409 | 409|48|M|administrator|98225
410 | 410|30|F|artist|94025
411 | 411|34|M|educator|44691
412 | 412|25|M|educator|15222
413 | 413|55|M|educator|78212
414 | 414|24|M|programmer|38115
415 | 415|39|M|educator|85711
416 | 416|20|F|student|92626
417 | 417|27|F|other|48103
418 | 418|55|F|none|21206
419 | 419|37|M|lawyer|43215
420 | 420|53|M|educator|02140
421 | 421|38|F|programmer|55105
422 | 422|26|M|entertainment|94533
423 | 423|64|M|other|91606
424 | 424|36|F|marketing|55422
425 | 425|19|M|student|58644
426 | 426|55|M|educator|01602
427 | 427|51|M|doctor|85258
428 | 428|28|M|student|55414
429 | 429|27|M|student|29205
430 | 430|38|M|scientist|98199
431 | 431|24|M|marketing|92629
432 | 432|22|M|entertainment|50311
433 | 433|27|M|artist|11211
434 | 434|16|F|student|49705
435 | 435|24|M|engineer|60007
436 | 436|30|F|administrator|17345
437 | 437|27|F|other|20009
438 | 438|51|F|administrator|43204
439 | 439|23|F|administrator|20817
440 | 440|30|M|other|48076
441 | 441|50|M|technician|55013
442 | 442|22|M|student|85282
443 | 443|35|M|salesman|33308
444 | 444|51|F|lawyer|53202
445 | 445|21|M|writer|92653
446 | 446|57|M|educator|60201
447 | 447|30|M|administrator|55113
448 | 448|23|M|entertainment|10021
449 | 449|23|M|librarian|55021
450 | 450|35|F|educator|11758
451 | 451|16|M|student|48446
452 | 452|35|M|administrator|28018
453 | 453|18|M|student|06333
454 | 454|57|M|other|97330
455 | 455|48|M|administrator|83709
456 | 456|24|M|technician|31820
457 | 457|33|F|salesman|30011
458 | 458|47|M|technician|Y1A6B
459 | 459|22|M|student|29201
460 | 460|44|F|other|60630
461 | 461|15|M|student|98102
462 | 462|19|F|student|02918
463 | 463|48|F|healthcare|75218
464 | 464|60|M|writer|94583
465 | 465|32|M|other|05001
466 | 466|22|M|student|90804
467 | 467|29|M|engineer|91201
468 | 468|28|M|engineer|02341
469 | 469|60|M|educator|78628
470 | 470|24|M|programmer|10021
471 | 471|10|M|student|77459
472 | 472|24|M|student|87544
473 | 473|29|M|student|94708
474 | 474|51|M|executive|93711
475 | 475|30|M|programmer|75230
476 | 476|28|M|student|60440
477 | 477|23|F|student|02125
478 | 478|29|M|other|10019
479 | 479|30|M|educator|55409
480 | 480|57|M|retired|98257
481 | 481|73|M|retired|37771
482 | 482|18|F|student|40256
483 | 483|29|M|scientist|43212
484 | 484|27|M|student|21208
485 | 485|44|F|educator|95821
486 | 486|39|M|educator|93101
487 | 487|22|M|engineer|92121
488 | 488|48|M|technician|21012
489 | 489|55|M|other|45218
490 | 490|29|F|artist|V5A2B
491 | 491|43|F|writer|53711
492 | 492|57|M|educator|94618
493 | 493|22|M|engineer|60090
494 | 494|38|F|administrator|49428
495 | 495|29|M|engineer|03052
496 | 496|21|F|student|55414
497 | 497|20|M|student|50112
498 | 498|26|M|writer|55408
499 | 499|42|M|programmer|75006
500 | 500|28|M|administrator|94305
501 | 501|22|M|student|10025
502 | 502|22|M|student|23092
503 | 503|50|F|writer|27514
504 | 504|40|F|writer|92115
505 | 505|27|F|other|20657
506 | 506|46|M|programmer|03869
507 | 507|18|F|writer|28450
508 | 508|27|M|marketing|19382
509 | 509|23|M|administrator|10011
510 | 510|34|M|other|98038
511 | 511|22|M|student|21250
512 | 512|29|M|other|20090
513 | 513|43|M|administrator|26241
514 | 514|27|M|programmer|20707
515 | 515|53|M|marketing|49508
516 | 516|53|F|librarian|10021
517 | 517|24|M|student|55454
518 | 518|49|F|writer|99709
519 | 519|22|M|other|55320
520 | 520|62|M|healthcare|12603
521 | 521|19|M|student|02146
522 | 522|36|M|engineer|55443
523 | 523|50|F|administrator|04102
524 | 524|56|M|educator|02159
525 | 525|27|F|administrator|19711
526 | 526|30|M|marketing|97124
527 | 527|33|M|librarian|12180
528 | 528|18|M|student|55104
529 | 529|47|F|administrator|44224
530 | 530|29|M|engineer|94040
531 | 531|30|F|salesman|97408
532 | 532|20|M|student|92705
533 | 533|43|M|librarian|02324
534 | 534|20|M|student|05464
535 | 535|45|F|educator|80302
536 | 536|38|M|engineer|30078
537 | 537|36|M|engineer|22902
538 | 538|31|M|scientist|21010
539 | 539|53|F|administrator|80303
540 | 540|28|M|engineer|91201
541 | 541|19|F|student|84302
542 | 542|21|M|student|60515
543 | 543|33|M|scientist|95123
544 | 544|44|F|other|29464
545 | 545|27|M|technician|08052
546 | 546|36|M|executive|22911
547 | 547|50|M|educator|14534
548 | 548|51|M|writer|95468
549 | 549|42|M|scientist|45680
550 | 550|16|F|student|95453
551 | 551|25|M|programmer|55414
552 | 552|45|M|other|68147
553 | 553|58|M|educator|62901
554 | 554|32|M|scientist|62901
555 | 555|29|F|educator|23227
556 | 556|35|F|educator|30606
557 | 557|30|F|writer|11217
558 | 558|56|F|writer|63132
559 | 559|69|M|executive|10022
560 | 560|32|M|student|10003
561 | 561|23|M|engineer|60005
562 | 562|54|F|administrator|20879
563 | 563|39|F|librarian|32707
564 | 564|65|M|retired|94591
565 | 565|40|M|student|55422
566 | 566|20|M|student|14627
567 | 567|24|M|entertainment|10003
568 | 568|39|M|educator|01915
569 | 569|34|M|educator|91903
570 | 570|26|M|educator|14627
571 | 571|34|M|artist|01945
572 | 572|51|M|educator|20003
573 | 573|68|M|retired|48911
574 | 574|56|M|educator|53188
575 | 575|33|M|marketing|46032
576 | 576|48|M|executive|98281
577 | 577|36|F|student|77845
578 | 578|31|M|administrator|M7A1A
579 | 579|32|M|educator|48103
580 | 580|16|M|student|17961
581 | 581|37|M|other|94131
582 | 582|17|M|student|93003
583 | 583|44|M|engineer|29631
584 | 584|25|M|student|27511
585 | 585|69|M|librarian|98501
586 | 586|20|M|student|79508
587 | 587|26|M|other|14216
588 | 588|18|F|student|93063
589 | 589|21|M|lawyer|90034
590 | 590|50|M|educator|82435
591 | 591|57|F|librarian|92093
592 | 592|18|M|student|97520
593 | 593|31|F|educator|68767
594 | 594|46|M|educator|M4J2K
595 | 595|25|M|programmer|31909
596 | 596|20|M|artist|77073
597 | 597|23|M|other|84116
598 | 598|40|F|marketing|43085
599 | 599|22|F|student|R3T5K
600 | 600|34|M|programmer|02320
601 | 601|19|F|artist|99687
602 | 602|47|F|other|34656
603 | 603|21|M|programmer|47905
604 | 604|39|M|educator|11787
605 | 605|33|M|engineer|33716
606 | 606|28|M|programmer|63044
607 | 607|49|F|healthcare|02154
608 | 608|22|M|other|10003
609 | 609|13|F|student|55106
610 | 610|22|M|student|21227
611 | 611|46|M|librarian|77008
612 | 612|36|M|educator|79070
613 | 613|37|F|marketing|29678
614 | 614|54|M|educator|80227
615 | 615|38|M|educator|27705
616 | 616|55|M|scientist|50613
617 | 617|27|F|writer|11201
618 | 618|15|F|student|44212
619 | 619|17|M|student|44134
620 | 620|18|F|writer|81648
621 | 621|17|M|student|60402
622 | 622|25|M|programmer|14850
623 | 623|50|F|educator|60187
624 | 624|19|M|student|30067
625 | 625|27|M|programmer|20723
626 | 626|23|M|scientist|19807
627 | 627|24|M|engineer|08034
628 | 628|13|M|none|94306
629 | 629|46|F|other|44224
630 | 630|26|F|healthcare|55408
631 | 631|18|F|student|38866
632 | 632|18|M|student|55454
633 | 633|35|M|programmer|55414
634 | 634|39|M|engineer|T8H1N
635 | 635|22|M|other|23237
636 | 636|47|M|educator|48043
637 | 637|30|M|other|74101
638 | 638|45|M|engineer|01940
639 | 639|42|F|librarian|12065
640 | 640|20|M|student|61801
641 | 641|24|M|student|60626
642 | 642|18|F|student|95521
643 | 643|39|M|scientist|55122
644 | 644|51|M|retired|63645
645 | 645|27|M|programmer|53211
646 | 646|17|F|student|51250
647 | 647|40|M|educator|45810
648 | 648|43|M|engineer|91351
649 | 649|20|M|student|39762
650 | 650|42|M|engineer|83814
651 | 651|65|M|retired|02903
652 | 652|35|M|other|22911
653 | 653|31|M|executive|55105
654 | 654|27|F|student|78739
655 | 655|50|F|healthcare|60657
656 | 656|48|M|educator|10314
657 | 657|26|F|none|78704
658 | 658|33|M|programmer|92626
659 | 659|31|M|educator|54248
660 | 660|26|M|student|77380
661 | 661|28|M|programmer|98121
662 | 662|55|M|librarian|19102
663 | 663|26|M|other|19341
664 | 664|30|M|engineer|94115
665 | 665|25|M|administrator|55412
666 | 666|44|M|administrator|61820
667 | 667|35|M|librarian|01970
668 | 668|29|F|writer|10016
669 | 669|37|M|other|20009
670 | 670|30|M|technician|21114
671 | 671|21|M|programmer|91919
672 | 672|54|F|administrator|90095
673 | 673|51|M|educator|22906
674 | 674|13|F|student|55337
675 | 675|34|M|other|28814
676 | 676|30|M|programmer|32712
677 | 677|20|M|other|99835
678 | 678|50|M|educator|61462
679 | 679|20|F|student|54302
680 | 680|33|M|lawyer|90405
681 | 681|44|F|marketing|97208
682 | 682|23|M|programmer|55128
683 | 683|42|M|librarian|23509
684 | 684|28|M|student|55414
685 | 685|32|F|librarian|55409
686 | 686|32|M|educator|26506
687 | 687|31|F|healthcare|27713
688 | 688|37|F|administrator|60476
689 | 689|25|M|other|45439
690 | 690|35|M|salesman|63304
691 | 691|34|M|educator|60089
692 | 692|34|M|engineer|18053
693 | 693|43|F|healthcare|85210
694 | 694|60|M|programmer|06365
695 | 695|26|M|writer|38115
696 | 696|55|M|other|94920
697 | 697|25|M|other|77042
698 | 698|28|F|programmer|06906
699 | 699|44|M|other|96754
700 | 700|17|M|student|76309
701 | 701|51|F|librarian|56321
702 | 702|37|M|other|89104
703 | 703|26|M|educator|49512
704 | 704|51|F|librarian|91105
705 | 705|21|F|student|54494
706 | 706|23|M|student|55454
707 | 707|56|F|librarian|19146
708 | 708|26|F|homemaker|96349
709 | 709|21|M|other|N4T1A
710 | 710|19|M|student|92020
711 | 711|22|F|student|15203
712 | 712|22|F|student|54901
713 | 713|42|F|other|07204
714 | 714|26|M|engineer|55343
715 | 715|21|M|technician|91206
716 | 716|36|F|administrator|44265
717 | 717|24|M|technician|84105
718 | 718|42|M|technician|64118
719 | 719|37|F|other|V0R2H
720 | 720|49|F|administrator|16506
721 | 721|24|F|entertainment|11238
722 | 722|50|F|homemaker|17331
723 | 723|26|M|executive|94403
724 | 724|31|M|executive|40243
725 | 725|21|M|student|91711
726 | 726|25|F|administrator|80538
727 | 727|25|M|student|78741
728 | 728|58|M|executive|94306
729 | 729|19|M|student|56567
730 | 730|31|F|scientist|32114
731 | 731|41|F|educator|70403
732 | 732|28|F|other|98405
733 | 733|44|F|other|60630
734 | 734|25|F|other|63108
735 | 735|29|F|healthcare|85719
736 | 736|48|F|writer|94618
737 | 737|30|M|programmer|98072
738 | 738|35|M|technician|95403
739 | 739|35|M|technician|73162
740 | 740|25|F|educator|22206
741 | 741|25|M|writer|63108
742 | 742|35|M|student|29210
743 | 743|31|M|programmer|92660
744 | 744|35|M|marketing|47024
745 | 745|42|M|writer|55113
746 | 746|25|M|engineer|19047
747 | 747|19|M|other|93612
748 | 748|28|M|administrator|94720
749 | 749|33|M|other|80919
750 | 750|28|M|administrator|32303
751 | 751|24|F|other|90034
752 | 752|60|M|retired|21201
753 | 753|56|M|salesman|91206
754 | 754|59|F|librarian|62901
755 | 755|44|F|educator|97007
756 | 756|30|F|none|90247
757 | 757|26|M|student|55104
758 | 758|27|M|student|53706
759 | 759|20|F|student|68503
760 | 760|35|F|other|14211
761 | 761|17|M|student|97302
762 | 762|32|M|administrator|95050
763 | 763|27|M|scientist|02113
764 | 764|27|F|educator|62903
765 | 765|31|M|student|33066
766 | 766|42|M|other|10960
767 | 767|70|M|engineer|00000
768 | 768|29|M|administrator|12866
769 | 769|39|M|executive|06927
770 | 770|28|M|student|14216
771 | 771|26|M|student|15232
772 | 772|50|M|writer|27105
773 | 773|20|M|student|55414
774 | 774|30|M|student|80027
775 | 775|46|M|executive|90036
776 | 776|30|M|librarian|51157
777 | 777|63|M|programmer|01810
778 | 778|34|M|student|01960
779 | 779|31|M|student|K7L5J
780 | 780|49|M|programmer|94560
781 | 781|20|M|student|48825
782 | 782|21|F|artist|33205
783 | 783|30|M|marketing|77081
784 | 784|47|M|administrator|91040
785 | 785|32|M|engineer|23322
786 | 786|36|F|engineer|01754
787 | 787|18|F|student|98620
788 | 788|51|M|administrator|05779
789 | 789|29|M|other|55420
790 | 790|27|M|technician|80913
791 | 791|31|M|educator|20064
792 | 792|40|M|programmer|12205
793 | 793|22|M|student|85281
794 | 794|32|M|educator|57197
795 | 795|30|M|programmer|08610
796 | 796|32|F|writer|33755
797 | 797|44|F|other|62522
798 | 798|40|F|writer|64131
799 | 799|49|F|administrator|19716
800 | 800|25|M|programmer|55337
801 | 801|22|M|writer|92154
802 | 802|35|M|administrator|34105
803 | 803|70|M|administrator|78212
804 | 804|39|M|educator|61820
805 | 805|27|F|other|20009
806 | 806|27|M|marketing|11217
807 | 807|41|F|healthcare|93555
808 | 808|45|M|salesman|90016
809 | 809|50|F|marketing|30803
810 | 810|55|F|other|80526
811 | 811|40|F|educator|73013
812 | 812|22|M|technician|76234
813 | 813|14|F|student|02136
814 | 814|30|M|other|12345
815 | 815|32|M|other|28806
816 | 816|34|M|other|20755
817 | 817|19|M|student|60152
818 | 818|28|M|librarian|27514
819 | 819|59|M|administrator|40205
820 | 820|22|M|student|37725
821 | 821|37|M|engineer|77845
822 | 822|29|F|librarian|53144
823 | 823|27|M|artist|50322
824 | 824|31|M|other|15017
825 | 825|44|M|engineer|05452
826 | 826|28|M|artist|77048
827 | 827|23|F|engineer|80228
828 | 828|28|M|librarian|85282
829 | 829|48|M|writer|80209
830 | 830|46|M|programmer|53066
831 | 831|21|M|other|33765
832 | 832|24|M|technician|77042
833 | 833|34|M|writer|90019
834 | 834|26|M|other|64153
835 | 835|44|F|executive|11577
836 | 836|44|M|artist|10018
837 | 837|36|F|artist|55409
838 | 838|23|M|student|01375
839 | 839|38|F|entertainment|90814
840 | 840|39|M|artist|55406
841 | 841|45|M|doctor|47401
842 | 842|40|M|writer|93055
843 | 843|35|M|librarian|44212
844 | 844|22|M|engineer|95662
845 | 845|64|M|doctor|97405
846 | 846|27|M|lawyer|47130
847 | 847|29|M|student|55417
848 | 848|46|M|engineer|02146
849 | 849|15|F|student|25652
850 | 850|34|M|technician|78390
851 | 851|18|M|other|29646
852 | 852|46|M|administrator|94086
853 | 853|49|M|writer|40515
854 | 854|29|F|student|55408
855 | 855|53|M|librarian|04988
856 | 856|43|F|marketing|97215
857 | 857|35|F|administrator|V1G4L
858 | 858|63|M|educator|09645
859 | 859|18|F|other|06492
860 | 860|70|F|retired|48322
861 | 861|38|F|student|14085
862 | 862|25|M|executive|13820
863 | 863|17|M|student|60089
864 | 864|27|M|programmer|63021
865 | 865|25|M|artist|11231
866 | 866|45|M|other|60302
867 | 867|24|M|scientist|92507
868 | 868|21|M|programmer|55303
869 | 869|30|M|student|10025
870 | 870|22|M|student|65203
871 | 871|31|M|executive|44648
872 | 872|19|F|student|74078
873 | 873|48|F|administrator|33763
874 | 874|36|M|scientist|37076
875 | 875|24|F|student|35802
876 | 876|41|M|other|20902
877 | 877|30|M|other|77504
878 | 878|50|F|educator|98027
879 | 879|33|F|administrator|55337
880 | 880|13|M|student|83702
881 | 881|39|M|marketing|43017
882 | 882|35|M|engineer|40503
883 | 883|49|M|librarian|50266
884 | 884|44|M|engineer|55337
885 | 885|30|F|other|95316
886 | 886|20|M|student|61820
887 | 887|14|F|student|27249
888 | 888|41|M|scientist|17036
889 | 889|24|M|technician|78704
890 | 890|32|M|student|97301
891 | 891|51|F|administrator|03062
892 | 892|36|M|other|45243
893 | 893|25|M|student|95823
894 | 894|47|M|educator|74075
895 | 895|31|F|librarian|32301
896 | 896|28|M|writer|91505
897 | 897|30|M|other|33484
898 | 898|23|M|homemaker|61755
899 | 899|32|M|other|55116
900 | 900|60|M|retired|18505
901 | 901|38|M|executive|L1V3W
902 | 902|45|F|artist|97203
903 | 903|28|M|educator|20850
904 | 904|17|F|student|61073
905 | 905|27|M|other|30350
906 | 906|45|M|librarian|70124
907 | 907|25|F|other|80526
908 | 908|44|F|librarian|68504
909 | 909|50|F|educator|53171
910 | 910|28|M|healthcare|29301
911 | 911|37|F|writer|53210
912 | 912|51|M|other|06512
913 | 913|27|M|student|76201
914 | 914|44|F|other|08105
915 | 915|50|M|entertainment|60614
916 | 916|27|M|engineer|N2L5N
917 | 917|22|F|student|20006
918 | 918|40|M|scientist|70116
919 | 919|25|M|other|14216
920 | 920|30|F|artist|90008
921 | 921|20|F|student|98801
922 | 922|29|F|administrator|21114
923 | 923|21|M|student|E2E3R
924 | 924|29|M|other|11753
925 | 925|18|F|salesman|49036
926 | 926|49|M|entertainment|01701
927 | 927|23|M|programmer|55428
928 | 928|21|M|student|55408
929 | 929|44|M|scientist|53711
930 | 930|28|F|scientist|07310
931 | 931|60|M|educator|33556
932 | 932|58|M|educator|06437
933 | 933|28|M|student|48105
934 | 934|61|M|engineer|22902
935 | 935|42|M|doctor|66221
936 | 936|24|M|other|32789
937 | 937|48|M|educator|98072
938 | 938|38|F|technician|55038
939 | 939|26|F|student|33319
940 | 940|32|M|administrator|02215
941 | 941|20|M|student|97229
942 | 942|48|F|librarian|78209
943 | 943|22|M|student|77841
944 |
--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuking/RecommendationSystem/79842abb12c7cb03454967ccaafd7a365678a8e3/data/__init__.py
--------------------------------------------------------------------------------
/util/load_data_util.py:
--------------------------------------------------------------------------------
1 | import torch.utils.data as data
2 |
3 |
4 | def get_batch_loader(features, labels, batch_size, shuffle=True):
5 | class MyDataset(data.Dataset):
6 | def __init__(self, features, labels):
7 | self.features = features
8 | self.labels = labels
9 |
10 | def __getitem__(self, index): # 返回的是tensor
11 | row_data, target = self.features[index], self.labels[index]
12 | return row_data, target
13 |
14 | def __len__(self):
15 | return len(self.features)
16 |
17 | batch_loader = data.DataLoader(MyDataset(features, labels), batch_size=batch_size, shuffle=shuffle)
18 | return batch_loader
19 |
20 |
--------------------------------------------------------------------------------