├── dataloader
├── __init__.py
└── kuairec.py
├── model
├── __init__.py
├── tpm.py
├── d2q.py
├── wd.py
├── cread.py
├── egmn.py
└── layers.py
├── README.md
├── run_vr.py
├── run_egmn.py
├── run_tpm.py
├── dataset
└── kuairec
│ └── kuairec_process.py
├── run_d2q.py
├── utils.py
├── run_d2co.py
└── run_cread.py
/dataloader/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from dataloader.kuairec import KUAIRECDataLoader
3 |
--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from model.wd import WideAndDeep
3 | from model.cread import Cread
4 | from model.d2q import D2Q
5 | from model.tpm import TPM
6 | from model.egmn import EGMN
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # EGMN
2 |
3 | ================================================
4 |
5 |
6 | Overview
7 | --------
8 |
9 | PyTorch implementation of our paper "Multi-Granularity Distribution Modeling for Video Watch Time Prediction via Exponential-Gaussian Mixture Network".
10 |
11 | This repository also contains the implementation of all baselines and the processing of KuaiRec dataset mentioned in our paper.
12 |
13 | _**Other datatsets** and **more in-depth details** mentoined in our experiment section will be synchronized in this repository if the manuscript is accepted._
14 |
15 | Dependencies
16 | ------------
17 |
18 | Install Pytorch 2.1.0, using pip or conda, should resolve all dependencies (pandas, numpy, sklearn).
19 |
20 | Tested with Python 3.10.12, but should work with 3.x as well.
21 |
22 | Tested on CPU or GPU.
23 |
24 | Public Dataset
25 | -------
26 |
27 | You can download the public dataset from following links:
28 | * [KuaiRec](https://kuairec.com/)
29 |
30 |
31 | Raw data need to be preprocessed before using. The data preprocessing scripts are given in `dataset/kuairec/kuairec_process.py`.
32 |
33 | How to Use
34 | ----------
35 | `model/*`: Implementation of various models.
36 |
37 | `run_*.py`: The starting point for running each method.
38 |
39 | You can run the program with these command examples:
40 |
41 |
42 | `python run_cread.py --dataset_name kuairec` : evaluate **CREAD** on KuaiRec dataset
43 |
44 |
45 | `python run_egmn.py --dataset_name kuairec`: evaluate proposed **EGMN** on the KuaiRec dataset
46 |
47 |
48 | The program will print the MAE, XAUC and KL-Divergend of evaluation.
49 | Some other settable parameters could be found in the `./run_*.py` file.
50 |
--------------------------------------------------------------------------------
/model/tpm.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from model.layers import MultiLayerPerceptronTPM
4 |
5 | class TPM(torch.nn.Module):
6 |
7 | def __init__(self, description, class_num, embed_dim, mlp_dims, dropout):
8 | super().__init__()
9 | self.features = {name: (size, type) for name, size, type in description if (type in ["ctn", 'seq', 'spr'])}
10 | self.build(embed_dim, mlp_dims, dropout,class_num)
11 |
12 | def build(self, embed_dim, mlp_dims, dropout,class_num):
13 | self.emb_layer = torch.nn.ModuleDict()
14 | self.ctn_emb_layer = torch.nn.ParameterDict()
15 | self.ctn_linear_layer = torch.nn.ModuleDict()
16 | embed_output_dim = 0
17 | for name, (size, type) in self.features.items():
18 | if type == 'spr':
19 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
20 | embed_output_dim += embed_dim
21 | elif type == 'ctn':
22 | self.ctn_linear_layer[name] = torch.nn.Linear(1, 1, bias=False)
23 | elif type == 'seq':
24 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
25 | embed_output_dim += embed_dim
26 | else:
27 | raise ValueError('unkown feature type: {}'.format(type))
28 | self.mlp = MultiLayerPerceptronTPM(embed_output_dim, mlp_dims, dropout, class_num)
29 | return
30 |
31 | def init(self):
32 | for param in self.parameters():
33 | torch.nn.init.uniform_(param, -0.01, 0.01)
34 |
35 | def forward(self, x_dict):
36 | linears = []
37 | embs = []
38 | for name, (_, type) in self.features.items():
39 | x = x_dict[name]
40 | if type == 'spr':
41 | embs.append(self.emb_layer[name](x).squeeze(1))
42 | elif type == 'ctn':
43 | linears.append(self.ctn_linear_layer[name](x))
44 | elif type == 'seq':
45 | seq_emb = self.emb_layer[name](x)
46 | seq_mask = torch.unsqueeze(x_dict["{}mask".format(name)], dim=2)
47 | embs.append(torch.sum(seq_emb * seq_mask, dim=1) / torch.sum(seq_mask, dim=1))
48 | else:
49 | raise ValueError('unkwon feature: {}'.format(name))
50 | emb = torch.cat(embs, dim=1)
51 | res = self.mlp(emb)
52 | return torch.sigmoid(res)
--------------------------------------------------------------------------------
/model/d2q.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from model.layers import MultiLayerPerceptronD2Q
4 |
5 | class D2Q(torch.nn.Module):
6 |
7 | def __init__(self, description, embed_dim, mlp_dims, dropout):
8 | super().__init__()
9 | self.features = {name: (size, type) for name, size, type in description if (type in ["ctn", 'seq', 'spr'])}
10 | self.build(embed_dim, mlp_dims, dropout)
11 |
12 | def build(self, embed_dim, mlp_dims, dropout):
13 | self.emb_layer = torch.nn.ModuleDict()
14 | self.ctn_emb_layer = torch.nn.ParameterDict()
15 | self.ctn_linear_layer = torch.nn.ModuleDict()
16 | embed_output_dim = 0
17 | for name, (size, type) in self.features.items():
18 | if type == 'spr':
19 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
20 | embed_output_dim += embed_dim
21 | elif type == 'ctn':
22 | self.ctn_linear_layer[name] = torch.nn.Linear(1, 1, bias=False)
23 | embed_output_dim += 1
24 | elif type == 'seq':
25 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
26 | embed_output_dim += embed_dim
27 | else:
28 | raise ValueError('unkown feature type: {}'.format(type))
29 | self.mlp = MultiLayerPerceptronD2Q(embed_output_dim, mlp_dims, dropout)
30 | return
31 |
32 | # def init(self):
33 | # for param in self.parameters():
34 | # torch.nn.init.uniform_(param, -0.01, 0.01)
35 |
36 | def forward(self, x_dict):
37 | linears = []
38 | embs = []
39 | for name, (_, type) in self.features.items():
40 | x = x_dict[name]
41 | if type == 'spr':
42 | embs.append(self.emb_layer[name](x).squeeze(1))
43 | elif type == 'ctn':
44 | linears.append(self.ctn_linear_layer[name](x))
45 | elif type == 'seq':
46 | seq_emb = self.emb_layer[name](x)
47 | seq_mask = torch.unsqueeze(x_dict["{}mask".format(name)], dim=2)
48 | embs.append(torch.sum(seq_emb * seq_mask, dim=1) / torch.sum(seq_mask, dim=1))
49 | else:
50 | raise ValueError('unkwon feature: {}'.format(name))
51 | emb = torch.concat(embs + linears, dim=1)
52 | res = self.mlp(emb)
53 | res = res.squeeze(1)
54 | return torch.sigmoid(res)
--------------------------------------------------------------------------------
/model/wd.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from model.layers import FactorizationMachine, MultiLayerPerceptron
4 |
5 | class WideAndDeep(torch.nn.Module):
6 |
7 | def __init__(self, description, embed_dim, mlp_dims, dropout):
8 | super().__init__()
9 | self.features = {name: (size, type) for name, size, type in description if (type in ["ctn", 'seq', 'spr'])}
10 | self.build(embed_dim, mlp_dims, dropout)
11 |
12 | def build(self, embed_dim, mlp_dims, dropout):
13 | self.emb_layer = torch.nn.ModuleDict()
14 | self.ctn_emb_layer = torch.nn.ParameterDict()
15 | self.ctn_linear_layer = torch.nn.ModuleDict()
16 | embed_output_dim = 0
17 | for name, (size, type) in self.features.items():
18 | if type == 'spr':
19 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
20 | embed_output_dim += embed_dim
21 | elif type == 'ctn':
22 | self.ctn_linear_layer[name] = torch.nn.Linear(1, 1, bias=False)
23 | elif type == 'seq':
24 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
25 | embed_output_dim += embed_dim
26 | else:
27 | raise ValueError('unkown feature type: {}'.format(type))
28 | self.mlp = MultiLayerPerceptron(embed_output_dim, mlp_dims, dropout)
29 | return
30 |
31 | def init(self):
32 | for param in self.parameters():
33 | torch.nn.init.uniform_(param, -0.01, 0.01)
34 |
35 | def forward(self, x_dict):
36 | linears = []
37 | embs = []
38 | for name, (_, type) in self.features.items():
39 | x = x_dict[name]
40 | if type == 'spr':
41 | embs.append(self.emb_layer[name](x).squeeze(1))
42 | elif type == 'ctn':
43 | linears.append(self.ctn_linear_layer[name](x))
44 | elif type == 'seq':
45 | seq_emb = self.emb_layer[name](x)
46 | seq_mask = torch.unsqueeze(x_dict["{}mask".format(name)], dim=2)
47 | embs.append(torch.sum(seq_emb * seq_mask, dim=1) / torch.sum(seq_mask, dim=1))
48 | elif type == 'other':
49 | pass
50 | else:
51 | raise ValueError('unkwon feature: {}'.format(name))
52 | emb = torch.concat(embs, dim=1)
53 | res = self.mlp(emb)
54 | if len(linears) > 0:
55 | linear_part = torch.concat(linears, dim=1).sum(dim=1, keepdims=True)
56 | res += linear_part
57 | res = res.squeeze(1)
58 | return torch.sigmoid(res)
--------------------------------------------------------------------------------
/dataloader/kuairec.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pickle
4 | import os
5 | import torch
6 | from torch.utils.data import Dataset, DataLoader
7 |
8 |
9 | class KUAIRECDataset(Dataset):
10 | """
11 | Load a KuaiRec Dataset
12 | """
13 | def __init__(self, dataset_name, df, description, device):
14 | super(KUAIRECDataset, self).__init__()
15 | self.dataset_name = dataset_name
16 | self.df = df
17 | self.length = len(df)
18 | self.name2array = {name: torch.from_numpy(np.array(list(df[name])).reshape([self.length, -1])).to(device) \
19 | for name in df.columns}
20 | self.format(description, device)
21 | self.features = [name for name, size, type in description if type != 'label']
22 | self.label = 'play_time'
23 |
24 | def format(self, description, device):
25 | for name, size, type in description:
26 | if type == 'spr' or type == 'seq':
27 | self.name2array[name] = self.name2array[name].to(torch.long)
28 | elif type == 'ctn' or type == 'seqm' or type == 'other':
29 | self.name2array[name] = self.name2array[name].to(torch.float32)
30 | elif type == 'label':
31 | pass
32 | else:
33 | raise ValueError('unkwon type {}'.format(type))
34 |
35 | def __getitem__(self, index):
36 | return {name: self.name2array[name][index] for name in self.features}, \
37 | self.name2array[self.label][index].squeeze()
38 |
39 | def __len__(self):
40 | return self.length
41 |
42 |
43 | class KUAIRECDataLoader(object):
44 | """
45 | Load KUAIRANK16DataLoader for torch train/eval
46 |
47 | :param dataset_path: dataset path
48 | """
49 |
50 | def __init__(self, dataset_name, dataset_path, device, bsz=32):
51 | assert os.path.exists(dataset_path), '{} does not exist'.format(dataset_path)
52 | with open(dataset_path, 'rb+') as f:
53 | data = pickle.load(f)
54 | self.dataset_name = dataset_name
55 | self.dataloaders = {}
56 | self.description = data['description']
57 | for key, df in data.items():
58 | if key == 'description':
59 | continue
60 | self.dataloaders[key] = DataLoader(KUAIRECDataset(dataset_name, df, self.description, device), batch_size=bsz, shuffle=False)
61 | self.keys = list(self.dataloaders.keys())
62 |
63 |
64 | def __getitem__(self, name):
65 | assert name in self.keys, '{} not in keys of datasets'.format(name)
66 | return self.dataloaders[name]
--------------------------------------------------------------------------------
/model/cread.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from model.layers import FactorizationMachine, MultiLayerPerceptron
4 |
5 | class Cread(torch.nn.Module):
6 |
7 | def __init__(self, description, embed_dim, share_mlp_dims, output_mlp_dims, head_num, dropout):
8 | super().__init__()
9 | self.features = {name: (size, type) for name, size, type in description if (type in ["ctn", 'seq', 'spr'])}
10 | self.build(embed_dim, share_mlp_dims, output_mlp_dims, head_num, dropout)
11 |
12 | def build(self, embed_dim, share_mlp_dims, output_mlp_dims, head_num, dropout):
13 | self.emb_layer = torch.nn.ModuleDict()
14 | self.ctn_emb_layer = torch.nn.ParameterDict()
15 | self.ctn_linear_layer = torch.nn.ModuleDict()
16 | embed_output_dim = 0
17 | for name, (size, type) in self.features.items():
18 | if type == 'spr':
19 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
20 | embed_output_dim += embed_dim
21 | elif type == 'ctn':
22 | self.ctn_linear_layer[name] = torch.nn.Linear(1, 1, bias=False)
23 | elif type == 'seq':
24 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
25 | embed_output_dim += embed_dim
26 | else:
27 | raise ValueError('unkown feature type: {}'.format(type))
28 | self.share_mlp = MultiLayerPerceptron(embed_output_dim, share_mlp_dims, dropout, output_layer=False)
29 | self.output_mlps = torch.nn.ModuleList()
30 | for idx_head in range(head_num):
31 | self.output_mlps.append(MultiLayerPerceptron(share_mlp_dims[-1], output_mlp_dims, dropout, output_layer=True))
32 | return
33 |
34 | def init(self):
35 | for param in self.parameters():
36 | torch.nn.init.uniform_(param, -0.01, 0.01)
37 |
38 | def forward(self, x_dict):
39 | linears = []
40 | embs = []
41 | for name, (_, type) in self.features.items():
42 | x = x_dict[name]
43 | if type == 'spr':
44 | embs.append(self.emb_layer[name](x).squeeze(1))
45 | elif type == 'ctn':
46 | linears.append(self.ctn_linear_layer[name](x))
47 | elif type == 'seq':
48 | seq_emb = self.emb_layer[name](x)
49 | seq_mask = torch.unsqueeze(x_dict["{}mask".format(name)], dim=2)
50 | embs.append(torch.sum(seq_emb * seq_mask, dim=1) / torch.sum(seq_mask, dim=1))
51 | else:
52 | raise ValueError('unkwon feature: {}'.format(name))
53 | emb = torch.concat(embs, dim=1)
54 | share_vec = self.share_mlp(emb)
55 | output_list = []
56 | for output_mlp in self.output_mlps:
57 | output_list.append(output_mlp(share_vec))
58 | return torch.sigmoid(torch.concat(output_list, dim=1))
--------------------------------------------------------------------------------
/run_vr.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import torch
4 | import random
5 | import numpy as np
6 | import argparse
7 | from dataloader import KUAIRECDataLoader
8 | from model import WideAndDeep
9 | from utils import eval_mae, eval_xauc, eval_kl
10 |
11 | def get_args():
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--dataset_name', default='kuairec')
14 | parser.add_argument('--dataset_path', default='./dataset/')
15 | parser.add_argument('--device', default='cuda:0')
16 | parser.add_argument('--bsz', type=int, default=2048)
17 | parser.add_argument('--log_interval', type=int, default=10)
18 |
19 | parser.add_argument('--epoch', type=int, default=10)
20 | parser.add_argument('--lr', type=float, default=0.1)
21 | parser.add_argument('--weight_decay', type=float, default=1e-6)
22 | parser.add_argument('--seed', type=int, default=42)
23 |
24 | args = parser.parse_args()
25 | return args
26 |
27 | def get_loaders(name, dataset_path, device, bsz):
28 | path = os.path.join(dataset_path, name, "{}_data.pkl".format(name))
29 | if name == 'kuairec':
30 | dataloaders = KUAIRECDataLoader(name, path, device, bsz=bsz)
31 | else:
32 | raise ValueError('unkown dataset name: {}'.format(name))
33 | return dataloaders
34 |
35 | def mae_rescale_to_second(dataset, mae):
36 | if dataset == 'kuairec':
37 | return mae * 999639 / 1000
38 | elif dataset == 'wechat':
39 | return mae * 20840
40 | elif dataset == 'cikm16':
41 | return (mae *(6000-31) + 31) / 1000
42 | else:
43 | raise ValueError('unkown dataset name: {}'.format(dataset))
44 |
45 | def test(args, model, dataloaders):
46 | model.eval()
47 | labels, scores, predicts = list(), list(), list()
48 | with torch.no_grad():
49 | for _, (features, label) in enumerate(dataloaders['test']):
50 | y = model(features) /20
51 | labels.extend(label.tolist())
52 | scores.extend(y.tolist())
53 | labels, scores = np.array(labels), np.array(scores)
54 | mae, xauc, kl = eval_mae(labels, scores), eval_xauc(labels, scores), eval_kl(labels, scores)
55 | mae = mae_rescale_to_second(args.dataset_name, mae)
56 | print("test result | MAE: {:.7f} | XAUC: {:.7f} | KL: {:.7f}".format(mae, xauc, kl))
57 |
58 | if __name__ == '__main__':
59 | args = get_args()
60 | if args.seed > -1:
61 | np.random.seed(args.seed)
62 | torch.manual_seed(args.seed)
63 | torch.cuda.manual_seed(args.seed)
64 | res = {}
65 | torch.cuda.empty_cache()
66 |
67 | device = torch.device(args.device)
68 |
69 | dataloaders = get_loaders(args.dataset_name, args.dataset_path, device, args.bsz)
70 | model = WideAndDeep(dataloaders.description, embed_dim=16, mlp_dims=(512, 256, 128, 64, 32), dropout=0.0)
71 | model = model.to(device)
72 |
73 | # train
74 | dataloader_train = dataloaders['train']
75 | model.train()
76 | # criterion = torch.nn.BCELoss()
77 | criterion = torch.nn.MSELoss()
78 | optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
79 | for epoch_i in range(1, args.epoch + 1):
80 | model.train()
81 | epoch_loss = 0.0
82 | total_loss = 0
83 | total_iters = len(dataloader_train)
84 | for i, (features, label) in enumerate(dataloader_train):
85 | y = model(features)
86 | loss = criterion(y,20 * label.float())
87 | model.zero_grad()
88 | loss.backward()
89 | optimizer.step()
90 | epoch_loss += loss.item()
91 | total_loss += loss.item()
92 | if (i + 1) % 10 == 0:
93 | print(" Iter {}/{} loss: {:.7f}".format(i + 1, total_iters + 1, total_loss/args.log_interval), end='\r')
94 | total_loss = 0
95 | print("Epoch {}/{} average Loss: {:.7f}".format(epoch_i, args.epoch, epoch_loss/total_iters))
96 | test(args, model, dataloaders)
97 |
98 |
--------------------------------------------------------------------------------
/run_egmn.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import torch
4 | import random
5 | import numpy as np
6 | import argparse
7 | from dataloader import KUAIRECDataLoader
8 | from model import EGMN
9 | from utils import eval_mae, eval_xauc, eval_kl
10 | from sklearn.metrics import roc_auc_score
11 |
12 | def get_args():
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--dataset_name', default='kuairec')
15 | parser.add_argument('--dataset_path', default='./dataset/')
16 | parser.add_argument('--device', default='cuda:0')
17 | parser.add_argument('--bsz', type=int, default=2048)
18 | parser.add_argument('--log_interval', type=int, default=10)
19 |
20 | parser.add_argument('--alpha', type=float, default=0.1)
21 | parser.add_argument('--beta', type=float, default=1.0)
22 | parser.add_argument('--epoch', type=int, default=10)
23 | parser.add_argument('--lr', type=float, default=0.1)
24 | parser.add_argument('--weight_decay', type=float, default=1e-6)
25 | parser.add_argument('--runs', type=int, default=1, help = 'number of executions to compute the average metrics')
26 | parser.add_argument('--seed', type=int, default=42)
27 |
28 | args = parser.parse_args()
29 | return args
30 |
31 | def get_loaders(name, dataset_path, device, bsz):
32 | path = os.path.join(dataset_path, name, "{}_data.pkl".format(name))
33 | if name == 'kuairec':
34 | dataloaders = KUAIRECDataLoader(name, path, device, bsz=bsz)
35 | else:
36 | raise ValueError('unkown dataset name: {}'.format(name))
37 | return dataloaders
38 |
39 | def mae_rescale_to_second(dataset, mae):
40 | if dataset == 'kuairec':
41 | return mae * 999639 / 1000
42 | elif dataset == 'wechat':
43 | return mae * 20840
44 | elif dataset == 'cikm16':
45 | return (mae *(6000-31) + 31) / 1000
46 | else:
47 | raise ValueError('unkown dataset name: {}'.format(dataset))
48 |
49 | def test(args, model, dataloaders):
50 | model.eval()
51 | labels, scores, predicts = list(), list(), list()
52 | with torch.no_grad():
53 | for _, (features, label) in enumerate(dataloaders['test']):
54 | y = model.predict(features)
55 | duration = features['duration'].squeeze()
56 | pred = y.squeeze()
57 | labels.extend(label.tolist())
58 | scores.extend(pred.tolist())
59 | labels, scores = np.array(labels), np.array(scores)
60 | mae, xauc, kl = eval_mae(labels, scores), eval_xauc(labels, scores), eval_kl(labels, scores)
61 | mae = mae_rescale_to_second(args.dataset_name, mae)
62 | print("test result | MAE: {:.7f} | XAUC: {:.7f} | KL: {:.7f}".format(mae, xauc, kl))
63 |
64 | if __name__ == '__main__':
65 | args = get_args()
66 | if args.seed > -1:
67 | np.random.seed(args.seed)
68 | torch.manual_seed(args.seed)
69 | torch.cuda.manual_seed(args.seed)
70 | res = {}
71 | torch.cuda.empty_cache()
72 |
73 | device = torch.device(args.device)
74 |
75 | dataloaders = get_loaders(args.dataset_name, args.dataset_path, device, args.bsz)
76 | model = EGMN(dataloaders.description, embed_dim=16, share_mlp_dims=(256, 128, 64), output_mlp_dims=(32, 16), dropout=0.2)
77 | model = model.to(device)
78 | model.train()
79 |
80 | # train
81 | dataloader_train = dataloaders['train']
82 | # optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
83 | optimizer = torch.optim.Adagrad(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
84 | for epoch_i in range(1, args.epoch + 1):
85 | model.train()
86 | epoch_loss, epoch_nll, epoch_reg, epoch_entropy = 0.0, 0.0, 0.0, 0.0
87 | total_loss, total_nll, total_reg, total_entropy = 0.0, 0.0, 0.0, 0.0
88 | total_iters = len(dataloader_train)
89 | for i, (features, label) in enumerate(dataloader_train):
90 | pi, lambda_, mu, sigma = model(features)
91 | duration = features['duration'].squeeze()
92 | nll_loss, reg_loss, entropy_loss = model.loss(label.float(), pi, lambda_, mu, sigma, features['duration'].view(-1, 1))
93 | loss = nll_loss + args.alpha * entropy_loss + args.beta * reg_loss
94 |
95 | model.zero_grad()
96 | loss.backward()
97 | optimizer.step()
98 | epoch_loss += loss.item(); epoch_nll += nll_loss.item(); epoch_reg += reg_loss.item(); epoch_entropy += entropy_loss.item()
99 | total_loss += loss.item(); total_nll += nll_loss.item(); total_reg += reg_loss.item(); total_entropy += entropy_loss.item()
100 | if (i + 1) % 10 == 0:
101 | print(" Iter {}/{} loss: {:.7f}, epoch nll: {:.7f}, epoch reg: {:.7f}, epoch entropy: {:.7f}".format(i + 1, total_iters + 1, total_loss/args.log_interval, total_nll/args.log_interval, total_reg/args.log_interval, total_entropy/args.log_interval), end='\r')
102 | total_loss, total_nll, total_reg = 0, 0, 0
103 | print("Epoch {}/{} average Loss: {:.7f}, nll: {:.7f}, reg: {:.7f}, entropy: {:.7f}".format(epoch_i, args.epoch, epoch_loss/total_iters, epoch_nll/total_iters, epoch_reg/total_iters, epoch_entropy/total_iters))
104 | test(args, model, dataloaders)
105 |
106 |
--------------------------------------------------------------------------------
/run_tpm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import torch
4 | import random
5 | import numpy as np
6 | import argparse
7 | from dataloader import KUAIRECDataLoader
8 | from model import TPM
9 | from utils import eval_mae, eval_xauc, eval_kl, get_playtime_percentiles_range, get_tree_encoded_value, get_tree_encoded_label, get_tree_classify_loss
10 |
11 |
12 | def get_args():
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--dataset_name', default='wechat')
15 | parser.add_argument('--dataset_path', default='./dataset/')
16 | parser.add_argument('--device', default='cuda:0')
17 | parser.add_argument('--bsz', type=int, default=2048)
18 | parser.add_argument('--log_interval', type=int, default=10)
19 |
20 | parser.add_argument('--wr_bucknum', type=int, default=32)
21 | parser.add_argument('--epoch', type=int, default=10)
22 | parser.add_argument('--lr', type=float, default=0.1)
23 | parser.add_argument('--weight_decay', type=float, default=1e-6)
24 | parser.add_argument('--seed', type=int, default=42)
25 | parser.add_argument('--variance_weight', type=float, default=0.0001)
26 | parser.add_argument('--mse_weight', type=float, default=1)
27 | parser.add_argument('--tree_cla_weight', type=float, default=1)
28 | args = parser.parse_args()
29 | return args
30 |
31 |
32 | def get_loaders(name, dataset_path, device, bsz):
33 | path = os.path.join(dataset_path, name, "{}_data.pkl".format(name))
34 | if name == 'kuairec':
35 | dataloaders = KUAIRECDataLoader(name, path, device, bsz=bsz)
36 | else:
37 | raise ValueError('unkown dataset name: {}'.format(name))
38 | return dataloaders
39 |
40 | def mae_rescale_to_second(dataset, mae):
41 | if dataset == 'kuairec':
42 | return mae * 999639 / 1000
43 | elif dataset == 'wechat':
44 | return mae * 20840
45 | elif dataset == 'cikm16':
46 | return (mae *(6000-31) + 31) / 1000
47 | else:
48 | raise ValueError('unkown dataset name: {}'.format(dataset))
49 |
50 | def test(args, model, dataloaders):
51 | model.eval()
52 | labels, scores, predicts = list(), list(), list()
53 | with torch.no_grad():
54 | for _, (features, label) in enumerate(dataloaders['test']):
55 | y = model(features)
56 | encoded_y, variance = get_tree_encoded_value(y, args.wr_bucknum, bucket_begins, bucket_ends)
57 | labels.extend(label.tolist())
58 | scores.extend(encoded_y.flatten().tolist())
59 | labels, scores = np.array(labels), np.array(scores)
60 | mae, xauc, kl = eval_mae(labels, scores), eval_xauc(labels, scores), eval_kl(labels, scores)
61 | mae = mae_rescale_to_second(args.dataset_name, mae)
62 | print("test result | MAE: {:.7f} | XAUC: {:.7f} | KL: {:.7f}".format(mae, xauc, kl))
63 |
64 | if __name__ == '__main__':
65 | args = get_args()
66 | if args.seed > -1:
67 | np.random.seed(args.seed)
68 | torch.manual_seed(args.seed)
69 | torch.cuda.manual_seed(args.seed)
70 | res = {}
71 | torch.cuda.empty_cache()
72 |
73 | device = torch.device(args.device)
74 |
75 | dataloaders = get_loaders(args.dataset_name, args.dataset_path, device, args.bsz)
76 | model = TPM(dataloaders.description, class_num=args.wr_bucknum-1 ,embed_dim=16, mlp_dims=(128, 64, 32), dropout=0.0)
77 | model = model.to(device)
78 |
79 | # get the palytime bucket ranges
80 | dataloader_train = dataloaders['train']
81 | bucket_begins, bucket_ends = get_playtime_percentiles_range(dataloader_train, args.wr_bucknum ,args.device)
82 |
83 | # train
84 | model.train()
85 | optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
86 | for epoch_i in range(1, args.epoch + 1):
87 | model.train()
88 | epoch_loss = 0.0
89 | total_loss = 0
90 | print_tc_loss = 0
91 | print_mse_loss = 0
92 | print_var_loss = 0
93 | total_iters = len(dataloader_train)
94 | for i, (features, label) in enumerate(dataloader_train):
95 | y = model(features)
96 | encoded_y, variance = get_tree_encoded_value(y, args.wr_bucknum, bucket_begins, bucket_ends)
97 | encoded_label, bucket_weights = get_tree_encoded_label(label, args.wr_bucknum, bucket_begins, bucket_ends)
98 | tree_classify_loss = get_tree_classify_loss(encoded_label, bucket_weights, y, args.wr_bucknum)
99 | mse_loss_fn = torch.nn.MSELoss(reduction='mean')
100 | mse_loss = mse_loss_fn(encoded_y, label.view(-1,1).float())
101 | loss = tree_classify_loss * args.tree_cla_weight + mse_loss * args.mse_weight + variance * args.variance_weight
102 | model.zero_grad()
103 | loss.backward()
104 | optimizer.step()
105 | epoch_loss += loss.item()
106 | total_loss += loss.item()
107 | print_tc_loss += tree_classify_loss.item()
108 | print_mse_loss += mse_loss.item()
109 | print_var_loss += variance.item()
110 | if (i + 1) % 10 == 0:
111 | print(" Iter {}/{} total_loss: {:.7f}, tc_loss: {:.7f}, mse_loss: {:.7f},var_loss: {:.7f},".format(i + 1, total_iters + 1, total_loss/args.log_interval, print_tc_loss/args.log_interval, print_mse_loss/args.log_interval, print_var_loss/args.log_interval), end='\r')
112 | total_loss = 0
113 | print_tc_loss = 0
114 | print_mse_loss = 0
115 | print_var_loss = 0
116 | print("Epoch {}/{} average Loss: {:.7f}".format(epoch_i, args.epoch, epoch_loss/total_iters))
117 | test(args, model, dataloaders)
118 |
119 |
--------------------------------------------------------------------------------
/model/egmn.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.distributions as D
4 | import torch.nn.functional as F
5 | import numpy as np
6 |
7 | from model.layers import FactorizationMachine, MultiLayerPerceptron, DurationMultiLayerPerceptron
8 |
9 | class EGMN(torch.nn.Module):
10 |
11 | def __init__(self, description, embed_dim, share_mlp_dims, output_mlp_dims, dropout):
12 | super().__init__()
13 | self.features = {name: (size, type) for name, size, type in description if (type in ["ctn", 'seq', 'spr'])}
14 | self.build(embed_dim, share_mlp_dims, output_mlp_dims, dropout)
15 |
16 | def build(self, embed_dim, share_mlp_dims, output_mlp_dims, dropout):
17 | self.emb_layer = torch.nn.ModuleDict()
18 | self.ctn_emb_layer = torch.nn.ParameterDict()
19 | self.ctn_linear_layer = torch.nn.ModuleDict()
20 | embed_output_dim = 0
21 | for name, (size, type) in self.features.items():
22 | if type == 'spr':
23 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
24 | embed_output_dim += embed_dim
25 | elif type == 'ctn':
26 | self.ctn_linear_layer[name] = torch.nn.Linear(1, 1, bias=False)
27 | embed_output_dim += 1
28 | elif type == 'seq':
29 | self.emb_layer[name] = torch.nn.Embedding(size, embed_dim)
30 | embed_output_dim += embed_dim
31 | else:
32 | raise ValueError('unkown feature type: {}'.format(type))
33 |
34 |
35 | self.share_mlp = MultiLayerPerceptron(embed_output_dim, share_mlp_dims, dropout, output_layer=False)
36 |
37 | hidden_dim = share_mlp_dims[-1]# + 1
38 |
39 | # 指数分布参数分支(快滑峰)
40 | self.lambda_layer = nn.Sequential(
41 | nn.Linear(hidden_dim, 1),
42 | nn.Softplus(beta=0.5)
43 | )
44 |
45 | comp_num = 10
46 | self.mixture_logits = nn.Linear(hidden_dim, comp_num+1)
47 | self.gauss_mu = nn.Sequential(
48 | nn.Linear(hidden_dim, comp_num),
49 | # MultiLayerPerceptron(hidden_dim, output_mlp_dims, dropout, output_layer=True),
50 | nn.Softplus()
51 | )
52 | self.gauss_sigma = nn.Sequential(
53 | nn.Linear(hidden_dim, comp_num),
54 | # MultiLayerPerceptron(hidden_dim, output_mlp_dims, dropout, output_layer=True),
55 | nn.Softplus()
56 | )
57 | return
58 |
59 | def init(self):
60 | for param in self.parameters():
61 | torch.nn.init.uniform_(param, -0.01, 0.01)
62 |
63 | def forward(self, x_dict):
64 | linears = []
65 | embs = []
66 | for name, (_, type) in self.features.items():
67 | x = x_dict[name]
68 | if type == 'spr':
69 | embs.append(self.emb_layer[name](x).squeeze(1))
70 | elif type == 'ctn':
71 | linears.append(self.ctn_linear_layer[name](x))
72 | elif type == 'seq':
73 | seq_emb = self.emb_layer[name](x)
74 | seq_mask = torch.unsqueeze(x_dict["{}mask".format(name)], dim=2)
75 | embs.append(torch.sum(seq_emb * seq_mask, dim=1) / torch.sum(seq_mask, dim=1))
76 | else:
77 | raise ValueError('unkwon feature: {}'.format(name))
78 | emb = torch.concat(embs + linears, dim=1)
79 |
80 | hidden = self.share_mlp(emb)
81 | # hidden = torch.concat([hidden, engage_pred.view(-1, 1)], dim=1)
82 |
83 | lambda_ = self.lambda_layer(hidden) + 1e-6
84 | pi = self.mixture_logits(hidden) # [batch, componet]
85 |
86 | mu = self.gauss_mu(hidden) + 1/lambda_# [batch, component]
87 | # mu = torch.cumsum(mu, dim=1) + 1/lambda_
88 | sigma = self.gauss_sigma(hidden) + 1e-6 # [batch, component]
89 |
90 | return pi, lambda_, mu, sigma
91 |
92 | def loss(self, y_true, pi, lambda_, mu, sigma, duration):
93 | batch_size = y_true.shape[0]
94 | y_true = y_true.view(-1, 1)
95 |
96 | # 指数分布(快滑峰)
97 | exp_dist = D.Exponential(rate=lambda_.view(-1))
98 | log_prob_short = exp_dist.log_prob(y_true.view(-1)).view(batch_size, 1)
99 |
100 | #高斯分布
101 | log_prob_all = []
102 | for comp_idx in range(mu.shape[1]):
103 | normal_dist = D.Normal(loc=mu[:, comp_idx], scale=sigma[:, comp_idx])
104 | trunc_min = torch.zeros_like(mu[:, comp_idx])
105 | prob_long = 1.0 - normal_dist.cdf(trunc_min) # 左侧 <0截断
106 | log_prob = normal_dist.log_prob(y_true.view(-1)) - torch.log(prob_long + 1e-6) # [batch, comp_nm]
107 | log_prob_all.append(log_prob.view(-1, 1))
108 | log_prob_all = torch.concat([log_prob_short] + log_prob_all, dim=1)
109 |
110 | # 混合概率
111 | mix_probs = torch.softmax(pi, dim=1)
112 |
113 | # sample_w = (1 + y_true * duration)
114 | # sample_w = torch.where(y_true * video_durations.view(-1, 1) < 0.005, 0.5 * torch.ones_like(y_true), torch.ones_like(y_true) )
115 | # nll loss
116 | log_mix_probs = torch.log_softmax(pi, dim=1)
117 | total_log_prob = torch.logsumexp(
118 | log_mix_probs + log_prob_all,
119 | dim=1, keepdim=True
120 | )
121 | nll_loss = -torch.mean(total_log_prob)
122 |
123 | # reconstruction loss
124 | pi = torch.softmax(pi, dim=1)
125 | pred = torch.sum(pi * torch.concat([1/lambda_, mu], dim=1), dim=1, keepdim=True)
126 | reg_loss = F.l1_loss(pred, y_true.float())
127 |
128 | # mixture entropy loss
129 | entropy_loss = torch.sum(mix_probs * torch.log(mix_probs + 1e-6), dim=1).mean()
130 |
131 | return nll_loss, reg_loss, entropy_loss
132 |
133 | def get_quantile(self, pi, lambda_, mu, sigma, tau=0.5):
134 | exp_dist = D.Exponential(rate=lambda_.view(-1, 1))
135 | norm_dist_list = []
136 | for comp_idx in range(mu.shape[1]):
137 | normal_dist = D.Normal(loc=mu[:, comp_idx:comp_idx+1], scale=sigma[:, comp_idx:comp_idx+1])
138 | norm_dist_list.append(normal_dist)
139 | try_list = torch.arange(0, 1, 0.0001).view(1, -1).to(pi.device)
140 | cdf = exp_dist.cdf(try_list.view(1, -1))
141 | for norm_dist in norm_dist_list:
142 | cdf += norm_dist.cdf(try_list.view(1, -1))
143 | try_list = try_list.view(-1)
144 | idx = (cdf < tau).to(torch.int8).sum(dim=1)
145 | return try_list[idx]
146 |
147 | def predict(self, x):
148 | with torch.no_grad():
149 | pi, lambda_, mu, sigma = self.forward(x)
150 | pi = torch.softmax(pi, dim=1)
151 | return torch.sum(pi * torch.concat([1/lambda_, mu], dim=1), dim=1)
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/dataset/kuairec/kuairec_process.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import pickle
4 |
5 | with open("row_data/kuairec_caption_category.csv", "r", encoding="utf-8", errors="ignore") as f:
6 | lines = f.readlines()
7 |
8 | with open("row_data/cleaned_kuairec_caption_category.csv", "w", encoding="utf-8") as f:
9 | f.writelines(lines)
10 |
11 | df5 = pd.read_csv("row_data/cleaned_kuairec_caption_category.csv", encoding="utf-8")
12 | df5["video_id"].astype(int)
13 | df5.to_csv("row_data/cleaned_kuairec_caption_category.csv", index=False, encoding="utf-8")
14 |
15 | #join feature files to one file
16 | df1 = pd.read_csv("row_data/big_matrix.csv")
17 | df2 = pd.read_csv("row_data/user_features.csv")
18 | df3 = pd.read_csv("row_data/item_daily_features.csv").loc[:, ['video_id', 'video_type','music_id','video_tag_id']]
19 | df3 = df3.drop_duplicates(subset=['video_id'])
20 | df4 = pd.read_csv("row_data/item_categories.csv")
21 | df5 = pd.read_csv("row_data/cleaned_kuairec_caption_category.csv", encoding="utf-8").loc[:,['video_id', 'first_level_category_id', 'second_level_category_id', 'third_level_category_id']]
22 |
23 |
24 | df_merged_1_2 = pd.merge(df1, df2, on="user_id", how="left")
25 | df_merged_1_2_3 = pd.merge(df_merged_1_2, df3, on="video_id", how="left")
26 | df_merged_1_2_3_4 = pd.merge(df_merged_1_2_3, df4, on="video_id", how="left")
27 | df_final = pd.merge(df_merged_1_2_3_4, df5, on="video_id", how="left")
28 |
29 | df =df_final.fillna('12345')
30 |
31 | #filter AD video
32 | df = df[df['video_type'] != 'AD']
33 | df = df[df['play_duration'] > 0]
34 | df = df[df['video_duration'] > 0]
35 |
36 | df = df.sample(frac=0.1, random_state=312).reset_index(drop=True)
37 |
38 | #delete unnecessary features
39 | df = df.drop(columns=['video_type','time','date','watch_ratio','follow_user_num','fans_user_num','friend_user_num','register_days'])
40 |
41 |
42 | # indentify different types of feature
43 | label="play_duration"
44 | sequence_feature="feat"
45 | sparse_feature_list=['feat','user_id','video_id','music_id','video_tag_id','user_active_degree','is_lowactive_period','is_live_streamer','is_video_author','follow_user_num_range','fans_user_num_range','friend_user_num_range','register_days_range','onehot_feat0','onehot_feat1','onehot_feat2','onehot_feat3','onehot_feat4','onehot_feat5','onehot_feat6','onehot_feat7','onehot_feat8','onehot_feat9','onehot_feat10','onehot_feat11','onehot_feat12','onehot_feat13','onehot_feat14','onehot_feat15','onehot_feat16','onehot_feat17','first_level_category_id','second_level_category_id','third_level_category_id']
46 | dense_feature_list=['video_duration','timestamp']
47 |
48 | # create feature desc and process play_time_ms/duration_ms
49 | desc=[('play_time', -1, 'label'), ('duration', -1, 'ctn')]
50 | df['play_time'] = df['play_duration'] / df['play_duration'].max()
51 | df['duration'] = df['video_duration'].clip(upper=df['play_duration'].max()) / df['play_duration'].max()
52 | df = df[df['play_time'] < df['duration']*10]
53 | df['play_time'] = df.apply(lambda row:row['play_time'] if row['play_time'] < row['duration']*10 else row['duration']*10,axis=1)
54 | df['play_time'] = df['play_time'].astype(float)
55 | df['duration'] = df['duration'].astype(float)
56 |
57 | # preprocess sparse features:
58 | for sparse_feature_name in sparse_feature_list:
59 | sparse_feature_set= set()
60 | sparse_feature_col = []
61 | for index, row in df.iterrows():
62 | if (sparse_feature_name == sequence_feature):
63 | sparse_feature = [str(i) for i in row[sparse_feature_name].split(",")]
64 | sparse_feature_set.update(sparse_feature)
65 | sparse_feature_col.append(sparse_feature)
66 | else:
67 | sparse_feature = str(row[sparse_feature_name])
68 | sparse_feature_set.add(sparse_feature)
69 | sparse_feature_col.append(sparse_feature)
70 |
71 | # generate the vocabulary of search tokens and items
72 | sparse_feature_voc = {word: idx for idx, word in enumerate(sparse_feature_set)}
73 | print("sparse feature {} size: {}".format(sparse_feature_name,len(sparse_feature_voc)))
74 |
75 | # reset query and item index with new vocabulary
76 | for idx, ori_feature in enumerate(sparse_feature_col):
77 | if (sparse_feature_name == sequence_feature):
78 | sparse_feature_col[idx] = [sparse_feature_voc[word] for word in ori_feature]
79 | else:
80 | sparse_feature_col[idx] = sparse_feature_voc[ori_feature]
81 |
82 | if(sparse_feature_name == sequence_feature):
83 | sparse_mask_col= []
84 | for idx, ori_feature in enumerate(sparse_feature_col):
85 | feature_len = len(ori_feature[:10])
86 | new_feature = ori_feature[:10] + [0]*(10-feature_len)
87 | feature_mask = [1.0]*feature_len +[0.0]*(10-feature_len)
88 | sparse_feature_col[idx] = new_feature
89 | sparse_mask_col.append(feature_mask)
90 | df['featmask'] = sparse_mask_col
91 | df[sparse_feature_name] = sparse_feature_col
92 |
93 | # delete features that only has one value and adding desc
94 | if len(sparse_feature_voc) == 1:
95 | df = df.drop(sparse_feature_name, axis=1)
96 | print("remove the {} feature !!!!!".format(sparse_feature_name))
97 | else:
98 | if (sparse_feature_name == sequence_feature):
99 | desc.append((sparse_feature_name, len(sparse_feature_voc), 'seq'))
100 | desc.append(('featmask', -1, 'seqm'))
101 | else:
102 | desc.append((sparse_feature_name, len(sparse_feature_voc), 'spr'))
103 |
104 | # generate precompulted duration buckets for D2Q
105 | n_bins = 50
106 | df['duration_bucket'], bins = pd.qcut(df['duration'],q=n_bins,labels=False,retbins=True,duplicates='drop' )
107 | bucket_ranges = pd.DataFrame({'bucket_index': range(len(bins)-1),'min_duration': bins[:-1],'max_duration': bins[1:]})
108 | bucket_ranges.to_csv('d2q_duration_bucket_ranges.csv', index=False)
109 | desc.append(('duration_bucket', len(bins) - 1, 'spr'))
110 |
111 | # split train, test set
112 | df = df.sample(frac=1, random_state=1234).reset_index(drop=True)
113 | df_train = df[:int(0.8*len(df))]
114 | df_test = df[int(0.8*len(df)):]
115 |
116 | # generate precompulted quantiles in each duration bucket for D2Q
117 | quantile_num= 100
118 | quantiles = np.linspace(0, 1, quantile_num+1)
119 | quantile_df = (df_train.groupby('duration_bucket')['play_time'].quantile(quantiles).reset_index().rename(columns={'level_1': 'quantile'}))
120 | quantile_pivot = quantile_df.pivot(index='duration_bucket',columns='quantile',values='play_time')
121 | quantile_pivot.to_csv('d2q_duration_bucket_playtime_quantiles.csv')
122 |
123 | # save to pickle
124 | data = {
125 | "train": df_train,
126 | "test": df_test,
127 | "description": desc
128 | }
129 | with open('./kuairec_data.pkl', 'wb+') as f:
130 | pickle.dump(data, f)
--------------------------------------------------------------------------------
/run_d2q.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import torch
4 | import random
5 | import numpy as np
6 | import argparse
7 | from dataloader import KUAIRECDataLoader
8 | from model import D2Q,WideAndDeep
9 | from utils import eval_mae, eval_xauc, eval_kl
10 | import pandas as pd
11 | import pickle as pkl
12 | from torch.distributions import Normal
13 |
14 | def get_args():
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--dataset_name', default='kuairec')
17 | parser.add_argument('--dataset_path', default='./dataset/')
18 | parser.add_argument('--device', default='cuda:0')
19 | parser.add_argument('--quantile_max', type=float, default=100)
20 | parser.add_argument('--bsz', type=int, default=2048)
21 | parser.add_argument('--log_interval', type=int, default=10)
22 |
23 | parser.add_argument('--epoch', type=int, default=10)
24 | parser.add_argument('--lr', type=float, default=0.1)
25 | parser.add_argument('--weight_decay', type=float, default=1e-6)
26 | parser.add_argument('--seed', type=int, default=42)
27 |
28 | args = parser.parse_args()
29 | return args
30 |
31 | def get_loaders(name, dataset_path, device, bsz):
32 | path = os.path.join(dataset_path, name, "{}_data.pkl".format(name))
33 | if name == 'kuairec':
34 | dataloaders = KUAIRECDataLoader(name, path, device, bsz=bsz)
35 | else:
36 | raise ValueError('unkown dataset name: {}'.format(name))
37 | return dataloaders
38 |
39 | def label_norm(label, max_value):
40 | return torch.clamp(label, max=max_value)/max_value
41 |
42 | def from_value_to_quantile(bucket_quantiles, bucket_index, value):
43 | quantile = bucket_quantiles[bucket_index]
44 | if len(quantile) < 2:
45 | return 0.0
46 | if value <= quantile[0]:
47 | return 0.0
48 | if value >= quantile[-1]:
49 | return len(quantile) - 1
50 | idx = np.searchsorted(quantile, value) - 1
51 | left_val = quantile[idx]
52 | right_val = quantile[idx + 1]
53 | if right_val == left_val:
54 | return float(idx)
55 | fraction = (value - left_val) / (right_val - left_val)
56 | quantile = idx + fraction
57 | return quantile
58 |
59 | def from_quantile_to_value(bucket_quantiles, bucket_index, quantile):
60 | quantiles = bucket_quantiles[bucket_index]
61 | num_quantiles = len(quantiles)
62 | quantile_steps = np.linspace(0.0, 1.0, num_quantiles)
63 | if quantile <= 0.0:
64 | return quantiles[0]
65 | elif quantile >= 1.0:
66 | return quantiles[-1]
67 | idx = np.searchsorted(quantile_steps, quantile) - 1
68 | if quantile_steps[idx+1] == quantile:
69 | return quantiles[idx+1]
70 | q1, q2 = quantile_steps[idx], quantile_steps[idx + 1]
71 | v1, v2 = quantiles[idx], quantiles[idx + 1]
72 | interpolated_value = v1 + (v2 - v1) * (quantile - q1) / (q2 - q1)
73 | return interpolated_value
74 |
75 | def get_buckets_infor(buckets_quantiles_path):
76 | df = pd.read_csv(buckets_quantiles_path)
77 | bucket_quantiles = {}
78 |
79 | for idx, row in df.iterrows():
80 | bucket_index = int(row[0])
81 | quantiles = row[1:].values.tolist()
82 | bucket_quantiles[bucket_index] = quantiles
83 |
84 | return bucket_quantiles
85 |
86 | def mae_rescale_to_second(dataset, mae):
87 | if dataset == 'kuairec':
88 | return mae * 999639 / 1000
89 | elif dataset == 'wechat':
90 | return mae * 20840
91 | elif dataset == 'cikm16':
92 | return (mae *(6000-31) + 31) / 1000
93 | else:
94 | raise ValueError('unkown dataset name: {}'.format(dataset))
95 |
96 | def test(args, model, dataloaders):
97 | model.eval()
98 | labels, scores, predicts, durs = list(), list(), list(), list()
99 | with torch.no_grad():
100 | for _, (features, label) in enumerate(dataloaders['test']):
101 | bucket_index = features['duration_bucket']
102 | y = model(features)
103 | mapped_y,mapped_label = [],[]
104 | for idx, quantile in zip(bucket_index, y.tolist()):
105 | mapped_y_value = from_quantile_to_value(buckets_quantiles, idx.item(), quantile)
106 | mapped_y.append(mapped_y_value)
107 | labels.extend(label.tolist())
108 | scores.extend(mapped_y)
109 | durs.extend(features['duration'].squeeze().tolist())
110 | labels, scores = np.array(labels), np.array(scores)
111 | mae, xauc, kl = eval_mae(labels, scores), eval_xauc(labels, scores), eval_kl(labels, scores)
112 | mae = mae_rescale_to_second(args.dataset_name, mae)
113 | print("test result | MAE: {:.7f} | XAUC: {:.7f} | KL: {:.7f}".format(mae, xauc, kl))
114 |
115 | if __name__ == '__main__':
116 | args = get_args()
117 | if args.seed > -1:
118 | np.random.seed(args.seed)
119 | torch.manual_seed(args.seed)
120 | torch.cuda.manual_seed(args.seed)
121 | res = {}
122 | torch.cuda.empty_cache()
123 |
124 | device = torch.device(args.device)
125 |
126 | dataloaders = get_loaders(args.dataset_name, args.dataset_path, device, args.bsz)
127 | buckets_quantiles_path = os.path.join(args.dataset_path, args.dataset_name, "d2q_duration_bucket_playtime_quantiles.csv")
128 | buckets_quantiles = get_buckets_infor(buckets_quantiles_path)
129 | model = D2Q(dataloaders.description, embed_dim=16, mlp_dims=(512, 256, 128, 64), dropout=0.0)
130 | model = model.to(device)
131 |
132 | # train
133 | dataloader_train = dataloaders['train']
134 | model.train()
135 | criterion = torch.nn.MSELoss()
136 | # criterion = torch.nn.BCELoss()
137 | optimizer = torch.optim.Adagrad(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
138 | for epoch_i in range(1, args.epoch + 1):
139 | model.train()
140 | epoch_loss = 0.0
141 | total_loss = 0
142 | total_iters = len(dataloader_train)
143 | for i, (features, label) in enumerate(dataloader_train):
144 | y = model(features)
145 | bucket_index = features['duration_bucket']
146 | mapped_label = []
147 | for idx, label_origin in zip(bucket_index, label.tolist()):
148 | # print("{}, {}".format(idx.item(), label_origin))
149 | mapped_label_value = from_value_to_quantile(buckets_quantiles, idx.item(), label_origin)
150 | mapped_label.append(mapped_label_value)
151 | # print("train orig mapped_label:", mapped_label)
152 | mapped_label = torch.tensor(mapped_label, device=device)
153 | mapped_label = label_norm(mapped_label, args.quantile_max)
154 | # print("y:", y)
155 | loss = criterion(y, mapped_label.float())
156 | model.zero_grad()
157 | loss.backward()
158 | optimizer.step()
159 | epoch_loss += loss.item()
160 | total_loss += loss.item()
161 | if (i + 1) % 10 == 0:
162 | print(" Iter {}/{} loss: {:7f}".format(i + 1, total_iters + 1, total_loss/args.log_interval), end='\r')
163 | total_loss = 0
164 | print("Epoch {}/{} average Loss: {:.7f}".format(epoch_i, args.epoch, epoch_loss/total_iters))
165 | test(args, model, dataloaders)
166 |
167 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import os
5 | import math
6 | import torch
7 | import torch.nn.functional as F
8 | from sklearn.metrics import roc_auc_score
9 |
10 | import torch
11 | import numpy as np
12 |
13 | def get_playtime_percentiles_range(dataloader, wr_bucknum, _device):
14 | all_play_time = []
15 | for _, (_, label) in enumerate(dataloader):
16 | play_time = label
17 | all_play_time.append(play_time)
18 | all_play_time = torch.cat(all_play_time, dim=0)
19 | play_time_np = all_play_time.cpu().numpy()
20 | percen_value = np.percentile(play_time_np, np.linspace(0.0, 100.0, num=wr_bucknum + 1).astype(np.float32)).tolist()
21 | bucket_begins = torch.tensor(percen_value[:-1], dtype=torch.float32, device=_device).unsqueeze(0)
22 | bucket_ends = torch.tensor(percen_value[1:], dtype=torch.float32, device=_device).unsqueeze(0)
23 | return bucket_begins, bucket_ends
24 |
25 |
26 | def get_tree_classify_loss(label_dict, weight_dict, label_encoding_predict, tree_num_intervals=32):
27 | auxiliary_loss_ = 0.0
28 | height = int(math.log2(tree_num_intervals))
29 | for i in range(height):
30 | for j in range(2**i):
31 | interval_label = label_dict[1000*i + j].reshape(-1, 1)
32 | interval_weight = weight_dict[1000*i + j].reshape(-1, 1)
33 | interval_preds = label_encoding_predict[:, 2**i - 1 + j].view(-1,1)
34 | interval_loss = F.binary_cross_entropy_with_logits(interval_preds, interval_label, weight=interval_weight)
35 | auxiliary_loss_ += interval_loss
36 | final_loss = auxiliary_loss_ / (tree_num_intervals - 1.0)
37 | return final_loss.float()
38 |
39 | def get_tree_encoded_label(label,tree_num_intervals, begins, ends, name="label_encoding"):
40 | label_dict = {}
41 | weight_dict = {}
42 | height = int(math.log2(tree_num_intervals))
43 | for i in range(height):
44 | for j in range(2**i):
45 | temp_ind = max(int(tree_num_intervals * 1.0 / (2**i) * j) - 1, 0)
46 |
47 | if j == 0:
48 | weight_temp = torch.where(label < begins[:, temp_ind].reshape(-1, 1), torch.zeros_like(label), torch.ones_like(label))
49 | else:
50 | weight_temp = torch.where(label < ends[:, temp_ind].reshape(-1, 1), torch.zeros_like(label), torch.ones_like(label))
51 |
52 | temp_ind = max(int(tree_num_intervals * 1.0 / (2**i) * (j + 1)) - 1, 0)
53 | weight_temp = torch.where(label < ends[:, temp_ind].reshape(-1, 1), weight_temp, torch.zeros_like(label))
54 |
55 | temp_ind = max(int(tree_num_intervals * (1.0 / (2**i) * j + 1.0 / (2**(i + 1)))) - 1, 0)
56 | label_temp = torch.where(label < ends[:, temp_ind].reshape(-1, 1), torch.zeros_like(label), torch.ones_like(label))
57 |
58 | label_dict[1000 * i + j] = label_temp
59 | weight_dict[1000 * i + j] = weight_temp
60 |
61 | return label_dict, weight_dict
62 |
63 |
64 | def get_tree_encoded_value(label_encoding_predict, tree_num_intervals, begins, ends, name="encoded_playtime"):
65 | height = int(math.log2(tree_num_intervals))
66 | encoded_prob_list = []
67 |
68 | temp_encoded_playtime = (begins + ends) / 2.0
69 | encoded_playtime = temp_encoded_playtime
70 |
71 | batch_size = label_encoding_predict.size(0)
72 | device = label_encoding_predict.device
73 |
74 | for i in range(tree_num_intervals):
75 | temp = torch.zeros(batch_size, dtype=torch.float32, device=device)
76 | cur_code = 2 ** height - 1 + i
77 |
78 | for j in range(1, height + 1):
79 | classifier_branch = cur_code % 2
80 | classifier_idx = (cur_code - 1) // 2
81 |
82 | probs = label_encoding_predict[:, classifier_idx]
83 | condition = torch.tensor(classifier_branch == 1, dtype=torch.bool, device=device)
84 | log_p = torch.where(condition, torch.log(1.0 - probs + 0.00001), torch.log(probs + 0.00001))
85 | temp += log_p
86 |
87 | cur_code = classifier_idx
88 | encoded_prob_list.append(temp)
89 | encoded_prob = torch.exp(torch.stack(encoded_prob_list, dim=1))
90 | encoded_playtime = torch.sum(temp_encoded_playtime * encoded_prob, dim=-1, keepdim=True)
91 |
92 | e_x2 = torch.sum((encoded_playtime ** 2) * encoded_prob, dim=-1, keepdim=True)
93 | square_of_e_x = encoded_playtime ** 2
94 | var = torch.sqrt(torch.abs(e_x2 - square_of_e_x) + 1e-8)
95 |
96 | return encoded_playtime.float(), torch.sum(var).float()
97 |
98 |
99 | class InversePairsCalc:
100 | def InversePairs(self, data):
101 | if not data :
102 | return False
103 | if len(data)==1 :
104 | return 0
105 | def merge(tuple_fir,tuple_sec):
106 | array_before = tuple_fir[0]
107 | cnt_before = tuple_fir[1]
108 | array_after = tuple_sec[0]
109 | cnt_after = tuple_sec[1]
110 | cnt = cnt_before+cnt_after
111 | flag = len(array_after)-1
112 | array_merge = []
113 | for i in range(len(array_before)-1,-1,-1):
114 | while array_before[i]<=array_after[flag] and flag>=0 :
115 | array_merge.append(array_after[flag])
116 | flag -= 1
117 | if flag == -1 :
118 | break
119 | else:
120 | array_merge.append(array_before[i])
121 | cnt += (flag+1)
122 | if flag == -1 :
123 | for j in range(i,-1,-1):
124 | array_merge.append(array_before[j])
125 | else:
126 | for j in range(flag ,-1,-1):
127 | array_merge.append(array_after[j])
128 | return array_merge[::-1],cnt
129 |
130 | def mergesort(array):
131 | if len(array)==1:
132 | return (array,0)
133 | cut = math.floor(len(array)/2)
134 | tuple_fir=mergesort(array[:cut])
135 | tuple_sec=mergesort(array[cut:])
136 | return merge(tuple_fir, tuple_sec)
137 | return mergesort(data)[1]
138 |
139 | def eval_xauc(labels, pres):
140 | label_preds = zip(labels.reshape(-1), pres.reshape(-1))
141 | sorted_label_preds = sorted(
142 | label_preds, key=lambda lc: lc[1], reverse=True)
143 | label_preds_len = len(sorted_label_preds)
144 | pairs_cnt = label_preds_len * (label_preds_len-1) / 2
145 |
146 | labels_sort = [ele[0] for ele in sorted_label_preds]
147 | S=InversePairsCalc()
148 | total_positive = S.InversePairs(labels_sort)
149 | xauc = total_positive / pairs_cnt
150 | return xauc
151 |
152 | def eval_auc(labels, pres):
153 | auc = roc_auc_score(labels, pres)
154 | return auc
155 |
156 | def eval_mae(labels, scores):
157 | return np.mean(np.abs(labels - scores))
158 |
159 | def eval_kl(samples_p, samples_q, bins=100, epsilon=1e-10):
160 | # 计算直方图分箱概率
161 | hist_p, bin_edges = np.histogram(samples_p, bins=bins, density=True)
162 | hist_q, _ = np.histogram(samples_q, bins=bin_edges, density=True)
163 |
164 | # 计算每个分箱的宽度(用于归一化)
165 | bin_width = np.diff(bin_edges)
166 | hist_p = hist_p * bin_width # 转为概率质量
167 | hist_q = hist_q * bin_width
168 |
169 | # 防止零概率
170 | hist_p = np.clip(hist_p, epsilon, None)
171 | hist_q = np.clip(hist_q, epsilon, None)
172 |
173 | # 计算KL散度
174 | kl = np.sum(hist_p * np.log(hist_p / hist_q))
175 | return kl
176 |
--------------------------------------------------------------------------------
/run_d2co.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import torch
4 | import random
5 | import numpy as np
6 | import argparse
7 | from dataloader import KUAIRECDataLoader
8 | from sklearn.mixture import GaussianMixture
9 | from collections import Counter
10 | from model import D2Q,WideAndDeep
11 | from utils import eval_mae, eval_xauc, eval_kl
12 | import pandas as pd
13 | import pickle as pkl
14 | from torch.distributions import Normal
15 |
16 | def get_args():
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument('--dataset_name', default='kuairec')
19 | parser.add_argument('--dataset_path', default='./dataset/')
20 | parser.add_argument('--device', default='cuda:0')
21 | parser.add_argument('--bsz', type=int, default=2048)
22 | parser.add_argument('--log_interval', type=int, default=10)
23 |
24 | parser.add_argument('--epoch', type=int, default=10)
25 | parser.add_argument('--lr', type=float, default=0.1)
26 | parser.add_argument('--weight_decay', type=float, default=1e-6)
27 | parser.add_argument('--seed', type=int, default=42)
28 |
29 | args = parser.parse_args()
30 | return args
31 |
32 | def get_loaders(name, dataset_path, device, bsz):
33 | path = os.path.join(dataset_path, name, "{}_data.pkl".format(name))
34 | if name == 'kuairec':
35 | dataloaders = KUAIRECDataLoader(name, path, device, bsz=bsz)
36 | else:
37 | raise ValueError('unkown dataset name: {}'.format(name))
38 | return dataloaders
39 |
40 |
41 | def get_gmm_mean(df_train,n_bins):
42 | # using GMM to calculate the mean values of each distribution
43 | gmmMeanList=[]
44 | durationBucketList= []
45 | playTimeList = []
46 | for _, (features, label) in enumerate(df_train):
47 | durationBucketList.extend(features['duration_bucket'].cpu().numpy().flatten())
48 | playTimeList.extend(label.cpu().numpy().flatten())
49 | durationBucketList = np.array(durationBucketList)
50 | playTimeList= np.array(playTimeList)
51 |
52 | for d in range(n_bins):
53 | playTimeInBucket = playTimeList[durationBucketList == d].reshape(-1, 1)
54 | gm = GaussianMixture(n_components=2, init_params='kmeans',covariance_type='spherical', max_iter=500, random_state=61).fit(playTimeInBucket)
55 | means = np.sort(gm.means_.T[0])
56 | gmmMeanList.append([d,means[0],means[1]])
57 |
58 | #calculate num of row in each bucket
59 | numInEachBucket = Counter(durationBucketList)
60 | numInEachBucket = dict(sorted(numInEachBucket.items()))
61 | numInEachBucket = list(numInEachBucket.values())
62 |
63 | def freq_moving_ave(ls_v, ls_w, windows_size=5):
64 | ls_mul = np.array(ls_v) * np.array(ls_w)
65 | amount = pd.Series(ls_mul)
66 | amount_sum = amount.rolling(2*windows_size-1, min_periods=1, center=True).agg(lambda x: np.sum(x))
67 |
68 | weight = pd.Series(ls_w)
69 | weight_sum = weight.rolling(2*windows_size-1, min_periods=1, center=True).agg(lambda x: np.sum(x))
70 |
71 | return amount_sum/weight_sum
72 |
73 | # smoothing by frequent moving averge
74 | gmmMeanList = np.array(gmmMeanList)
75 | nega_GMM_mean = dict(zip(gmmMeanList[:,0],freq_moving_ave(gmmMeanList[:,1], numInEachBucket, windows_size=5)))
76 | posi_GMM_mean = dict(zip(gmmMeanList[:,0],freq_moving_ave(gmmMeanList[:,2], numInEachBucket, windows_size=5)))
77 | return nega_GMM_mean, posi_GMM_mean
78 |
79 | def get_gmm_label(label, idx, nega_GMM_mean, posi_GMM_mean ,alpha=1.0):
80 | p = nega_GMM_mean[idx]
81 | q = posi_GMM_mean[idx]
82 | gmm_label = (np.exp(alpha * label) - np.exp(alpha * q)) / (np.exp(alpha * p)- np.exp(alpha * q))
83 | return np.clip(gmm_label,0,1)
84 |
85 | def get_real_value(y, idx, nega_GMM_mean, posi_GMM_mean, alpha=1.0):
86 | p = nega_GMM_mean[idx]
87 | q = posi_GMM_mean[idx]
88 | real_y = np.log(y * (np.exp(alpha * p)- np.exp(alpha * q)) + np.exp(alpha * q)) / alpha
89 | return real_y
90 |
91 | def mae_rescale_to_second(dataset, mae):
92 | if dataset == 'kuairec':
93 | return mae * 999639 / 1000
94 | elif dataset == 'wechat':
95 | return mae * 20840
96 | elif dataset == 'cikm16':
97 | return (mae *(6000-31) + 31) / 1000
98 | else:
99 | raise ValueError('unkown dataset name: {}'.format(dataset))
100 |
101 | def test(args, model, dataloaders, nega_GMM_mean, posi_GMM_mean):
102 | model.eval()
103 | labels, scores, predicts, durs = list(), list(), list(), list()
104 | with torch.no_grad():
105 | for _, (features, label) in enumerate(dataloaders['test']):
106 | bucket_index = features['duration_bucket']
107 | y = model(features)
108 | mapped_y,mapped_label = [],[]
109 | for idx, gmm_y in zip(bucket_index, y.tolist()):
110 | mapped_y_value = get_real_value(gmm_y, idx.item(), nega_GMM_mean, posi_GMM_mean)
111 | mapped_y.append(mapped_y_value)
112 | labels.extend(label.tolist())
113 | scores.extend(mapped_y)
114 | durs.extend(features['duration'].squeeze().tolist())
115 | labels, scores = np.array(labels), np.array(scores)
116 | mae, xauc, kl = eval_mae(labels, scores), eval_xauc(labels, scores), eval_kl(labels, scores)
117 | mae = mae_rescale_to_second(args.dataset_name, mae)
118 | print("test result | MAE: {:.7f} | XAUC: {:.7f} | KL: {:.7f}".format(mae, xauc, kl))
119 |
120 | if __name__ == '__main__':
121 | args = get_args()
122 | if args.seed > -1:
123 | np.random.seed(args.seed)
124 | torch.manual_seed(args.seed)
125 | torch.cuda.manual_seed(args.seed)
126 | res = {}
127 | torch.cuda.empty_cache()
128 |
129 | device = torch.device(args.device)
130 |
131 | dataloaders = get_loaders(args.dataset_name, args.dataset_path, device, args.bsz)
132 | model = WideAndDeep(dataloaders.description, embed_dim=16, mlp_dims=(512, 256, 128, 64, 32), dropout=0.0)
133 | model = model.to(device)
134 |
135 | # train
136 | dataloader_train = dataloaders['train']
137 | bin_nums = 50
138 | if args.dataset_name == "wechat":
139 | bin_nums = 17
140 | nega_GMM_mean, posi_GMM_mean = get_gmm_mean(dataloader_train, bin_nums)
141 |
142 | model.train()
143 | criterion = torch.nn.MSELoss()
144 | optimizer = torch.optim.Adagrad(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
145 | for epoch_i in range(1, args.epoch + 1):
146 | model.train()
147 | epoch_loss = 0.0
148 | total_loss = 0
149 | total_iters = len(dataloader_train)
150 | for i, (features, label) in enumerate(dataloader_train):
151 | y = model(features)
152 | bucket_index = features['duration_bucket']
153 | mapped_label = []
154 | for idx, label_origin in zip(bucket_index, label.tolist()):
155 | mapped_label_value = get_gmm_label(label_origin, idx.item(), nega_GMM_mean, posi_GMM_mean)
156 | mapped_label.append(mapped_label_value)
157 | mapped_label = torch.tensor(mapped_label, device=device)
158 | loss = criterion(y, mapped_label.float())
159 | model.zero_grad()
160 | loss.backward()
161 | optimizer.step()
162 | epoch_loss += loss.item()
163 | total_loss += loss.item()
164 | if (i + 1) % 10 == 0:
165 | print(" Iter {}/{} loss: {:7f}".format(i + 1, total_iters + 1, total_loss/args.log_interval), end='\r')
166 | total_loss = 0
167 | print("Epoch {}/{} average Loss: {:.7f}".format(epoch_i, args.epoch, epoch_loss/total_iters))
168 | test(args, model, dataloaders, nega_GMM_mean, posi_GMM_mean)
169 |
170 |
--------------------------------------------------------------------------------
/model/layers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 |
5 | class FeaturesLinear(torch.nn.Module):
6 |
7 | def __init__(self, field_dims, output_dim=1):
8 | super().__init__()
9 | self.fc = torch.nn.Embedding(sum(field_dims), output_dim)
10 | self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))
11 | self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
12 |
13 | def forward(self, x):
14 | x = x + x.new_tensor(self.offsets).unsqueeze(0)
15 | return torch.sum(self.fc(x), dim=1) + self.bias
16 |
17 |
18 | class FeaturesEmbedding(torch.nn.Module):
19 |
20 | def __init__(self, field_dims, embed_dim):
21 | """
22 | :param x: Long tensor of size ``(batch_size, num_fields)``
23 | """
24 | super().__init__()
25 | self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim)
26 | self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
27 | torch.nn.init.xavier_uniform_(self.embedding.weight.data)
28 |
29 | def forward(self, x):
30 | x = x + x.new_tensor(self.offsets).unsqueeze(0)
31 | return self.embedding(x)
32 |
33 |
34 | class SeqFeatureEmbedding(torch.nn.Module):
35 |
36 | def __init__(self, vocab_sizes, embed_dim):
37 | super().__init__()
38 | self.embedding = torch.nn.Embedding(sum(vocab_sizes), embed_dim)
39 | self.offsets = np.array((0, *np.cumsum(vocab_sizes)[:-1]), dtype=np.long)
40 | torch.nn.init.xavier_uniform_(self.embedding.weight.data)
41 |
42 | def forward(self, x_list):
43 | embs = list()
44 | for i, x in enumerate(x_list):
45 | embs.append(self.embedding(x + self.offsets[i]).sum(dim=1, keepdims=True))
46 | emb = torch.concat(embs, dim=1)
47 | return emb
48 |
49 |
50 | class FactorizationMachine(torch.nn.Module):
51 |
52 | def __init__(self, reduce_sum=True):
53 | super().__init__()
54 | self.reduce_sum = reduce_sum
55 |
56 | def forward(self, x):
57 | """
58 | :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
59 | """
60 | square_of_sum = torch.sum(x, dim=1) ** 2
61 | sum_of_square = torch.sum(x ** 2, dim=1)
62 | ix = square_of_sum - sum_of_square
63 | if self.reduce_sum:
64 | ix = torch.sum(ix, dim=1, keepdim=True)
65 | return 0.5 * ix
66 |
67 |
68 | class MultiLayerPerceptron(torch.nn.Module):
69 |
70 | def __init__(self, input_dim, embed_dims, dropout, output_layer=True):
71 | super().__init__()
72 | layers = list()
73 | for embed_dim in embed_dims:
74 | layers.append(torch.nn.Linear(input_dim, embed_dim))
75 | layers.append(torch.nn.BatchNorm1d(embed_dim))
76 | layers.append(torch.nn.ReLU())
77 | layers.append(torch.nn.Dropout(p=dropout))
78 | input_dim = embed_dim
79 | if output_layer:
80 | layers.append(torch.nn.Linear(input_dim, 1))
81 | self.mlp = torch.nn.Sequential(*layers)
82 |
83 | def forward(self, x):
84 | """
85 | :param x: Float tensor of size ``(batch_size, embed_dim)``
86 | """
87 | return self.mlp(x)
88 |
89 |
90 |
91 | class DurationMultiLayerPerceptron(torch.nn.Module):
92 |
93 | def __init__(self, input_dim, embed_dims, dropout, output_layer=True):
94 | super().__init__()
95 | self.mlps = torch.nn.ModuleList()
96 | for embed_dim in embed_dims:
97 | layers = list()
98 | layers.append(torch.nn.Linear(input_dim + 1, embed_dim))
99 | layers.append(torch.nn.BatchNorm1d(embed_dim))
100 | layers.append(torch.nn.ReLU())
101 | layers.append(torch.nn.Dropout(p=dropout))
102 | self.mlps.append(torch.nn.Sequential(*layers))
103 | input_dim = embed_dim
104 |
105 | def forward(self, x, duration):
106 | """
107 | :param x: Float tensor of size ``(batch_size, embed_dim)``
108 | """
109 | for mlp in self.mlps:
110 | x = mlp(torch.concat([x, duration], dim=1))
111 | return x
112 |
113 |
114 | class Swish(torch.nn.Module):
115 | def forward(self, x):
116 | return x * torch.sigmoid(x)
117 |
118 | class MultiLayerPerceptronD2Q(torch.nn.Module):
119 |
120 | def __init__(self, input_dim, embed_dims, dropout, output_layer=True):
121 | super().__init__()
122 | layers = list()
123 | for embed_dim in embed_dims:
124 | layers.append(torch.nn.Linear(input_dim, embed_dim))
125 | layers.append(torch.nn.BatchNorm1d(embed_dim))
126 | layers.append(Swish())
127 | layers.append(torch.nn.Dropout(p=dropout))
128 | input_dim = embed_dim
129 | if output_layer:
130 | layers.append(torch.nn.Linear(input_dim, 1))
131 | self.mlp = torch.nn.Sequential(*layers)
132 |
133 | def forward(self, x):
134 | """
135 | :param x: Float tensor of size ``(batch_size, embed_dim)``
136 | """
137 | return self.mlp(x)
138 |
139 | class MultiLayerPerceptronTPM(torch.nn.Module):
140 |
141 | def __init__(self, input_dim, embed_dims, dropout, class_num, output_layer=True):
142 | super().__init__()
143 | layers = list()
144 | for embed_dim in embed_dims:
145 | layers.append(torch.nn.Linear(input_dim, embed_dim))
146 | layers.append(torch.nn.BatchNorm1d(embed_dim))
147 | layers.append(Swish())
148 | layers.append(torch.nn.Dropout(p=dropout))
149 | input_dim = embed_dim
150 | if output_layer:
151 | layers.append(torch.nn.Linear(input_dim, class_num))
152 | self.mlp = torch.nn.Sequential(*layers)
153 |
154 | def forward(self, x):
155 | """
156 | :param x: Float tensor of size ``(batch_size, embed_dim)``
157 | """
158 | return self.mlp(x)
159 |
160 | class AttentionalFactorizationMachine(torch.nn.Module):
161 |
162 | def __init__(self, embed_dim, attn_size, dropouts):
163 | super().__init__()
164 | self.attention = torch.nn.Linear(embed_dim, attn_size)
165 | self.projection = torch.nn.Linear(attn_size, 1)
166 | self.fc = torch.nn.Linear(embed_dim, 1)
167 | self.dropouts = dropouts
168 |
169 | def forward(self, x):
170 | """
171 | :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
172 | """
173 | num_fields = x.shape[1]
174 | row, col = list(), list()
175 | for i in range(num_fields - 1):
176 | for j in range(i + 1, num_fields):
177 | row.append(i), col.append(j)
178 | p, q = x[:, row], x[:, col]
179 | inner_product = p * q
180 | attn_scores = F.relu(self.attention(inner_product))
181 | attn_scores = F.softmax(self.projection(attn_scores), dim=1)
182 | attn_scores = F.dropout(attn_scores, p=self.dropouts[0], training=self.training)
183 | attn_output = torch.sum(attn_scores * inner_product, dim=1)
184 | attn_output = F.dropout(attn_output, p=self.dropouts[1], training=self.training)
185 | return self.fc(attn_output)
186 |
187 | class CrossNetwork(torch.nn.Module):
188 |
189 | def __init__(self, input_dim, num_layers):
190 | super().__init__()
191 | self.num_layers = num_layers
192 | self.w = torch.nn.ModuleList([
193 | torch.nn.Linear(input_dim, 1, bias=False) for _ in range(num_layers)
194 | ])
195 | self.b = torch.nn.ParameterList([
196 | torch.nn.Parameter(torch.zeros((input_dim,))) for _ in range(num_layers)
197 | ])
198 |
199 | def forward(self, x):
200 | """
201 | :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
202 | """
203 | x0 = x
204 | for i in range(self.num_layers):
205 | xw = self.w[i](x)
206 | x = x0 * xw + self.b[i] + x
207 | return x
--------------------------------------------------------------------------------
/run_cread.py:
--------------------------------------------------------------------------------
1 | import os
2 | import copy
3 | import torch
4 | import random
5 | import numpy as np
6 | import argparse
7 | from dataloader import KUAIRECDataLoader
8 | from model import Cread
9 | from utils import eval_mae, eval_xauc, eval_kl
10 |
11 | def get_args():
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--dataset_name', default='kuairec')
14 | parser.add_argument('--dataset_path', default='./dataset/')
15 | parser.add_argument('--device', default='cuda:0')
16 | parser.add_argument('--bsz', type=int, default=2048)
17 | parser.add_argument('--log_interval', type=int, default=10)
18 |
19 | parser.add_argument('--epoch', type=int, default=10)
20 | parser.add_argument('--restore_w', type=float, default=1.0)
21 | parser.add_argument('--ord_w', type=float, default=0.00002)
22 | parser.add_argument('--lr', type=float, default=0.1)
23 | parser.add_argument('--weight_decay', type=float, default=1e-6)
24 | parser.add_argument('--bkt_num', type=float, default=50)
25 | parser.add_argument('--seed', type=int, default=42)
26 |
27 | args = parser.parse_args()
28 | return args
29 |
30 | def get_loaders(name, dataset_path, device, bsz):
31 | path = os.path.join(dataset_path, name, "{}_data.pkl".format(name))
32 | if name == 'kuairec':
33 | dataloaders = KUAIRECDataLoader(name, path, device, bsz=bsz)
34 | else:
35 | raise ValueError('unkown dataset name: {}'.format(name))
36 | return dataloaders
37 |
38 | def discretize_time_label(playtime, split_nodes):
39 | playtime = playtime.reshape([-1, 1, 1])
40 | split_nodes = split_nodes.reshape([1, 1, -1])
41 | cmp_tensor = playtime > split_nodes
42 | binary_labels = torch.where(cmp_tensor, torch.ones_like(cmp_tensor), torch.zeros_like(cmp_tensor)) # [bsz, 1, M]
43 | return torch.squeeze(binary_labels)
44 |
45 | def restore_time_label(preds, split_nodes):
46 | append_split_nodes = torch.concat((torch.tensor([0]).to(torch.float32).to(split_nodes.device), split_nodes))
47 | left_split_nodes, right_split_nodes = append_split_nodes[:-1], append_split_nodes[1:]
48 | bkt_size_list = right_split_nodes - left_split_nodes
49 | return torch.sum(preds * bkt_size_list.view([1, -1]), dim=1) # [bsz]
50 |
51 | def get_ord_criterion(preds):
52 | left_preds, right_preds = preds[:,:-1], preds[:,1:]
53 | return torch.sum(torch.clamp(right_preds - left_preds, min=0.0))
54 |
55 | def get_split_nodes(all_labels, M, alpha):
56 | split_nodes = []
57 | cdf_list = []
58 | for m in range(1, M+1):
59 | z = m / M
60 | gamma = (1 - np.exp(-alpha*z)) / (1 - np.exp(-alpha))
61 | split_nodes.append(torch.quantile(all_labels, gamma))
62 | cdf_list.append(gamma)
63 | return torch.tensor(split_nodes), torch.tensor(cdf_list)
64 |
65 | def cread_grid_search(dataloader_train, M):
66 | all_labels = []
67 | for (_, label) in dataloader_train:
68 | all_labels.append(label)
69 | all_labels = torch.concat(all_labels).to(torch.float32)
70 | alpha_search_space = list(np.arange(0.001, 5.0, 0.1))
71 | beta_search_space = [50]
72 | best_loss, best_alpha, best_beta, best_split = None, None, None, None
73 | print("Strat Cread Split Nodes Search....l")
74 | for alpha in alpha_search_space:
75 | for beta in beta_search_space:
76 | split_nodes, cdf_list = get_split_nodes(all_labels, M, alpha)
77 | split_nodes_left, split_nodes_right = torch.cat([torch.tensor([0]), split_nodes[:-1]]), split_nodes
78 | cdf_list_left, cdf_list_right = torch.cat([torch.tensor([0]), cdf_list[:-1]]), cdf_list
79 | A_w = torch.sum(torch.pow(cdf_list_right - cdf_list_left, 2)) * torch.sum(torch.pow(split_nodes_right - split_nodes_left, 2)/(cdf_list_right - cdf_list_left))
80 | A_b = torch.sum(torch.pow(cdf_list_right - cdf_list_left, 2)) * torch.sum(torch.pow(split_nodes_right - split_nodes_left, 2))
81 | A_loss = A_w + beta * A_b
82 | print("Searching | alpha={:.7f}, beta={:.7f}: A_loss={:.7f},A_w={:.7f},A_b={:.7f} ".format(alpha, beta, A_loss, A_w, A_b))
83 | if (best_loss == None) or (best_loss > A_loss):
84 | best_loss, best_alpha, best_beta, best_split = A_loss, alpha, beta, split_nodes
85 | print("Cread Search Complete! Best Loss is {:.7f}, Best Alpha is {:.7f}, Best Beta is {:.7f}.".format(best_loss, best_alpha, best_beta))
86 | return best_split
87 |
88 | def mae_rescale_to_second(dataset, mae):
89 | if dataset == 'kuairec':
90 | return mae * 999639 / 1000
91 | elif dataset == 'wechat':
92 | return mae * 20840
93 | elif dataset == 'cikm16':
94 | return (mae *(6000-31) + 31) / 1000
95 | else:
96 | raise ValueError('unkown dataset name: {}'.format(dataset))
97 |
98 | def test(args, model, dataloaders):
99 | model.eval()
100 | labels, scores, predicts = list(), list(), list()
101 | with torch.no_grad():
102 | for _, (features, label) in enumerate(dataloaders['test']):
103 | preds = model(features)
104 | y = restore_time_label(preds, split_nodes)
105 | labels.extend(label.tolist())
106 | scores.extend(y.tolist())
107 | labels, scores = np.array(labels), np.array(scores)
108 | mae, xauc, kl = eval_mae(labels, scores), eval_xauc(labels, scores), eval_kl(labels, scores)
109 | mae = mae_rescale_to_second(args.dataset_name, mae)
110 | print("test result | MAE: {:.7f} | XAUC: {:.7f} | KL: {:.7f}".format(mae, xauc, kl))
111 |
112 | if __name__ == '__main__':
113 | args = get_args()
114 | if args.seed > -1:
115 | np.random.seed(args.seed)
116 | torch.manual_seed(args.seed)
117 | torch.cuda.manual_seed(args.seed)
118 | res = {}
119 | torch.cuda.empty_cache()
120 |
121 | device = torch.device(args.device)
122 |
123 | # consturct DataLoader
124 | dataloaders = get_loaders(args.dataset_name, args.dataset_path, device, args.bsz)
125 |
126 | # discretization strategy
127 | split_nodes = cread_grid_search(dataloaders['train'], args.bkt_num).to(device)
128 | print("split node list: ", split_nodes)
129 | M = split_nodes.shape[0]
130 |
131 | # construct model
132 | model = Cread(dataloaders.description, embed_dim=16, share_mlp_dims=(512, 256, 128), output_mlp_dims=(64, 32), head_num=M, dropout=0.0)
133 | model = model.to(device)
134 |
135 | # train
136 | dataloader_train = dataloaders['train']
137 | model.train()
138 | bce_criterion = torch.nn.BCELoss()
139 | huber_criterion = torch.nn.HuberLoss()
140 | optimizer = torch.optim.Adam(params=model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
141 | for epoch_i in range(1, args.epoch + 1):
142 | model.train()
143 | epoch_loss, epoch_loss_bce, epoch_loss_restore, epoch_loss_ord = 0.0, 0.0, 0.0, 0.0
144 | total_loss, total_loss_bce, total_loss_restore, total_loss_ord = 0.0, 0.0, 0.0, 0.0
145 | total_iters = len(dataloader_train)
146 | for i, (features, label) in enumerate(dataloader_train):
147 | preds = model(features) # [bsz, M]
148 | binary_labels = discretize_time_label(label, split_nodes)
149 | restore_pred = restore_time_label(preds, split_nodes)
150 | loss_bce = bce_criterion(preds, binary_labels.float())
151 | loss_restore = huber_criterion(restore_pred, label.float())
152 | loss_ord = get_ord_criterion(preds)
153 | loss = loss_bce + args.restore_w * loss_restore + args.ord_w * loss_ord
154 | model.zero_grad()
155 | loss.backward()
156 | optimizer.step()
157 | epoch_loss += loss.item(); epoch_loss_bce += loss_bce.item(); epoch_loss_restore += args.restore_w * loss_restore.item(); epoch_loss_ord += args.ord_w * loss_ord.item()
158 | total_loss += loss.item(); total_loss_bce += loss_bce.item(); total_loss_restore += args.restore_w * loss_restore.item(); total_loss_ord += args.ord_w * loss_ord.item()
159 | if (i + 1) % 10 == 0:
160 | print(" Iter {}/{} loss: {:.7f}, loss_bce: {:.7f}, loss_restore: {:.7f}, loss_ord: {:.7f} ".format(i + 1, total_iters + 1, total_loss/args.log_interval, total_loss_bce/args.log_interval, total_loss_restore/args.log_interval, total_loss_ord/args.log_interval ), end='\r')
161 | total_loss = 0
162 | print("Epoch {}/{} average Loss: {:.7f}, loss_bce: {:.7f}, loss_restore: {:.7f}, loss_ord: {:.7f}".format(epoch_i, args.epoch, epoch_loss/total_iters, epoch_loss_bce/total_iters, epoch_loss_restore/total_iters, epoch_loss_ord/total_iters))
163 | test(args, model,dataloaders)
164 |
--------------------------------------------------------------------------------