├── .gitignore ├── LICENSE ├── README.md ├── config.yaml ├── main.py ├── model.py ├── testset_demo.csv ├── trainset_demo.csv └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 tao-shen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EdgeRec 2 | This repository is the implement of EdgeRec using [Mobile Intelligent Dataset](https://tianchi.aliyun.com/dataset/dataDetail?dataId=109858)(part 1). 3 | 4 | ### Prerequisites 5 | - A basic pytorch installation. The version is **1.7**. 6 | - Python packages you might not have: `jsonargparse`, `tqdm`, `sklearn`. 7 | 8 | ### Installation 9 | 1. Clone the repository 10 | ```Shell 11 | git clone https://github.com/tao-shen/EdgeRec 12 | ``` 13 | ### Usage 14 | 1. Setup dataset 15 | 16 | We setup two datasize of data: `full` refers to the full dataset and `demo` refers to the first 10000 samples of the full dataset. 17 | 18 | We provide the demo dataset here and you can find the detailed description and download te full dataset [here](https://tianchi.aliyun.com/dataset/dataDetail?dataId=109858)(part 1). 19 | ```Yaml 20 | # demo dataset 21 | datasize: demo 22 | device: cuda:0 23 | lr: 0.01 24 | batchsize: 100 25 | # full dataset 26 | datasize: full 27 | device: cuda:0 28 | lr: 0.001 29 | batchsize: 10000 30 | ``` 31 | 2. Run the `main.py` 32 | 33 | if you want to run on the demo dataset, for example 34 | ```Shell 35 | python main.py --device=cuda:0 --datasize=demo --lr=0.01 --batchsize=100 36 | ``` 37 | if you want to run on the full dataset, for example 38 | ```Shell 39 | python main.py --device=cuda:0 --datasize=full --lr=0.001 --batchsize=10000 40 | ``` 41 | **Note:** The parsed arguments will overwrite the configuration in yaml file. 42 | 43 | ### Results 44 | 45 | | Setting | AUC | 46 | | ------------- | ------------- | 47 | | scores only | 0.7277 | 48 | | trained model | 0.7310 | 49 | 50 | On full dataset, the baseline `auc:0.7277` is evaluated only using scores in test samples, which is provided by cloud. We train the [DIN](https://arxiv.org/abs/1706.06978) model, with `batchsize=10000`,`lr=0.001`, the result `auc:0.7310`, shows in the Table. 51 | 52 | 53 | ## License 54 | 55 | 56 | 57 | MIT License 58 | 59 | Copyright (c) 2021 tao-shen 60 | 61 | Permission is hereby granted, free of charge, to any person obtaining a copy 62 | of this software and associated documentation files (the "Software"), to deal 63 | in the Software without restriction, including without limitation the rights 64 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 65 | copies of the Software, and to permit persons to whom the Software is 66 | furnished to do so, subject to the following conditions: 67 | 68 | The above copyright notice and this permission notice shall be included in all 69 | copies or substantial portions of the Software. 70 | 71 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 72 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 73 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 74 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 75 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 76 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 77 | SOFTWARE. -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | datasize: demo 2 | device: cuda:0 3 | lr: 0.01 4 | batchsize: 100 5 | 6 | # datasize: full 7 | # device: cuda:0 8 | # lr: 0.001 9 | # batchsize: 10000 10 | 11 | embedding: 12 | { 'user_os': {num: 3, size: 2}, 13 | 'user_gender': {num: 3, size: 2}, 14 | 'user_age_level': {num: 9, size: 3}, 15 | 'user_purchase_level': {num: 9, size: 3}, 16 | 'user_hour': {num: 25, size: 5}, 17 | 'item_pos': {num: 51, size: 5}, 18 | 'item_pagenum': {num: 51, size: 5}, 19 | 'item_sex': {num: 5, size: 2}, 20 | 'item_price_level': {num: 9, size: 3}, 21 | 'item_age_level': {num: 8, size: 3}, 22 | 'item_bc_type': {num: 3, size: 2},} 23 | 24 | features: 25 | cat_feats: ['user_os', 26 | 'user_gender', 27 | 'user_age_level', 28 | 'user_purchase_level', 29 | 'user_hour', 30 | 'cand_item_pos', 31 | 'cand_item_pagenum', 32 | 'cand_item_sex', 33 | 'cand_item_price_level', 34 | 'cand_item_age_level', 35 | 'cand_item_bc_type',] 36 | seq_feats: ['exp_item_pos_seq', 37 | 'exp_item_pagenum_seq', 38 | 'exp_item_sex_seq', 39 | 'exp_item_price_level_seq', 40 | 'exp_item_age_level_seq', 41 | 'exp_item_bc_type_seq', 42 | 'ipv_item_pos_seq', 43 | 'ipv_item_pagenum_seq', 44 | 'ipv_item_sex_seq', 45 | 'ipv_item_price_level_seq', 46 | 'ipv_item_age_level_seq', 47 | 'ipv_item_bc_type_seq',] 48 | num_feats: ['score'] 49 | label: ['label'] 50 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from model import * 3 | # random seed 4 | setup_seed(2021) 5 | # initialize hyperparameters 6 | args = init_args() 7 | # initialize model 8 | model = DIN(args) 9 | # record loss & auc 10 | recorder = {'loss': [], 'auc': []} 11 | # load csv-fotmat dataset 12 | train_set = MCC_Dataset('trainset_{}.csv'.format(args.datasize), args) 13 | test_set = MCC_Dataset('testset_{}.csv'.format(args.datasize), args) 14 | # dataloader & optimizer 15 | train_loader = data_loader(train_set, batchsize=args.batchsize) 16 | test_loader = data_loader(test_set, batchsize=args.batchsize) 17 | optimizer = torch.optim.Adam( 18 | model.parameters(), lr=args.lr, weight_decay=args.weight_decay) 19 | 20 | if __name__ == '__main__': 21 | # start training % testing 22 | for _ in range(10): 23 | model.fit(train_loader, optimizer) 24 | model.evaluate(test_loader, recorder) 25 | print(recorder['auc'][-1]) 26 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch, torch.nn as nn 2 | from tqdm import tqdm 3 | from sklearn.metrics import roc_curve, auc 4 | 5 | 6 | class Model_fn(): 7 | def __init__(self, args): 8 | self.device = args.device 9 | self.loss_fun = torch.nn.BCELoss() 10 | self.train_num = 0 11 | 12 | def fit(self, train_loader, optimizer): 13 | train_loader.dataset.reset() 14 | self.train() 15 | self.to(self.device) 16 | description = "Training (the {:d}-batch): tra_Loss = {:.4f}" 17 | loss_total, avg_loss = 0.0, 0.0 18 | epochs = tqdm(train_loader, leave=False, desc='local_update') 19 | for idx, batch in enumerate(epochs): 20 | optimizer.zero_grad() 21 | batch = to_device(batch, self.device) 22 | output = self(batch) 23 | label = batch['label'].float() 24 | loss = self.loss_fun(output.squeeze(-1), label) 25 | loss.backward() 26 | optimizer.step() 27 | loss_total += loss.item() 28 | avg_loss = loss_total / (idx + 1) 29 | epochs.set_description(description.format(idx + 1, avg_loss)) 30 | self.train_num = len(train_loader.dataset) 31 | 32 | def evaluate(self, test_loader, recorder=None): 33 | test_loader.dataset.reset() 34 | self.eval() 35 | self.to(self.device) 36 | loss_total = 0.0 37 | label, pred = [], [] 38 | with torch.no_grad(): 39 | with tqdm(test_loader) as epochs: 40 | for idx, batch in enumerate(epochs): 41 | batch = to_device(batch, self.device) 42 | output = self(batch) 43 | pred += output.squeeze(-1).tolist() 44 | label += batch['label'].tolist() 45 | loss = self.loss_fun( 46 | output.squeeze(-1), batch['label'].float()) 47 | loss_total += loss.item() 48 | loss_avg = loss_total/len(test_loader) 49 | fpr, tpr, _ = roc_curve(label, pred) 50 | auc_score = auc(fpr, tpr) 51 | recorder['loss'].append(loss_avg) 52 | recorder['auc'].append(auc_score) 53 | 54 | 55 | class DIN(nn.Module, Model_fn): 56 | def __init__(self, args): 57 | super(DIN, self).__init__() 58 | 59 | self.args = args 60 | self.features = args.features 61 | self._estimator_type = 'classifier' 62 | self.num_inputs = nn.ModuleDict() 63 | self.embeddings = nn.ModuleDict() 64 | self.cat_embeddings = nn.ModuleDict() 65 | self.seq_embeddings = nn.ModuleDict() 66 | cat_size = 0 67 | 68 | for embed_key in args.embedding.keys(): 69 | self.embeddings[embed_key] = nn.Embedding( 70 | args.embedding[embed_key]['num'], args.embedding[embed_key]['size']) 71 | for feats_key, feats_value in args.use_feats.items(): 72 | if embed_key in feats_key: 73 | if feats_value == 'cat_feats': 74 | self.cat_embeddings[feats_key] = self.embeddings[embed_key] 75 | if feats_value == 'seq_feats': 76 | self.seq_embeddings[feats_key] = self.embeddings[embed_key] 77 | cat_size += args.embedding[embed_key]['size'] 78 | args.item_embed_size = sum( 79 | [v['size'] for k, v in args.embedding.items() if 'item' in k]) 80 | 81 | for key in self.features['num_feats']: 82 | self.num_inputs[key] = nn.Identity() 83 | cat_size += 1 84 | 85 | self.pooling = Pooling('attention', dim=1, args=args) 86 | self.mlp = MLP(cat_size, self.args) 87 | 88 | Model_fn.__init__(self, args) 89 | 90 | def forward(self, inputs): 91 | 92 | embedded = {} 93 | for key, module in self.num_inputs.items(): 94 | out = module(inputs[key]).unsqueeze(-1) 95 | embedded[key] = out 96 | can_embedded, exp_embedded, ipv_embedded = [], [], [] 97 | for key, module in self.cat_embeddings.items(): 98 | out = module(inputs[key]) 99 | if 'cand_item' in key: 100 | can_embedded.append(out) 101 | else: 102 | embedded[key] = out 103 | embedded['cand_item'] = torch.cat(can_embedded, dim=1) 104 | for key, module in self.seq_embeddings.items(): 105 | seq_out = module(inputs[key]) 106 | if 'exp_item' in key: 107 | exp_embedded.append(seq_out) 108 | elif 'ipv_item' in key: 109 | ipv_embedded.append(seq_out) 110 | 111 | exp_seq = torch.cat(exp_embedded, dim=-1) 112 | exp_out = self.pooling(exp_seq, embedded['cand_item']) 113 | embedded['exp_item'] = exp_out 114 | 115 | ipv_seq = torch.cat(ipv_embedded, dim=-1) 116 | ipv_out = self.pooling(ipv_seq, embedded['cand_item']) 117 | embedded['ipv_item'] = ipv_out 118 | 119 | emb_cat = torch.cat(list(embedded.values()), dim=1) 120 | score_logits = -torch.log(1/inputs['score'].unsqueeze(-1)-1) 121 | output = torch.sigmoid(self.mlp(emb_cat)+score_logits) 122 | return output 123 | 124 | 125 | class MLP(nn.Module): 126 | def __init__(self, input_size, args): 127 | super(MLP, self).__init__() 128 | self.fc1 = nn.Sequential( 129 | nn.Linear(input_size, args.hidden_size[0]), 130 | nn.BatchNorm1d(args.hidden_size[0]) 131 | ) 132 | self.fc2 = nn.Sequential( 133 | nn.Linear(args.hidden_size[0], args.hidden_size[1]), 134 | nn.BatchNorm1d(args.hidden_size[1]) 135 | ) 136 | self.fc3 = nn.Linear(args.hidden_size[1], 1) 137 | self.relu = torch.nn.ReLU() 138 | self.dropout = nn.Dropout(args.dropout) 139 | 140 | def forward(self, input): 141 | x = self.dropout(self.relu(self.fc1(input))) 142 | x = self.dropout(self.relu(self.fc2(x))) 143 | output = self.fc3(x) 144 | return output 145 | 146 | 147 | class Pooling(nn.Module): 148 | def __init__(self, pooling_type, dim=1, **kwargs): 149 | super(Pooling, self).__init__() 150 | self.dim = dim 151 | self.pooling_type = pooling_type 152 | if self.pooling_type == 'mean': 153 | self.pooling = torch.mean 154 | if self.pooling_type == 'sum': 155 | self.pooling = torch.sum 156 | if self.pooling_type == 'attention': 157 | self.pooling = Attention_Pooling(kwargs['args']) 158 | 159 | def forward(self, x, target_item=None): 160 | if self.pooling_type != 'attention': 161 | output = self.pooling(x, self.dim) 162 | else: 163 | output = self.pooling(x, target_item, self.dim) 164 | return output 165 | 166 | 167 | class Attention_Pooling(nn.Module): 168 | def __init__(self, args): 169 | super(Attention_Pooling, self).__init__() 170 | self.attention_unit = Attention_Unit(args) 171 | 172 | def forward(self, seq, target_item, dim): 173 | target_items = target_item.unsqueeze(-2).expand_as(seq) 174 | weights = self.attention_unit(target_items, seq) 175 | weights = torch.softmax(weights, dim=1) 176 | out = weights*seq 177 | return out.sum(dim=dim) 178 | 179 | 180 | class Attention_Unit(nn.Module): 181 | 182 | def __init__(self, args): 183 | super(Attention_Unit, self).__init__() 184 | self.fc1 = nn.Linear(args.item_embed_size*4, args.item_embed_size) 185 | self.fc2 = nn.Linear(args.item_embed_size, 1) 186 | self.activation = torch.nn.ReLU() 187 | 188 | def forward(self, seq, target_item): 189 | emb_cat = torch.cat( 190 | (target_item, seq, target_item-seq, target_item*seq), dim=-1) 191 | x = self.activation(self.fc1(emb_cat)) 192 | weight = self.fc2(x) 193 | return weight 194 | 195 | 196 | def to_device(x, device): 197 | for key, value in x.items(): 198 | x[key] = value.to(device) 199 | return x 200 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from jsonargparse import ArgumentParser 2 | from torch.utils.data import Dataset, Sampler, DataLoader 3 | import torch 4 | import pandas as pd 5 | 6 | def setup_seed(seed): 7 | import numpy as np 8 | import random 9 | from torch.backends import cudnn 10 | torch.manual_seed(seed) 11 | torch.cuda.manual_seed_all(seed) 12 | torch.cuda.manual_seed(seed) 13 | np.random.seed(seed) 14 | random.seed(seed) 15 | cudnn.deterministic = True 16 | cudnn.benchmark = False 17 | 18 | 19 | def init_args(): 20 | parser = ArgumentParser(default_config_files=['config.yaml']) 21 | parser.add_argument('--seed', type=int, default=2021) 22 | parser.add_argument('--datasize', type=str, default='demo', 23 | help='size: {demo, full}') 24 | parser.add_argument('--batchsize', type=int, default=1024) 25 | parser.add_argument('--device', type=str, default='cuda:0', 26 | help='device: {cuda, cpu}') 27 | parser.add_argument('--lr', type=float, default=1e-2, 28 | help='learning rate') 29 | parser.add_argument('--weight-decay', type=float, default=1e-5, 30 | help='Weight for L2 loss') 31 | parser.add_argument('--dropout', type=float, default=0.3, 32 | help='dropout probability') 33 | parser.add_argument('--hidden_size', type=list, default=[32, 16], 34 | help='hidden_size') 35 | parser.add_argument('--features') 36 | parser.add_argument('--embedding') 37 | args = parser.parse_args() 38 | args.use_feats = {vs: k for k, v in args.features.items() for vs in v} 39 | return args 40 | 41 | 42 | class MCC_Dataset(Dataset): 43 | def __init__(self, file, args): 44 | self.args = args 45 | self.file = file 46 | self.cols = list(args.use_feats.keys()) 47 | self.convert = {key: lambda x: list(map(int, x.split(','))) 48 | for key in args.use_feats.keys() if 'seq' in key} 49 | 50 | def reset(self): 51 | self.data = pd.read_csv(self.file, usecols=self.cols, 52 | converters=self.convert, iterator=True) 53 | 54 | def __getitem__(self, batchsize): 55 | x = self.data.get_chunk(batchsize) 56 | x = x.to_dict(orient='list') 57 | for k in self.args.use_feats.keys(): 58 | x[k] = torch.tensor(x[k]) 59 | return x 60 | 61 | def __len__(self): 62 | if self.args.datasize == 'demo': 63 | return 10000 64 | elif 'train' in self.file: 65 | return 9526571 66 | elif 'test' in self.file: 67 | return 3555419 68 | 69 | 70 | def data_loader(dataset, batchsize=None): 71 | sampler = CSV_Sampler(dataset) 72 | batch_sampler = CSV_Batch_Sampler(sampler, batchsize) 73 | return DataLoader(dataset, batch_sampler=batch_sampler, collate_fn=lambda x: x[0]) 74 | 75 | class CSV_Sampler(Sampler): 76 | def __init__(self, data_source): 77 | self.data_source = data_source 78 | 79 | def __iter__(self): 80 | return self.data_source.data 81 | 82 | def __len__(self): 83 | return len(self.data_source) 84 | 85 | 86 | class CSV_Batch_Sampler(Sampler): 87 | def __init__(self, csv_sampler, batchsize): 88 | self.sampler = csv_sampler 89 | self.batch_size = batchsize 90 | 91 | def __iter__(self): 92 | return self 93 | 94 | def __next__(self): 95 | return [self.batch_size] 96 | 97 | def __len__(self): 98 | return (len(self.sampler) + self.batch_size - 1) // self.batch_size 99 | 100 | 101 | 102 | --------------------------------------------------------------------------------