├── HRec ├── __init__.py ├── datasets │ ├── __init__.py │ ├── dataset.py │ ├── enum_type.py │ └── hdataset.py ├── models │ ├── __init__.py │ ├── afm.py │ ├── autoint.py │ ├── base.py │ ├── cccf.py │ ├── dcn.py │ ├── ddtcdr.py │ ├── deepfm.py │ ├── deepmf.py │ ├── dssm.py │ ├── duration.py │ ├── fism.py │ ├── layers.py │ ├── model_map.py │ ├── nais.py │ ├── ncf.py │ ├── nfm.py │ ├── utils.py │ ├── widedeep.py │ └── xdeepfm.py └── pipeline │ ├── __init__.py │ ├── configure.py │ ├── dprocess.py │ ├── evaluator.py │ ├── hprocess.py │ ├── metrics.py │ ├── optimizer.py │ ├── process.py │ └── utils.py ├── LICENSE ├── README.md ├── configs ├── afm.json ├── autoint.json ├── cccf.json ├── ddtcdr.json ├── deepfm.json ├── deepmf.json ├── dssm.json ├── duration.json ├── fism.json ├── nais.json ├── widedeep.json └── xdeepfm.json ├── data.tar.xz ├── framework.png ├── requirements.txt ├── train_hete.py └── train_homo.py /HRec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekinglcq/HRec/f13a685dd593154d4887ed18bd444e588484d014/HRec/__init__.py -------------------------------------------------------------------------------- /HRec/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset import DataSet, SubSet 2 | from .hdataset import HDataSet 3 | from .enum_type import FeatureSource 4 | -------------------------------------------------------------------------------- /HRec/datasets/enum_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class FeatureSource(Enum): 5 | """Source of features. 6 | 7 | - ``INTERACTION``: Features from ``.inter`` 8 | - ``USER``: Features from ``.user`` (other than ``user_id``). 9 | - ``ITEM``: Features from ``.item`` (other than ``item_id``). 10 | - ``USER_ID``: ``user_id`` feature in ``inter_feat`` and ``user_feat``. 11 | - ``ITEM_ID``: ``item_id`` feature in ``inter_feat`` and ``item_feat``. 12 | """ 13 | 14 | INTERACTION = 'inter' 15 | USER = 'user' 16 | ITEM = 'item' 17 | USER_ID = 'user_id' 18 | ITEM_ID = 'item_id' 19 | 20 | 21 | item_type_dict = {'book': 0.0, 'music': 1.0, 'movie': 2.0} 22 | -------------------------------------------------------------------------------- /HRec/datasets/hdataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: hdataset.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-28 20:17:47 7 | # ########################### 8 | 9 | import pandas as pd 10 | 11 | import os 12 | import logging 13 | from collections import defaultdict 14 | from torch.utils.data import DataLoader, Dataset 15 | from .enum_type import FeatureSource as FS 16 | from .enum_type import item_type_dict 17 | from .dataset import DataSet, SubSet 18 | 19 | 20 | class HDataSet(DataSet): 21 | """ 22 | Dataset used for heterogenous items 23 | """ 24 | def __init__(self, config, restore_path=None): 25 | self.config = config 26 | self._init_setting() 27 | if restore_path is None: 28 | self._load_feats() 29 | else: 30 | # TODO 31 | pass 32 | self._preprocessing() 33 | 34 | def _load_feats(self): 35 | self.user_feat = self._load_meta_feats(self.config["user_feat_path"], 36 | FS.USER, "user_id") 37 | self.item_feat = self._load_item_feats(self.config["item_feat_path"], 38 | FS.ITEM) 39 | self.inter_feat = pd.read_csv(self.config["inter_feat_path"]).sample( 40 | frac=1, random_state=28) 41 | mask = None 42 | if len(self.types) < 3: 43 | for item_type, item_feat in self.item_feat.items(): 44 | new_mask = self.inter_feat[self.iid_field].isin( 45 | item_feat[self.iid_field]) 46 | if mask is not None: 47 | mask = mask | new_mask 48 | else: 49 | mask = new_mask 50 | self.inter_feat = self.inter_feat[mask] 51 | self.h_inter_feat = {} 52 | self.user_num = len(self.user_feat) 53 | self.item_num = sum([len(i) for i in self.item_feat.values()]) 54 | self.item_nums = {k: len(v) for k, v in self.item_feat.items()} 55 | print(f'user num: {self.user_num}') 56 | print(f'item num: {self.item_num}') 57 | print(f'item nums: {self.item_nums}') 58 | 59 | def _preprocessing(self): 60 | self._normalize() 61 | if len(self.types) < 3: 62 | self._reID(self.iid_field) 63 | self._reID(self.uid_field) 64 | 65 | def _load_item_feats(self, paths, source): 66 | item_feat = {} 67 | for item_type, item_path in paths.items(): 68 | if item_type not in self.types: 69 | continue 70 | if os.path.isfile(item_path): 71 | feat = pd.read_csv(item_path) 72 | item_feat[item_type] = feat 73 | else: 74 | raise ValueError("Dataset file not fountd.") 75 | return item_feat 76 | 77 | def _init_setting(self): 78 | self.logger = logging.getLogger() 79 | self.name = self.config['name'] 80 | print(self.config) 81 | self.uid_field = self.config["USER_ID_FIELD"] 82 | self.iid_field = self.config["ITEM_ID_FIELD"] 83 | self.label_field = self.config["LABEL_FIELD"] 84 | self.itype_field = self.config["TYPE_FIELD"] 85 | self.types = self.config["type"] 86 | self.field2type = {} 87 | self.field2source = {} 88 | self.field2id_token = defaultdict(dict) 89 | self.field2token_id = defaultdict(dict) 90 | self.user_feat_fields = [] 91 | self.item_feat_fields = defaultdict(list) 92 | 93 | for feat_name, feat_value in self.config['feat'].items(): 94 | source = feat_value['source'] 95 | self.field2type[feat_name] = feat_value['type'] 96 | self.field2source[feat_name] = feat_value['source'] 97 | if source == 'user' and feat_name != self.uid_field: 98 | self.user_feat_fields.append(feat_name) 99 | if source.startswith('item') and feat_name != self.iid_field: 100 | item_type = source.split("_")[1] 101 | if item_type in self.types: 102 | self.item_feat_fields[item_type].append(feat_name) 103 | 104 | def num(self, field): 105 | 106 | if field == self.uid_field: 107 | return self.user_num 108 | if field == self.iid_field: 109 | return self.item_num 110 | if field not in self.field2type: 111 | raise ValueError('field {} not in dataset'.format(field)) 112 | # if field not in self.field2token_id: 113 | # raise ValueError('field {} is not token type'.format(field)) 114 | if len(self.field2token_id[field]) == 0: 115 | if field in self.user_feat_fields: 116 | return len(self.user_feat[field].unique()) 117 | else: 118 | for item_type, item_feat_fields in self.item_feat_fields.items( 119 | ): 120 | if field in item_feat_fields: 121 | return len(self.item_feat[item_type][field].unique()) 122 | return len(self.field2token_id[field]) 123 | 124 | def _reID(self, field): 125 | """ 126 | Re-ID the token-type feature, save the id map in self.field2token_id 127 | """ 128 | self.logger.info(f'ReID field {field}.') 129 | ftype = self.field2type.get(field) 130 | assert ftype == 'token' 131 | source = self.field2source.get(field) 132 | if type(source) is str and source.startswith("item_"): 133 | item_type = source.split("_")[1] 134 | dataframe = self.item_feat[item_type] 135 | elif source is FS.ITEM_ID or source == "item": 136 | dataframe = pd.concat(list(self.item_feat.values()), join='inner') 137 | elif source == 'user' or source is FS.USER_ID: 138 | dataframe = self.user_feat 139 | else: 140 | dataframe = self.inter_feat 141 | id_map = {v: k for k, v in enumerate(dataframe[field].unique())} 142 | self.field2token_id[field].update(id_map) 143 | dataframe[field] = dataframe[field].map(id_map) 144 | if source in ['item', 'user', FS.ITEM_ID, FS.USER_ID]: 145 | if field in self.inter_feat: 146 | self.inter_feat[field] = self.inter_feat[field].map(id_map) 147 | for item_type, item_feat in self.item_feat.items(): 148 | if field in item_feat: 149 | item_feat[field] = item_feat[field].map(id_map) 150 | 151 | def join(self, df): 152 | """ 153 | Join user/item features to interactions. 154 | """ 155 | if self.user_feat is not None and self.uid_field in df: 156 | df = pd.merge(df, 157 | self.user_feat, 158 | on=self.uid_field, 159 | how='left', 160 | suffixes=('_inter', '_user')) 161 | if self.item_feat is not None and self.iid_field in df: 162 | for item_type, item_feat in self.item_feat.items(): 163 | df = pd.merge(df, 164 | item_feat, 165 | on=self.iid_field, 166 | how='left', 167 | suffixes=(f'_{item_type}', '_inter')) 168 | type_c = [i for i in df.columns if i.startswith(self.itype_field)] 169 | df[self.itype_field] = df[type_c].agg(sum, axis=1) 170 | return df 171 | 172 | def join_interaction(self): 173 | self.inter_feat = self.join(self.inter_feat) 174 | if 'sample' in self.config: 175 | sample_ratio = self.config['sample'] 176 | sampled = [] 177 | for kind in self.types: 178 | ratio = sample_ratio.get(kind, 1.0) 179 | kind_id = item_type_dict[kind] 180 | # preverse the data for val & test 181 | new_df = self.inter_feat[self.inter_feat['type'] == 182 | kind_id].sample(frac=ratio * 0.7 + 183 | 0.3, 184 | random_state=16) 185 | print(kind, kind_id, ratio, new_df.shape) 186 | sampled.append(new_df) 187 | self.inter_feat = pd.concat(sampled, ignore_index=True) 188 | self.inter_feat = self.inter_feat.sample(frac=1.).reset_index( 189 | drop=True) 190 | 191 | def train_val_test_split(self, 192 | ratios=[0.7, 0.2, 0.1], 193 | group_by=None, 194 | **kwargs): 195 | assert len(ratios) == 3 196 | if 'sample' in self.config: 197 | train, val, test = self.split_by_ratio_sampled( 198 | ratios, create_new_dataset=False) 199 | else: 200 | train, val, test = self.split_by_ratio(ratios, 201 | group_by=group_by, 202 | create_new_dataset=False) 203 | user_fs = self.user_feat_fields 204 | item_fs = self.item_feat_fields 205 | type_field = self.itype_field 206 | self.train_inter_subset = {} 207 | self.val_inter_subset = {} 208 | self.test_inter_subset = {} 209 | for item_type in self.types: 210 | item_type_id = item_type_dict[item_type] 211 | self.train_inter_subset[item_type] = SubSet( 212 | train[train[type_field] == item_type_id], self.uid_field, 213 | self.iid_field, self.itype_field, self.label_field, user_fs, 214 | item_fs[item_type]) 215 | self.val_inter_subset[item_type] = SubSet( 216 | val[val[type_field] == item_type_id], self.uid_field, 217 | self.iid_field, self.itype_field, self.label_field, user_fs, 218 | item_fs[item_type]) 219 | self.test_inter_subset[item_type] = SubSet( 220 | test[test[type_field] == item_type_id], self.uid_field, 221 | self.iid_field, self.itype_field, self.label_field, user_fs, 222 | item_fs[item_type]) 223 | self.all_inter_feat = self.inter_feat 224 | self.logger.info( 225 | "Replace interaction features with train interaction fatures.") 226 | self.logger.info( 227 | "Interaction features are stored in self.all_inter_feat") 228 | self.inter_feat = train 229 | 230 | def init_data_loader(self, batch_size=256, num_workers=1): 231 | self.train_data_loader = {} 232 | self.val_data_loader = {} 233 | self.test_data_loader = {} 234 | for item_type in self.types: 235 | self.train_data_loader[item_type] = DataLoader( 236 | self.train_inter_subset[item_type], 237 | batch_size=batch_size, 238 | # pin_memory=True, 239 | num_workers=num_workers) 240 | self.val_data_loader[item_type] = DataLoader( 241 | self.val_inter_subset[item_type], 242 | batch_size=batch_size, 243 | num_workers=num_workers) 244 | self.test_data_loader[item_type] = DataLoader( 245 | self.test_inter_subset[item_type], 246 | batch_size=batch_size, 247 | num_workers=num_workers) 248 | 249 | 250 | class HSubSet(Dataset): 251 | def __init__(self, dataframes, uid_field, iid_field, label_field, 252 | u_feat_fields, i_feat_fields): 253 | self.types = dataframes.keys() 254 | self.dfs = dataframes 255 | self.uid = uid_field 256 | self.iid = iid_field 257 | self.label = label_field 258 | 259 | def __len__(self): 260 | return min([len(df.index) for df in self.dfs]) 261 | -------------------------------------------------------------------------------- /HRec/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepfm import DeepFM 2 | from .deepmf import DMF 3 | from .fism import FISM 4 | from .ncf import NCF 5 | from .xdeepfm import xDeepFM 6 | from .dssm import DSSM 7 | from .afm import AFM 8 | from .dcn import DCN 9 | from .widedeep import WideDeep 10 | from .nais import NAIS 11 | from .cccf import CCCFNet 12 | from .ddtcdr import DDTCDR 13 | from .autoint import AutoInt 14 | from .duration import DURation 15 | 16 | from .utils import ModelType 17 | 18 | model_map = { 19 | # General Model 20 | "DMF": DMF, 21 | "FISM": FISM, 22 | "NCF": NCF, 23 | # Context Model 24 | "DeepFM": DeepFM, 25 | "xDeepFM": xDeepFM, 26 | "DCN": DCN, 27 | "AFM": AFM, 28 | "DSSM": DSSM, 29 | "WideDeep": WideDeep, 30 | "NAIS": NAIS, 31 | "AutoInt": AutoInt, 32 | # Heterogeneous Model 33 | "CCCF": CCCFNet, 34 | "DDTCDR": DDTCDR, 35 | "DURation": DURation, 36 | } 37 | -------------------------------------------------------------------------------- /HRec/models/afm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: afm.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-31 17:31:43 7 | # ########################### 8 | r""" 9 | AFM 10 | ################################################ 11 | Reference: 12 | Jun Xiao et al. "Attentional Factorization Machines: Learning the Weight of Feature Interactions via 13 | Attention Networks" in IJCAI 2017. 14 | """ 15 | 16 | import torch 17 | import torch.nn as nn 18 | from torch.nn.init import xavier_normal_, constant_ 19 | 20 | from .layers import AttLayer 21 | from .base import ContextModel 22 | 23 | 24 | class AFM(ContextModel): 25 | """ AFM is a attention based FM model that predict the final score with the attention of input feature. 26 | 27 | """ 28 | def __init__(self, config, dataset): 29 | super(AFM, self).__init__(config, dataset) 30 | 31 | # load parameters info 32 | self.attention_size = config['attention_size'] 33 | self.dropout_prob = config['dropout_prob'] 34 | self.reg_weight = config['reg_weight'] 35 | self.num_pair = self.num_feature_field * (self.num_feature_field - 36 | 1) / 2 37 | 38 | # define layers and loss 39 | self.attlayer = AttLayer(self.embedding_size, self.attention_size) 40 | self.p = nn.Parameter(torch.randn(self.embedding_size), 41 | requires_grad=True) 42 | self.dropout_layer = nn.Dropout(p=self.dropout_prob) 43 | self.sigmoid = nn.Sigmoid() 44 | self.loss = nn.BCELoss() 45 | 46 | # parameters initialization 47 | self.apply(self._init_weights) 48 | 49 | def _init_weights(self, module): 50 | if isinstance(module, nn.Embedding): 51 | xavier_normal_(module.weight.data) 52 | elif isinstance(module, nn.Linear): 53 | xavier_normal_(module.weight.data) 54 | if module.bias is not None: 55 | constant_(module.bias.data, 0) 56 | 57 | def build_cross(self, feat_emb): 58 | """ Build the cross feature columns of feature columns 59 | 60 | Args: 61 | feat_emb (torch.FloatTensor): input feature embedding tensor. shape of [batch_size, field_size, embed_dim]. 62 | 63 | Returns: 64 | tuple: 65 | - torch.FloatTensor: Left part of the cross feature. shape of [batch_size, num_pairs, emb_dim]. 66 | - torch.FloatTensor: Right part of the cross feature. shape of [batch_size, num_pairs, emb_dim]. 67 | """ 68 | # num_pairs = num_feature_field * (num_feature_field-1) / 2 69 | row = [] 70 | col = [] 71 | for i in range(self.num_feature_field - 1): 72 | for j in range(i + 1, self.num_feature_field): 73 | row.append(i) 74 | col.append(j) 75 | p = feat_emb[:, row] # [batch_size, num_pairs, emb_dim] 76 | q = feat_emb[:, col] # [batch_size, num_pairs, emb_dim] 77 | return p, q 78 | 79 | def afm_layer(self, infeature): 80 | """ Get the attention-based feature interaction score 81 | 82 | Args: 83 | infeature (torch.FloatTensor): input feature embedding tensor. shape of [batch_size, field_size, embed_dim]. 84 | 85 | Returns: 86 | torch.FloatTensor: Result of score. shape of [batch_size, 1]. 87 | """ 88 | p, q = self.build_cross(infeature) 89 | pair_wise_inter = torch.mul(p, q) # [batch_size, num_pairs, emb_dim] 90 | 91 | # [batch_size, num_pairs, 1] 92 | att_signal = self.attlayer(pair_wise_inter).unsqueeze(dim=2) 93 | 94 | att_inter = torch.mul( 95 | att_signal, pair_wise_inter) # [batch_size, num_pairs, emb_dim] 96 | att_pooling = torch.sum(att_inter, dim=1) # [batch_size, emb_dim] 97 | att_pooling = self.dropout_layer(att_pooling) # [batch_size, emb_dim] 98 | 99 | att_pooling = torch.mul(att_pooling, self.p) # [batch_size, emb_dim] 100 | att_pooling = torch.sum(att_pooling, dim=1, 101 | keepdim=True) # [batch_size, 1] 102 | 103 | return att_pooling 104 | 105 | def forward(self, interaction): 106 | # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None 107 | # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None 108 | sparse_embedding, dense_embedding = self.embed_input_fields( 109 | interaction) 110 | all_embeddings = [] 111 | if sparse_embedding is not None: 112 | all_embeddings.append(sparse_embedding) 113 | if dense_embedding is not None and len(dense_embedding.shape) == 3: 114 | all_embeddings.append(dense_embedding) 115 | afm_all_embeddings = torch.cat( 116 | all_embeddings, dim=1) # [batch_size, num_field, embed_dim] 117 | 118 | output = self.sigmoid( 119 | self.first_order_linear(interaction) + 120 | self.afm_layer(afm_all_embeddings)) 121 | return output.squeeze() 122 | 123 | def calculate_loss(self, interaction): 124 | label = interaction[self.LABEL].float() 125 | 126 | output = self.forward(interaction) 127 | l2_loss = self.reg_weight * torch.norm(self.attlayer.w.weight, p=2) 128 | return self.loss(output, label) + l2_loss 129 | 130 | def predict(self, interaction): 131 | return self.forward(interaction) 132 | -------------------------------------------------------------------------------- /HRec/models/autoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: autoint.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2021-01-26 21:01:14 7 | # ########################### 8 | r""" 9 | AutoInt 10 | ################################################ 11 | Reference: 12 | Weiping Song et al. "AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks" 13 | in CIKM 2018. 14 | """ 15 | 16 | import torch 17 | import torch.nn.functional as F 18 | import torch.nn as nn 19 | from torch.nn.init import xavier_normal_, constant_ 20 | 21 | from .layers import MLPLayers 22 | from .base import ContextModel 23 | 24 | 25 | class AutoInt(ContextModel): 26 | """ AutoInt is a novel CTR prediction model based on self-attention mechanism, 27 | which can automatically learn high-order feature interactions in an explicit fashion. 28 | 29 | """ 30 | def __init__(self, config, dataset): 31 | super(AutoInt, self).__init__(config, dataset) 32 | 33 | # load parameters info 34 | self.attention_size = config['attention_size'] 35 | self.dropout_probs = config['dropout_probs'] 36 | self.n_layers = config['n_layers'] 37 | self.num_heads = config['num_heads'] 38 | self.mlp_hidden_size = config['mlp_hidden_size'] 39 | 40 | # define layers and loss 41 | self.att_embedding = nn.Linear(self.embedding_size, 42 | self.attention_size) 43 | self.embed_output_dim = self.num_feature_field * self.embedding_size 44 | self.atten_output_dim = self.num_feature_field * self.attention_size 45 | size_list = [self.embed_output_dim] + self.mlp_hidden_size 46 | self.mlp_layers = MLPLayers(size_list, dropout=self.dropout_probs[1]) 47 | # multi-head self-attention network 48 | self.self_attns = nn.ModuleList([ 49 | nn.MultiheadAttention(self.attention_size, 50 | self.num_heads, 51 | dropout=self.dropout_probs[0]) 52 | for _ in range(self.n_layers) 53 | ]) 54 | self.attn_fc = torch.nn.Linear(self.atten_output_dim, 1) 55 | self.deep_predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1) 56 | 57 | self.dropout_layer = nn.Dropout(p=self.dropout_probs[2]) 58 | self.sigmoid = nn.Sigmoid() 59 | self.loss = nn.BCELoss() 60 | 61 | # parameters initialization 62 | self.apply(self._init_weights) 63 | 64 | def _init_weights(self, module): 65 | if isinstance(module, nn.Embedding): 66 | xavier_normal_(module.weight.data) 67 | elif isinstance(module, nn.Linear): 68 | xavier_normal_(module.weight.data) 69 | if module.bias is not None: 70 | constant_(module.bias.data, 0) 71 | 72 | def autoint_layer(self, infeature): 73 | """ Get the attention-based feature interaction score 74 | 75 | Args: 76 | infeature (torch.FloatTensor): input feature embedding tensor. shape of[batch_size,field_size,embed_dim]. 77 | 78 | Returns: 79 | torch.FloatTensor: Result of score. shape of [batch_size,1] . 80 | """ 81 | 82 | att_infeature = self.att_embedding(infeature) 83 | cross_term = att_infeature.transpose(0, 1) 84 | for self_attn in self.self_attns: 85 | cross_term, _ = self_attn(cross_term, cross_term, cross_term) 86 | cross_term = cross_term.transpose(0, 1) 87 | # Interacting layer 88 | cross_term = F.relu(cross_term).contiguous().view( 89 | -1, self.atten_output_dim) 90 | batch_size = infeature.shape[0] 91 | att_output = self.attn_fc(cross_term) + self.deep_predict_layer( 92 | self.mlp_layers(infeature.view(batch_size, -1))) 93 | return att_output 94 | 95 | def forward(self, interaction): 96 | # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None 97 | # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None 98 | sparse_embedding, dense_embedding = self.embed_input_fields( 99 | interaction) 100 | all_embeddings = [] 101 | if sparse_embedding is not None: 102 | all_embeddings.append(sparse_embedding) 103 | if dense_embedding is not None and len(dense_embedding.shape) == 3: 104 | all_embeddings.append(dense_embedding) 105 | autoint_all_embeddings = torch.cat( 106 | all_embeddings, dim=1) # [batch_size, num_field, embed_dim] 107 | output = self.first_order_linear(interaction) + self.autoint_layer( 108 | autoint_all_embeddings) 109 | return self.sigmoid(output.squeeze(1)) 110 | 111 | def calculate_loss(self, interaction): 112 | label = interaction[self.LABEL].float() 113 | output = self.forward(interaction) 114 | return self.loss(output, label) 115 | 116 | def predict(self, interaction): 117 | return self.forward(interaction) 118 | -------------------------------------------------------------------------------- /HRec/models/cccf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: cccf.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2021-01-01 16:10:50 7 | # ########################### 8 | 9 | import torch 10 | import logging 11 | import numpy as np 12 | import torch.nn as nn 13 | from .layers import MLPLayers 14 | from .base import HModel 15 | from torch.nn.init import normal_ 16 | from collections import Counter, defaultdict 17 | 18 | 19 | class CCCFNet(HModel): 20 | """CCCFNet 21 | CCCFNet: A Content-Boosted Collaborative Filtering Neural Network for Cross Domain Recommender Systems 22 | """ 23 | def __init__(self, config, dataset): 24 | super().__init__(config, dataset) 25 | self.logger = logging.getLogger() 26 | 27 | self.LABEL = dataset.config['LABEL_FIELD'] 28 | 29 | self.user_emb_size = config['user_emb_size'] 30 | self.item_emb_size = config['item_emb_size'] 31 | self.token_emb_size = config['token_emb_size'] 32 | 33 | self.user_cf_embedding = nn.Embedding(self.n_users, self.user_emb_size) 34 | self.item_cf_embedding = nn.Embedding(self.n_items, self.item_emb_size) 35 | 36 | self.P = len(dataset.config['item_feat_path']) 37 | self.item_size = dataset.item_nums 38 | 39 | self.device = config['device'] 40 | self.user_hidden_size_list = config['user_hidden_size_list'] 41 | self.item_hidden_size_list = config['item_hidden_size_list'] 42 | 43 | assert self.user_hidden_size_list[-1] == self.item_hidden_size_list[-1] 44 | 45 | self.item_nn_dict = nn.ModuleDict() 46 | 47 | for item_type, item_feats in dataset.item_feat_fields.items(): 48 | item_feat_type_count = Counter( 49 | [dataset.field2type[i] for i in item_feats]) 50 | input_dim = (item_feat_type_count['token'] + 1) * self.token_emb_size + \ 51 | item_feat_type_count['float'] 52 | self.item_nn_dict[item_type] = MLPLayers( 53 | [input_dim + self.user_emb_size] + self.item_hidden_size_list, 54 | activation='tanh').to(self.device) 55 | 56 | self.user_fc_layers = MLPLayers([self.user_emb_size] + 57 | self.user_hidden_size_list).to( 58 | self.device) 59 | 60 | self.bce_loss = nn.BCELoss() 61 | self.sigmoid = nn.Sigmoid() 62 | # Save the item embedding before dot product layer to speed up evaluation 63 | self.i_embedding = None 64 | 65 | # parameters initialization 66 | self.apply(self._init_weights) 67 | 68 | def _init_weights(self, module): 69 | # We just initialize the module with normal distribution as the paper said 70 | if isinstance(module, nn.Linear): 71 | normal_(module.weight.data, 0, 0.01) 72 | if module.bias is not None: 73 | module.bias.data.fill_(0.0) 74 | elif isinstance(module, nn.Embedding): 75 | normal_(module.weight.data, 0, 0.01) 76 | 77 | def agg_item_feature(self, item_type, item_data): 78 | 79 | token_embeddings = [] 80 | float_feats = [] 81 | for feat_name, feat_value in item_data.items(): 82 | if feat_name in self.token_embedding_table and feat_name != self.USER_ID: 83 | emb = self.token_embedding_table[feat_name](feat_value.long()) 84 | token_embeddings.append(emb) 85 | if feat_name in self.float_field_names: 86 | float_feat = feat_value.float() 87 | if float_feat.dim() == 1: 88 | float_feat = float_feat.unsqueeze(-1) 89 | float_feats.append(float_feat) 90 | all_emb = torch.cat(token_embeddings + float_feats, dim=-1) 91 | return all_emb 92 | 93 | def forward(self, item_type, data): 94 | 95 | user = data[self.USER_ID] 96 | item_id = data[self.ITEM_ID] 97 | user_emb = self.user_cf_embedding(user) 98 | item_cf_emb = self.item_cf_embedding(item_id) 99 | 100 | item_layer = self.item_nn_dict[item_type] 101 | item_content_emb = self.agg_item_feature(item_type, data) 102 | item_emb = torch.cat([item_cf_emb, item_content_emb], dim=-1) 103 | item_emb = item_layer(item_emb) 104 | 105 | user_emb = self.user_fc_layers(user_emb) 106 | 107 | vector = torch.mul(user_emb, item_emb).sum(dim=1) 108 | vector = self.sigmoid(vector) 109 | return vector 110 | 111 | def calculate_loss(self, data): 112 | losses = [] 113 | losses_dict = defaultdict(int) 114 | 115 | for item_type, item_data in data.items(): 116 | 117 | output = self.forward(item_type, item_data) 118 | 119 | label = item_data[self.LABEL].float() 120 | tmp_loss = self.bce_loss(output, label) 121 | losses.append(tmp_loss) 122 | losses_dict['total'] += tmp_loss.item() 123 | 124 | loss = torch.sum(torch.stack(losses)) 125 | return loss, losses_dict 126 | 127 | def predict(self, h, data): 128 | return self.forward(h, data) 129 | -------------------------------------------------------------------------------- /HRec/models/dcn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: dcn.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-31 17:32:58 7 | # ########################### 8 | r""" 9 | ################################################ 10 | Reference: 11 | Ruoxi Wang at al. "Deep & Cross Network for Ad Click Predictions." in ADKDD 2017. 12 | """ 13 | 14 | import torch 15 | import torch.nn as nn 16 | from torch.nn.init import xavier_normal_, constant_ 17 | 18 | from .layers import MLPLayers, RegLoss 19 | from .base import ContextModel 20 | 21 | 22 | class DCN(ContextModel): 23 | """Deep & Cross Network replaces the wide part in Wide&Deep with cross network, 24 | automatically construct limited high-degree cross features, and learns the corresponding weights. 25 | 26 | """ 27 | def __init__(self, config, dataset): 28 | super(DCN, self).__init__(config, dataset) 29 | 30 | # load parameters info 31 | self.mlp_hidden_size = config['mlp_hidden_size'] 32 | self.cross_layer_num = config['cross_layer_num'] 33 | self.reg_weight = config['reg_weight'] 34 | self.dropout_prob = config['dropout_prob'] 35 | 36 | # define layers and loss 37 | # init weight and bias of each cross layer 38 | self.cross_layer_parameter = [ 39 | nn.Parameter( 40 | torch.empty(self.embedding_size * len(self.token_field_names) + 41 | len(self.float_field_names), 42 | device=self.device)) 43 | for _ in range(self.cross_layer_num * 2) 44 | ] 45 | self.cross_layer_w = nn.ParameterList( 46 | self.cross_layer_parameter[:self.cross_layer_num]) 47 | self.cross_layer_b = nn.ParameterList( 48 | self.cross_layer_parameter[self.cross_layer_num:]) 49 | 50 | # size of mlp hidden layer 51 | size_list = [ 52 | self.embedding_size * len(self.token_field_names) + 53 | len(self.float_field_names) 54 | ] + self.mlp_hidden_size 55 | # size of cross network output 56 | in_feature_num = self.mlp_hidden_size[-1] + \ 57 | self.embedding_size * len(self.token_field_names) + \ 58 | len(self.float_field_names) 59 | 60 | self.mlp_layers = MLPLayers(size_list, 61 | dropout=self.dropout_prob, 62 | bn=True) 63 | self.predict_layer = nn.Linear(in_feature_num, 1) 64 | self.reg_loss = RegLoss() 65 | self.sigmoid = nn.Sigmoid() 66 | self.loss = nn.BCELoss() 67 | 68 | # parameters initialization 69 | self.apply(self._init_weights) 70 | 71 | def _init_weights(self, module): 72 | if isinstance(module, nn.Embedding): 73 | xavier_normal_(module.weight.data) 74 | elif isinstance(module, nn.Linear): 75 | xavier_normal_(module.weight.data) 76 | if module.bias is not None: 77 | constant_(module.bias.data, 0) 78 | 79 | def cross_network(self, x_0): 80 | r"""Cross network is composed of cross layers, with each layer having the following formula. 81 | 82 | .. math:: x_{l+1} = x_0 {x_l^T} w_l + b_l + x_l 83 | 84 | :math:`x_l`, :math:`x_{l+1}` are column vectors denoting the outputs from the l -th and 85 | (l + 1)-th cross layers, respectively. 86 | :math:`w_l`, :math:`b_l` are the weight and bias parameters of the l -th layer. 87 | 88 | Args: 89 | x_0(torch.Tensor): Embedding vectors of all features, input of cross network. 90 | 91 | Returns: 92 | torch.Tensor:output of cross network, [batch_size, num_feature_field * embedding_size] 93 | 94 | """ 95 | x_l = x_0 96 | for i in range(self.cross_layer_num): 97 | xl_w = torch.tensordot(x_l, self.cross_layer_w[i], dims=([1], [0])) 98 | xl_dot = (x_0.transpose(0, 1) * xl_w).transpose(0, 1) 99 | x_l = xl_dot + self.cross_layer_b[i] + x_l 100 | return x_l 101 | 102 | def forward(self, interaction): 103 | # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None 104 | # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None 105 | sparse_embedding, dense_embedding = self.embed_input_fields( 106 | interaction) 107 | batch_size = sparse_embedding.shape[0] 108 | all_embeddings = [] 109 | if sparse_embedding is not None: 110 | all_embeddings.append(sparse_embedding.view(batch_size, -1)) 111 | if dense_embedding is not None and len(dense_embedding.shape) == 3: 112 | all_embeddings.append(dense_embedding.view(batch_size, -1)) 113 | 114 | dcn_all_embeddings = torch.cat( 115 | all_embeddings, dim=1) # [batch_size, num_field, embed_dim] 116 | dcn_all_embeddings = dcn_all_embeddings.view(batch_size, -1) 117 | 118 | # DNN 119 | deep_output = self.mlp_layers(dcn_all_embeddings) 120 | # Cross Network 121 | cross_output = self.cross_network(dcn_all_embeddings) 122 | stack = torch.cat([cross_output, deep_output], dim=-1) 123 | output = self.sigmoid(self.predict_layer(stack)) 124 | 125 | return output.squeeze(1) 126 | 127 | def calculate_loss(self, interaction): 128 | label = interaction[self.LABEL] 129 | output = self.forward(interaction) 130 | l2_loss = self.reg_weight * self.reg_loss(self.cross_layer_w) 131 | return self.loss(output, label) + l2_loss 132 | 133 | def predict(self, interaction): 134 | return self.forward(interaction) 135 | -------------------------------------------------------------------------------- /HRec/models/ddtcdr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: ddtcdr.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2021-02-03 01:38:58 7 | # ########################### 8 | 9 | import torch 10 | import logging 11 | import torch.nn as nn 12 | from .base import HModel 13 | from collections import Counter 14 | 15 | 16 | class DDTCDR(HModel): 17 | """ DDTCDR 18 | DDTCDR: Deep Dual Transfer Cross Domain Recommendation. 19 | """ 20 | def __init__(self, config, dataset, item_type): 21 | super().__init__(config, dataset) 22 | self.logger = logging.getLogger() 23 | 24 | self.LABEL = dataset.config['LABEL_FIELD'] 25 | # self.RATING = dataset.config['RATING_FIELD'] 26 | 27 | self.user_emb_size = config['latent_dim'] 28 | self.item_emb_size = config['latent_dim'] 29 | 30 | self.layers = config['layers'] 31 | self.token_emb_size = config['token_emb_size'] 32 | self.user_cf_embedding = nn.Embedding(self.n_users, self.user_emb_size) 33 | self.item_cf_embedding = nn.Embedding(self.n_items, self.item_emb_size) 34 | 35 | self.latent_dim = config['latent_dim'] 36 | self.fc_layers = torch.nn.ModuleList() 37 | 38 | item_feats = dataset.item_feat_fields[item_type] 39 | item_feat_type_count = Counter( 40 | [dataset.field2type[i] for i in item_feats]) 41 | input_dim = (item_feat_type_count['token'] + 1) * self.token_emb_size + \ 42 | item_feat_type_count['float'] + self.user_emb_size + self.item_emb_size 43 | 44 | self.layers.insert(0, input_dim) 45 | for idx, (in_size, 46 | out_size) in enumerate(zip(self.layers[:-1], 47 | self.layers[1:])): 48 | self.fc_layers.append(torch.nn.Linear(in_size, out_size)) 49 | 50 | self.affine_output = torch.nn.Linear(in_features=config['layers'][-1], 51 | out_features=1) 52 | self.logistic = torch.nn.Sigmoid() 53 | self.bridge = torch.nn.Linear(config['latent_dim'], 54 | config['latent_dim']) 55 | torch.nn.init.orthogonal_(self.bridge.weight) 56 | 57 | def agg_item_feature(self, item_type, item_data): 58 | token_embeddings = [] 59 | float_feats = [] 60 | for feat_name, feat_value in item_data.items(): 61 | if feat_name in self.token_embedding_table and feat_name != self.USER_ID: 62 | emb = self.token_embedding_table[feat_name](feat_value.long()) 63 | token_embeddings.append(emb) 64 | if feat_name in self.float_field_names: 65 | float_feat = feat_value.float() 66 | if float_feat.dim() == 1: 67 | float_feat = float_feat.unsqueeze(-1) 68 | float_feats.append(float_feat) 69 | all_emb = torch.cat(token_embeddings + float_feats, dim=-1) 70 | return all_emb 71 | 72 | def forward(self, item_type, data, dual=False): 73 | user = data[self.USER_ID] 74 | item_id = data[self.ITEM_ID] 75 | user_emb = self.user_cf_embedding(user) 76 | if dual: 77 | user_emb = self.bridge(user_emb) 78 | item_cf_emb = self.item_cf_embedding(item_id) 79 | 80 | item_content_emb = self.agg_item_feature(item_type, data) 81 | item_emb = torch.cat([item_cf_emb, item_content_emb], dim=-1) 82 | vector = torch.cat([user_emb, item_emb], dim=-1) 83 | vector = vector.float() 84 | 85 | for fc in self.fc_layers: 86 | vector = fc(vector) 87 | vector = torch.nn.Dropout(p=0.1)(vector) 88 | vector = torch.nn.ReLU()(vector) 89 | rating = self.affine_output(vector) 90 | rating = self.logistic(rating) 91 | return rating 92 | 93 | def calculate_loss(self): 94 | pass 95 | 96 | def predict(self, h, data): 97 | return self.forward(h, data) 98 | -------------------------------------------------------------------------------- /HRec/models/deepfm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: deepfm.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-31 11:09:13 7 | # ########################### 8 | """ 9 | DeepFM 10 | ################################################ 11 | Reference: 12 | Huifeng Guo et al. "DeepFM: A Factorization-Machine based Neural Network for CTR Prediction." in IJCAI 2017. 13 | """ 14 | 15 | import torch 16 | import torch.nn as nn 17 | from torch.nn.init import xavier_normal_, constant_ 18 | 19 | from .base import ContextModel 20 | from .layers import BaseFactorizationMachine, MLPLayers 21 | 22 | 23 | class DeepFM(ContextModel): 24 | """DeepFM is a DNN enhanced FM which both use a DNN and a FM to calculate feature interaction. 25 | Also DeepFM can be seen as a combination of FNN and FM. 26 | 27 | """ 28 | def __init__(self, config, dataset): 29 | super(DeepFM, self).__init__(config, dataset) 30 | 31 | # load parameters info 32 | self.mlp_hidden_size = config['mlp_hidden_size'] 33 | self.dropout_prob = config['dropout_prob'] 34 | 35 | # define layers and loss 36 | self.fm = BaseFactorizationMachine(reduce_sum=True) 37 | size_list = [ 38 | self.embedding_size * len(self.token_field_names) + 39 | len(self.float_field_names) 40 | ] + self.mlp_hidden_size 41 | self.mlp_layers = MLPLayers(size_list, self.dropout_prob) 42 | self.deep_predict_layer = nn.Linear( 43 | self.mlp_hidden_size[-1], 1) # Linear product to the final score 44 | self.sigmoid = nn.Sigmoid() 45 | self.loss = nn.BCELoss() 46 | 47 | # parameters initialization 48 | self.apply(self._init_weights) 49 | 50 | def _init_weights(self, module): 51 | if isinstance(module, nn.Embedding): 52 | xavier_normal_(module.weight.data) 53 | elif isinstance(module, nn.Linear): 54 | xavier_normal_(module.weight.data) 55 | if module.bias is not None: 56 | constant_(module.bias.data, 0) 57 | 58 | def forward(self, interaction): 59 | # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None 60 | # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None 61 | sparse_embedding, dense_embedding = self.embed_input_fields( 62 | interaction) 63 | all_embeddings = [] 64 | batch_size = sparse_embedding.shape[0] 65 | if sparse_embedding is not None: 66 | all_embeddings.append(sparse_embedding.view(batch_size, -1)) 67 | if dense_embedding is not None and len(dense_embedding.shape) == 3: 68 | all_embeddings.append(dense_embedding.view(batch_size, -1)) 69 | # import pdb 70 | # pdb.set_trace() 71 | deepfm_all_embeddings = torch.cat( 72 | all_embeddings, dim=1) # [batch_size, num_field, embed_dim] 73 | y_fm = self.first_order_linear(interaction) + self.fm(sparse_embedding) 74 | 75 | y_deep = self.deep_predict_layer( 76 | self.mlp_layers(deepfm_all_embeddings.view(batch_size, -1))) 77 | y = self.sigmoid(y_fm + y_deep) 78 | return y.squeeze() 79 | 80 | def calculate_loss(self, interaction): 81 | label = interaction[self.LABEL] 82 | output = self.forward(interaction) 83 | return self.loss(output, label.float()) 84 | 85 | def predict(self, interaction): 86 | return self.forward(interaction) 87 | -------------------------------------------------------------------------------- /HRec/models/deepmf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import torch 4 | import logging 5 | import numpy as np 6 | import torch.nn as nn 7 | from .layers import MLPLayers 8 | from .base import GeneralModel 9 | from torch.nn.init import normal_ 10 | """ 11 | DMF 12 | ################################################ 13 | Reference: 14 | Hong-Jian Xue et al. "Deep Matrix Factorization Models for Recommender Systems." in IJCAI 2017. 15 | """ 16 | 17 | 18 | class DMF(GeneralModel): 19 | """Deep MF""" 20 | def __init__(self, config, dataset): 21 | super().__init__(config, dataset) 22 | self.logger = logging.getLogger() 23 | 24 | self.LABEL = dataset.config['LABEL_FIELD'] 25 | # self.RATING = dataset.config['RATING_FIELD'] 26 | 27 | self.user_emb_size = config['user_emb_size'] 28 | self.item_emb_size = config['item_emb_size'] 29 | 30 | self.device = config['device'] 31 | self.user_hidden_size_list = config['user_hidden_size_list'] 32 | self.item_hidden_size_list = config['item_hidden_size_list'] 33 | 34 | assert self.user_hidden_size_list[-1] == self.item_hidden_size_list[-1] 35 | self.inter_matrix_type = dataset.config['inter_matrix_type'] 36 | 37 | # generate intermediate data 38 | if self.inter_matrix_type == '01': 39 | self.history_user_id, self.history_user_value, _ = dataset.history_user_matrix( 40 | ) 41 | self.history_item_id, self.history_item_value, _ = dataset.history_item_matrix( 42 | ) 43 | self.interaction_matrix = dataset.inter_matrix(form='csr').astype( 44 | np.float32) 45 | elif self.inter_matrix_type == 'rating': 46 | self.history_user_id, self.history_user_value, _ = dataset.history_user_matrix( 47 | value_field=self.RATING) 48 | self.history_item_id, self.history_item_value, _ = dataset.history_item_matrix( 49 | value_field=self.RATING) 50 | self.interaction_matrix = dataset.inter_matrix( 51 | form='csr', value_field=self.RATING).astype(np.float32) 52 | 53 | self.max_rating = self.history_user_value.max() 54 | # tensor of shape [n_items, H] where H is max length of history interaction. 55 | self.history_user_id = self.history_user_id.to(self.device) 56 | self.history_user_value = self.history_user_value.to(self.device) 57 | self.history_item_id = self.history_item_id.to(self.device) 58 | self.history_item_value = self.history_item_value.to(self.device) 59 | 60 | # define layers 61 | self.user_linear = nn.Linear(in_features=self.n_items, 62 | out_features=self.user_emb_size, 63 | bias=False) 64 | self.item_linear = nn.Linear(in_features=self.n_users, 65 | out_features=self.item_emb_size, 66 | bias=False) 67 | self.user_fc_layers = MLPLayers([self.user_emb_size] + 68 | self.user_hidden_size_list) 69 | self.item_fc_layers = MLPLayers([self.item_emb_size] + 70 | self.item_hidden_size_list) 71 | self.sigmoid = nn.Sigmoid() 72 | self.bce_loss = nn.BCELoss() 73 | 74 | # Save the item embedding before dot product layer to speed up evaluation 75 | self.i_embedding = None 76 | 77 | # parameters initialization 78 | self.apply(self._init_weights) 79 | 80 | def _init_weights(self, module): 81 | # We just initialize the module with normal distribution as the paper said 82 | if isinstance(module, nn.Linear): 83 | normal_(module.weight.data, 0, 0.01) 84 | if module.bias is not None: 85 | module.bias.data.fill_(0.0) 86 | elif isinstance(module, nn.Embedding): 87 | normal_(module.weight.data, 0, 0.01) 88 | 89 | def forward(self, user, item): 90 | 91 | user = user.long() 92 | item = item.long() 93 | user = self.get_user_embedding(user) 94 | 95 | # Following lines construct tensor of shape [B,n_users] using the tensor of shape [B,H] 96 | col_indices = self.history_user_id[item].flatten() 97 | row_indices = torch.arange(item.shape[0]).to( 98 | self.device).repeat_interleave(self.history_user_id.shape[1], 99 | dim=0) 100 | matrix_01 = torch.zeros(1).to(self.device).repeat( 101 | item.shape[0], self.n_users) 102 | matrix_01.index_put_((row_indices, col_indices), 103 | self.history_user_value[item].flatten()) 104 | item = self.item_linear(matrix_01) 105 | 106 | user = self.user_fc_layers(user) 107 | item = self.item_fc_layers(item) 108 | 109 | # cosine distance is replaced by dot product according the result of our experiments. 110 | vector = torch.mul(user, item).sum(dim=1) 111 | vector = self.sigmoid(vector) 112 | 113 | return vector 114 | 115 | def calculate_loss(self, interaction): 116 | # when starting a new epoch, the item embedding we saved must be cleared. 117 | if self.training: 118 | self.i_embedding = None 119 | 120 | user = interaction[self.USER_ID] 121 | item = interaction[self.ITEM_ID] 122 | if self.inter_matrix_type == '01': 123 | label = interaction[self.LABEL].float() 124 | elif self.inter_matrix_type == 'rating': 125 | label = interaction[self.RATING] * interaction[self.LABEL] 126 | output = self.forward(user, item) 127 | 128 | 129 | label = label / self.max_rating # normalize the label to calculate BCE loss. 130 | loss = self.bce_loss(output, label) 131 | return loss 132 | 133 | def predict(self, interaction): 134 | user = interaction[self.USER_ID] 135 | item = interaction[self.ITEM_ID] 136 | return self.forward(user, item) 137 | 138 | def get_user_embedding(self, user): 139 | r"""Get a batch of user's embedding with the user's id and history interaction matrix. 140 | 141 | Args: 142 | user (torch.LongTensor): The input tensor that contains user's id, shape: [batch_size, ] 143 | 144 | Returns: 145 | torch.FloatTensor: The embedding tensor of a batch of user, shape: [batch_size, emb_size] 146 | """ 147 | # Following lines construct tensor of shape [B,n_items] using the tensor of shape [B,H] 148 | col_indices = self.history_item_id[user].flatten() 149 | row_indices = torch.arange(user.shape[0]).to( 150 | self.device).repeat_interleave(self.history_item_id.shape[1], 151 | dim=0) 152 | matrix_01 = torch.zeros(1).to(self.device).repeat( 153 | user.shape[0], self.n_items) 154 | matrix_01.index_put_((row_indices, col_indices), 155 | self.history_item_value[user].flatten()) 156 | user = self.user_linear(matrix_01) 157 | 158 | return user 159 | 160 | def get_item_embedding(self): 161 | r"""Get all item's embedding with history interaction matrix. 162 | 163 | Considering the RAM of device, we use matrix multiply on sparse tensor for generalization. 164 | 165 | Returns: 166 | torch.FloatTensor: The embedding tensor of all item, shape: [n_items, emb_size] 167 | """ 168 | interaction_matrix = self.interaction_matrix.tocoo() 169 | row = interaction_matrix.row 170 | col = interaction_matrix.col 171 | i = torch.LongTensor([row, col]) 172 | data = torch.FloatTensor(interaction_matrix.data) 173 | item_matrix = torch.sparse.FloatTensor( 174 | i, data, 175 | torch.Size(interaction_matrix.shape)).to(self.device).transpose( 176 | 0, 1) 177 | item = torch.sparse.mm(item_matrix, self.item_linear.weight.t()) 178 | 179 | item = self.item_fc_layers(item) 180 | return item 181 | 182 | def full_sort_predict(self, interaction): 183 | user = interaction[self.USER_ID] 184 | u_embedding = self.get_user_embedding(user) 185 | u_embedding = self.user_fc_layers(u_embedding) 186 | 187 | if self.i_embedding is None: 188 | self.i_embedding = self.get_item_embedding() 189 | 190 | similarity = torch.mm(u_embedding, self.i_embedding.t()) 191 | similarity = self.sigmoid(similarity) 192 | return similarity.view(-1) 193 | 194 | 195 | if __name__ == '__main__': 196 | 197 | model = DMF() 198 | -------------------------------------------------------------------------------- /HRec/models/dssm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: dssm.py 4 | # ########################### 5 | """ 6 | DSSM 7 | ################################################ 8 | Reference: 9 | PS Huang et al. "Learning Deep Structured Semantic Models for Web Search using Clickthrough Data" in CIKM 2013. 10 | """ 11 | 12 | import torch 13 | import torch.nn as nn 14 | from torch.nn.init import xavier_normal_, constant_ 15 | 16 | from .layers import MLPLayers 17 | from .base import ContextModel 18 | 19 | 20 | class DSSM(ContextModel): 21 | """ DSSM respectively expresses user and item as low dimensional vectors with mlp layers, 22 | and uses cosine distance to calculate the distance between the two semantic vectors. 23 | 24 | """ 25 | def __init__(self, config, dataset): 26 | super(DSSM, self).__init__(config, dataset) 27 | 28 | # load parameters info 29 | self.mlp_hidden_size = config['mlp_hidden_size'] 30 | self.dropout_prob = config['dropout_prob'] 31 | 32 | self.user_feature_num = self.user_token_field_num + self.user_float_field_num + self.user_token_seq_field_num 33 | self.item_feature_num = self.item_token_field_num + self.item_float_field_num + self.item_token_seq_field_num 34 | user_size_list = [self.embedding_size * self.user_feature_num 35 | ] + self.mlp_hidden_size 36 | item_size_list = [self.embedding_size * self.item_feature_num 37 | ] + self.mlp_hidden_size 38 | 39 | # define layers and loss 40 | self.user_mlp_layers = MLPLayers(user_size_list, 41 | self.dropout_prob, 42 | activation='tanh', 43 | bn=True) 44 | self.item_mlp_layers = MLPLayers(item_size_list, 45 | self.dropout_prob, 46 | activation='tanh', 47 | bn=True) 48 | 49 | self.loss = nn.BCELoss() 50 | self.sigmod = nn.Sigmoid() 51 | 52 | # parameters initialization 53 | self.apply(self._init_weights) 54 | 55 | def _init_weights(self, module): 56 | if isinstance(module, nn.Embedding): 57 | xavier_normal_(module.weight.data) 58 | elif isinstance(module, nn.Linear): 59 | xavier_normal_(module.weight.data) 60 | if module.bias is not None: 61 | constant_(module.bias.data, 0) 62 | 63 | def forward(self, interaction): 64 | embed_result = self.double_tower_embed_input_fields(interaction) 65 | user_sparse_embedding, user_dense_embedding = embed_result[:2] 66 | item_sparse_embedding, item_dense_embedding = embed_result[2:] 67 | 68 | user = [] 69 | if user_sparse_embedding is not None: 70 | user.append(user_sparse_embedding) 71 | if user_dense_embedding is not None and len( 72 | user_dense_embedding.shape) == 3: 73 | user.append(user_dense_embedding) 74 | 75 | embed_user = torch.cat(user, dim=1) 76 | 77 | item = [] 78 | if item_sparse_embedding is not None: 79 | item.append(item_sparse_embedding) 80 | if item_dense_embedding is not None and len( 81 | item_dense_embedding.shape) == 3: 82 | item.append(item_dense_embedding) 83 | 84 | embed_item = torch.cat(item, dim=1) 85 | 86 | batch_size = embed_item.shape[0] 87 | user_dnn_out = self.user_mlp_layers(embed_user.view(batch_size, -1)) 88 | item_dnn_out = self.item_mlp_layers(embed_item.view(batch_size, -1)) 89 | score = torch.cosine_similarity(user_dnn_out, item_dnn_out, dim=1) 90 | 91 | sig_score = self.sigmod(score) 92 | return sig_score.squeeze() 93 | 94 | def calculate_loss(self, interaction): 95 | label = interaction[self.LABEL] 96 | output = self.forward(interaction) 97 | return self.loss(output, label.float()) 98 | 99 | def predict(self, interaction): 100 | return self.forward(interaction) 101 | -------------------------------------------------------------------------------- /HRec/models/duration.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: duration.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2021-01-07 15:28:53 7 | # ########################### 8 | 9 | import torch 10 | import random 11 | import logging 12 | import numpy as np 13 | import torch.nn as nn 14 | from .layers import MLPLayers, set_kernel_layer 15 | from .base import HModel 16 | from torch.nn.init import normal_ 17 | from collections import Counter, defaultdict 18 | from itertools import combinations_with_replacement 19 | 20 | 21 | class DURation(HModel): 22 | """ Deep Unified Representation for Heterogeneous Recommendation""" 23 | def __init__(self, config, dataset): 24 | super().__init__(config, dataset) 25 | self.logger = logging.getLogger() 26 | 27 | self.LABEL = dataset.config['LABEL_FIELD'] 28 | # self.RATING = dataset.config['RATING_FIELD'] 29 | 30 | self.user_emb_size = config['user_emb_size'] 31 | self.item_emb_size = config['item_emb_size'] 32 | self.token_emb_size = config['token_emb_size'] 33 | 34 | # The number of item types 35 | self.P = len(dataset.config['item_feat_path']) 36 | self.item_size = dataset.item_nums 37 | 38 | self.device = config['device'] 39 | self.user_hidden_size_list = config['user_hidden_size_list'] 40 | self.item_hidden_size_list = config['item_hidden_size_list'] 41 | self.item_map_hidden_size_list = config['item_map_hidden_size_list'] 42 | self.kernel = set_kernel_layer(config.get('kernel', 'gaussian')) 43 | 44 | assert self.user_hidden_size_list[-1] == self.item_hidden_size_list[-1] 45 | self.inter_matrix_type = dataset.config['inter_matrix_type'] 46 | 47 | # generate intermediate data 48 | if self.inter_matrix_type == '01': 49 | self.history_user_id, self.history_user_value, _ = dataset.history_user_matrix( 50 | ) 51 | self.history_item_id, self.history_item_value, _ = dataset.history_item_matrix( 52 | ) 53 | self.interaction_matrix = dataset.inter_matrix(form='csr').astype( 54 | np.float32) 55 | elif self.inter_matrix_type == 'rating': 56 | self.history_user_id, self.history_user_value, _ = dataset.history_user_matrix( 57 | value_field=self.RATING) 58 | self.history_item_id, self.history_item_value, _ = dataset.history_item_matrix( 59 | value_field=self.RATING) 60 | self.interaction_matrix = dataset.inter_matrix( 61 | form='csr', value_field=self.RATING).astype(np.float32) 62 | 63 | self.max_rating = self.history_user_value.max() 64 | # tensor of shape [n_items, H] where H is max length of history interaction. 65 | 66 | # Keep the user matrix in cpu to save gpu mem 67 | # self.history_user_id = self.history_user_id.to(self.device) 68 | # self.history_user_value = self.history_user_value.to(self.device) 69 | 70 | self.history_item_id = self.history_item_id.to(self.device) 71 | self.history_item_value = self.history_item_value.to(self.device) 72 | 73 | # define layers 74 | self.user_linear = nn.Linear(in_features=self.n_items, 75 | out_features=self.user_emb_size, 76 | bias=False) 77 | 78 | self.map_func_dict = nn.ModuleDict() 79 | 80 | self.pdist = nn.PairwiseDistance(p=2) 81 | 82 | for item_type, item_feats in dataset.item_feat_fields.items(): 83 | 84 | item_feat_type_count = Counter( 85 | [dataset.field2type[i] for i in item_feats]) 86 | input_dim = (item_feat_type_count['token'] + 1) * self.token_emb_size + \ 87 | item_feat_type_count['float'] 88 | self.map_func_dict[item_type] = MLPLayers( 89 | [input_dim] + self.item_map_hidden_size_list).to(self.device) 90 | 91 | self.item_linear = nn.Linear(in_features=self.n_users, 92 | out_features=self.item_emb_size, 93 | bias=False) 94 | self.user_fc_layers = MLPLayers([self.user_emb_size] + 95 | self.user_hidden_size_list).to( 96 | self.device) 97 | self.item_fc_layers = MLPLayers( 98 | [self.item_map_hidden_size_list[-1] + self.item_emb_size] + 99 | self.item_hidden_size_list).to(self.device) 100 | self.sigmoid = nn.Sigmoid() 101 | self.bce_loss = nn.BCELoss() 102 | 103 | # Save the item embedding before dot product layer to speed up evaluation 104 | self.i_embedding = None 105 | 106 | # parameters initialization 107 | self.apply(self._init_weights) 108 | 109 | def _init_weights(self, module): 110 | # We just initialize the module with normal distribution as the paper said 111 | if isinstance(module, nn.Linear): 112 | normal_(module.weight.data, 0, 0.01) 113 | if module.bias is not None: 114 | module.bias.data.fill_(0.0) 115 | elif isinstance(module, nn.Embedding): 116 | normal_(module.weight.data, 0, 0.01) 117 | 118 | def agg_item_feature(self, item_type, item_data): 119 | 120 | token_embeddings = [] 121 | float_feats = [] 122 | for feat_name, feat_value in item_data.items(): 123 | if feat_name in self.token_embedding_table and feat_name != self.USER_ID: 124 | emb = self.token_embedding_table[feat_name](feat_value.long()) 125 | token_embeddings.append(emb) 126 | if feat_name in self.float_field_names: 127 | float_feat = feat_value.float() 128 | if float_feat.dim() == 1: 129 | float_feat = float_feat.unsqueeze(-1) 130 | float_feats.append(float_feat) 131 | all_emb = torch.cat(token_embeddings + float_feats, dim=-1) 132 | return all_emb 133 | 134 | def get_item_embedding(self, item_type, data): 135 | item_id = data[self.ITEM_ID].long() 136 | 137 | # Following lines construct tensor of shape [B,n_users] using the tensor of shape [B,H] 138 | row_indices = torch.arange(item_id.shape[0], 139 | device=self.device).repeat_interleave( 140 | self.history_user_id.shape[1], dim=0) 141 | col_indices = self.history_user_id[item_id].flatten().to(self.device) 142 | matrix_01 = torch.zeros(1, device=self.device).repeat( 143 | item_id.shape[0], self.n_users) 144 | matrix_01.index_put_( 145 | (row_indices, col_indices), 146 | self.history_user_value[item_id].flatten().to(self.device)) 147 | item_inter_feat = self.item_linear(matrix_01) 148 | 149 | map_layers = self.map_func_dict[item_type] 150 | item_emb = self.agg_item_feature(item_type, data) 151 | item_transformed_emb = map_layers(item_emb) 152 | item_feat = torch.cat([item_inter_feat, item_transformed_emb], dim=-1) 153 | 154 | return item_emb, item_transformed_emb, item_feat 155 | 156 | def forward(self, item_type, data, return_item_emb=False): 157 | 158 | # Interaction-related features 159 | user = data[self.USER_ID] 160 | item_id = data[self.ITEM_ID] 161 | user = self.get_user_embedding(user) 162 | 163 | # Following lines construct tensor of shape [B,n_users] using the tensor of shape [B,H] 164 | col_indices = self.history_user_id[item_id].flatten().to(self.device) 165 | row_indices = torch.arange(item_id.shape[0], 166 | device=self.device).repeat_interleave( 167 | self.history_user_id.shape[1], dim=0) 168 | matrix_01 = torch.zeros(1, device=self.device).repeat( 169 | item_id.shape[0], self.n_users) 170 | matrix_01.index_put_( 171 | (row_indices, col_indices), 172 | self.history_user_value[item_id].flatten().to(self.device)) 173 | item_inter_feat = self.item_linear(matrix_01) 174 | 175 | # Context-related features 176 | 177 | # Map heterogeneous raw feature to unified feature space 178 | 179 | map_layers = self.map_func_dict[item_type] 180 | item_emb = self.agg_item_feature(item_type, data) 181 | item_transformed_emb = map_layers(item_emb) 182 | item_feat = torch.cat([item_inter_feat, item_transformed_emb], dim=-1) 183 | 184 | user = self.user_fc_layers(user) 185 | item = self.item_fc_layers(item_feat) 186 | 187 | vector = torch.mul(user, item).sum(dim=1) 188 | vector = self.sigmoid(vector) 189 | 190 | if return_item_emb: 191 | return vector, item_emb, item_transformed_emb 192 | else: 193 | return vector 194 | 195 | def calculate_topo_loss(self, raw_emb, emb): 196 | """ 197 | Calculate the topology loss, for every pair of items sampled from given batch, 198 | calculate the 199 | |x_i, x_j|^2 * W(r_i, r_j) 200 | x is the tranformed representation 201 | r is the raw representation 202 | W is the similarity function 203 | 204 | Input: 205 | raw_emb: [bs, dim] raw features 206 | emb: [bs, new_dim] embedding in transformed feature space 207 | 208 | """ 209 | 210 | d = emb.shape[0] 211 | n_r = raw_emb.shape[1] 212 | n_x = emb.shape[1] 213 | # r_one = torch.ones((1, n_r), device=self.device) @ raw_emb.T 214 | # x_one = torch.ones((1, n_x), device=self.device) @ emb.T 215 | c_r = 1 / (n_r - 1) * torch.matmul(raw_emb, raw_emb.T) 216 | # (1 / n_r) * torch.matmul(r_one.T, r_one)) 217 | c_x = 1 / (n_x - 1) * torch.matmul(emb, emb.T) 218 | # (1 / n_x) * torch.matmul(x_one.T, x_one)) 219 | 220 | loss = 1 / (4 * d**2) * (c_r - c_x).pow(2).sum().sqrt() 221 | return loss 222 | 223 | def calculate_align_loss(self, data): 224 | """ 225 | Calculate the alignment loss. For each batch, sample a number of pairs to minimize 226 | the alignment loss. 227 | """ 228 | size = 128 229 | item_size = self.item_size 230 | 231 | min_size = min([i.shape[0] for i in data.values()]) 232 | if min_size < size: 233 | return None 234 | losses = [] 235 | for type_i, type_j in combinations_with_replacement(data.keys(), r=2): 236 | 237 | if type_i == type_j: 238 | factor = (self.P - 1) / (self.P**2 * item_size[type_i]**2) 239 | else: 240 | factor = -1 / (self.P**2 * item_size[type_i] * 241 | item_size[type_j]) 242 | indice_i = random.sample(range(min_size), size) 243 | indice_i = torch.tensor(indice_i, device=self.device) 244 | sample_i = data[type_i][indice_i] 245 | indice_j = random.sample(range(min_size), size) 246 | indice_j = torch.tensor(indice_j, device=self.device) 247 | sample_j = data[type_j][indice_j] 248 | 249 | res = self.kernel(sample_i, sample_j) 250 | loss = factor * res 251 | losses.append(loss) 252 | 253 | align_loss = torch.sum(torch.stack(losses)) 254 | return align_loss 255 | 256 | def calculate_loss(self, data): 257 | # when starting a new epoch, the item embedding we saved must be cleared. 258 | # The 259 | if self.training: 260 | self.i_embedding = None 261 | 262 | losses = [] 263 | losses_dict = defaultdict(int) 264 | 265 | item_emb_dict = {} 266 | for item_type, item_data in data.items(): 267 | 268 | if self.inter_matrix_type == '01': 269 | label = item_data[self.LABEL].float() 270 | elif self.inter_matrix_type == 'rating': 271 | label = item_data[self.RATING] * item_data[self.LABEL] 272 | 273 | output, item_raw_emb, item_emb = self.forward(item_type, 274 | item_data, 275 | return_item_emb=True) 276 | 277 | item_emb_dict[item_type] = item_emb 278 | 279 | topo_loss = 0.001 * self.calculate_topo_loss( 280 | item_raw_emb, item_emb) 281 | losses_dict['topo'] += topo_loss.item() 282 | losses.append(topo_loss) 283 | 284 | label = label / self.max_rating # normalize the label to calculate BCE loss. 285 | 286 | cls_loss = self.bce_loss(output, label) 287 | losses_dict['cls'] += cls_loss.item() 288 | 289 | losses.append(cls_loss) 290 | 291 | align_loss = 5e8 * self.calculate_align_loss(item_emb_dict) 292 | if align_loss is not None: 293 | losses.append(align_loss) 294 | losses_dict['align'] += align_loss.item() 295 | 296 | loss = torch.sum(torch.stack(losses)) 297 | return loss, losses_dict 298 | 299 | def predict(self, h, data): 300 | return self.forward(h, data) 301 | 302 | def get_user_embedding(self, user): 303 | r"""Get a batch of user's embedding with the user's id and history interaction matrix. 304 | 305 | Args: 306 | user (torch.LongTensor): The input tensor that contains user's id, shape: [batch_size, ] 307 | 308 | Returns: 309 | torch.FloatTensor: The embedding tensor of a batch of user, shape: [batch_size, emb_size] 310 | """ 311 | # Following lines construct tensor of shape [B,n_items] using the tensor of shape [B,H] 312 | col_indices = self.history_item_id[user].flatten() 313 | row_indices = torch.arange(user.shape[0], 314 | device=self.device).repeat_interleave( 315 | self.history_item_id.shape[1], dim=0) 316 | matrix_01 = torch.zeros(1, device=self.device).repeat( 317 | user.shape[0], self.n_items) 318 | matrix_01.index_put_((row_indices, col_indices), 319 | self.history_item_value[user].flatten()) 320 | user = self.user_linear(matrix_01) 321 | 322 | return user 323 | 324 | 325 | if __name__ == '__main__': 326 | 327 | model = DURation() 328 | -------------------------------------------------------------------------------- /HRec/models/fism.py: -------------------------------------------------------------------------------- 1 | # -*- codi:utf-8 -*- 2 | # ########################### 3 | # File Name: fism.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-25 18:34:53 7 | # ########################### 8 | 9 | import torch 10 | import logging 11 | import torch.nn as nn 12 | from .base import GeneralModel 13 | from torch.nn.init import normal_ 14 | """ 15 | FISM 16 | ####################################### 17 | Reference: 18 | S. Kabbur et al. "FISM: Factored item similarity models for top-n recommender systems" in KDD 2013 19 | """ 20 | 21 | 22 | class FISM(GeneralModel): 23 | """FISM is an item-based model for generating top-N recommendations that learns the 24 | item-item similarity matrix as the product of two low dimensional latent factor matrices. 25 | These matrices are learned using a structural equation modeling approach, where in the 26 | value being estimated is not used for its own estimation. 27 | 28 | """ 29 | def __init__(self, config, dataset): 30 | super(FISM, self).__init__(config, dataset) 31 | 32 | # load dataset info 33 | self.LABEL = dataset.config['LABEL_FIELD'] 34 | self.logger = logging.getLogger() 35 | 36 | # get all users's history interaction information.the history item 37 | # matrix is padding by the maximum number of a user's interactions 38 | self.history_item_matrix, self.history_lens, self.mask_mat = self.get_history_info( 39 | dataset) 40 | 41 | # load parameters info 42 | self.embedding_size = config['embedding_size'] 43 | self.reg_weights = config['reg_weights'] 44 | self.alpha = config['alpha'] 45 | self.split_to = config['split_to'] 46 | 47 | # split the too large dataset into the specified pieces 48 | if self.split_to > 0: 49 | self.group = torch.chunk( 50 | torch.arange(self.n_items).to(self.device), self.split_to) 51 | else: 52 | self.logger.warning( 53 | 'Pay Attetion!! the `split_to` is set to 0. If you catch a OMM error in this case, ' 54 | + 55 | 'you need to increase it \n\t\t\tuntil the error disappears. For example, ' 56 | + 57 | 'you can append it in the command line such as `--split_to=5`') 58 | 59 | # define layers and loss 60 | # construct source and destination item embedding matrix 61 | self.item_src_embedding = nn.Embedding(self.n_items, 62 | self.embedding_size, 63 | padding_idx=0) 64 | self.item_dst_embedding = nn.Embedding(self.n_items, 65 | self.embedding_size, 66 | padding_idx=0) 67 | self.user_bias = nn.Parameter(torch.zeros(self.n_users)) 68 | self.item_bias = nn.Parameter(torch.zeros(self.n_items)) 69 | self.bceloss = nn.BCELoss() 70 | 71 | # parameters initialization 72 | self.apply(self._init_weights) 73 | 74 | def get_history_info(self, dataset): 75 | """get the user history interaction information 76 | 77 | Args: 78 | dataset (DataSet): train dataset 79 | 80 | Returns: 81 | tuple: (history_item_matrix, history_lens, mask_mat) 82 | 83 | """ 84 | history_item_matrix, _, history_lens = dataset.history_item_matrix() 85 | history_item_matrix = history_item_matrix.to(self.device) 86 | history_lens = history_lens.to(self.device) 87 | arange_tensor = torch.arange(history_item_matrix.shape[1]).to( 88 | self.device) 89 | mask_mat = (arange_tensor < history_lens.unsqueeze(1)).float() 90 | return history_item_matrix, history_lens, mask_mat 91 | 92 | def reg_loss(self): 93 | """calculate the reg loss for embedding layers 94 | 95 | Returns: 96 | torch.Tensor: reg loss 97 | 98 | """ 99 | reg_1, reg_2 = self.reg_weights 100 | loss_1 = reg_1 * self.item_src_embedding.weight.norm(2) 101 | loss_2 = reg_2 * self.item_dst_embedding.weight.norm(2) 102 | 103 | return loss_1 + loss_2 104 | 105 | def _init_weights(self, module): 106 | """Initialize the module's parameters 107 | 108 | Note: 109 | It's a little different from the source code, because pytorch has no function to initialize 110 | the parameters by truncated normal distribution, so we replace it with xavier normal distribution 111 | 112 | """ 113 | if isinstance(module, nn.Embedding): 114 | normal_(module.weight.data, 0, 0.01) 115 | 116 | def inter_forward(self, user, item): 117 | """forward the model by interaction 118 | 119 | """ 120 | user_inter = self.history_item_matrix[user] 121 | item_num = self.history_lens[user].unsqueeze(1) 122 | batch_mask_mat = self.mask_mat[user] 123 | user_history = self.item_src_embedding( 124 | user_inter) # batch_size x max_len x embedding_size 125 | target = self.item_dst_embedding(item) # batch_size x embedding_size 126 | user_bias = self.user_bias[user] # batch_size x 1 127 | item_bias = self.item_bias[item] 128 | similarity = torch.bmm(user_history, target.unsqueeze(2)).squeeze( 129 | 2) # batch_size x max_len 130 | similarity = batch_mask_mat * similarity 131 | coeff = torch.pow(item_num.squeeze(1), -self.alpha) 132 | scores = torch.sigmoid(coeff.float() * torch.sum(similarity, dim=1) + 133 | user_bias + item_bias) 134 | return scores 135 | 136 | def user_forward(self, 137 | user_input, 138 | item_num, 139 | user_bias, 140 | repeats=None, 141 | pred_slc=None): 142 | """forward the model by user 143 | 144 | Args: 145 | user_input (torch.Tensor): user input tensor 146 | item_num (torch.Tensor): user hitory interaction lens 147 | repeats (int, optional): the number of items to be evaluated 148 | pred_slc (torch.Tensor, optional): continuous index which controls the current evaluation items, 149 | if pred_slc is None, it will evaluate all items 150 | 151 | Returns: 152 | torch.Tensor: result 153 | 154 | """ 155 | item_num = item_num.repeat(repeats, 1) 156 | user_history = self.item_src_embedding( 157 | user_input) # inter_num x embedding_size 158 | user_history = user_history.repeat( 159 | repeats, 1, 1) # target_items x inter_num x embedding_size 160 | if pred_slc is None: 161 | targets = self.item_dst_embedding.weight # target_items x embedding_size 162 | item_bias = self.item_bias 163 | else: 164 | targets = self.item_dst_embedding(pred_slc) 165 | item_bias = self.item_bias[pred_slc] 166 | similarity = torch.bmm(user_history, targets.unsqueeze(2)).squeeze( 167 | 2) # inter_num x target_items 168 | coeff = torch.pow(item_num.squeeze(1), -self.alpha) 169 | scores = torch.sigmoid(coeff.float() * torch.sum(similarity, dim=1) + 170 | user_bias + item_bias) 171 | return scores 172 | 173 | def forward(self, user, item): 174 | user = user.long() 175 | item = item.long() 176 | return self.inter_forward(user, item) 177 | 178 | def calculate_loss(self, interaction): 179 | user = interaction[self.USER_ID].long() 180 | item = interaction[self.ITEM_ID].long() 181 | label = interaction[self.LABEL].float() 182 | output = self.forward(user, item) 183 | loss = self.bceloss(output, label) + self.reg_loss() 184 | return loss 185 | 186 | def full_sort_predict(self, interaction): 187 | user = interaction[self.USER_ID] 188 | batch_user_bias = self.user_bias[user] 189 | user_inters = self.history_item_matrix[user] 190 | item_nums = self.history_lens[user] 191 | scores = [] 192 | 193 | # test users one by one, if the number of items is too large, we will split it to some pieces 194 | for user_input, item_num, user_bias in zip(user_inters, 195 | item_nums.unsqueeze(1), 196 | batch_user_bias): 197 | if self.split_to <= 0: 198 | output = self.user_forward(user_input[:item_num], 199 | item_num, 200 | user_bias, 201 | repeats=self.n_items) 202 | else: 203 | output = [] 204 | for mask in self.group: 205 | tmp_output = self.user_forward(user_input[:item_num], 206 | item_num, 207 | user_bias, 208 | repeats=len(mask), 209 | pred_slc=mask) 210 | output.append(tmp_output) 211 | output = torch.cat(output, dim=0) 212 | scores.append(output) 213 | result = torch.cat(scores, dim=0) 214 | return result 215 | 216 | def predict(self, interaction): 217 | user = interaction[self.USER_ID] 218 | item = interaction[self.ITEM_ID] 219 | output = self.forward(user, item) 220 | return output 221 | -------------------------------------------------------------------------------- /HRec/models/layers.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import torch 4 | import numpy as np 5 | import torch.nn as nn 6 | import torch.nn.functional as fn 7 | 8 | from torch.nn.init import normal_ 9 | 10 | 11 | class MLPLayers(nn.Module): 12 | r""" MLPLayers 13 | Args: 14 | - layers(list): a list contains the size of each layer in mlp layers 15 | - dropout(float): probability of an element to be zeroed. Default: 0 16 | - activation(str): activation function after each layer in mlp layers. Default: 'relu' 17 | candidates: 'sigmoid', 'tanh', 'relu', 'leekyrelu', 'none' 18 | Shape: 19 | 20 | - Input: (:math:`N`, \*, :math:`H_{in}`) where \* means any number of additional dimensions 21 | :math:`H_{in}` must equal to the first value in `layers` 22 | - Output: (:math:`N`, \*, :math:`H_{out}`) where :math:`H_{out}` equals to the last value in `layers` 23 | 24 | Examples:: 25 | 26 | >>> m = MLPLayers([64, 32, 16], 0.2, 'relu') 27 | >>> input = torch.randn(128, 64) 28 | >>> output = m(input) 29 | >>> print(output.size()) 30 | >>> torch.Size([128, 16]) 31 | """ 32 | def __init__(self, 33 | layers, 34 | dropout=0, 35 | activation='relu', 36 | bn=False, 37 | init_method=None): 38 | super(MLPLayers, self).__init__() 39 | self.layers = layers 40 | self.dropout = dropout 41 | self.activation = activation 42 | self.use_bn = bn 43 | self.init_method = init_method 44 | 45 | mlp_modules = [] 46 | for idx, (input_size, output_size) in enumerate( 47 | zip(self.layers[:-1], self.layers[1:])): 48 | mlp_modules.append(nn.Dropout(p=self.dropout)) 49 | mlp_modules.append(nn.Linear(input_size, output_size)) 50 | if self.use_bn: 51 | mlp_modules.append(nn.BatchNorm1d(num_features=output_size)) 52 | activation_func = activation_layer(self.activation, output_size) 53 | if activation_func is not None: 54 | mlp_modules.append(activation_func) 55 | 56 | self.mlp_layers = nn.Sequential(*mlp_modules) 57 | if self.init_method is not None: 58 | self.apply(self.init_weights) 59 | 60 | def init_weights(self, module): 61 | # We just initialize the module with normal distribution as the paper said 62 | if isinstance(module, nn.Linear): 63 | if self.init_method == 'norm': 64 | normal_(module.weight.data, 0, 0.01) 65 | if module.bias is not None: 66 | module.bias.data.fill_(0.0) 67 | 68 | def forward(self, input_feature): 69 | return self.mlp_layers(input_feature) 70 | 71 | 72 | def activation_layer(activation_name='relu', emb_dim=None): 73 | """Construct activation layers 74 | 75 | Args: 76 | activation_name: str, name of activation function 77 | emb_dim: int, used for Dice activation 78 | 79 | Return: 80 | activation: activation layer 81 | """ 82 | if activation_name is None: 83 | activation = None 84 | elif isinstance(activation_name, str): 85 | if activation_name.lower() == 'sigmoid': 86 | activation = nn.Sigmoid() 87 | elif activation_name.lower() == 'tanh': 88 | activation = nn.Tanh() 89 | elif activation_name.lower() == 'relu': 90 | activation = nn.ReLU() 91 | elif activation_name.lower() == 'leakyrelu': 92 | activation = nn.LeakyReLU() 93 | elif activation_name.lower() == 'none': 94 | activation = None 95 | elif issubclass(activation_name, nn.Module): 96 | activation = activation_name() 97 | else: 98 | raise NotImplementedError( 99 | "activation function {} is not implemented".format( 100 | activation_name)) 101 | return activation 102 | 103 | 104 | class BaseFactorizationMachine(nn.Module): 105 | r"""Calculate FM result over the embeddings 106 | 107 | Args: 108 | reduce_sum: bool, whether to sum the result, default is True. 109 | 110 | Input: 111 | input_x: tensor, A 3D tensor with shape:``(batch_size,field_size,embed_dim)``. 112 | 113 | Output 114 | output: tensor, A 3D tensor with shape: ``(batch_size,1)`` or ``(batch_size, embed_dim)``. 115 | """ 116 | def __init__(self, reduce_sum=True): 117 | super(BaseFactorizationMachine, self).__init__() 118 | self.reduce_sum = reduce_sum 119 | 120 | def forward(self, input_x): 121 | square_of_sum = torch.sum(input_x, dim=1)**2 122 | sum_of_square = torch.sum(input_x**2, dim=1) 123 | output = square_of_sum - sum_of_square 124 | if self.reduce_sum: 125 | output = torch.sum(output, dim=1, keepdim=True) 126 | output = 0.5 * output 127 | return output 128 | 129 | 130 | class FMEmbedding(nn.Module): 131 | r""" Embedding for token fields. 132 | 133 | Args: 134 | field_dims: list, the number of tokens in each token fields 135 | offsets: list, the dimension offset of each token field 136 | embed_dim: int, the dimension of output embedding vectors 137 | 138 | Input: 139 | input_x: tensor, A 3D tensor with shape:``(batch_size,field_size)``. 140 | 141 | Return: 142 | output: tensor, A 3D tensor with shape: ``(batch_size,field_size,embed_dim)``. 143 | """ 144 | def __init__(self, field_dims, offsets, embed_dim): 145 | super(FMEmbedding, self).__init__() 146 | self.embedding = nn.Embedding(sum(field_dims), embed_dim) 147 | self.offsets = offsets 148 | 149 | def forward(self, input_x): 150 | input_x = input_x + input_x.new_tensor(self.offsets).unsqueeze(0) 151 | output = self.embedding(input_x) 152 | return output 153 | 154 | 155 | class FMFirstOrderLinear(nn.Module): 156 | """Calculate the first order score of the input features. 157 | This class is a member of ContextRecommender, you can call it easily when inherit ContextRecommender. 158 | 159 | """ 160 | def __init__(self, config, dataset, output_dim=1, embed=True): 161 | 162 | super(FMFirstOrderLinear, self).__init__() 163 | self.field_names = dataset.fields() 164 | self.LABEL = dataset.config['LABEL_FIELD'] 165 | self.device = config['device'] 166 | self.embed = embed 167 | self.token_field_names = [] 168 | self.token_field_dims = [] 169 | self.float_field_names = [] 170 | self.float_field_dims = [] 171 | self.token_seq_field_names = [] 172 | self.token_seq_field_dims = [] 173 | for field_name in self.field_names: 174 | if field_name == self.LABEL: 175 | continue 176 | if dataset.field2type[field_name] == "token": 177 | self.token_field_names.append(field_name) 178 | self.token_field_dims.append(dataset.num(field_name)) 179 | elif dataset.field2type[field_name] == "token_seq": 180 | self.token_seq_field_names.append(field_name) 181 | self.token_seq_field_dims.append(dataset.num(field_name)) 182 | else: 183 | self.float_field_names.append(field_name) 184 | self.float_field_dims.append(dataset.num(field_name)) 185 | if len(self.token_field_dims) > 0: 186 | self.token_field_offsets = np.array( 187 | (0, *np.cumsum(self.token_field_dims)[:-1]), dtype=np.long) 188 | self.token_embedding_table = FMEmbedding(self.token_field_dims, 189 | self.token_field_offsets, 190 | output_dim) 191 | if len(self.float_field_dims) > 0: 192 | self.float_embedding_table = nn.Embedding( 193 | np.sum(self.float_field_dims, dtype=np.int32), output_dim) 194 | if len(self.token_seq_field_dims) > 0: 195 | self.token_seq_embedding_table = nn.ModuleList() 196 | for token_seq_field_dim in self.token_seq_field_dims: 197 | self.token_seq_embedding_table.append( 198 | nn.Embedding(token_seq_field_dim, output_dim)) 199 | 200 | self.bias = nn.Parameter(torch.zeros((output_dim, )), 201 | requires_grad=True) 202 | 203 | def embed_float_fields(self, float_fields, embed=True): 204 | """Calculate the first order score of float feature columns 205 | 206 | Args: 207 | float_fields (torch.FloatTensor): The input tensor. shape of [batch_size, num_float_field] 208 | 209 | Returns: 210 | torch.FloatTensor: The first order score of float feature columns 211 | """ 212 | # input Tensor shape : [batch_size, num_float_field] 213 | if float_fields is None: 214 | return float_fields 215 | if not embed: 216 | if float_fields.dim() == 2: 217 | return float_fields.unsqueeze(1) 218 | 219 | num_float_field = float_fields.shape[1] 220 | # [batch_size, num_float_field] 221 | index = torch.arange( 222 | 0, num_float_field).unsqueeze(0).expand_as(float_fields).long().to( 223 | self.device) 224 | 225 | # [batch_size, num_float_field, output_dim] 226 | float_embedding = self.float_embedding_table(index) 227 | float_embedding = torch.mul(float_embedding, float_fields.unsqueeze(2)) 228 | 229 | # [batch_size, 1, output_dim] 230 | float_embedding = torch.sum(float_embedding, dim=1, keepdim=True) 231 | 232 | return float_embedding 233 | 234 | def embed_token_fields(self, token_fields): 235 | """Calculate the first order score of token feature columns 236 | 237 | Args: 238 | token_fields (torch.LongTensor): The input tensor. shape of [batch_size, num_token_field] 239 | 240 | Returns: 241 | torch.FloatTensor: The first order score of token feature columns 242 | """ 243 | # input Tensor shape : [batch_size, num_token_field] 244 | if token_fields is None: 245 | return None 246 | # [batch_size, num_token_field, embed_dim] 247 | token_embedding = self.token_embedding_table(token_fields) 248 | # [batch_size, 1, output_dim] 249 | token_embedding = torch.sum(token_embedding, dim=1, keepdim=True) 250 | 251 | return token_embedding 252 | 253 | def embed_token_seq_fields(self, token_seq_fields): 254 | """Calculate the first order score of token sequence feature columns 255 | 256 | Args: 257 | token_seq_fields (torch.LongTensor): The input tensor. shape of [batch_size, seq_len] 258 | 259 | Returns: 260 | torch.FloatTensor: The first order score of token sequence feature columns 261 | """ 262 | # input is a list of Tensor shape of [batch_size, seq_len] 263 | fields_result = [] 264 | for i, token_seq_field in enumerate(token_seq_fields): 265 | embedding_table = self.token_seq_embedding_table[i] 266 | mask = token_seq_field != 0 # [batch_size, seq_len] 267 | mask = mask.float() 268 | # value_cnt = torch.sum(mask, dim=1, keepdim=True) # [batch_size, 1] 269 | 270 | token_seq_embedding = embedding_table( 271 | token_seq_field) # [batch_size, seq_len, output_dim] 272 | 273 | mask = mask.unsqueeze(2).expand_as( 274 | token_seq_embedding) # [batch_size, seq_len, output_dim] 275 | pdb.set_trace() 276 | masked_token_seq_embedding = token_seq_embedding * mask.float() 277 | result = torch.sum(masked_token_seq_embedding, dim=1, 278 | keepdim=True) # [batch_size, 1, output_dim] 279 | 280 | fields_result.append(result) 281 | if len(fields_result) == 0: 282 | return None 283 | else: 284 | return torch.sum(torch.cat(fields_result, dim=1), 285 | dim=1, 286 | keepdim=True) # [batch_size, 1, output_dim] 287 | 288 | def forward(self, interaction): 289 | total_fields_embedding = [] 290 | float_fields = [] 291 | for field_name in self.float_field_names: 292 | float_fields.append(interaction[field_name] 293 | if len(interaction[field_name].shape) == 294 | 2 else interaction[field_name].unsqueeze(1)) 295 | 296 | if len(float_fields) > 0: 297 | float_fields = torch.cat(float_fields, 298 | dim=1) # [batch_size, num_float_field] 299 | else: 300 | float_fields = None 301 | 302 | # [batch_size, 1, output_dim] or None 303 | float_fields_embedding = self.embed_float_fields(float_fields, 304 | embed=self.embed) 305 | 306 | if float_fields_embedding is not None: 307 | total_fields_embedding.append(float_fields_embedding.float()) 308 | 309 | token_fields = [] 310 | for field_name in self.token_field_names: 311 | token_fields.append(interaction[field_name].unsqueeze(1)) 312 | if len(token_fields) > 0: 313 | token_fields = torch.cat(token_fields, 314 | dim=1) # [batch_size, num_token_field] 315 | else: 316 | token_fields = None 317 | # [batch_size, 1, output_dim] or None 318 | token_fields_embedding = self.embed_token_fields(token_fields) 319 | if token_fields_embedding is not None: 320 | total_fields_embedding.append(token_fields_embedding) 321 | 322 | token_seq_fields = [] 323 | for field_name in self.token_seq_field_names: 324 | token_seq_fields.append(interaction[field_name]) 325 | # [batch_size, 1, output_dim] or None 326 | token_seq_fields_embedding = self.embed_token_seq_fields( 327 | token_seq_fields) 328 | if token_seq_fields_embedding is not None: 329 | total_fields_embedding.append(token_seq_fields_embedding) 330 | 331 | if self.embed: 332 | return torch.sum(torch.cat(total_fields_embedding, dim=1), 333 | dim=1) + self.bias # [batch_size, output_dim] 334 | else: 335 | return torch.sum(torch.cat(total_fields_embedding, dim=2), 336 | dim=2) + self.bias 337 | 338 | 339 | class AttLayer(nn.Module): 340 | """Calculate the attention signal(weight) according the input tensor. 341 | 342 | Args: 343 | infeatures (torch.FloatTensor): A 3D input tensor with shape of[batch_size, M, embed_dim]. 344 | 345 | Returns: 346 | torch.FloatTensor: Attention weight of input. shape of [batch_size, M]. 347 | """ 348 | def __init__(self, in_dim, att_dim): 349 | super(AttLayer, self).__init__() 350 | self.in_dim = in_dim 351 | self.att_dim = att_dim 352 | self.w = torch.nn.Linear(in_features=in_dim, 353 | out_features=att_dim, 354 | bias=False) 355 | self.h = nn.Parameter(torch.randn(att_dim), requires_grad=True) 356 | 357 | def forward(self, infeatures): 358 | att_singal = self.w(infeatures) # [batch_size, M, att_dim] 359 | att_singal = fn.relu(att_singal) # [batch_size, M, att_dim] 360 | 361 | att_singal = torch.mul(att_singal, self.h) # [batch_size, M, att_dim] 362 | att_singal = torch.sum(att_singal, dim=2) # [batch_size, M] 363 | att_singal = fn.softmax(att_singal, dim=1) # [batch_size, M] 364 | 365 | return att_singal 366 | 367 | 368 | class RegLoss(nn.Module): 369 | """ RegLoss, L2 regularization on model parameters 370 | 371 | """ 372 | def __init__(self): 373 | super(RegLoss, self).__init__() 374 | 375 | def forward(self, parameters): 376 | reg_loss = None 377 | for W in parameters: 378 | if reg_loss is None: 379 | reg_loss = W.norm(2) 380 | else: 381 | reg_loss = reg_loss + W.norm(2) 382 | return reg_loss 383 | 384 | 385 | def meshgrid(x, y=None): 386 | if y is None: 387 | y = x 388 | x = torch.as_tensor(x) 389 | y = torch.as_tensor(y) 390 | m, n = x.size(0), y.size(0) 391 | grid_x = x[None].expand(n, m) 392 | grid_y = y[:, None].expand(n, m) 393 | return grid_x, grid_y 394 | 395 | 396 | def get_all_combination(x, dim=0, r=2, device='cpu'): 397 | """ 398 | Get all combination of given x. 399 | Input: 400 | x: tensor 401 | dim: 402 | r: the number of elements to combine 403 | """ 404 | xs = torch.arange(x.shape[dim], device=device) 405 | idx = torch.combinations(xs, r=r) 406 | a = x.index_select(dim, idx[:, 0]) 407 | b = x.index_select(dim, idx[:, 1]) 408 | return a, b 409 | 410 | 411 | def combinations(x, y, dim, all=True, n=None): 412 | """ 413 | Given the tensor x and y, return a list of pair of tensor where the first tensor sampled 414 | from x and the second tensor sampled from y. 415 | Input: 416 | x,y: tensors shared the same dim 417 | dim: sample from which dimension 418 | TODO all: if True, return all of possible combinations 419 | n: the num of samples should return. If all is True, will ignore n 420 | """ 421 | xs = torch.arange(x.shape[dim]) 422 | ys = torch.arange(y.shape[dim]) 423 | grid_x, grid_y = meshgrid(xs, ys) 424 | # select n random elements from the 425 | # cartesian product 426 | sampled = torch.randperm(grid_x.numel())[:n] 427 | indices_x = grid_x.take(sampled) 428 | indices_y = grid_y.take(sampled) 429 | # get from the indices 430 | return x.index_select(dim, indices_x), y.index_select(dim, indices_y) 431 | 432 | 433 | def set_kernel_layer(name): 434 | 435 | if name == 'gaussian': 436 | return gaussian_rbf_layer 437 | 438 | 439 | def gaussian_rbf_layer(x, y): 440 | dist = torch.pairwise_distance(x, y, 2) 441 | return torch.exp(-0.5 * dist.pow(2)) 442 | -------------------------------------------------------------------------------- /HRec/models/model_map.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekinglcq/HRec/f13a685dd593154d4887ed18bd444e588484d014/HRec/models/model_map.py -------------------------------------------------------------------------------- /HRec/models/nais.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: nais.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2021-01-25 19:18:52 7 | # ########################### 8 | """ 9 | NAIS 10 | ###################################### 11 | Reference: 12 | Xiangnan He et al. "NAIS: Neural Attentive Item Similarity Model for Recommendation." in TKDE 2018. 13 | """ 14 | 15 | import torch 16 | import logging 17 | import torch.nn as nn 18 | from .layers import MLPLayers 19 | from .base import GeneralModel 20 | from torch.nn.init import constant_, normal_, xavier_normal_ 21 | 22 | 23 | class NAIS(GeneralModel): 24 | """NAIS is an attention network, which is capable of distinguishing which historical items 25 | in a user profile are more important for a prediction. We just implement the model following 26 | the original author with a pointwise training mode. 27 | Note: 28 | instead of forming a minibatch as all training instances of a randomly sampled user which is 29 | mentioned in the original paper, we still train the model by a randomly sampled interactions. 30 | 31 | """ 32 | def __init__(self, config, dataset): 33 | super(NAIS, self).__init__(config, dataset) 34 | 35 | # load dataset info 36 | self.LABEL = config['LABEL_FIELD'] 37 | self.logger = logging.getLogger() 38 | 39 | # get all users's history interaction information.the history item 40 | # matrix is padding by the maximum number of a user's interactions 41 | self.history_item_matrix, self.history_lens, self.mask_mat = self.get_history_info( 42 | dataset) 43 | 44 | # load parameters info 45 | self.embedding_size = config['embedding_size'] 46 | self.weight_size = config['weight_size'] 47 | self.algorithm = config['algorithm'] 48 | self.reg_weights = config['reg_weights'] 49 | self.alpha = config['alpha'] 50 | self.beta = config['beta'] 51 | self.split_to = config['split_to'] 52 | self.pretrain_path = config['pretrain_path'] 53 | 54 | # split the too large dataset into the specified pieces 55 | if self.split_to > 0: 56 | self.logger.info('split the n_items to {} pieces'.format( 57 | self.split_to)) 58 | self.group = torch.chunk( 59 | torch.arange(self.n_items).to(self.device), self.split_to) 60 | else: 61 | self.logger.warning( 62 | 'Pay Attetion!! the `split_to` is set to 0. If you catch a OMM error in this case, ' 63 | + 64 | 'you need to increase it \n\t\t\tuntil the error disappears. For example, ' 65 | + 66 | 'you can append it in the command line such as `--split_to=5`') 67 | 68 | # define layers and loss 69 | # construct source and destination item embedding matrix 70 | self.item_src_embedding = nn.Embedding(self.n_items, 71 | self.embedding_size, 72 | padding_idx=0) 73 | self.item_dst_embedding = nn.Embedding(self.n_items, 74 | self.embedding_size, 75 | padding_idx=0) 76 | self.bias = nn.Parameter(torch.zeros(self.n_items)) 77 | if self.algorithm == 'concat': 78 | self.mlp_layers = MLPLayers( 79 | [self.embedding_size * 2, self.weight_size]) 80 | elif self.algorithm == 'prod': 81 | self.mlp_layers = MLPLayers( 82 | [self.embedding_size, self.weight_size]) 83 | else: 84 | raise ValueError( 85 | "NAIS just support attention type in ['concat', 'prod'] but get {}" 86 | .format(self.algorithm)) 87 | self.weight_layer = nn.Parameter(torch.ones(self.weight_size, 1)) 88 | self.bceloss = nn.BCELoss() 89 | 90 | # parameters initialization 91 | if self.pretrain_path is not None: 92 | self.logger.info('use pretrain from [{}]...'.format( 93 | self.pretrain_path)) 94 | self._load_pretrain() 95 | else: 96 | self.logger.info('unuse pretrain...') 97 | self.apply(self._init_weights) 98 | 99 | def _init_weights(self, module): 100 | """Initialize the module's parameters 101 | 102 | Note: 103 | It's a little different from the source code, because pytorch has no function to initialize 104 | the parameters by truncated normal distribution, so we replace it with xavier normal distribution 105 | 106 | """ 107 | if isinstance(module, nn.Embedding): 108 | normal_(module.weight.data, 0, 0.01) 109 | elif isinstance(module, nn.Linear): 110 | xavier_normal_(module.weight.data) 111 | if module.bias is not None: 112 | constant_(module.bias.data, 0) 113 | 114 | def _load_pretrain(self): 115 | """A simple implementation of loading pretrained parameters. 116 | 117 | """ 118 | fism = torch.load(self.pretrain_path)['state_dict'] 119 | self.item_src_embedding.weight.data.copy_( 120 | fism['item_src_embedding.weight']) 121 | self.item_dst_embedding.weight.data.copy_( 122 | fism['item_dst_embedding.weight']) 123 | for name, parm in self.mlp_layers.named_parameters(): 124 | if name.endswith('weight'): 125 | xavier_normal_(parm.data) 126 | elif name.endswith('bias'): 127 | constant_(parm.data, 0) 128 | 129 | def get_history_info(self, dataset): 130 | """get the user history interaction information 131 | 132 | Args: 133 | dataset (DataSet): train dataset 134 | 135 | Returns: 136 | tuple: (history_item_matrix, history_lens, mask_mat) 137 | 138 | """ 139 | history_item_matrix, _, history_lens = dataset.history_item_matrix() 140 | history_item_matrix = history_item_matrix.to(self.device) 141 | history_lens = history_lens.to(self.device) 142 | arange_tensor = torch.arange(history_item_matrix.shape[1]).to( 143 | self.device) 144 | mask_mat = (arange_tensor < history_lens.unsqueeze(1)).float() 145 | return history_item_matrix, history_lens, mask_mat 146 | 147 | def reg_loss(self): 148 | """calculate the reg loss for embedding layers and mlp layers 149 | 150 | Returns: 151 | torch.Tensor: reg loss 152 | 153 | """ 154 | reg_1, reg_2, reg_3 = self.reg_weights 155 | loss_1 = reg_1 * self.item_src_embedding.weight.norm(2) 156 | loss_2 = reg_2 * self.item_dst_embedding.weight.norm(2) 157 | loss_3 = 0 158 | for name, parm in self.mlp_layers.named_parameters(): 159 | if name.endswith('weight'): 160 | loss_3 = loss_3 + reg_3 * parm.norm(2) 161 | return loss_1 + loss_2 + loss_3 162 | 163 | def attention_mlp(self, inter, target): 164 | """layers of attention which support `prod` and `concat` 165 | 166 | Args: 167 | inter (torch.Tensor): the embedding of history items 168 | target (torch.Tensor): the embedding of target items 169 | 170 | Returns: 171 | torch.Tensor: the result of attention 172 | 173 | """ 174 | if self.algorithm == 'prod': 175 | mlp_input = inter * target.unsqueeze( 176 | 1) # batch_size x max_len x embedding_size 177 | else: 178 | mlp_input = torch.cat( 179 | [inter, target.unsqueeze(1).expand_as(inter)], 180 | dim=2) # batch_size x max_len x embedding_size*2 181 | mlp_output = self.mlp_layers( 182 | mlp_input) # batch_size x max_len x weight_size 183 | 184 | logits = torch.matmul(mlp_output, self.weight_layer).squeeze( 185 | 2) # batch_size x max_len 186 | return logits 187 | 188 | def mask_softmax(self, similarity, logits, bias, item_num, batch_mask_mat): 189 | """softmax the unmasked user history items and get the final output 190 | 191 | Args: 192 | similarity (torch.Tensor): the similarity between the histoy items and target items 193 | logits (torch.Tensor): the initial weights of the history items 194 | item_num (torch.Tensor): user hitory interaction lengths 195 | bias (torch.Tensor): bias 196 | batch_mask_mat (torch.Tensor): the mask of user history interactions 197 | 198 | Returns: 199 | torch.Tensor: final output 200 | 201 | """ 202 | exp_logits = torch.exp(logits) # batch_size x max_len 203 | 204 | exp_logits = batch_mask_mat * exp_logits # batch_size x max_len 205 | exp_sum = torch.sum(exp_logits, dim=1, keepdim=True) 206 | exp_sum = torch.pow(exp_sum, self.beta) 207 | weights = torch.div(exp_logits, exp_sum) 208 | 209 | coeff = torch.pow(item_num.squeeze(1), -self.alpha) 210 | output = torch.sigmoid(coeff.float() * 211 | torch.sum(weights * similarity, dim=1) + bias) 212 | 213 | return output 214 | 215 | def softmax(self, similarity, logits, item_num, bias): 216 | """softmax the user history features and get the final output 217 | 218 | Args: 219 | similarity (torch.Tensor): the similarity between the histoy items and target items 220 | logits (torch.Tensor): the initial weights of the history items 221 | item_num (torch.Tensor): user hitory interaction lengths 222 | bias (torch.Tensor): bias 223 | 224 | Returns: 225 | torch.Tensor: final output 226 | 227 | """ 228 | exp_logits = torch.exp(logits) # batch_size x max_len 229 | exp_sum = torch.sum(exp_logits, dim=1, keepdim=True) 230 | exp_sum = torch.pow(exp_sum, self.beta) 231 | weights = torch.div(exp_logits, exp_sum) 232 | coeff = torch.pow(item_num.squeeze(1), -self.alpha) 233 | output = torch.sigmoid(coeff.float() * 234 | torch.sum(weights * similarity, dim=1) + bias) 235 | 236 | return output 237 | 238 | def inter_forward(self, user, item): 239 | """forward the model by interaction 240 | 241 | """ 242 | user_inter = self.history_item_matrix[user] 243 | item_num = self.history_lens[user].unsqueeze(1) 244 | batch_mask_mat = self.mask_mat[user] 245 | user_history = self.item_src_embedding( 246 | user_inter) # batch_size x max_len x embedding_size 247 | target = self.item_dst_embedding(item) # batch_size x embedding_size 248 | bias = self.bias[item] # batch_size x 1 249 | similarity = torch.bmm(user_history, target.unsqueeze(2)).squeeze( 250 | 2) # batch_size x max_len 251 | logits = self.attention_mlp(user_history, target) 252 | scores = self.mask_softmax(similarity, logits, bias, item_num, 253 | batch_mask_mat) 254 | return scores 255 | 256 | def user_forward(self, user_input, item_num, repeats=None, pred_slc=None): 257 | """forward the model by user 258 | 259 | Args: 260 | user_input (torch.Tensor): user input tensor 261 | item_num (torch.Tensor): user hitory interaction lens 262 | repeats (int, optional): the number of items to be evaluated 263 | pred_slc (torch.Tensor, optional): continuous index which controls the current evaluation items, 264 | if pred_slc is None, it will evaluate all items 265 | 266 | Returns: 267 | torch.Tensor: result 268 | 269 | """ 270 | item_num = item_num.repeat(repeats, 1) 271 | user_history = self.item_src_embedding( 272 | user_input) # inter_num x embedding_size 273 | user_history = user_history.repeat( 274 | repeats, 1, 1) # target_items x inter_num x embedding_size 275 | if pred_slc is None: 276 | targets = self.item_dst_embedding.weight # target_items x embedding_size 277 | bias = self.bias 278 | else: 279 | targets = self.item_dst_embedding(pred_slc) 280 | bias = self.bias[pred_slc] 281 | similarity = torch.bmm(user_history, targets.unsqueeze(2)).squeeze( 282 | 2) # inter_num x target_items 283 | logits = self.attention_mlp(user_history, targets) 284 | scores = self.softmax(similarity, logits, item_num, bias) 285 | return scores 286 | 287 | def forward(self, user, item): 288 | return self.inter_forward(user.long(), item.long()) 289 | 290 | def calculate_loss(self, interaction): 291 | user = interaction[self.USER_ID] 292 | item = interaction[self.ITEM_ID] 293 | label = interaction[self.LABEL].float() 294 | output = self.forward(user, item) 295 | loss = self.bceloss(output, label) + self.reg_loss() 296 | return loss 297 | 298 | def full_sort_predict(self, interaction): 299 | user = interaction[self.USER_ID] 300 | user_inters = self.history_item_matrix[user] 301 | item_nums = self.history_lens[user] 302 | scores = [] 303 | 304 | # test users one by one, if the number of items is too large, we will split it to some pieces 305 | for user_input, item_num in zip(user_inters, item_nums.unsqueeze(1)): 306 | if self.split_to <= 0: 307 | output = self.user_forward(user_input[:item_num], 308 | item_num, 309 | repeats=self.n_items) 310 | else: 311 | output = [] 312 | for mask in self.group: 313 | tmp_output = self.user_forward(user_input[:item_num], 314 | item_num, 315 | repeats=len(mask), 316 | pred_slc=mask) 317 | output.append(tmp_output) 318 | output = torch.cat(output, dim=0) 319 | scores.append(output) 320 | result = torch.cat(scores, dim=0) 321 | return result 322 | 323 | def predict(self, interaction): 324 | user = interaction[self.USER_ID] 325 | item = interaction[self.ITEM_ID] 326 | output = self.forward(user, item) 327 | return output 328 | -------------------------------------------------------------------------------- /HRec/models/ncf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: ncf.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-25 20:54:34 7 | # ########################### 8 | 9 | import torch 10 | import logging 11 | import torch.nn as nn 12 | from .layers import MLPLayers 13 | from .base import GeneralModel 14 | from torch.nn.init import normal_ 15 | """ 16 | NCF 17 | ####################################### 18 | Reference: 19 | Xiangnan He et al. "Neural Collaborative Filtering." in WWW 2017. 20 | """ 21 | 22 | 23 | class NCF(GeneralModel): 24 | def __init__(self, config, dataset): 25 | super().__init__(config, dataset) 26 | 27 | # load dataset info 28 | self.LABEL = dataset.config['LABEL_FIELD'] 29 | self.logger = logging.getLogger() 30 | 31 | # load parameters info 32 | self.mf_embedding_size = config['mf_embedding_size'] 33 | self.mlp_embedding_size = config['mlp_embedding_size'] 34 | self.mlp_hidden_size = config['mlp_hidden_size'] 35 | self.dropout_prob = config['dropout_prob'] 36 | self.mf_train = config['mf_train'] 37 | self.mlp_train = config['mlp_train'] 38 | self.use_pretrain = config['use_pretrain'] 39 | self.mf_pretrain_path = config['mf_pretrain_path'] 40 | self.mlp_pretrain_path = config['mlp_pretrain_path'] 41 | 42 | # define layers and loss 43 | self.user_mf_embedding = nn.Embedding(self.n_users, 44 | self.mf_embedding_size) 45 | self.item_mf_embedding = nn.Embedding(self.n_items, 46 | self.mf_embedding_size) 47 | self.user_mlp_embedding = nn.Embedding(self.n_users, 48 | self.mlp_embedding_size) 49 | self.item_mlp_embedding = nn.Embedding(self.n_items, 50 | self.mlp_embedding_size) 51 | self.mlp_layers = MLPLayers([2 * self.mlp_embedding_size] + 52 | self.mlp_hidden_size) 53 | self.mlp_layers.logger = None # remove logger to use torch.save() 54 | if self.mf_train and self.mlp_train: 55 | self.predict_layer = nn.Linear( 56 | self.mf_embedding_size + self.mlp_hidden_size[-1], 1, 57 | self.dropout_prob) 58 | elif self.mf_train: 59 | self.predict_layer = nn.Linear(self.mf_embedding_size, 1) 60 | elif self.mlp_train: 61 | self.predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1) 62 | self.sigmoid = nn.Sigmoid() 63 | self.loss = nn.BCELoss() 64 | 65 | # parameters initialization 66 | if self.use_pretrain: 67 | self.load_pretrain() 68 | else: 69 | self.apply(self._init_weights) 70 | 71 | def load_pretrain(self): 72 | r"""A simple implementation of loading pretrained parameters. 73 | 74 | """ 75 | mf = torch.load(self.mf_pretrain_path) 76 | mlp = torch.load(self.mlp_pretrain_path) 77 | self.user_mf_embedding.weight.data.copy_(mf.user_mf_embedding.weight) 78 | self.item_mf_embedding.weight.data.copy_(mf.item_mf_embedding.weight) 79 | self.user_mlp_embedding.weight.data.copy_( 80 | mlp.user_mlp_embedding.weight) 81 | self.item_mlp_embedding.weight.data.copy_( 82 | mlp.item_mlp_embedding.weight) 83 | 84 | for (m1, m2) in zip(self.mlp_layers.mlp_layers, 85 | mlp.mlp_layers.mlp_layers): 86 | if isinstance(m1, nn.Linear) and isinstance(m2, nn.Linear): 87 | m1.weight.data.copy_(m2.weight) 88 | m1.bias.data.copy_(m2.bias) 89 | 90 | predict_weight = torch.cat( 91 | [mf.predict_layer.weight, mlp.predict_layer.weight], dim=1) 92 | predict_bias = mf.predict_layer.bias + mlp.predict_layer.bias 93 | 94 | self.predict_layer.weight.data.copy_(0.5 * predict_weight) 95 | self.predict_layer.weight.data.copy_(0.5 * predict_bias) 96 | 97 | def _init_weights(self, module): 98 | if isinstance(module, nn.Embedding): 99 | normal_(module.weight.data, mean=0.0, std=0.01) 100 | 101 | def forward(self, user, item): 102 | user = user.long() 103 | item = item.long() 104 | user_mf_e = self.user_mf_embedding(user) 105 | item_mf_e = self.item_mf_embedding(item) 106 | user_mlp_e = self.user_mlp_embedding(user) 107 | item_mlp_e = self.item_mlp_embedding(item) 108 | if self.mf_train: 109 | mf_output = torch.mul(user_mf_e, 110 | item_mf_e) # [batch_size, embedding_size] 111 | if self.mlp_train: 112 | mlp_output = self.mlp_layers( 113 | torch.cat((user_mlp_e, item_mlp_e), 114 | -1)) # [batch_size, layers[-1]] 115 | if self.mf_train and self.mlp_train: 116 | output = self.sigmoid( 117 | self.predict_layer(torch.cat((mf_output, mlp_output), -1))) 118 | elif self.mf_train: 119 | output = self.sigmoid(self.predict_layer(mf_output)) 120 | elif self.mlp_train: 121 | output = self.sigmoid(self.predict_layer(mlp_output)) 122 | else: 123 | raise RuntimeError( 124 | 'mf_train and mlp_train can not be False at the same time') 125 | return output.squeeze() 126 | 127 | def calculate_loss(self, interaction): 128 | user = interaction[self.USER_ID] 129 | item = interaction[self.ITEM_ID] 130 | label = interaction[self.LABEL].float() 131 | 132 | output = self.forward(user, item) 133 | return self.loss(output, label) 134 | 135 | def predict(self, interaction): 136 | user = interaction[self.USER_ID] 137 | item = interaction[self.ITEM_ID] 138 | return self.forward(user, item) 139 | 140 | def dump_parameters(self): 141 | r"""A simple implementation of dumping model parameters for pretrain. 142 | 143 | """ 144 | if self.mf_train and not self.mlp_train: 145 | save_path = self.mf_pretrain_path 146 | torch.save(self, save_path) 147 | elif self.mlp_train and not self.mf_train: 148 | save_path = self.mlp_pretrain_path 149 | torch.save(self, save_path) 150 | -------------------------------------------------------------------------------- /HRec/models/nfm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: nfm.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-31 17:30:38 7 | # ########################### 8 | r""" 9 | NFM 10 | ################################################ 11 | Reference: 12 | He X, Chua T S. "Neural factorization machines for sparse predictive analytics" in SIGIR 2017 13 | """ 14 | 15 | import torch 16 | import torch.nn as nn 17 | from torch.nn.init import xavier_normal_, constant_ 18 | 19 | from .layers import BaseFactorizationMachine, MLPLayers 20 | from .base import ContextModel 21 | 22 | 23 | class NFM(ContextModel): 24 | """ NFM replace the fm part as a mlp to model the feature interaction. 25 | 26 | """ 27 | def __init__(self, config, dataset): 28 | super(NFM, self).__init__(config, dataset) 29 | 30 | # load parameters info 31 | self.mlp_hidden_size = config['mlp_hidden_size'] 32 | self.dropout_prob = config['dropout_prob'] 33 | 34 | # define layers and loss 35 | size_list = [self.embedding_size] + self.mlp_hidden_size 36 | self.fm = BaseFactorizationMachine(reduce_sum=False) 37 | self.bn = nn.BatchNorm1d(num_features=self.embedding_size) 38 | self.mlp_layers = MLPLayers(size_list, 39 | self.dropout_prob, 40 | activation='sigmoid', 41 | bn=True) 42 | self.predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1, bias=False) 43 | self.sigmoid = nn.Sigmoid() 44 | self.loss = nn.BCELoss() 45 | 46 | # parameters initialization 47 | self.apply(self._init_weights) 48 | 49 | def _init_weights(self, module): 50 | if isinstance(module, nn.Embedding): 51 | xavier_normal_(module.weight.data) 52 | elif isinstance(module, nn.Linear): 53 | xavier_normal_(module.weight.data) 54 | if module.bias is not None: 55 | constant_(module.bias.data, 0) 56 | 57 | def forward(self, interaction): 58 | # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None 59 | # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None 60 | sparse_embedding, dense_embedding = self.embed_input_fields( 61 | interaction) 62 | all_embeddings = [] 63 | if sparse_embedding is not None: 64 | all_embeddings.append(sparse_embedding) 65 | if dense_embedding is not None and len(dense_embedding.shape) == 3: 66 | all_embeddings.append(dense_embedding) 67 | nfm_all_embeddings = torch.cat( 68 | all_embeddings, dim=1) # [batch_size, num_field, embed_dim] 69 | bn_nfm_all_embeddings = self.bn(self.fm(nfm_all_embeddings)) 70 | 71 | output = self.sigmoid( 72 | self.predict_layer(self.mlp_layers(bn_nfm_all_embeddings)) + 73 | self.first_order_linear(interaction)) 74 | return output.squeeze() 75 | 76 | def calculate_loss(self, interaction): 77 | label = interaction[self.LABEL] 78 | output = self.forward(interaction) 79 | return self.loss(output, label) 80 | 81 | def predict(self, interaction): 82 | return self.forward(interaction) 83 | -------------------------------------------------------------------------------- /HRec/models/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import gc 4 | import torch 5 | from enum import Enum 6 | 7 | 8 | class ModelType(Enum): 9 | """Type of models: 10 | - GENERAL: General model, treat different type item as the same. 11 | - HETERO: Heterogenous model 12 | """ 13 | GENERAL = 1 14 | CONTEXT = 2 15 | HETERO = 3 16 | 17 | 18 | ## MEM utils ## 19 | def mem_report(): 20 | '''Report the memory usage of the tensor.storage in pytorch 21 | Both on CPUs and GPUs are reported''' 22 | def _mem_report(tensors, mem_type): 23 | '''Print the selected tensors of type 24 | 25 | There are two major storage types in our major concern: 26 | - GPU: tensors transferred to CUDA devices 27 | - CPU: tensors remaining on the system memory (usually unimportant) 28 | 29 | Args: 30 | - tensors: the tensors of specified type 31 | - mem_type: 'CPU' or 'GPU' in current implementation ''' 32 | print('Storage on %s' % (mem_type)) 33 | print('-' * LEN) 34 | total_numel = 0 35 | total_mem = 0 36 | visited_data = [] 37 | for tensor in tensors: 38 | if tensor.is_sparse: 39 | continue 40 | import pdb 41 | pdb.set_trace() 42 | # a data_ptr indicates a memory block allocated 43 | data_ptr = tensor.storage().data_ptr() 44 | if data_ptr in visited_data: 45 | continue 46 | visited_data.append(data_ptr) 47 | 48 | numel = tensor.storage().size() 49 | total_numel += numel 50 | element_size = tensor.storage().element_size() 51 | mem = numel * element_size / 1024 / 1024 # 32bit=4Byte, MByte 52 | total_mem += mem 53 | element_type = type(tensor).__name__ 54 | size = tuple(tensor.size()) 55 | 56 | print('%s\t\t%s\t\t%.2f' % (element_type, size, mem)) 57 | print('-' * LEN) 58 | print('Total Tensors: %d \tUsed Memory Space: %.2f MBytes' % 59 | (total_numel, total_mem)) 60 | print('-' * LEN) 61 | 62 | LEN = 65 63 | print('=' * LEN) 64 | objects = gc.get_objects() 65 | print('%s\t%s\t\t\t%s' % ('Element type', 'Size', 'Used MEM(MBytes)')) 66 | tensors = [obj for obj in objects if torch.is_tensor(obj)] 67 | cuda_tensors = [t for t in tensors if t.is_cuda] 68 | host_tensors = [t for t in tensors if not t.is_cuda] 69 | _mem_report(cuda_tensors, 'GPU') 70 | _mem_report(host_tensors, 'CPU') 71 | print('=' * LEN) 72 | -------------------------------------------------------------------------------- /HRec/models/widedeep.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: widedeep.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-31 17:31:01 7 | # ########################### 8 | 9 | # -*- coding: utf-8 -*- 10 | # @Time : 2020/08/30 11 | # @Author : Xinyan Fan 12 | # @Email : xinyan.fan@ruc.edu.cn 13 | # @File : widedeep.py 14 | r""" 15 | WideDeep 16 | ##################################################### 17 | Reference: 18 | Heng-Tze Cheng et al. "Wide & Deep Learning for Recommender Systems." in RecSys 2016. 19 | """ 20 | 21 | import torch 22 | import torch.nn as nn 23 | from torch.nn.init import xavier_normal_, constant_ 24 | 25 | from .layers import MLPLayers 26 | from .base import ContextModel 27 | 28 | 29 | class WideDeep(ContextModel): 30 | r"""WideDeep is a context-based recommendation model. 31 | It jointly trains wide linear models and deep neural networks to combine the benefits 32 | of memorization and generalization for recommender systems. The wide component is a generalized linear model 33 | of the form :math:`y = w^Tx + b`. The deep component is a feed-forward neural network. The wide component 34 | and deep component are combined using a weighted sum of their output log odds as the prediction, 35 | which is then fed to one common logistic loss function for joint training. 36 | """ 37 | def __init__(self, config, dataset): 38 | super(WideDeep, self).__init__(config, dataset) 39 | 40 | # load parameters info 41 | self.mlp_hidden_size = config['mlp_hidden_size'] 42 | self.dropout_prob = config['dropout_prob'] 43 | 44 | # define layers and loss 45 | size_list = [ 46 | self.embedding_size * len(self.token_field_names) + 47 | len(self.float_field_names) 48 | ] + self.mlp_hidden_size 49 | 50 | self.mlp_layers = MLPLayers(size_list, self.dropout_prob) 51 | self.deep_predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1) 52 | self.sigmoid = nn.Sigmoid() 53 | self.loss = nn.BCELoss() 54 | 55 | # parameters initialization 56 | self.apply(self._init_weights) 57 | 58 | def _init_weights(self, module): 59 | if isinstance(module, nn.Embedding): 60 | xavier_normal_(module.weight.data) 61 | elif isinstance(module, nn.Linear): 62 | xavier_normal_(module.weight.data) 63 | if module.bias is not None: 64 | constant_(module.bias.data, 0) 65 | 66 | def forward(self, interaction): 67 | # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None 68 | # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None 69 | sparse_embedding, dense_embedding = self.embed_input_fields( 70 | interaction) 71 | batch_size = sparse_embedding.shape[0] 72 | all_embeddings = [] 73 | if sparse_embedding is not None: 74 | all_embeddings.append(sparse_embedding.view(batch_size, -1)) 75 | if dense_embedding is not None and len(dense_embedding.shape) == 3: 76 | all_embeddings.append(dense_embedding.view(batch_size, -1)) 77 | widedeep_all_embeddings = torch.cat( 78 | all_embeddings, dim=1) # [batch_size, num_field, embed_dim] 79 | fm_output = self.first_order_linear(interaction) 80 | 81 | deep_output = self.deep_predict_layer( 82 | self.mlp_layers(widedeep_all_embeddings)) 83 | output = self.sigmoid(fm_output + deep_output) 84 | return output.squeeze() 85 | 86 | def calculate_loss(self, interaction): 87 | label = interaction[self.LABEL] 88 | output = self.forward(interaction) 89 | return self.loss(output, label.float()) 90 | 91 | def predict(self, interaction): 92 | return self.forward(interaction) 93 | -------------------------------------------------------------------------------- /HRec/models/xdeepfm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: xdeepfm.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-31 17:31:23 7 | # ########################### 8 | r""" 9 | xDeepFM 10 | ################################################ 11 | Reference: 12 | Jianxun Lian at al. "xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems." 13 | in SIGKDD 2018. 14 | """ 15 | 16 | import torch 17 | import torch.nn as nn 18 | from torch.nn.init import xavier_normal_, constant_ 19 | from logging import getLogger 20 | 21 | from .layers import MLPLayers, activation_layer 22 | from .base import ContextModel 23 | 24 | 25 | class xDeepFM(ContextModel): 26 | """xDeepFM combines a CIN (Compressed Interaction Network) with a classical DNN. 27 | The model is able to learn certain bounded-degree feature interactions explicitly; 28 | Besides, it can also learn arbitrary low- and high-order feature interactions implicitly. 29 | """ 30 | def __init__(self, config, dataset): 31 | super(xDeepFM, self).__init__(config, dataset) 32 | 33 | # load parameters info 34 | self.mlp_hidden_size = config['mlp_hidden_size'] 35 | self.reg_weight = config['reg_weight'] 36 | self.dropout_prob = config['dropout_prob'] 37 | self.direct = config['direct'] 38 | self.eize = temp_cin_size = list(config['cin_layer_size']) 39 | 40 | # Check whether the size of the CIN layer is legal. 41 | if not self.direct: 42 | self.cin_layer_size = list( 43 | map(lambda x: int(x // 2 * 2), temp_cin_size)) 44 | if self.cin_layer_size[:-1] != temp_cin_size[:-1]: 45 | logger = getLogger() 46 | logger.warning( 47 | 'Layer size of CIN should be even except for the last layer when direct is True.' 48 | 'It is changed to {}'.format(self.cin_layer_size)) 49 | 50 | # Create a convolutional layer for each CIN layer 51 | self.conv1d_list = [] 52 | self.field_nums = [self.num_feature_field] 53 | for i, layer_size in enumerate(self.cin_layer_size): 54 | conv1d = nn.Conv1d(self.field_nums[-1] * self.field_nums[0], 55 | layer_size, 1).to(self.device) 56 | self.conv1d_list.append(conv1d) 57 | if self.direct: 58 | self.field_nums.append(layer_size) 59 | else: 60 | self.field_nums.append(layer_size // 2) 61 | 62 | # Create MLP layer 63 | size_list = [ 64 | self.embedding_size * len(self.token_field_names) + 65 | len(self.float_field_names) 66 | ] + self.mlp_hidden_size + [1] 67 | self.mlp_layers = MLPLayers(size_list, dropout=self.dropout_prob) 68 | 69 | # Get the output size of CIN 70 | if self.direct: 71 | self.final_len = sum(self.cin_layer_size) 72 | else: 73 | self.final_len = sum( 74 | self.cin_layer_size[:-1]) // 2 + self.cin_layer_size[-1] 75 | 76 | self.cin_linear = nn.Linear(self.final_len, 1, bias=False) 77 | self.sigmoid = nn.Sigmoid() 78 | self.loss = nn.BCELoss() 79 | self.apply(self._init_weights) 80 | 81 | def _init_weights(self, module): 82 | if isinstance(module, nn.Embedding): 83 | xavier_normal_(module.weight.data) 84 | elif isinstance(module, nn.Linear): 85 | xavier_normal_(module.weight.data) 86 | if module.bias is not None: 87 | constant_(module.bias.data, 0) 88 | 89 | def reg_loss(self, parameters): 90 | """Calculate the L2 normalization loss of parameters in a certain layer. 91 | 92 | Returns: 93 | loss(torch.FloatTensor): The L2 Loss tensor. shape of [1,] 94 | """ 95 | reg_loss = 0 96 | for name, parm in parameters: 97 | if name.endswith('weight'): 98 | reg_loss = reg_loss + parm.norm(2) 99 | return reg_loss 100 | 101 | def calculate_reg_loss(self): 102 | """Calculate the final L2 normalization loss of model parameters. 103 | Including weight matrixes of mlp layers, linear layer and convolutional layers. 104 | 105 | Returns: 106 | loss(torch.FloatTensor): The L2 Loss tensor. shape of [1,] 107 | """ 108 | l2_reg = 0 109 | l2_reg = l2_reg + self.reg_loss(self.mlp_layers.named_parameters()) 110 | l2_reg = l2_reg + self.reg_loss( 111 | self.first_order_linear.named_parameters()) 112 | for conv1d in self.conv1d_list: 113 | l2_reg += self.reg_loss(conv1d.named_parameters()) 114 | return l2_reg 115 | 116 | def compressed_interaction_network(self, 117 | input_features, 118 | activation='identity'): 119 | r"""For k-th CIN layer, the output :math:`X_k` is calculated via 120 | 121 | .. math:: 122 | x_{h,*}^{k} = \sum_{i=1}^{H_k-1} \sum_{j=1}^{m}W_{i,j}^{k,h}(X_{i,*}^{k-1} \circ x_{j,*}^0) 123 | 124 | :math:`H_k` donates the number of feature vectors in the k-th layer, 125 | :math:`1 \le h \le H_k`. 126 | :math:`\circ` donates the Hadamard product. 127 | 128 | And Then, We apply sum pooling on each feature map of the hidden layer. 129 | Finally, All pooling vectors from hidden layers are concatenated. 130 | 131 | Args: 132 | input_features(torch.Tensor): [batch_size, field_num, embed_dim]. Embedding vectors of all features. 133 | activation(str): name of activation function. 134 | 135 | Returns: 136 | torch.Tensor: [batch_size, num_feature_field * embedding_size]. output of CIN layer. 137 | """ 138 | batch_size, _, embedding_size = input_features.shape 139 | hidden_nn_layers = [input_features] 140 | final_result = [] 141 | for i, layer_size in enumerate(self.cin_layer_size): 142 | z_i = torch.einsum('bmd,bhd->bhmd', hidden_nn_layers[0], 143 | hidden_nn_layers[-1]) 144 | z_i = z_i.view(batch_size, self.field_nums[0] * self.field_nums[i], 145 | embedding_size) 146 | z_i = self.conv1d_list[i](z_i) 147 | 148 | # Pass the CIN intermediate result through the activation function. 149 | if activation.lower() == 'identity': 150 | output = z_i 151 | else: 152 | activate_func = activation_layer(activation) 153 | if activate_func is None: 154 | output = z_i 155 | else: 156 | output = activate_func(z_i) 157 | 158 | # Get the output of the hidden layer. 159 | if self.direct: 160 | direct_connect = output 161 | next_hidden = output 162 | else: 163 | if i != len(self.cin_layer_size) - 1: 164 | next_hidden, direct_connect = torch.split( 165 | output, 2 * [layer_size // 2], 1) 166 | else: 167 | direct_connect = output 168 | next_hidden = 0 169 | 170 | final_result.append(direct_connect) 171 | hidden_nn_layers.append(next_hidden) 172 | result = torch.cat(final_result, dim=1) 173 | result = torch.sum(result, -1) 174 | return result 175 | 176 | def forward(self, interaction): 177 | sparse_embedding, dense_embedding = self.embed_input_fields( 178 | interaction) 179 | all_embeddings = [] 180 | batch_size = sparse_embedding.shape[0] 181 | if sparse_embedding is not None: 182 | all_embeddings.append(sparse_embedding.view(batch_size, -1)) 183 | if dense_embedding is not None and len(dense_embedding.shape) == 3: 184 | all_embeddings.append(dense_embedding.view(batch_size, -1)) 185 | 186 | # Get the output of CIN. 187 | xdeepfm_input = torch.cat(all_embeddings, 188 | dim=1) # [batch_size, num_field, embed_dim] 189 | cin_output = self.compressed_interaction_network(sparse_embedding) 190 | cin_output = self.cin_linear(cin_output) 191 | 192 | # Get the output of MLP layer. 193 | batch_size = xdeepfm_input.shape[0] 194 | dnn_output = self.mlp_layers(xdeepfm_input.view(batch_size, -1)) 195 | 196 | # Get predicted score. 197 | y_p = self.first_order_linear(interaction) + cin_output + dnn_output 198 | y = self.sigmoid(y_p) 199 | 200 | return y.squeeze(1) 201 | 202 | def calculate_loss(self, interaction): 203 | label = interaction[self.LABEL] 204 | output = self.forward(interaction) 205 | l2_reg = self.calculate_reg_loss() 206 | return self.loss(output, label) + self.reg_weight * l2_reg 207 | 208 | def predict(self, interaction): 209 | return self.forward(interaction) 210 | -------------------------------------------------------------------------------- /HRec/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .process import Process 2 | from .hprocess import HProcess 3 | from .dprocess import DProcess 4 | from .utils import get_free_gpu 5 | -------------------------------------------------------------------------------- /HRec/pipeline/configure.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import json 4 | class Config(object): 5 | 6 | """Config class that control all config in the experiment. 7 | """ 8 | def __init__(self, config_path): 9 | 10 | self.dict = json.load(open(config_path)) 11 | for key in ['data', 'model', 'opt']: 12 | self.dict.update(self.dict[key]) 13 | 14 | 15 | def __getitem__(self, arg): 16 | 17 | if arg in self.dict: 18 | return self.get(arg) 19 | else: 20 | raise ValueError(f'No [{arg}] value in this config.') 21 | 22 | 23 | -------------------------------------------------------------------------------- /HRec/pipeline/dprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: dprocess.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2021-02-03 02:14:47 7 | # ########################### 8 | 9 | import os 10 | from tqdm import tqdm 11 | from .optimizer import Optimizer 12 | import torch 13 | from ..models import model_map 14 | from .hprocess import HProcess 15 | from collections import defaultdict 16 | from torch.autograd import Variable 17 | 18 | 19 | class DProcess(HProcess): 20 | """ 21 | Process for the DDTCDR model 22 | """ 23 | def __init__(self, config): 24 | self.config = config 25 | self._path_config(config['path']) 26 | self._logger_config() 27 | self._set_device(config) 28 | self._prepare_data(config['data']) 29 | self._prepare_model(config['model']) 30 | self._prepare_optimizer(config['opt']) 31 | self._prepare_evaluator(config) 32 | 33 | def _prepare_model(self, model_config): 34 | self.models = {} 35 | name = model_config['name'] 36 | for item_type in self.types: 37 | model = model_map[name](model_config, self.dataset, item_type) 38 | self.models[item_type] = model.to(self.device) 39 | self.model = self.models[item_type] 40 | self.crit = torch.nn.BCELoss() 41 | self.alpha = model_config['alpha'] 42 | 43 | def _prepare_optimizer(self, opt_config): 44 | self.opts = {} 45 | for item_type in self.types: 46 | opt = Optimizer(opt_config, self.models[item_type].parameters()) 47 | self.opts[item_type] = opt 48 | self.start_epoch = 0 49 | self.best_val_score = -1 50 | self.epochs = opt_config['epochs'] 51 | self.eval_step = opt_config['eval_step'] 52 | self.save_step = opt_config['save_step'] 53 | self.train_loss_dict = {} 54 | self.val_loss_dict = {} 55 | if 'early_stop' in opt_config.keys(): 56 | self.early_stop = True 57 | config = opt_config['early_stop'] 58 | self.eval_metric = config.get('metric', 'auc') 59 | self.eval_mode = config.get('mode', 'max') 60 | self.stop_step = config.get('stop_step', 5) 61 | else: 62 | self.early_stop = False 63 | 64 | def train_one_batch(self, hdata): 65 | 66 | for opt in self.opts.values(): 67 | opt.zero_grad() 68 | 69 | preds = defaultdict(dict) 70 | losses = defaultdict(dict) 71 | 72 | for item_model_type in self.types: 73 | for item_type in self.types: 74 | if item_type == item_model_type: 75 | preds[item_model_type][item_type] = self.models[ 76 | item_model_type](item_type, hdata[item_type]) 77 | else: 78 | preds[item_model_type][item_type] = self.models[ 79 | item_model_type](item_type, 80 | hdata[item_type], 81 | dual=True) 82 | label = hdata[item_type][self.LABEL].reshape((-1, 1)).float() 83 | losses[item_model_type][item_type] = self.crit( 84 | preds[item_model_type][item_type], label) 85 | 86 | # wighted loss 87 | w_loss = defaultdict(list) 88 | for item_type in self.types: 89 | for item_model_type in self.types: 90 | if item_type == item_model_type: 91 | loss = (1 - 92 | self.alpha) * losses[item_model_type][item_type] 93 | else: 94 | # change variable to Tensor if error 95 | loss = self.alpha * Variable( 96 | losses[item_model_type][item_type].data, 97 | requires_grad=False) 98 | w_loss[item_type].append(loss) 99 | t_loss = {} 100 | for k, v in w_loss.items(): 101 | t_loss[k] = torch.sum(torch.stack(w_loss[k], dim=0)) 102 | t_loss[k].backward(retain_graph=True) 103 | orth_loss = {} 104 | for item_type in self.types: 105 | orth_loss[item_type] = torch.zeros(1, device=self.device) 106 | reg = 1e-6 107 | 108 | for item_type, model in self.models.items(): 109 | for name, param in model.bridge.named_parameters(): 110 | if 'bias' not in name: 111 | param_flat = param.view(param.shape[0], -1) 112 | sym = torch.mm(param_flat, torch.t(param_flat)) 113 | sym -= torch.eye(param_flat.shape[0], device=self.device) 114 | orth_loss[item_type] += reg * sym.abs().sum() 115 | orth_loss[item_type].backward() 116 | for item_type in self.types: 117 | self.opts[item_type].step() 118 | 119 | r_loss = 0 120 | for _, loss in t_loss.items(): 121 | r_loss += loss.data.cpu().item() 122 | for _, l in orth_loss.items(): 123 | r_loss += loss.data.cpu().item() 124 | 125 | return r_loss 126 | 127 | def train_one_epoch(self, data_loader=None): 128 | """Train one epoch using given data""" 129 | if data_loader is None: 130 | data_loader = self.dataset.train_data_loader 131 | max_len = max([len(dl) for dl in data_loader.values()]) 132 | m = self.model 133 | 134 | m.train() 135 | losses_dict = defaultdict(float) 136 | 137 | iters = {} 138 | hdata = {} 139 | for idx in tqdm(range(max_len), total=max_len): 140 | 141 | for item_type, dl in data_loader.items(): 142 | try: 143 | data_iter = iters.get(item_type, None) 144 | if data_iter is None: 145 | iters[item_type] = iter(dl) 146 | data_iter = iters[item_type] 147 | data = next(data_iter) 148 | except: 149 | iters[item_type] = iter(dl) 150 | data = next(iters[item_type]) 151 | 152 | # self.modelsize(self.model, data) 153 | if type(data) is dict: 154 | for key, value in data.items(): 155 | data[key] = value.to(self.device) 156 | hdata[item_type] = data 157 | 158 | r_loss = self.train_one_batch(hdata) 159 | 160 | losses_dict['total'] = r_loss 161 | 162 | return losses_dict 163 | 164 | def validate(self, data_loader=None): 165 | """ 166 | Run model in validation dataset and calculate the 167 | score using evaluator. 168 | Return: 169 | result: a dict store metrics name-value pair. 170 | """ 171 | if data_loader is None: 172 | data_loader = self.dataset.val_data_loader 173 | ms = self.models 174 | for m in ms.values(): 175 | m.eval() 176 | 177 | lens = [len(dl) for dl in data_loader.values()] 178 | batch_matrix_list = [] 179 | with tqdm(total=sum(lens)) as pbar: 180 | for item_type, dl in data_loader.items(): 181 | for data in dl: 182 | 183 | if type(data) is dict: 184 | for key, value in data.items(): 185 | data[key] = value.to(self.device) 186 | pred = ms[item_type].predict(item_type, data).reshape( 187 | (-1, )) 188 | batch_matrix = self.evaluator.collect(data, pred) 189 | batch_matrix_list.append(batch_matrix) 190 | pbar.update(1) 191 | 192 | result = self.evaluator.evaluate(batch_matrix_list, groupby=True) 193 | return result 194 | 195 | def save_checkpoint(self, epoch, name='last', path=None): 196 | if path is None: 197 | path = self.ckp_path 198 | model_dict = {} 199 | opt_dict = {} 200 | for item_type in self.models: 201 | model_dict[item_type] = self.models[item_type].state_dict() 202 | opt_dict[item_type] = self.opts[item_type].opt.state_dict() 203 | state = { 204 | 'epoch': epoch, 205 | 'state_dict': model_dict, 206 | 'optimizer': opt_dict 207 | } 208 | if name == 'last': 209 | file_name = os.path.join(self.ckp_path, 210 | f'{name}-{epoch}-model.pth') 211 | else: 212 | file_name = os.path.join(self.ckp_path, f'{name}-model.pth') 213 | self.last_model_path = file_name 214 | torch.save(state, file_name) 215 | if name == 'best': 216 | self.best_ckp_path = file_name 217 | elif name == 'last': 218 | self.last_ckp_path = file_name 219 | 220 | def load_checkpoint(self, file_name=None, mode=None): 221 | if file_name is None: 222 | if mode == 'last': 223 | file_name = self.last_model_path 224 | elif mode == 'best': 225 | file_name = self.best_model_path 226 | else: 227 | raise ValueError("No checkpoint path provided.") 228 | ckp = torch.load(file_name) 229 | self.start_epoch = ckp['epoch'] + 1 230 | for item_type in self.models: 231 | self.models[item_type].load_state_dict( 232 | ckp['state_dict'][item_type]) 233 | self.opts[item_type].opt.load_state_dict( 234 | ckp['optimizer'][item_type]) 235 | self.logger.info(f"Load ckp from {file_name}.") 236 | -------------------------------------------------------------------------------- /HRec/pipeline/evaluator.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-18 15:34:30 7 | # ########################### 8 | 9 | import numpy as np 10 | import torch 11 | from .metrics import metrics_dict 12 | 13 | # These metrics are typical in loss recommendations 14 | loss_metrics = { 15 | metric.lower(): metric 16 | for metric in ['AUC', 'RMSE', 'MAE', 'LOGLOSS'] 17 | } 18 | 19 | 20 | class Evaluator(object): 21 | r"""Loss Evaluator is mainly used in rating prediction and click through rate prediction. Now, we support four 22 | loss metrics which contain `'AUC', 'RMSE', 'MAE', 'LOGLOSS'`. 23 | """ 24 | def __init__(self, config): 25 | super().__init__() 26 | 27 | self.metrics = config['metrics'] 28 | 29 | self.label_field = config['data']['LABEL_FIELD'] 30 | self.type_field = config['data']['TYPE_FIELD'] 31 | self._check_args() 32 | 33 | def collect(self, interaction, pred_scores): 34 | """collect the loss intermediate result of one batch, this function mainly 35 | implements concatenating preds and trues. It is called at the end of each batch 36 | 37 | Args: 38 | interaction (Interaction): :class:`AbstractEvaluator` of the batch 39 | pred_scores (tensor): the tensor of model output with a size of `(N, )` 40 | 41 | Returns: 42 | tensor : a batch of socres with a size of `(N, 2)` 43 | 44 | """ 45 | true_scores = interaction[self.label_field].to(pred_scores.device) 46 | types = interaction[self.type_field].to(pred_scores.device).float() 47 | assert len(true_scores) == len(pred_scores) 48 | return torch.stack((true_scores.float(), pred_scores.detach(), types), 49 | dim=1) 50 | 51 | def evaluate(self, batch_matrix_list, groupby=False, *args): 52 | """calculate the metrics of all batches. It is called at the end of each epoch 53 | 54 | Args: 55 | batch_matrix_list (list): the results of all batches 56 | 57 | Returns: 58 | dict: such as {'AUC': 0.83} 59 | 60 | """ 61 | concat = torch.cat(batch_matrix_list, dim=0).cpu().numpy() 62 | 63 | metric_dict = {} 64 | if groupby: 65 | types = concat[:, 2] 66 | for t in np.unique(types): 67 | trues = concat[types == t][:, 0] 68 | preds = concat[types == t][:, 1] 69 | result_list = self._calculate_metrics(trues, preds) 70 | for metric, value in zip(self.metrics, result_list): 71 | key = str(t) + "-" + str(metric) 72 | metric_dict[key] = round(value, 4) 73 | 74 | trues = concat[:, 0] 75 | preds = concat[:, 1] 76 | # get metrics 77 | result_list = self._calculate_metrics(trues, preds) 78 | for metric, value in zip(self.metrics, result_list): 79 | key = str(metric) 80 | metric_dict[key] = round(value, 4) 81 | 82 | return metric_dict 83 | 84 | def _check_args(self): 85 | 86 | # Check metrics 87 | if isinstance(self.metrics, (str, list)): 88 | if isinstance(self.metrics, str): 89 | self.metrics = [self.metrics] 90 | else: 91 | raise TypeError('metrics must be str or list') 92 | 93 | # Convert metric to lowercase 94 | for m in self.metrics: 95 | if m.lower() not in loss_metrics: 96 | raise ValueError("There is no loss metric named {}!".format(m)) 97 | self.metrics = [metric.lower() for metric in self.metrics] 98 | 99 | def metrics_info(self, trues, preds): 100 | """get metrics result 101 | 102 | Args: 103 | trues (np.ndarray): the true scores' list 104 | preds (np.ndarray): the predict scores' list 105 | 106 | Returns: 107 | list: a list of metrics result 108 | 109 | """ 110 | result_list = [] 111 | for metric in self.metrics: 112 | metric_fuc = metrics_dict[metric.lower()] 113 | result = metric_fuc(trues, preds) 114 | result_list.append(result) 115 | return result_list 116 | 117 | def _calculate_metrics(self, trues, preds): 118 | return self.metrics_info(trues, preds) 119 | 120 | def __str__(self): 121 | mesg = 'The Loss Evaluator Info:\n' + '\tMetrics:[' + ', '.join( 122 | [loss_metrics[metric.lower()] for metric in self.metrics]) + ']' 123 | return mesg 124 | -------------------------------------------------------------------------------- /HRec/pipeline/hprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: hprocess 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2021-01-06 16:24:08 7 | # ########################### 8 | 9 | from tqdm import tqdm 10 | import numpy as np 11 | import torch.nn as nn 12 | from ..datasets import HDataSet, SubSet 13 | from .process import Process 14 | from collections import defaultdict 15 | from torch.utils.data import DataLoader 16 | 17 | 18 | class HProcess(Process): 19 | """ 20 | Process for Heterogeneous Recommendation 21 | """ 22 | def __init__(self, config): 23 | self.config = config 24 | self._path_config(config['path']) 25 | self._logger_config() 26 | self._set_device(config) 27 | self._prepare_data(config['data']) 28 | self._prepare_model(config['model']) 29 | self._prepare_optimizer(config['opt'], self.model.parameters()) 30 | self._prepare_evaluator(config) 31 | 32 | def train_one_epoch(self, data_loader=None): 33 | """Train one epoch using given data""" 34 | if data_loader is None: 35 | data_loader = self.dataset.train_data_loader 36 | max_len = max([len(dl) for dl in data_loader.values()]) 37 | m = self.model 38 | loss_fn = self.model.calculate_loss 39 | 40 | m.train() 41 | losses = None 42 | losses_dict = defaultdict(float) 43 | opt = self.opt.opt 44 | if hasattr(self.opt, 'scheduler'): 45 | pass 46 | # TODO: scheduler 47 | # sch = self.opt.scheduler 48 | else: 49 | pass 50 | # sch = None 51 | iters = {} 52 | hdata = {} 53 | for idx in tqdm(range(max_len), total=max_len): 54 | 55 | for item_type, dl in data_loader.items(): 56 | try: 57 | data_iter = iters.get(item_type, None) 58 | if data_iter is None: 59 | iters[item_type] = iter(dl) 60 | data_iter = iters[item_type] 61 | data = next(data_iter) 62 | except: 63 | iters[item_type] = iter(dl) 64 | data = next(iters[item_type]) 65 | 66 | # self.modelsize(self.model, data) 67 | if type(data) is dict: 68 | for key, value in data.items(): 69 | data[key] = value.to(self.device) 70 | hdata[item_type] = data 71 | 72 | opt.zero_grad() 73 | loss, loss_dict = loss_fn(hdata) 74 | loss.backward() 75 | opt.step() 76 | 77 | losses = loss.item() if losses is None else losses + loss.item() 78 | for k, v in loss_dict.items(): 79 | losses_dict[k] += v 80 | losses_dict['total'] = losses 81 | 82 | return losses_dict 83 | 84 | def get_item_embeddings(self, item_kind): 85 | item_feat = self.dataset.item_feat[item_kind] 86 | item_set = SubSet(item_feat, None, self.dataset.iid_field, 87 | self.dataset.itype_field, None, None, 88 | self.dataset.item_feat_fields[item_kind]) 89 | dl = DataLoader(item_set) 90 | id2mapemb = {} 91 | id2emb = {} 92 | id2rawemb = {} 93 | for data in dl: 94 | if type(data) is dict: 95 | for k, v in data.items(): 96 | data[k] = v.to(self.device) 97 | rawembs, mapembs, embs = self.model.get_item_embedding( 98 | item_kind, data) 99 | ids = data['item_id'].cpu().detach().numpy() 100 | embs = embs.cpu().detach().numpy() 101 | mapembs = mapembs.cpu().detach().numpy() 102 | rawembs = rawembs.cpu().detach().numpy() 103 | for idx, mapemb, emb, rawemb in zip(ids, mapembs, embs, 104 | rawembs): 105 | id2mapemb[idx] = mapemb 106 | id2emb[idx] = emb 107 | id2rawemb[idx] = rawemb 108 | 109 | return id2emb, id2mapemb, id2rawemb 110 | 111 | def validate(self, data_loader=None): 112 | """ 113 | Run model in validation dataset and calculate the 114 | score using evaluator. 115 | Return: 116 | result: a dict store metrics name-value pair. 117 | """ 118 | if data_loader is None: 119 | data_loader = self.dataset.val_data_loader 120 | m = self.model 121 | m.eval() 122 | 123 | lens = [len(dl) for dl in data_loader.values()] 124 | batch_matrix_list = [] 125 | with tqdm(total=sum(lens)) as pbar: 126 | for item_type, dl in data_loader.items(): 127 | for data in dl: 128 | 129 | if type(data) is dict: 130 | for key, value in data.items(): 131 | data[key] = value.to(self.device) 132 | pred = m.predict(item_type, data) 133 | batch_matrix = self.evaluator.collect(data, pred) 134 | batch_matrix_list.append(batch_matrix) 135 | pbar.update(1) 136 | 137 | result = self.evaluator.evaluate(batch_matrix_list, groupby=True) 138 | return result 139 | 140 | def test(self, data_loader=None): 141 | """ 142 | Test 143 | """ 144 | if data_loader is None: 145 | data_loader = self.dataset.test_data_loader 146 | return self.validate(data_loader=data_loader) 147 | 148 | def _prepare_data(self, data_config): 149 | self.dataset = HDataSet(data_config) 150 | self.LABEL = self.dataset.config['LABEL_FIELD'] 151 | self.types = self.dataset.types 152 | 153 | def modelsize(self, model, input, type_size=4): 154 | para = sum([np.prod(list(p.size())) for p in model.parameters()]) 155 | print('Model {} : params: {:4f}M'.format( 156 | model._get_name(), para * type_size / 1000 / 1000)) 157 | input_ = input 158 | input_.requires_grad_(requires_grad=False) 159 | mods = list(model.modules()) 160 | out_sizes = [] 161 | 162 | for i in range(1, len(mods)): 163 | m = mods[i] 164 | if isinstance(m, nn.ReLU): 165 | if m.inplace: 166 | continue 167 | out = m(input_) 168 | out_sizes.append(np.array(out.size())) 169 | input_ = out 170 | 171 | total_nums = 0 172 | for i in range(len(out_sizes)): 173 | s = out_sizes[i] 174 | nums = np.prod(np.array(s)) 175 | total_nums += nums 176 | print('Model {} : intermedite variables: {:3f} M (without backward)'. 177 | format(model._get_name(), total_nums * type_size / 1000 / 1000)) 178 | print( 179 | 'Model {} : intermedite variables: {:3f} M (with backward)'.format( 180 | model._get_name(), total_nums * type_size * 2 / 1000 / 1000)) 181 | -------------------------------------------------------------------------------- /HRec/pipeline/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-18 15:25:19 7 | # ########################### 8 | 9 | import numpy as np 10 | from sklearn.metrics import auc as sk_auc 11 | from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error 12 | 13 | # TopK Metrics # 14 | 15 | 16 | def hit_(pos_index, pos_len): 17 | r"""Hit_ (also known as hit ratio at :math:`N`) is a way of calculating how many 'hits' you have 18 | in an n-sized list of ranked items. 19 | 20 | .. _Hit: https://medium.com/@rishabhbhatia315/recommendation-system-evaluation-metrics-3f6739288870 21 | 22 | .. math:: 23 | \mathrm {HR@K} =\frac{Number \space of \space Hits @K}{|GT|} 24 | 25 | :math:`HR` is the number of users with a positive sample in the recommendation list. 26 | :math:`GT` is the total number of samples in the test set. 27 | 28 | """ 29 | result = np.cumsum(pos_index, axis=1) 30 | return (result > 0).astype(int) 31 | 32 | 33 | def mrr_(pos_index, pos_len): 34 | r"""The MRR_ (also known as mean reciprocal rank) is a statistic measure for evaluating any process 35 | that produces a list of possible responses to a sample of queries, ordered by probability of correctness. 36 | 37 | .. _MRR: https://en.wikipedia.org/wiki/Mean_reciprocal_rank 38 | 39 | .. math:: 40 | \mathrm {MRR} = \frac{1}{|{U}|} \sum_{i=1}^{|{U}|} \frac{1}{rank_i} 41 | 42 | :math:`U` is the number of users, :math:`rank_i` is the rank of the first item in the recommendation list 43 | in the test set results for user :math:`i`. 44 | 45 | """ 46 | idxs = pos_index.argmax(axis=1) 47 | result = np.zeros_like(pos_index, dtype=np.float) 48 | for row, idx in enumerate(idxs): 49 | if pos_index[row, idx] > 0: 50 | result[row, idx:] = 1 / (idx + 1) 51 | else: 52 | result[row, idx:] = 0 53 | return result 54 | 55 | 56 | def map_(pos_index, pos_len): 57 | r"""MAP_ (also known as Mean Average Precision) The MAP is meant to calculate Avg. Precision for the relevant items. 58 | 59 | Note: 60 | In this case the normalization factor used is :math:`\frac{1}{\min (m,N)}`, which prevents your AP score from 61 | being unfairly suppressed when your number of recommendations couldn't possibly capture all the correct ones. 62 | 63 | .. _map: http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html#MAP-for-Recommender-Algorithms 64 | 65 | .. math:: 66 | \begin{align*} 67 | \mathrm{AP@N} &= \frac{1}{\mathrm{min}(m,N)}\sum_{k=1}^N P(k) \cdot rel(k) \\ 68 | \mathrm{MAP@N}& = \frac{1}{|U|}\sum_{u=1}^{|U|}(\mathrm{AP@N})_u 69 | \end{align*} 70 | 71 | """ 72 | pre = precision_(pos_index, pos_len) 73 | sum_pre = np.cumsum(pre * pos_index.astype(np.float), axis=1) 74 | len_rank = np.full_like(pos_len, pos_index.shape[1]) 75 | actual_len = np.where(pos_len > len_rank, len_rank, pos_len) 76 | result = np.zeros_like(pos_index, dtype=np.float) 77 | for row, lens in enumerate(actual_len): 78 | ranges = np.arange(1, pos_index.shape[1] + 1) 79 | ranges[lens:] = ranges[lens - 1] 80 | result[row] = sum_pre[row] / ranges 81 | return result 82 | 83 | 84 | def recall_(pos_index, pos_len): 85 | r"""Recall_ (also known as sensitivity) is the fraction of the total amount of relevant instances 86 | that were actually retrieved 87 | 88 | .. _recall: https://en.wikipedia.org/wiki/Precision_and_recall#Recall 89 | 90 | .. math:: 91 | \mathrm {Recall@K} = \frac{|Rel_u\cap Rec_u|}{Rel_u} 92 | 93 | :math:`Rel_u` is the set of items relavent to user :math:`U`, 94 | :math:`Rec_u` is the top K items recommended to users. 95 | We obtain the result by calculating the average :math:`Recall@K` of each user. 96 | 97 | """ 98 | return np.cumsum(pos_index, axis=1) / pos_len.reshape(-1, 1) 99 | 100 | 101 | def ndcg_(pos_index, pos_len): 102 | r"""NDCG_ (also known as normalized discounted cumulative gain) is a measure of ranking quality. 103 | Through normalizing the score, users and their recommendation list results in the whole test set can be evaluated. 104 | 105 | .. _NDCG: https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG 106 | 107 | .. math:: 108 | \begin{gather} 109 | \mathrm {DCG@K}=\sum_{i=1}^{K} \frac{2^{rel_i}-1}{\log_{2}{(i+1)}}\\ 110 | \mathrm {IDCG@K}=\sum_{i=1}^{K}\frac{1}{\log_{2}{(i+1)}}\\ 111 | \mathrm {NDCG_u@K}=\frac{DCG_u@K}{IDCG_u@K}\\ 112 | \mathrm {NDCG@K}=\frac{\sum \nolimits_{u \in u^{te}NDCG_u@K}}{|u^{te}|} 113 | \end{gather} 114 | 115 | :math:`K` stands for recommending :math:`K` items. 116 | And the :math:`rel_i` is the relevance of the item in position :math:`i` in the recommendation list. 117 | :math:`2^{rel_i}` equals to 1 if the item hits otherwise 0. 118 | :math:`U^{te}` is for all users in the test set. 119 | 120 | """ 121 | 122 | len_rank = np.full_like(pos_len, pos_index.shape[1]) 123 | idcg_len = np.where(pos_len > len_rank, len_rank, pos_len) 124 | 125 | iranks = np.zeros_like(pos_index, dtype=np.float) 126 | iranks[:, :] = np.arange(1, pos_index.shape[1] + 1) 127 | idcg = np.cumsum(1.0 / np.log2(iranks + 1), axis=1) 128 | for row, idx in enumerate(idcg_len): 129 | idcg[row, idx:] = idcg[row, idx - 1] 130 | 131 | ranks = np.zeros_like(pos_index, dtype=np.float) 132 | ranks[:, :] = np.arange(1, pos_index.shape[1] + 1) 133 | dcg = 1.0 / np.log2(ranks + 1) 134 | dcg = np.cumsum(np.where(pos_index, dcg, 0), axis=1) 135 | 136 | result = dcg / idcg 137 | return result 138 | 139 | 140 | def precision_(pos_index, pos_len): 141 | r"""Precision_ (also called positive predictive value) is the fraction of 142 | relevant instances among the retrieved instances 143 | 144 | .. _precision: https://en.wikipedia.org/wiki/Precision_and_recall#Precision 145 | 146 | .. math:: 147 | \mathrm {Precision@K} = \frac{|Rel_u \cap Rec_u|}{Rec_u} 148 | 149 | :math:`Rel_u` is the set of items relavent to user :math:`U`, 150 | :math:`Rec_u` is the top K items recommended to users. 151 | We obtain the result by calculating the average :math:`Precision@K` of each user. 152 | 153 | """ 154 | return pos_index.cumsum(axis=1) / np.arange(1, pos_index.shape[1] + 1) 155 | 156 | 157 | # CTR Metrics # 158 | 159 | 160 | def auc_(trues, preds): 161 | r"""AUC_ (also known as Area Under Curve) is used to evaluate the two-class model, referring to 162 | the area under the ROC curve 163 | 164 | .. _AUC: https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve 165 | 166 | Note: 167 | This metric does not calculate group-based AUC which considers the AUC scores 168 | averaged across users. It is also not limited to k. Instead, it calculates the 169 | scores on the entire prediction results regardless the users. 170 | 171 | .. math:: 172 | \mathrm {AUC} = \frac{\sum\limits_{i=1}^M rank_{i} 173 | - {{M} \times {(M+1)}}} {{M} \times {N}} 174 | 175 | :math:`M` is the number of positive samples. 176 | :math:`N` is the number of negative samples. 177 | :math:`rank_i` is the rank of the ith positive sample. 178 | 179 | """ 180 | fps, tps = _binary_clf_curve(trues, preds) 181 | 182 | if len(fps) > 2: 183 | optimal_idxs = np.where( 184 | np.r_[True, 185 | np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True])[0] 186 | fps = fps[optimal_idxs] 187 | tps = tps[optimal_idxs] 188 | 189 | tps = np.r_[0, tps] 190 | fps = np.r_[0, fps] 191 | 192 | if fps[-1] <= 0: 193 | fpr = np.repeat(np.nan, fps.shape) 194 | else: 195 | fpr = fps / fps[-1] 196 | 197 | if tps[-1] <= 0: 198 | tpr = np.repeat(np.nan, tps.shape) 199 | else: 200 | tpr = tps / tps[-1] 201 | 202 | return sk_auc(fpr, tpr) 203 | 204 | 205 | # Loss based Metrics # 206 | 207 | 208 | def mae_(trues, preds): 209 | r"""`Mean absolute error regression loss`__ 210 | 211 | .. __: https://en.wikipedia.org/wiki/Mean_absolute_error 212 | 213 | .. math:: 214 | \mathrm{MAE}=\frac{1}{|{T}|} \sum_{(u, i) \in {T}}\left|\hat{r}_{u i}-r_{u i}\right| 215 | 216 | :math:`T` is the test set, :math:`\hat{r}_{u i}` is the score predicted by the model, 217 | and :math:`r_{u i}` the actual score of the test set. 218 | 219 | """ 220 | return mean_absolute_error(trues, preds) 221 | 222 | 223 | def rmse_(trues, preds): 224 | r"""`Mean std error regression loss`__ 225 | 226 | .. __: https://en.wikipedia.org/wiki/Root-mean-square_deviation 227 | 228 | .. math:: 229 | \mathrm{RMSE} = \sqrt{\frac{1}{|{T}|} \sum_{(u, i) \in {T}}(\hat{r}_{u i}-r_{u i})^{2}} 230 | 231 | :math:`T` is the test set, :math:`\hat{r}_{u i}` is the score predicted by the model, 232 | and :math:`r_{u i}` the actual score of the test set. 233 | 234 | """ 235 | return np.sqrt(mean_squared_error(trues, preds)) 236 | 237 | 238 | def log_loss_(trues, preds): 239 | r"""`Log loss`__, aka logistic loss or cross-entropy loss 240 | 241 | .. __: http://wiki.fast.ai/index.php/Log_Loss 242 | 243 | .. math:: 244 | -\log {P(y_t|y_p)} = -(({y_t}\ \log{y_p}) + {(1-y_t)}\ \log{(1 - y_p)}) 245 | 246 | For a single sample, :math:`y_t` is true label in :math:`\{0,1\}`. 247 | :math:`y_p` is the estimated probability that :math:`y_t = 1`. 248 | 249 | """ 250 | eps = 1e-15 251 | preds = np.float64(preds) 252 | preds = np.clip(preds, eps, 1 - eps) 253 | loss = np.sum(-trues * np.log(preds) - (1 - trues) * np.log(1 - preds)) 254 | 255 | return loss / len(preds) 256 | 257 | 258 | # Item based Metrics # 259 | 260 | 261 | def _binary_clf_curve(trues, preds): 262 | """Calculate true and false positives per binary classification threshold 263 | 264 | Args: 265 | trues (numpy.ndarray): the true scores' list 266 | preds (numpy.ndarray): the predict scores' list 267 | 268 | Returns: 269 | fps (np.ndarray): A count of false positives, at index i being the number of negative 270 | samples assigned a score >= thresholds[i] 271 | preds (numpy.ndarray): An increasing count of true positives, at index i being the number 272 | of positive samples assigned a score >= thresholds[i]. 273 | 274 | Note: 275 | To improve efficiency, we referred to the source code(which is available at sklearn.metrics.roc_curve) 276 | in SkLearn and made some optimizations. 277 | 278 | """ 279 | trues = (trues == 1) 280 | 281 | desc_idxs = np.argsort(preds)[::-1] 282 | preds = preds[desc_idxs] 283 | trues = trues[desc_idxs] 284 | 285 | unique_val_idxs = np.where(np.diff(preds))[0] 286 | threshold_idxs = np.r_[unique_val_idxs, trues.size - 1] 287 | 288 | tps = np.cumsum(trues)[threshold_idxs] 289 | fps = 1 + threshold_idxs - tps 290 | return fps, tps 291 | 292 | 293 | """Function name and function mapper. 294 | Useful when we have to serialize evaluation metric names 295 | and call the functions based on deserialized names 296 | """ 297 | metrics_dict = { 298 | 'ndcg': ndcg_, 299 | 'hit': hit_, 300 | 'precision': precision_, 301 | 'map': map_, 302 | 'recall': recall_, 303 | 'mrr': mrr_, 304 | 'rmse': rmse_, 305 | 'mae': mae_, 306 | 'logloss': log_loss_, 307 | 'auc': auc_ 308 | } 309 | -------------------------------------------------------------------------------- /HRec/pipeline/optimizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: optimizer.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-18 11:49:59 7 | # ########################### 8 | 9 | import torch.optim as optim 10 | 11 | opt_map = { 12 | "Adam": optim.Adam, 13 | "Adadelta": optim.Adadelta, 14 | "Adagrad": optim.Adagrad, 15 | "AdamW": optim.AdamW, 16 | "RMSprop": optim.RMSprop, 17 | "SGD": optim.SGD 18 | } 19 | 20 | 21 | class Optimizer(object): 22 | def __init__(self, config, params): 23 | opt_fn = opt_map[config["name"]] 24 | try: 25 | self.opt = opt_fn(params, **config["hyper_params"]) 26 | except TypeError: 27 | print("Unexcepted key error in optimizer") 28 | self.adjust_lr = config.get("adjust_lr", False) 29 | if self.adjust_lr: 30 | self.scheduler = self.get_scheduler(config.get("scheduler")) 31 | 32 | def get_scheduler(self, config): 33 | if config["name"] == "ReduceLROnPlateau": 34 | return optim.lr_scheduler.ReduceLROnPlateau( 35 | self.opt, **config["hyper_params"]) 36 | else: 37 | # TODO: Other schedulers 38 | raise NotImplementedError 39 | 40 | def zero_grad(self): 41 | self.opt.zero_grad() 42 | 43 | def step(self): 44 | self.opt.step() 45 | -------------------------------------------------------------------------------- /HRec/pipeline/process.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import json 3 | import os 4 | import time 5 | import logging 6 | from datetime import datetime 7 | from collections import defaultdict 8 | 9 | import torch 10 | from tqdm import tqdm 11 | 12 | from ..datasets import DataSet 13 | from ..models import model_map, ModelType 14 | from .evaluator import Evaluator 15 | from .optimizer import Optimizer 16 | from .utils import get_free_gpu, EarlyStopping 17 | 18 | 19 | class Process(object): 20 | def __init__(self, config): 21 | self.config = config 22 | self._path_config(config['path']) 23 | self._logger_config() 24 | self._set_device(config) 25 | self._prepare_data(config['data']) 26 | self._prepare_model(config['model']) 27 | self._prepare_optimizer(config['opt'], self.model.parameters()) 28 | self._prepare_evaluator(config) 29 | 30 | def _set_device(self, config): 31 | device = config.get('device', None) 32 | if device is not None: 33 | self.device = device 34 | else: 35 | device_list = get_free_gpu(mode="memory", memory_need=5000) 36 | if len(device_list) < 1: 37 | raise ValueError("No GPU available now.") 38 | else: 39 | self.device = f'cuda:{device_list[0]}' 40 | self.logger.info(f'Use device {self.device}') 41 | config['data']['device'] = self.device 42 | config['model']['device'] = self.device 43 | 44 | def _logger_config(self): 45 | """ 46 | Set the logger 47 | """ 48 | model_name = self.config['model']['name'] 49 | data_name = self.config['data']['name'] 50 | if self.config['data'].get('single', False): 51 | single_type = self.config['data'].get('single_type') 52 | data_name = f'{data_name}_{single_type}' 53 | logfile_name = os.path.join(self.log_path, 54 | f'{model_name}-{data_name}.log') 55 | fmt = "%(asctime)-15s %(levelname)s %(message)s" 56 | filedatefmt = "%a %d %b %Y %H:%M:%S" 57 | fileformatter = logging.Formatter(fmt, filedatefmt) 58 | 59 | sdatefmt = "%d %b %H:%M" 60 | sformatter = logging.Formatter(fmt, sdatefmt) 61 | 62 | fh = logging.FileHandler(logfile_name) 63 | fh.setLevel(logging.INFO) 64 | fh.setFormatter(fileformatter) 65 | sh = logging.StreamHandler() 66 | sh.setLevel(logging.INFO) 67 | sh.setFormatter(sformatter) 68 | logging.basicConfig(level=logging.INFO, handlers=[fh, sh]) 69 | self.logger = logging.getLogger() 70 | 71 | def _prepare_model(self, model_config): 72 | name = model_config['name'] 73 | self.model = model_map[name](model_config, self.dataset) 74 | self.model.to(self.device) 75 | 76 | def _prepare_data(self, data_config): 77 | self.dataset = DataSet(data_config) 78 | self.LABEL = self.dataset.config['LABEL_FIELD'] 79 | self.single = self.dataset.single 80 | 81 | def _prepare_optimizer(self, opt_config, params): 82 | self.opt = Optimizer(opt_config, params) 83 | self.start_epoch = 0 84 | self.best_val_score = -1 85 | self.epochs = opt_config['epochs'] 86 | self.eval_step = opt_config['eval_step'] 87 | self.save_step = opt_config['save_step'] 88 | self.train_loss_dict = {} 89 | self.val_loss_dict = {} 90 | if 'early_stop' in opt_config.keys(): 91 | self.early_stop = True 92 | config = opt_config['early_stop'] 93 | self.eval_metric = config.get('metric', 'auc') 94 | self.eval_mode = config.get('mode', 'max') 95 | self.stop_step = config.get('stop_step', 5) 96 | else: 97 | self.early_stop = False 98 | 99 | def _prepare_evaluator(self, config): 100 | self.evaluator = Evaluator(config) 101 | 102 | def train_one_epoch(self, data_loader=None): 103 | """Train one epoch using given data""" 104 | if data_loader is None: 105 | data_loader = self.dataset.train_data_loader 106 | 107 | m = self.model 108 | loss_fn = self.model.calculate_loss 109 | 110 | m.train() 111 | losses = None 112 | opt = self.opt.opt 113 | if hasattr(self.opt, 'scheduler'): 114 | pass 115 | # TODO: scheduler 116 | # sch = self.opt.scheduler 117 | else: 118 | pass 119 | # sch = None 120 | for idx, data in tqdm(enumerate(data_loader), total=len(data_loader)): 121 | if type(data) is dict: 122 | for key, value in data.items(): 123 | data[key] = value.to(self.device) 124 | opt.zero_grad() 125 | loss = loss_fn(data) 126 | loss.backward() 127 | opt.step() 128 | losses = loss.item() if losses is None else losses + loss.item() 129 | return losses 130 | 131 | def validate(self, data_loader=None): 132 | """ 133 | Run model in validation dataset and calculate the 134 | score using evaluator. 135 | Return: 136 | result: a dict store metrics name-value pair. 137 | """ 138 | if data_loader is None: 139 | data_loader = self.dataset.val_data_loader 140 | m = self.model 141 | m.eval() 142 | 143 | batch_matrix_list = [] 144 | for idx, data in tqdm(enumerate(data_loader), total=len(data_loader)): 145 | if type(data) is dict: 146 | for key, value in data.items(): 147 | data[key] = value.to(self.device) 148 | pred = m.predict(data) 149 | batch_matrix = self.evaluator.collect(data, pred) 150 | batch_matrix_list.append(batch_matrix) 151 | 152 | if self.single: 153 | result = self.evaluator.evaluate(batch_matrix_list, groupby=False) 154 | else: 155 | result = self.evaluator.evaluate(batch_matrix_list, groupby=True) 156 | return result 157 | 158 | def test(self, data_loader=None): 159 | 160 | if data_loader is None: 161 | data_loader = self.dataset.test_data_loader 162 | return self.validate(data_loader=data_loader) 163 | 164 | def save_checkpoint(self, epoch, name='last', path=None): 165 | if path is None: 166 | path = self.ckp_path 167 | state = { 168 | 'epoch': epoch, 169 | 'state_dict': self.model.state_dict(), 170 | 'optimizer': self.opt.opt.state_dict() 171 | } 172 | if name == 'last': 173 | file_name = os.path.join(self.ckp_path, 174 | f'{name}-{epoch}-model.pth') 175 | else: 176 | file_name = os.path.join(self.ckp_path, f'{name}-model.pth') 177 | self.last_model_path = file_name 178 | torch.save(state, file_name) 179 | if name == 'best': 180 | self.best_ckp_path = file_name 181 | elif name == 'last': 182 | self.last_ckp_path = file_name 183 | 184 | def load_checkpoint(self, file_name=None, mode=None): 185 | if file_name is None: 186 | if mode == 'last': 187 | file_name = self.last_model_path 188 | elif mode == 'best': 189 | file_name = self.best_model_path 190 | else: 191 | raise ValueError("No checkpoint path provided.") 192 | ckp = torch.load(file_name) 193 | self.start_epoch = ckp['epoch'] + 1 194 | self.model.load_state_dict(ckp['state_dict']) 195 | self.opt.opt.load_state_dict(ckp['optimizer']) 196 | self.logger.info(f"Load ckp from {file_name}.") 197 | 198 | def fit(self, 199 | train_data=None, 200 | val_data=None, 201 | test_data=None, 202 | verbose=True): 203 | 204 | if self.model.model_type == ModelType.CONTEXT: 205 | self.dataset.join_interaction() 206 | self.dataset.train_val_test_split(context=True) 207 | elif self.model.model_type == ModelType.HETERO: 208 | self.dataset.join_interaction() 209 | self.dataset.train_val_test_split() 210 | else: 211 | self.dataset.train_val_test_split(context=False) 212 | 213 | batch_size = self.config['opt'].get('batch_size', 256) 214 | num_workers = self.dataset.config.get('num_workers', 2) 215 | self.dataset.init_data_loader(batch_size=batch_size, 216 | num_workers=num_workers) 217 | for epoch_idx in range(self.start_epoch, self.epochs): 218 | 219 | # Train 220 | st = time.time() 221 | train_loss = self.train_one_epoch(train_data) 222 | self.train_loss_dict[epoch_idx] = train_loss 223 | ed = time.time() 224 | 225 | if verbose: 226 | if type(train_loss) is dict or type(train_loss) is defaultdict: 227 | train_loss = '\t'.join( 228 | [f'{k}: {v}' for k, v in train_loss.items()]) 229 | self.logger.info( 230 | f'[TRAIN] Epoch: {epoch_idx} cost time: {ed - st:.1f}, train loss: {train_loss}' 231 | ) 232 | 233 | # Eval 234 | if not ((epoch_idx + 1) % self.eval_step): 235 | st = time.time() 236 | result = self.validate(val_data) 237 | ed = time.time() 238 | self.logger.info( 239 | f'[EVAL] Epoch: {epoch_idx} cost time: {ed - st:.1f}') 240 | result_str = '[EVAL] ' + '\t'.join( 241 | [f'{k}: {v} ' for k, v in result.items()]) 242 | stop_flag, better = EarlyStopping.update( 243 | result, epoch_idx, self.eval_metric, self.eval_mode, 244 | self.stop_step) 245 | self.logger.info(result_str) 246 | 247 | # Save the best model 248 | if better: 249 | self.save_checkpoint(epoch_idx, 'best') 250 | if self.early_stop and stop_flag: 251 | self.logger.info(f'Early Stop in {epoch_idx} epoch. ') 252 | break 253 | 254 | if not ((epoch_idx + 1) % self.save_step): 255 | self.save_checkpoint(epoch_idx, 'last') 256 | 257 | # Test 258 | self.logger.info( 259 | 'Finish training. Start to evaluate in the test set using the best model in val set.' 260 | ) 261 | if hasattr(self, 'best_ckp_path'): 262 | self.load_checkpoint(self.best_ckp_path) 263 | result = self.test(test_data) 264 | result_str = '[TEST] ' + '\t'.join( 265 | [f'{k}: {v:.3f} ' for k, v in result.items()]) 266 | self.logger.info(result_str) 267 | self.config['result'] = result 268 | # Save the result to config file 269 | json.dump(self.config, 270 | open(os.path.join(self.output_path, "config.json"), "w"), 271 | indent='\t') 272 | 273 | def _path_config(self, config): 274 | now = str(datetime.now()).replace(" ", "_").split(".")[0] 275 | model_name = self.config['model']['name'] 276 | data_name = self.config['data']['name'] 277 | output_path = os.path.join(config["output"], 278 | f'{model_name}-{data_name}-{now}') 279 | self.output_path = output_path 280 | if os.path.isdir(output_path): 281 | raise ValueError("Output dir already exist") 282 | else: 283 | os.makedirs(output_path) 284 | # Save config files 285 | json.dump(self.config, 286 | open(os.path.join(output_path, "config.json"), "w"), 287 | indent='\t') 288 | print(f"Config is saved in {output_path}.") 289 | for sub_dir in ["log", "ckp"]: 290 | path = os.path.join(output_path, sub_dir) 291 | if not os.path.exists(path): 292 | os.mkdir(path) 293 | setattr(self, f'{sub_dir}_path', path) 294 | print(f'{sub_dir} is saved in {path}') 295 | -------------------------------------------------------------------------------- /HRec/pipeline/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # ########################### 3 | # File Name: utils.py 4 | # Author: geekinglcq 5 | # Mail: lcqgeek@live.com 6 | # Created Time: 2020-12-20 21:32:39 7 | # ########################### 8 | 9 | from gpustat import GPUStatCollection 10 | 11 | 12 | def get_free_gpu(mode="memory", memory_need=10000) -> list: 13 | r"""Get free gpu according to mode (process-free or memory-free). 14 | Args: 15 | mode (str, optional): memory-free or process-free. Defaults to "memory". 16 | memory_need (int): The memory you need, used if mode=='memory'. Defaults to 10000. 17 | Returns: 18 | list: free gpu ids sorting by free memory 19 | """ 20 | assert mode in ["memory", "process"], "mode must be 'memory' or 'process'" 21 | if mode == "memory": 22 | assert memory_need is not None, \ 23 | "'memory_need' if None, 'memory' mode must give the free memory you want to apply for" 24 | memory_need = int(memory_need) 25 | assert memory_need > 0, "'memory_need' you want must be positive" 26 | gpu_stats = GPUStatCollection.new_query() 27 | gpu_free_id_list = [] 28 | 29 | for idx, gpu_stat in enumerate(gpu_stats): 30 | if gpu_check_condition(gpu_stat, mode, memory_need): 31 | gpu_free_id_list.append([idx, gpu_stat.memory_free]) 32 | print("gpu[{}]: {}MB".format(idx, gpu_stat.memory_free)) 33 | 34 | if gpu_free_id_list: 35 | gpu_free_id_list = sorted(gpu_free_id_list, 36 | key=lambda x: x[1], 37 | reverse=True) 38 | gpu_free_id_list = [i[0] for i in gpu_free_id_list] 39 | return gpu_free_id_list 40 | 41 | 42 | def gpu_check_condition(gpu_stat, mode, memory_need) -> bool: 43 | r"""Check gpu is free or not. 44 | Args: 45 | gpu_stat (gpustat.core): gpustat to check 46 | mode (str): memory-free or process-free. 47 | memory_need (int): The memory you need, used if mode=='memory' 48 | Returns: 49 | bool: gpu is free or not 50 | """ 51 | if mode == "memory": 52 | return gpu_stat.memory_free > memory_need 53 | elif mode == "process": 54 | for process in gpu_stat.processes: 55 | if process["command"] == "python": 56 | return False 57 | return True 58 | else: 59 | return False 60 | 61 | 62 | class EarlyStopping(object): 63 | """ 64 | The class control the info to decide whether to do earlystopping 65 | """ 66 | best_score = None 67 | best_epoch = None 68 | steps = 0 69 | 70 | @classmethod 71 | def update(self, scores, epoch, metric='auc', mode='max', stop_step=5): 72 | """ 73 | Update current suitation after each epoch. 74 | Args: 75 | scores: a dict store metrice name-value pair 76 | metric: which metric to use in earlystopping 77 | mode: 'max' or 'min' 78 | stop_step: if after the given num of epochs, the 79 | model does not improve then stop the training 80 | Return: 81 | stop_flag: if or not stop 82 | better: if current version is the best 83 | """ 84 | assert metric in scores.keys() 85 | assert mode in ['max', 'min'] 86 | 87 | def _set_best(score, epoch): 88 | EarlyStopping.best_score = score 89 | EarlyStopping.best_epoch = epoch 90 | EarlyStopping.steps = 0 91 | 92 | def _compare(score, mode): 93 | comp_fn = {"max": lambda a, b: a > b, "min": lambda a, b: a < b} 94 | return comp_fn[mode](score, EarlyStopping.best_score) 95 | 96 | score = scores[metric] 97 | better = False 98 | if EarlyStopping.best_score is None: 99 | _set_best(score, epoch) 100 | better = True 101 | elif _compare(score, mode): 102 | _set_best(score, epoch) 103 | better = True 104 | else: 105 | EarlyStopping.steps += 1 106 | if EarlyStopping.steps >= stop_step: 107 | return True, better 108 | return False, better 109 | 110 | 111 | if __name__ == '__main__': 112 | get_free_gpu() 113 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 |
5 |

6 | 7 | # HR 8 | 9 | Official implementation of the paper [Deep Unified Representation for Heterogeneous Recommendation](https://arxiv.org/abs/2201.05861). 10 | Accepted by the ACM Web Conference 2022 (WWW '22) 11 | [中文版算法介绍](https://zhuanlan.zhihu.com/p/474148693) 12 | 13 | ## Dataset 14 | In this paper, we use the Douban dataset stored in `data.tar.xz`. 15 | Please uncompress it (`tar -xf data.tar.xz`) and put it in the working directory. 16 | 17 | ## Usage 18 | 19 | 20 | Before runing the code, please make sure that you have installed the dependency. You can install them with 21 | ``` 22 | pip install -r requirements.txt 23 | ``` 24 | Our code is tested on `python 3.7`. 25 | 26 | The next step is to prepare the configuration file. We provides the configurations of our proposed model (DURation) and baselines in `configs` fold as examples. To reproduce the results report in our paper, one just need change the path to your local path in the configuration. 27 | 28 | Then, you can run the program with simple one-line code. Take the DURation model as a example, there is a `duration.json` file in `configs`. 29 | 30 | ``` 31 | python train_hete.py duration 32 | ``` 33 | 34 | It is worth note that the configuration file must be put in `configs`. To test the homogeneous models, just replace `train_hete.py` with `train_homo.py`. The program will output the results on screen while save the log to a certain path. 35 | 36 | ## Models 37 | 38 | Currently, we support the following models: 39 | 40 | + **DeepMF**(2017): Deep Matrix Factorization Models for Recommender Systems 41 | + **FISM**(2013): Fism: factored item similarity models for top-n recommender systems. 42 | + **NAIS**(2018): Nais: Neural attentive item similarity model for recommendation. 43 | + **DeepFM**(2017): DeepFM: a factorization-machine based neural network for CTR prediction 44 | + **xDeepFM**(2018): xdeepfm: Combining explicit and implicit feature interactions for recommender systems 45 | + **AFM**(2017): Attentional factorization machines: Learning the weight of feature interactions via attention networks 46 | + **DSSM**(2013): Learning deep structured semantic models for web search using clickthrough data 47 | + **Wide & Deep**(2016): Wide & deep learning for recommender systems 48 | + **autoInt**(2019): Autoint: Automatic feature interaction learning via selfattentive neural networks 49 | + **CCCFNet**(2012): Cross-domain collaboration recommendation 50 | + **DDTCDR**(2020): DDTCDR: Deep dual transfer cross domain recommendation 51 | 52 | ## Cite 53 | 54 | ``` 55 | @inproceedings{lu2022deep, 56 | title={Deep Unified Representation for Heterogeneous Recommendation}, 57 | author={Lu, Chengqiang and Yin, Mingyang and Shen, Shuheng and Ji, Luo and Liu, Qi and Yang, Hongxia}, 58 | booktitle={Proceedings of the ACM Web Conference 2022}, 59 | pages={2141--2152}, 60 | year={2022} 61 | } 62 | ``` 63 | -------------------------------------------------------------------------------- /configs/afm.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "data" : { 4 | "name" : "douban", 5 | "USER_ID_FIELD" : "user_id", 6 | "ITEM_ID_FIELD" : "item_id", 7 | "LABEL_FIELD" : "label", 8 | "TYPE_FIELD" : "type", 9 | "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv", 10 | "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv", 11 | "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv", 12 | "feat" : { 13 | 14 | "user_id" : {"type" : "token", "source" : "user"}, 15 | 16 | "item_id" : {"type" : "token", "source" : "item"}, 17 | "type" : {"type" : "token", "source" : "item"}, 18 | "douban_score" : {"type" : "token", "source" : "item"}, 19 | "douban_votes" : {"type" : "token", "source" : "item"} 20 | }, 21 | "inter_matrix_type" : "01" 22 | }, 23 | "model" : { 24 | "name" : "AFM", 25 | "attention_size" : 25, 26 | "embedding_size" : 10, 27 | "dropout_prob" : 0.3, 28 | "reg_weight" : 2 29 | 30 | }, 31 | "opt" 32 | : { 33 | "name" : "Adam", 34 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 35 | "adjust_lr" : false, 36 | "scheduler" : 37 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 38 | "epochs" : 100, 39 | "eval_step" : 2, 40 | "batch_size" : 128, 41 | "save_step" : 5, 42 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 43 | }, 44 | "path" : {"output" : "/Your/Local/Path/output/"}, 45 | "metrics" : ["AUC"] 46 | } 47 | -------------------------------------------------------------------------------- /configs/autoint.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "data" : { 4 | "name" : "douban", 5 | "USER_ID_FIELD" : "user_id", 6 | "ITEM_ID_FIELD" : "item_id", 7 | "LABEL_FIELD" : "label", 8 | "TYPE_FIELD" : "type", 9 | "user_feat_path" : 10 | "/Your/Local/Path/data/douban/all_users.csv", 11 | "inter_feat_path" : 12 | "/Your/Local/Path/data/douban/all_rate.csv", 13 | "item_feat_path" : 14 | "/Your/Local/Path/data/douban/all_item_token.csv", 15 | "feat" : { 16 | 17 | "user_id" : {"type" : "token", "source" : "user"}, 18 | 19 | "item_id" : {"type" : "token", "source" : "item"}, 20 | 21 | "type" : {"type" : "token", "source" : "item"}, 22 | "douban_score" : {"type" : "token", "source" : "item"}, 23 | 24 | "douban_votes" : {"type" : "token", "source" : "item"} 25 | 26 | }, 27 | "inter_matrix_type" : "01" 28 | }, 29 | "model" : { 30 | "name" : "AutoInt", 31 | "embedding_size" : 64, 32 | "attention_size" : 16, 33 | "n_layers": 3, 34 | "num_heads":2, 35 | "dropout_probs": [0.2, 0.2, 0.2], 36 | "mlp_hidden_size": [128,128] 37 | }, 38 | "opt" 39 | : { 40 | "name" : "Adam", 41 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 42 | "adjust_lr" : false, 43 | "scheduler" : 44 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 45 | "epochs" : 100, 46 | "eval_step" : 2, 47 | "batch_size" : 1024, 48 | "save_step" : 5, 49 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 50 | }, 51 | "path" : {"output" : "/Your/Local/Path/output/"}, 52 | "metrics" : ["AUC"] 53 | } 54 | -------------------------------------------------------------------------------- /configs/cccf.json: -------------------------------------------------------------------------------- 1 | { 2 | "data" : { 3 | "name" : "douban", 4 | "USER_ID_FIELD" : "user_id", 5 | "ITEM_ID_FIELD" : "item_id", 6 | "LABEL_FIELD" : "label", 7 | "TYPE_FIELD" : "type", 8 | "pin_mem" : true, 9 | "type" : [ "book", "music", "movie" ], 10 | "user_feat_path" : 11 | "/Your/Local/Path/data/douban/all_users.csv", 12 | "inter_feat_path" : 13 | "/Your/Local/Path/data/douban/all_rate.csv", 14 | "item_feat_path" : { 15 | "book" : "/Your/Local/Path/data/douban/book_item.csv", 16 | "music" : "/Your/Local/Path/data/douban/music_item.csv", 17 | "movie" : "/Your/Local/Path/data/douban/movie_item.csv" 18 | }, 19 | "feat" : { 20 | 21 | "user_id" : {"type" : "token", "source" : "user"}, 22 | 23 | "item_id" : {"type" : "token", "source" : "item"}, 24 | 25 | "authors" : {"type" : "token", "source" : "item_book"}, 26 | "series" : {"type" : "token", "source" : "item_book"}, 27 | "publisher" : {"type" : "token", "source" : "item_book"}, 28 | "binding" : {"type" : "token", "source" : "item_book"}, 29 | "publish_year" : {"type" : "token", "source" : "item_book"}, 30 | "price" : {"type" : "float", "source" : "item_book"}, 31 | "pages" : {"type" : "float", "source" : "item_book"}, 32 | "book_douban_score" : {"type" : "token", "source" : "item_book"}, 33 | "book_douban_votes" : {"type" : "token", "source" : "item_book"}, 34 | "performer_ids" : {"type" : "token", "source" : "item_music"}, 35 | "style" : {"type" : "token", "source" : "item_music"}, 36 | "medium" : {"type" : "token", "source" : "item_music"}, 37 | "music_douban_score" : {"type" : "token", "source" : "item_music"}, 38 | "music_douban_votes" : {"type" : "token", "source" : "item_music"}, 39 | "publish_time" : {"type" : "token", "source" : "item_music"}, 40 | "regions" : {"type" : "token", "source" : "item_movie"}, 41 | "genres" : {"type" : "token", "source" : "item_movie"}, 42 | "languages" : {"type" : "token", "source" : "item_movie"}, 43 | "directors" : {"type" : "token", "source" : "item_movie"}, 44 | "year" : {"type" : "token", "source" : "item_movie"}, 45 | "mins" : {"type" : "float", "source" : "item_movie"}, 46 | "movie_douban_score" : {"type" : "token", "source" : "item_movie"}, 47 | "movie_douban_votes" : {"type" : "token", "source" : "item_movie"} 48 | }, 49 | "inter_matrix_type" : "01" 50 | }, 51 | "model" : { 52 | "name" : "CCCF", 53 | "user_emb_size" : 64, 54 | "item_emb_size" : 64, 55 | "token_emb_size" : 32, 56 | "user_hidden_size_list" : [ 64, 64 ], 57 | "item_hidden_size_list" : [ 64, 64 ] 58 | 59 | }, 60 | "opt" 61 | : { 62 | "name" : "Adam", 63 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 64 | "adjust_lr" : false, 65 | "scheduler" : 66 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 67 | "epochs" : 100, 68 | "eval_step" : 2, 69 | "batch_size" : 1024, 70 | "save_step" : 5, 71 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 72 | }, 73 | "path" : {"output" : "/Your/Local/Path/output/"}, 74 | "metrics" : ["AUC"] 75 | } 76 | -------------------------------------------------------------------------------- /configs/ddtcdr.json: -------------------------------------------------------------------------------- 1 | { 2 | "data" : { 3 | "name" : "douban", 4 | "USER_ID_FIELD" : "user_id", 5 | "ITEM_ID_FIELD" : "item_id", 6 | "LABEL_FIELD" : "label", 7 | "TYPE_FIELD" : "type", 8 | "pin_mem" : true, 9 | "type" : [ "book", "music", "movie" ], 10 | "user_feat_path" : 11 | "/Your/Local/Path/data/douban/all_users.csv", 12 | "inter_feat_path" : 13 | "/Your/Local/Path/data/douban/all_rate.csv", 14 | "item_feat_path" : { 15 | "book" : "/Your/Local/Path/data/douban/book_item.csv", 16 | "music" : "/Your/Local/Path/data/douban/music_item.csv", 17 | "movie" : "/Your/Local/Path/data/douban/movie_item.csv" 18 | }, 19 | "feat" : { 20 | 21 | "user_id" : {"type" : "token", "source" : "user"}, 22 | "item_id" : {"type" : "token", "source" : "item"}, 23 | "authors" : {"type" : "token", "source" : "item_book"}, 24 | "publish_year" : {"type" : "token", "source" : "item_book"}, 25 | "book_douban_score" : {"type" : "token", "source" : "item_book"}, 26 | "book_douban_votes" : {"type" : "token", "source" : "item_book"}, 27 | "performer_ids" : {"type" : "token", "source" : "item_music"}, 28 | "music_douban_score" : {"type" : "token", "source" : "item_music"}, 29 | "music_douban_votes" : {"type" : "token", "source" : "item_music"}, 30 | "publish_time" : {"type" : "token", "source" : "item_music"}, 31 | "directors" : {"type" : "token", "source" : "item_movie"}, 32 | "year" : {"type" : "token", "source" : "item_movie"}, 33 | "movie_douban_score" : {"type" : "token", "source" : "item_movie"}, 34 | "movie_douban_votes" : {"type" : "token", "source" : "item_movie"} 35 | }, 36 | "inter_matrix_type" : "01" 37 | }, 38 | "model" : { 39 | "name" : "DDTCDR", 40 | "latent_dim": 32, 41 | "token_emb_size" : 32, 42 | "layers": [64, 32], 43 | "alpha": 0.03 44 | }, 45 | "opt" 46 | : { 47 | "name" : "Adam", 48 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 49 | "adjust_lr" : false, 50 | "scheduler" : 51 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 52 | "epochs" : 100, 53 | "eval_step" : 2, 54 | "batch_size" : 1024, 55 | "save_step" : 5, 56 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 57 | }, 58 | "path" : {"output" : "/Your/Local/Path/output/"}, 59 | "metrics" : ["AUC"] 60 | } 61 | -------------------------------------------------------------------------------- /configs/deepfm.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "data" : { 4 | "name" : "douban", 5 | "USER_ID_FIELD" : "user_id", 6 | "ITEM_ID_FIELD" : "item_id", 7 | "LABEL_FIELD" : "label", 8 | "TYPE_FIELD" : "type", 9 | "user_feat_path" : 10 | "/Your/Local/Path/data/douban/all_users.csv", 11 | "inter_feat_path" : 12 | "/Your/Local/Path/data/douban/all_rate.csv", 13 | "item_feat_path" : 14 | "/Your/Local/Path/data/douban/all_item_token.csv", 15 | "feat" : { 16 | 17 | "user_id" : {"type" : "token", "source" : "user"}, 18 | 19 | "item_id" : {"type" : "token", "source" : "item"}, 20 | 21 | "type" : {"type" : "token", "source" : "item"}, 22 | "douban_score" : {"type" : "token", "source" : "item"}, 23 | 24 | "douban_votes" : {"type" : "token", "source" : "item"} 25 | }, 26 | "inter_matrix_type" : "01" 27 | }, 28 | "model" : { 29 | "name" : "DeepFM", 30 | "user_emb_size" : 64, 31 | "item_emb_size" : 64, 32 | "embedding_size" : 64, 33 | "mlp_hidden_size" : [ 128, 128, 128 ], 34 | "dropout_prob" : 0.2 35 | 36 | }, 37 | "opt" 38 | : { 39 | "name" : "Adam", 40 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 41 | "adjust_lr" : false, 42 | "scheduler" : 43 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 44 | "epochs" : 100, 45 | "eval_step" : 2, 46 | "batch_size" : 1024, 47 | "save_step" : 5, 48 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 49 | }, 50 | "path" : {"output" : "/Your/Local/Path/output/"}, 51 | "metrics" : ["AUC"] 52 | } 53 | -------------------------------------------------------------------------------- /configs/deepmf.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "data" : { 4 | "name" : "douban", 5 | "USER_ID_FIELD" : "user_id", 6 | "ITEM_ID_FIELD" : "item_id", 7 | "LABEL_FIELD" : "label", 8 | "TYPE_FIELD" : "type", 9 | "user_feat_path" : 10 | "/Your/Local/Path/data/douban/all_users.csv", 11 | "inter_feat_path" : 12 | "/Your/Local/Path/data/douban/all_rate.csv", 13 | "item_feat_path" : 14 | "/Your/Local/Path/data/douban/all_item.csv", 15 | "feat" : { 16 | 17 | "user_id" : {"type" : "token", "source" : "user"}, 18 | 19 | "item_id" : {"type" : "token", "source" : "item"}, 20 | 21 | "douban_score" : {"type" : "float", "source" : "item"}, 22 | 23 | "douban_votes" : {"type" : "float", "source" : "item"} 24 | }, 25 | "inter_matrix_type" : "01" 26 | }, 27 | "model" : { 28 | "name" : "DMF", 29 | "user_emb_size" : 64, 30 | "item_emb_size" : 64, 31 | "user_hidden_size_list" : [ 64, 64 ], 32 | "item_hidden_size_list" : [ 64, 64 ] 33 | 34 | }, 35 | "opt" 36 | : { 37 | "name" : "Adam", 38 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 39 | "adjust_lr" : false, 40 | "scheduler" : 41 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 42 | "epochs" : 100, 43 | "eval_step" : 2, 44 | "batch_size" : 16, 45 | "save_step" : 5, 46 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 47 | }, 48 | "path" : {"output" : "/Your/Local/Path/output/"}, 49 | "metrics" : ["AUC"] 50 | } 51 | -------------------------------------------------------------------------------- /configs/dssm.json: -------------------------------------------------------------------------------- 1 | { 2 | "data" : { 3 | "name" : "douban", 4 | "USER_ID_FIELD" : "user_id", 5 | "ITEM_ID_FIELD" : "item_id", 6 | "LABEL_FIELD" : "label", 7 | "TYPE_FIELD" : "type", 8 | "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv", 9 | "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv", 10 | "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv", 11 | "feat" : { 12 | "user_id" : {"type" : "token", "source" : "user"}, 13 | "item_id" : {"type" : "token", "source" : "item"}, 14 | "type" : {"type" : "token", "source" : "item"}, 15 | "douban_score" : {"type" : "token", "source" : "item"}, 16 | "douban_votes" : {"type" : "token", "source" : "item"} 17 | }, 18 | "inter_matrix_type" : "01" 19 | }, 20 | "model" : { 21 | "name" : "DSSM", 22 | "user_emb_size" : 64, 23 | "item_emb_size" : 64, 24 | "embedding_size" : 10, 25 | "mlp_hidden_size" : [ 256, 256, 256 ], 26 | "dropout_prob" : 0.3, 27 | "double_tower" : true 28 | }, 29 | "opt" 30 | : { 31 | "name" : "Adam", 32 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 33 | "adjust_lr" : false, 34 | "scheduler" : 35 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 36 | "epochs" : 100, 37 | "eval_step" : 2, 38 | "batch_size" : 128, 39 | "save_step" : 5, 40 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 41 | }, 42 | "path" : {"output" : "/Your/Local/Path/output/"}, 43 | "metrics" : ["AUC"] 44 | } 45 | -------------------------------------------------------------------------------- /configs/duration.json: -------------------------------------------------------------------------------- 1 | { 2 | "data" : { 3 | "name" : "douban", 4 | "USER_ID_FIELD" : "user_id", 5 | "ITEM_ID_FIELD" : "item_id", 6 | "LABEL_FIELD" : "label", 7 | "TYPE_FIELD" : "type", 8 | "pin_mem" : true, 9 | "type" : [ "book", "music", "movie" ], 10 | "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv", 11 | "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv", 12 | "item_feat_path" : { 13 | "book" : "/Your/Local/Path/data/douban/book_item.csv", 14 | "music" : "/Your/Local/Path/data/douban/music_item.csv", 15 | "movie" : "/Your/Local/Path/data/douban/movie_item.csv" 16 | }, 17 | "feat" : { 18 | 19 | "user_id" : {"type" : "token", "source" : "user"}, 20 | 21 | "item_id" : {"type" : "token", "source" : "item"}, 22 | 23 | "authors" : {"type" : "token", "source" : "item_book"}, 24 | "series" : {"type" : "token", "source" : "item_book"}, 25 | "publisher" : {"type" : "token", "source" : "item_book"}, 26 | "binding" : {"type" : "token", "source" : "item_book"}, 27 | "publish_year" : {"type" : "token", "source" : "item_book"}, 28 | "price" : {"type" : "float", "source" : "item_book"}, 29 | "pages" : {"type" : "float", "source" : "item_book"}, 30 | "book_douban_score" : {"type" : "token", "source" : "item_book"}, 31 | "book_douban_votes" : {"type" : "token", "source" : "item_book"}, 32 | "performer_ids" : {"type" : "token", "source" : "item_music"}, 33 | "style" : {"type" : "token", "source" : "item_music"}, 34 | "medium" : {"type" : "token", "source" : "item_music"}, 35 | "music_douban_score" : {"type" : "token", "source" : "item_music"}, 36 | "music_douban_votes" : {"type" : "token", "source" : "item_music"}, 37 | "publish_time" : {"type" : "token", "source" : "item_music"}, 38 | "regions" : {"type" : "token", "source" : "item_movie"}, 39 | "genres" : {"type" : "token", "source" : "item_movie"}, 40 | "languages" : {"type" : "token", "source" : "item_movie"}, 41 | "directors" : {"type" : "token", "source" : "item_movie"}, 42 | "year" : {"type" : "token", "source" : "item_movie"}, 43 | "mins" : {"type" : "float", "source" : "item_movie"}, 44 | "movie_douban_score" : {"type" : "token", "source" : "item_movie"}, 45 | "movie_douban_votes" : {"type" : "token", "source" : "item_movie"} 46 | }, 47 | "inter_matrix_type" : "01" 48 | }, 49 | "model" : { 50 | "name" : "DURation", 51 | "user_emb_size" : 64, 52 | "item_emb_size" : 64, 53 | "token_emb_size" : 32, 54 | "user_hidden_size_list" : [ 64, 64 ], 55 | "item_hidden_size_list" : [ 64, 64 ], 56 | "item_map_hidden_size_list" : [ 256, 64 ], 57 | "mlp_hidden_size" : [ 128, 128, 128 ], 58 | "dropout_prob" : 0.2, 59 | "kernel" : "gaussian", 60 | "align_sample_size" : 128 61 | 62 | }, 63 | "opt" 64 | : { 65 | "name" : "Adam", 66 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 67 | "adjust_lr" : false, 68 | "scheduler" : 69 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 70 | "epochs" : 100, 71 | "eval_step" : 2, 72 | "batch_size" : 1024, 73 | "save_step" : 5, 74 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 75 | }, 76 | "path" : {"output" : "/Your/Local/Path/output/"}, 77 | "metrics" : ["AUC"] 78 | } 79 | -------------------------------------------------------------------------------- /configs/fism.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "data" : { 4 | "name" : "douban", 5 | "USER_ID_FIELD" : "user_id", 6 | "ITEM_ID_FIELD" : "item_id", 7 | "LABEL_FIELD" : "label", 8 | "TYPE_FIELD" : "type", 9 | "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv", 10 | "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv", 11 | "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv", 12 | "feat" : { 13 | 14 | "user_id" : {"type" : "token", "source" : "user"}, 15 | 16 | "item_id" : {"type" : "token", "source" : "item"}, 17 | 18 | "type" : {"type" : "token", "source" : "item"}, 19 | "douban_score" : {"type" : "token", "source" : "item"}, 20 | 21 | "douban_votes" : {"type" : "token", "source" : "item"} 22 | }, 23 | "inter_matrix_type" : "01" 24 | }, 25 | "model" : { 26 | "name" : "FISM", 27 | "embedding_size" : 64, 28 | "split_to" : 0, 29 | "reg_weights" : [ 1e-2, 1e-2 ], 30 | "alpha" : 0 31 | 32 | }, 33 | "opt" 34 | : { 35 | "name" : "Adam", 36 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 37 | "adjust_lr" : false, 38 | "scheduler" : 39 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 40 | "epochs" : 100, 41 | "eval_step" : 2, 42 | "batch_size" : 512, 43 | "save_step" : 5, 44 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 45 | }, 46 | "path" : {"output" : "/Your/Local/Path/output/"}, 47 | "metrics" : ["AUC"] 48 | } 49 | -------------------------------------------------------------------------------- /configs/nais.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "data" : { 4 | "name" : "douban", 5 | "USER_ID_FIELD" : "user_id", 6 | "ITEM_ID_FIELD" : "item_id", 7 | "LABEL_FIELD" : "label", 8 | "TYPE_FIELD" : "type", 9 | "user_feat_path" : 10 | "/Your/Local/Path/data/douban/all_users.csv", 11 | "inter_feat_path" : 12 | "/Your/Local/Path/data/douban/all_rate.csv", 13 | "item_feat_path" : 14 | "/Your/Local/Path/data/douban/all_item.csv", 15 | "feat" : { 16 | 17 | "user_id" : {"type" : "token", "source" : "user"}, 18 | 19 | "item_id" : {"type" : "token", "source" : "item"}, 20 | 21 | "douban_score" : {"type" : "token", "source" : "item"}, 22 | 23 | "douban_votes" : {"type" : "token", "source" : "item"} 24 | }, 25 | "inter_matrix_type" : "01" 26 | }, 27 | "model" : { 28 | "name" : "NAIS", 29 | "algorithm": "prod", 30 | 31 | "embedding_size" : 64, 32 | "weight_size": 64, 33 | "split_to": 0, 34 | "reg_weights": [1e-7, 1e-7, 1e-5], 35 | "alpha": 0, 36 | "beta": 0.5, 37 | "pretrain_path" : null 38 | }, 39 | "opt" 40 | : { 41 | "name" : "Adam", 42 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 43 | "adjust_lr" : false, 44 | "scheduler" : 45 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 46 | "epochs" : 100, 47 | "eval_step" : 2, 48 | "batch_size" : 1024, 49 | "save_step" : 5, 50 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 51 | }, 52 | "path" : {"output" : "/Your/Local/Path/output/"}, 53 | "metrics" : ["AUC"] 54 | } 55 | -------------------------------------------------------------------------------- /configs/widedeep.json: -------------------------------------------------------------------------------- 1 | { 2 | "data" : { 3 | "name" : "douban", 4 | "USER_ID_FIELD" : "user_id", 5 | "ITEM_ID_FIELD" : "item_id", 6 | "LABEL_FIELD" : "label", 7 | "TYPE_FIELD" : "type", 8 | "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv", 9 | "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv", 10 | "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv", 11 | "feat" : { 12 | "user_id" : {"type" : "token", "source" : "user"}, 13 | "item_id" : {"type" : "token", "source" : "item"}, 14 | "type" : {"type" : "token", "source" : "item"}, 15 | "douban_score" : {"type" : "token", "source" : "item"}, 16 | "douban_votes" : {"type" : "token", "source" : "item"} 17 | }, 18 | "inter_matrix_type" : "01" 19 | }, 20 | "model" : { 21 | "name" : "WideDeep", 22 | "user_emb_size" : 64, 23 | "item_emb_size" : 64, 24 | "embedding_size" : 64, 25 | "mlp_hidden_size" : [ 256, 64, 8 ], 26 | "dropout_prob" : 0.1 27 | }, 28 | "opt" 29 | : { 30 | "name" : "Adam", 31 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 32 | "adjust_lr" : false, 33 | "scheduler" : 34 | {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}}, 35 | "epochs" : 100, 36 | "eval_step" : 2, 37 | "batch_size" : 1024, 38 | "save_step" : 5, 39 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 40 | }, 41 | "path" : {"output" : "/Your/Local/Path/output/"}, 42 | "metrics" : ["AUC"] 43 | } 44 | -------------------------------------------------------------------------------- /configs/xdeepfm.json: -------------------------------------------------------------------------------- 1 | 2 | { 3 | "data" : { 4 | "name" : "douban", 5 | "USER_ID_FIELD" : "user_id", 6 | "ITEM_ID_FIELD" : "item_id", 7 | "LABEL_FIELD" : "label", 8 | "TYPE_FIELD" : "type", 9 | "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv", 10 | "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv", 11 | "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv", 12 | "feat" : { 13 | 14 | "user_id" : {"type" : "token", "source" : "user"}, 15 | 16 | "item_id" : {"type" : "token", "source" : "item"}, 17 | 18 | "type" : {"type" : "token", "source" : "item"}, 19 | "douban_score" : {"type" : "token", "source" : "item"}, 20 | 21 | "douban_votes" : {"type" : "token", "source" : "item"} 22 | 23 | }, 24 | "inter_matrix_type" : "01" 25 | }, 26 | "model" : { 27 | "name" : "xDeepFM", 28 | "user_emb_size" : 64, 29 | "item_emb_size" : 64, 30 | "embedding_size" : 10, 31 | "mlp_hidden_size" : [ 128, 128, 128 ], 32 | "reg_weight" : 0.0005, 33 | "dropout_prob" : 0.2, 34 | "direct" : false, 35 | "cin_layer_size" : [ 100, 100, 100 ] 36 | }, 37 | "opt" 38 | : { 39 | "name" : "Adam", 40 | "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01}, 41 | "adjust_lr" : false, 42 | "epochs" : 100, 43 | "eval_step" : 2, 44 | "batch_size" : 1024, 45 | "save_step" : 5, 46 | "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"} 47 | }, 48 | "path" : {"output" : "/Your/Local/Path/output/"}, 49 | "metrics" : ["AUC"] 50 | } 51 | -------------------------------------------------------------------------------- /data.tar.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekinglcq/HRec/f13a685dd593154d4887ed18bd444e588484d014/data.tar.xz -------------------------------------------------------------------------------- /framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekinglcq/HRec/f13a685dd593154d4887ed18bd444e588484d014/framework.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Automatically generated by https://github.com/damnever/pigar. 2 | 3 | gpustat == 0.6.0 4 | numpy == 1.22.0 5 | pandas == 1.1.5 6 | scipy == 1.5.4 7 | sklearn == 0.0 8 | torch == 1.5.0 9 | tqdm == 4.54.1 10 | -------------------------------------------------------------------------------- /train_hete.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import json 4 | import argparse 5 | from HRec import pipeline 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('model', type=str) 9 | 10 | args = parser.parse_args() 11 | model_name = args.model 12 | print(args.model) 13 | config = json.load(open("./configs/%s.json" % (model_name))) 14 | 15 | p = pipeline.HProcess(config) 16 | p.fit() 17 | -------------------------------------------------------------------------------- /train_homo.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import json 4 | import argparse 5 | from HRec import pipeline 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('model', type=str) 9 | 10 | args = parser.parse_args() 11 | model_name = args.model 12 | print(args.model) 13 | config = json.load(open("./configs/%s.json" % (model_name))) 14 | 15 | p = pipeline.Process(config) 16 | p.fit() 17 | break 18 | --------------------------------------------------------------------------------