├── HRec
    ├── __init__.py
    ├── datasets
    │   ├── __init__.py
    │   ├── dataset.py
    │   ├── enum_type.py
    │   └── hdataset.py
    ├── models
    │   ├── __init__.py
    │   ├── afm.py
    │   ├── autoint.py
    │   ├── base.py
    │   ├── cccf.py
    │   ├── dcn.py
    │   ├── ddtcdr.py
    │   ├── deepfm.py
    │   ├── deepmf.py
    │   ├── dssm.py
    │   ├── duration.py
    │   ├── fism.py
    │   ├── layers.py
    │   ├── model_map.py
    │   ├── nais.py
    │   ├── ncf.py
    │   ├── nfm.py
    │   ├── utils.py
    │   ├── widedeep.py
    │   └── xdeepfm.py
    └── pipeline
    │   ├── __init__.py
    │   ├── configure.py
    │   ├── dprocess.py
    │   ├── evaluator.py
    │   ├── hprocess.py
    │   ├── metrics.py
    │   ├── optimizer.py
    │   ├── process.py
    │   └── utils.py
├── LICENSE
├── README.md
├── configs
    ├── afm.json
    ├── autoint.json
    ├── cccf.json
    ├── ddtcdr.json
    ├── deepfm.json
    ├── deepmf.json
    ├── dssm.json
    ├── duration.json
    ├── fism.json
    ├── nais.json
    ├── widedeep.json
    └── xdeepfm.json
├── data.tar.xz
├── framework.png
├── requirements.txt
├── train_hete.py
└── train_homo.py


/HRec/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekinglcq/HRec/f13a685dd593154d4887ed18bd444e588484d014/HRec/__init__.py


--------------------------------------------------------------------------------
/HRec/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import DataSet, SubSet
2 | from .hdataset import HDataSet
3 | from .enum_type import FeatureSource
4 | 


--------------------------------------------------------------------------------
/HRec/datasets/enum_type.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class FeatureSource(Enum):
 5 |     """Source of features.
 6 | 
 7 |     - ``INTERACTION``: Features from ``.inter``
 8 |     - ``USER``: Features from ``.user`` (other than ``user_id``).
 9 |     - ``ITEM``: Features from ``.item`` (other than ``item_id``).
10 |     - ``USER_ID``: ``user_id`` feature in ``inter_feat`` and ``user_feat``.
11 |     - ``ITEM_ID``: ``item_id`` feature in ``inter_feat`` and ``item_feat``.
12 |     """
13 | 
14 |     INTERACTION = 'inter'
15 |     USER = 'user'
16 |     ITEM = 'item'
17 |     USER_ID = 'user_id'
18 |     ITEM_ID = 'item_id'
19 | 
20 | 
21 | item_type_dict = {'book': 0.0, 'music': 1.0, 'movie': 2.0}
22 | 


--------------------------------------------------------------------------------
/HRec/datasets/hdataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: hdataset.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2020-12-28 20:17:47
  7 | # ###########################
  8 | 
  9 | import pandas as pd
 10 | 
 11 | import os
 12 | import logging
 13 | from collections import defaultdict
 14 | from torch.utils.data import DataLoader, Dataset
 15 | from .enum_type import FeatureSource as FS
 16 | from .enum_type import item_type_dict
 17 | from .dataset import DataSet, SubSet
 18 | 
 19 | 
 20 | class HDataSet(DataSet):
 21 |     """
 22 |     Dataset used for heterogenous items
 23 |     """
 24 |     def __init__(self, config, restore_path=None):
 25 |         self.config = config
 26 |         self._init_setting()
 27 |         if restore_path is None:
 28 |             self._load_feats()
 29 |         else:
 30 |             # TODO
 31 |             pass
 32 |         self._preprocessing()
 33 | 
 34 |     def _load_feats(self):
 35 |         self.user_feat = self._load_meta_feats(self.config["user_feat_path"],
 36 |                                                FS.USER, "user_id")
 37 |         self.item_feat = self._load_item_feats(self.config["item_feat_path"],
 38 |                                                FS.ITEM)
 39 |         self.inter_feat = pd.read_csv(self.config["inter_feat_path"]).sample(
 40 |             frac=1, random_state=28)
 41 |         mask = None
 42 |         if len(self.types) < 3:
 43 |             for item_type, item_feat in self.item_feat.items():
 44 |                 new_mask = self.inter_feat[self.iid_field].isin(
 45 |                     item_feat[self.iid_field])
 46 |                 if mask is not None:
 47 |                     mask = mask | new_mask
 48 |                 else:
 49 |                     mask = new_mask
 50 |             self.inter_feat = self.inter_feat[mask]
 51 |         self.h_inter_feat = {}
 52 |         self.user_num = len(self.user_feat)
 53 |         self.item_num = sum([len(i) for i in self.item_feat.values()])
 54 |         self.item_nums = {k: len(v) for k, v in self.item_feat.items()}
 55 |         print(f'user num: {self.user_num}')
 56 |         print(f'item num: {self.item_num}')
 57 |         print(f'item nums: {self.item_nums}')
 58 | 
 59 |     def _preprocessing(self):
 60 |         self._normalize()
 61 |         if len(self.types) < 3:
 62 |             self._reID(self.iid_field)
 63 |             self._reID(self.uid_field)
 64 | 
 65 |     def _load_item_feats(self, paths, source):
 66 |         item_feat = {}
 67 |         for item_type, item_path in paths.items():
 68 |             if item_type not in self.types:
 69 |                 continue
 70 |             if os.path.isfile(item_path):
 71 |                 feat = pd.read_csv(item_path)
 72 |                 item_feat[item_type] = feat
 73 |             else:
 74 |                 raise ValueError("Dataset file not fountd.")
 75 |         return item_feat
 76 | 
 77 |     def _init_setting(self):
 78 |         self.logger = logging.getLogger()
 79 |         self.name = self.config['name']
 80 |         print(self.config)
 81 |         self.uid_field = self.config["USER_ID_FIELD"]
 82 |         self.iid_field = self.config["ITEM_ID_FIELD"]
 83 |         self.label_field = self.config["LABEL_FIELD"]
 84 |         self.itype_field = self.config["TYPE_FIELD"]
 85 |         self.types = self.config["type"]
 86 |         self.field2type = {}
 87 |         self.field2source = {}
 88 |         self.field2id_token = defaultdict(dict)
 89 |         self.field2token_id = defaultdict(dict)
 90 |         self.user_feat_fields = []
 91 |         self.item_feat_fields = defaultdict(list)
 92 | 
 93 |         for feat_name, feat_value in self.config['feat'].items():
 94 |             source = feat_value['source']
 95 |             self.field2type[feat_name] = feat_value['type']
 96 |             self.field2source[feat_name] = feat_value['source']
 97 |             if source == 'user' and feat_name != self.uid_field:
 98 |                 self.user_feat_fields.append(feat_name)
 99 |             if source.startswith('item') and feat_name != self.iid_field:
100 |                 item_type = source.split("_")[1]
101 |                 if item_type in self.types:
102 |                     self.item_feat_fields[item_type].append(feat_name)
103 | 
104 |     def num(self, field):
105 | 
106 |         if field == self.uid_field:
107 |             return self.user_num
108 |         if field == self.iid_field:
109 |             return self.item_num
110 |         if field not in self.field2type:
111 |             raise ValueError('field {} not in dataset'.format(field))
112 |         # if field not in self.field2token_id:
113 |         # raise ValueError('field {} is not token type'.format(field))
114 |         if len(self.field2token_id[field]) == 0:
115 |             if field in self.user_feat_fields:
116 |                 return len(self.user_feat[field].unique())
117 |             else:
118 |                 for item_type, item_feat_fields in self.item_feat_fields.items(
119 |                 ):
120 |                     if field in item_feat_fields:
121 |                         return len(self.item_feat[item_type][field].unique())
122 |         return len(self.field2token_id[field])
123 | 
124 |     def _reID(self, field):
125 |         """
126 |         Re-ID the token-type feature, save the id map in self.field2token_id
127 |         """
128 |         self.logger.info(f'ReID field {field}.')
129 |         ftype = self.field2type.get(field)
130 |         assert ftype == 'token'
131 |         source = self.field2source.get(field)
132 |         if type(source) is str and source.startswith("item_"):
133 |             item_type = source.split("_")[1]
134 |             dataframe = self.item_feat[item_type]
135 |         elif source is FS.ITEM_ID or source == "item":
136 |             dataframe = pd.concat(list(self.item_feat.values()), join='inner')
137 |         elif source == 'user' or source is FS.USER_ID:
138 |             dataframe = self.user_feat
139 |         else:
140 |             dataframe = self.inter_feat
141 |         id_map = {v: k for k, v in enumerate(dataframe[field].unique())}
142 |         self.field2token_id[field].update(id_map)
143 |         dataframe[field] = dataframe[field].map(id_map)
144 |         if source in ['item', 'user', FS.ITEM_ID, FS.USER_ID]:
145 |             if field in self.inter_feat:
146 |                 self.inter_feat[field] = self.inter_feat[field].map(id_map)
147 |             for item_type, item_feat in self.item_feat.items():
148 |                 if field in item_feat:
149 |                     item_feat[field] = item_feat[field].map(id_map)
150 | 
151 |     def join(self, df):
152 |         """
153 |         Join user/item features to interactions.
154 |         """
155 |         if self.user_feat is not None and self.uid_field in df:
156 |             df = pd.merge(df,
157 |                           self.user_feat,
158 |                           on=self.uid_field,
159 |                           how='left',
160 |                           suffixes=('_inter', '_user'))
161 |         if self.item_feat is not None and self.iid_field in df:
162 |             for item_type, item_feat in self.item_feat.items():
163 |                 df = pd.merge(df,
164 |                               item_feat,
165 |                               on=self.iid_field,
166 |                               how='left',
167 |                               suffixes=(f'_{item_type}', '_inter'))
168 |             type_c = [i for i in df.columns if i.startswith(self.itype_field)]
169 |             df[self.itype_field] = df[type_c].agg(sum, axis=1)
170 |         return df
171 | 
172 |     def join_interaction(self):
173 |         self.inter_feat = self.join(self.inter_feat)
174 |         if 'sample' in self.config:
175 |             sample_ratio = self.config['sample']
176 |             sampled = []
177 |             for kind in self.types:
178 |                 ratio = sample_ratio.get(kind, 1.0)
179 |                 kind_id = item_type_dict[kind]
180 |                 # preverse the data for val & test
181 |                 new_df = self.inter_feat[self.inter_feat['type'] ==
182 |                                          kind_id].sample(frac=ratio * 0.7 +
183 |                                                          0.3,
184 |                                                          random_state=16)
185 |                 print(kind, kind_id, ratio, new_df.shape)
186 |                 sampled.append(new_df)
187 |             self.inter_feat = pd.concat(sampled, ignore_index=True)
188 |             self.inter_feat = self.inter_feat.sample(frac=1.).reset_index(
189 |                 drop=True)
190 | 
191 |     def train_val_test_split(self,
192 |                              ratios=[0.7, 0.2, 0.1],
193 |                              group_by=None,
194 |                              **kwargs):
195 |         assert len(ratios) == 3
196 |         if 'sample' in self.config:
197 |             train, val, test = self.split_by_ratio_sampled(
198 |                 ratios, create_new_dataset=False)
199 |         else:
200 |             train, val, test = self.split_by_ratio(ratios,
201 |                                                    group_by=group_by,
202 |                                                    create_new_dataset=False)
203 |         user_fs = self.user_feat_fields
204 |         item_fs = self.item_feat_fields
205 |         type_field = self.itype_field
206 |         self.train_inter_subset = {}
207 |         self.val_inter_subset = {}
208 |         self.test_inter_subset = {}
209 |         for item_type in self.types:
210 |             item_type_id = item_type_dict[item_type]
211 |             self.train_inter_subset[item_type] = SubSet(
212 |                 train[train[type_field] == item_type_id], self.uid_field,
213 |                 self.iid_field, self.itype_field, self.label_field, user_fs,
214 |                 item_fs[item_type])
215 |             self.val_inter_subset[item_type] = SubSet(
216 |                 val[val[type_field] == item_type_id], self.uid_field,
217 |                 self.iid_field, self.itype_field, self.label_field, user_fs,
218 |                 item_fs[item_type])
219 |             self.test_inter_subset[item_type] = SubSet(
220 |                 test[test[type_field] == item_type_id], self.uid_field,
221 |                 self.iid_field, self.itype_field, self.label_field, user_fs,
222 |                 item_fs[item_type])
223 |         self.all_inter_feat = self.inter_feat
224 |         self.logger.info(
225 |             "Replace interaction features with train interaction fatures.")
226 |         self.logger.info(
227 |             "Interaction features are stored in self.all_inter_feat")
228 |         self.inter_feat = train
229 | 
230 |     def init_data_loader(self, batch_size=256, num_workers=1):
231 |         self.train_data_loader = {}
232 |         self.val_data_loader = {}
233 |         self.test_data_loader = {}
234 |         for item_type in self.types:
235 |             self.train_data_loader[item_type] = DataLoader(
236 |                 self.train_inter_subset[item_type],
237 |                 batch_size=batch_size,
238 |                 # pin_memory=True,
239 |                 num_workers=num_workers)
240 |             self.val_data_loader[item_type] = DataLoader(
241 |                 self.val_inter_subset[item_type],
242 |                 batch_size=batch_size,
243 |                 num_workers=num_workers)
244 |             self.test_data_loader[item_type] = DataLoader(
245 |                 self.test_inter_subset[item_type],
246 |                 batch_size=batch_size,
247 |                 num_workers=num_workers)
248 | 
249 | 
250 | class HSubSet(Dataset):
251 |     def __init__(self, dataframes, uid_field, iid_field, label_field,
252 |                  u_feat_fields, i_feat_fields):
253 |         self.types = dataframes.keys()
254 |         self.dfs = dataframes
255 |         self.uid = uid_field
256 |         self.iid = iid_field
257 |         self.label = label_field
258 | 
259 |     def __len__(self):
260 |         return min([len(df.index) for df in self.dfs])
261 | 


--------------------------------------------------------------------------------
/HRec/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .deepfm import DeepFM
 2 | from .deepmf import DMF
 3 | from .fism import FISM
 4 | from .ncf import NCF
 5 | from .xdeepfm import xDeepFM
 6 | from .dssm import DSSM
 7 | from .afm import AFM
 8 | from .dcn import DCN
 9 | from .widedeep import WideDeep
10 | from .nais import NAIS
11 | from .cccf import CCCFNet
12 | from .ddtcdr import DDTCDR
13 | from .autoint import AutoInt
14 | from .duration import DURation
15 | 
16 | from .utils import ModelType
17 | 
18 | model_map = {
19 |     # General Model
20 |     "DMF": DMF,
21 |     "FISM": FISM,
22 |     "NCF": NCF,
23 |     # Context Model
24 |     "DeepFM": DeepFM,
25 |     "xDeepFM": xDeepFM,
26 |     "DCN": DCN,
27 |     "AFM": AFM,
28 |     "DSSM": DSSM,
29 |     "WideDeep": WideDeep,
30 |     "NAIS": NAIS,
31 |     "AutoInt": AutoInt,
32 |     # Heterogeneous Model
33 |     "CCCF": CCCFNet,
34 |     "DDTCDR": DDTCDR,
35 |     "DURation": DURation,
36 | }
37 | 


--------------------------------------------------------------------------------
/HRec/models/afm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: afm.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2020-12-31 17:31:43
  7 | # ###########################
  8 | r"""
  9 | AFM
 10 | ################################################
 11 | Reference:
 12 |     Jun Xiao et al. "Attentional Factorization Machines: Learning the Weight of Feature Interactions via
 13 |     Attention Networks" in IJCAI 2017.
 14 | """
 15 | 
 16 | import torch
 17 | import torch.nn as nn
 18 | from torch.nn.init import xavier_normal_, constant_
 19 | 
 20 | from .layers import AttLayer
 21 | from .base import ContextModel
 22 | 
 23 | 
 24 | class AFM(ContextModel):
 25 |     """ AFM is a attention based FM model that predict the final score with the attention of input feature.
 26 | 
 27 |     """
 28 |     def __init__(self, config, dataset):
 29 |         super(AFM, self).__init__(config, dataset)
 30 | 
 31 |         # load parameters info
 32 |         self.attention_size = config['attention_size']
 33 |         self.dropout_prob = config['dropout_prob']
 34 |         self.reg_weight = config['reg_weight']
 35 |         self.num_pair = self.num_feature_field * (self.num_feature_field -
 36 |                                                   1) / 2
 37 | 
 38 |         # define layers and loss
 39 |         self.attlayer = AttLayer(self.embedding_size, self.attention_size)
 40 |         self.p = nn.Parameter(torch.randn(self.embedding_size),
 41 |                               requires_grad=True)
 42 |         self.dropout_layer = nn.Dropout(p=self.dropout_prob)
 43 |         self.sigmoid = nn.Sigmoid()
 44 |         self.loss = nn.BCELoss()
 45 | 
 46 |         # parameters initialization
 47 |         self.apply(self._init_weights)
 48 | 
 49 |     def _init_weights(self, module):
 50 |         if isinstance(module, nn.Embedding):
 51 |             xavier_normal_(module.weight.data)
 52 |         elif isinstance(module, nn.Linear):
 53 |             xavier_normal_(module.weight.data)
 54 |             if module.bias is not None:
 55 |                 constant_(module.bias.data, 0)
 56 | 
 57 |     def build_cross(self, feat_emb):
 58 |         """ Build the cross feature columns of feature columns
 59 | 
 60 |         Args:
 61 |             feat_emb (torch.FloatTensor): input feature embedding tensor. shape of [batch_size, field_size, embed_dim].
 62 | 
 63 |         Returns:
 64 |             tuple:
 65 |                 - torch.FloatTensor: Left part of the cross feature. shape of [batch_size, num_pairs, emb_dim].
 66 |                 - torch.FloatTensor: Right part of the cross feature. shape of [batch_size, num_pairs, emb_dim].
 67 |         """
 68 |         # num_pairs = num_feature_field * (num_feature_field-1) / 2
 69 |         row = []
 70 |         col = []
 71 |         for i in range(self.num_feature_field - 1):
 72 |             for j in range(i + 1, self.num_feature_field):
 73 |                 row.append(i)
 74 |                 col.append(j)
 75 |         p = feat_emb[:, row]  # [batch_size, num_pairs, emb_dim]
 76 |         q = feat_emb[:, col]  # [batch_size, num_pairs, emb_dim]
 77 |         return p, q
 78 | 
 79 |     def afm_layer(self, infeature):
 80 |         """ Get the attention-based feature interaction score
 81 | 
 82 |         Args:
 83 |             infeature (torch.FloatTensor): input feature embedding tensor. shape of [batch_size, field_size, embed_dim].
 84 | 
 85 |         Returns:
 86 |             torch.FloatTensor: Result of score. shape of [batch_size, 1].
 87 |         """
 88 |         p, q = self.build_cross(infeature)
 89 |         pair_wise_inter = torch.mul(p, q)  # [batch_size, num_pairs, emb_dim]
 90 | 
 91 |         # [batch_size, num_pairs, 1]
 92 |         att_signal = self.attlayer(pair_wise_inter).unsqueeze(dim=2)
 93 | 
 94 |         att_inter = torch.mul(
 95 |             att_signal, pair_wise_inter)  # [batch_size, num_pairs, emb_dim]
 96 |         att_pooling = torch.sum(att_inter, dim=1)  # [batch_size, emb_dim]
 97 |         att_pooling = self.dropout_layer(att_pooling)  # [batch_size, emb_dim]
 98 | 
 99 |         att_pooling = torch.mul(att_pooling, self.p)  # [batch_size, emb_dim]
100 |         att_pooling = torch.sum(att_pooling, dim=1,
101 |                                 keepdim=True)  # [batch_size, 1]
102 | 
103 |         return att_pooling
104 | 
105 |     def forward(self, interaction):
106 |         # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None
107 |         # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None
108 |         sparse_embedding, dense_embedding = self.embed_input_fields(
109 |             interaction)
110 |         all_embeddings = []
111 |         if sparse_embedding is not None:
112 |             all_embeddings.append(sparse_embedding)
113 |         if dense_embedding is not None and len(dense_embedding.shape) == 3:
114 |             all_embeddings.append(dense_embedding)
115 |         afm_all_embeddings = torch.cat(
116 |             all_embeddings, dim=1)  # [batch_size, num_field, embed_dim]
117 | 
118 |         output = self.sigmoid(
119 |             self.first_order_linear(interaction) +
120 |             self.afm_layer(afm_all_embeddings))
121 |         return output.squeeze()
122 | 
123 |     def calculate_loss(self, interaction):
124 |         label = interaction[self.LABEL].float()
125 | 
126 |         output = self.forward(interaction)
127 |         l2_loss = self.reg_weight * torch.norm(self.attlayer.w.weight, p=2)
128 |         return self.loss(output, label) + l2_loss
129 | 
130 |     def predict(self, interaction):
131 |         return self.forward(interaction)
132 | 


--------------------------------------------------------------------------------
/HRec/models/autoint.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: autoint.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2021-01-26 21:01:14
  7 | # ###########################
  8 | r"""
  9 | AutoInt
 10 | ################################################
 11 | Reference:
 12 |     Weiping Song et al. "AutoInt: Automatic Feature Interaction Learning via Self-Attentive Neural Networks"
 13 |     in CIKM 2018.
 14 | """
 15 | 
 16 | import torch
 17 | import torch.nn.functional as F
 18 | import torch.nn as nn
 19 | from torch.nn.init import xavier_normal_, constant_
 20 | 
 21 | from .layers import MLPLayers
 22 | from .base import ContextModel
 23 | 
 24 | 
 25 | class AutoInt(ContextModel):
 26 |     """ AutoInt is a novel CTR prediction model based on self-attention mechanism,
 27 |     which can automatically learn high-order feature interactions in an explicit fashion.
 28 | 
 29 |     """
 30 |     def __init__(self, config, dataset):
 31 |         super(AutoInt, self).__init__(config, dataset)
 32 | 
 33 |         # load parameters info
 34 |         self.attention_size = config['attention_size']
 35 |         self.dropout_probs = config['dropout_probs']
 36 |         self.n_layers = config['n_layers']
 37 |         self.num_heads = config['num_heads']
 38 |         self.mlp_hidden_size = config['mlp_hidden_size']
 39 | 
 40 |         # define layers and loss
 41 |         self.att_embedding = nn.Linear(self.embedding_size,
 42 |                                        self.attention_size)
 43 |         self.embed_output_dim = self.num_feature_field * self.embedding_size
 44 |         self.atten_output_dim = self.num_feature_field * self.attention_size
 45 |         size_list = [self.embed_output_dim] + self.mlp_hidden_size
 46 |         self.mlp_layers = MLPLayers(size_list, dropout=self.dropout_probs[1])
 47 |         # multi-head self-attention network
 48 |         self.self_attns = nn.ModuleList([
 49 |             nn.MultiheadAttention(self.attention_size,
 50 |                                   self.num_heads,
 51 |                                   dropout=self.dropout_probs[0])
 52 |             for _ in range(self.n_layers)
 53 |         ])
 54 |         self.attn_fc = torch.nn.Linear(self.atten_output_dim, 1)
 55 |         self.deep_predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1)
 56 | 
 57 |         self.dropout_layer = nn.Dropout(p=self.dropout_probs[2])
 58 |         self.sigmoid = nn.Sigmoid()
 59 |         self.loss = nn.BCELoss()
 60 | 
 61 |         # parameters initialization
 62 |         self.apply(self._init_weights)
 63 | 
 64 |     def _init_weights(self, module):
 65 |         if isinstance(module, nn.Embedding):
 66 |             xavier_normal_(module.weight.data)
 67 |         elif isinstance(module, nn.Linear):
 68 |             xavier_normal_(module.weight.data)
 69 |             if module.bias is not None:
 70 |                 constant_(module.bias.data, 0)
 71 | 
 72 |     def autoint_layer(self, infeature):
 73 |         """ Get the attention-based feature interaction score
 74 | 
 75 |         Args:
 76 |             infeature (torch.FloatTensor): input feature embedding tensor. shape of[batch_size,field_size,embed_dim].
 77 | 
 78 |         Returns:
 79 |             torch.FloatTensor: Result of score. shape of [batch_size,1] .
 80 |         """
 81 | 
 82 |         att_infeature = self.att_embedding(infeature)
 83 |         cross_term = att_infeature.transpose(0, 1)
 84 |         for self_attn in self.self_attns:
 85 |             cross_term, _ = self_attn(cross_term, cross_term, cross_term)
 86 |         cross_term = cross_term.transpose(0, 1)
 87 |         # Interacting layer
 88 |         cross_term = F.relu(cross_term).contiguous().view(
 89 |             -1, self.atten_output_dim)
 90 |         batch_size = infeature.shape[0]
 91 |         att_output = self.attn_fc(cross_term) + self.deep_predict_layer(
 92 |             self.mlp_layers(infeature.view(batch_size, -1)))
 93 |         return att_output
 94 | 
 95 |     def forward(self, interaction):
 96 |         # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None
 97 |         # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None
 98 |         sparse_embedding, dense_embedding = self.embed_input_fields(
 99 |             interaction)
100 |         all_embeddings = []
101 |         if sparse_embedding is not None:
102 |             all_embeddings.append(sparse_embedding)
103 |         if dense_embedding is not None and len(dense_embedding.shape) == 3:
104 |             all_embeddings.append(dense_embedding)
105 |         autoint_all_embeddings = torch.cat(
106 |             all_embeddings, dim=1)  # [batch_size, num_field, embed_dim]
107 |         output = self.first_order_linear(interaction) + self.autoint_layer(
108 |             autoint_all_embeddings)
109 |         return self.sigmoid(output.squeeze(1))
110 | 
111 |     def calculate_loss(self, interaction):
112 |         label = interaction[self.LABEL].float()
113 |         output = self.forward(interaction)
114 |         return self.loss(output, label)
115 | 
116 |     def predict(self, interaction):
117 |         return self.forward(interaction)
118 | 


--------------------------------------------------------------------------------
/HRec/models/cccf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: cccf.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2021-01-01 16:10:50
  7 | # ###########################
  8 | 
  9 | import torch
 10 | import logging
 11 | import numpy as np
 12 | import torch.nn as nn
 13 | from .layers import MLPLayers
 14 | from .base import HModel
 15 | from torch.nn.init import normal_
 16 | from collections import Counter, defaultdict
 17 | 
 18 | 
 19 | class CCCFNet(HModel):
 20 |     """CCCFNet
 21 |     CCCFNet: A Content-Boosted Collaborative Filtering Neural Network for Cross Domain Recommender Systems
 22 |     """
 23 |     def __init__(self, config, dataset):
 24 |         super().__init__(config, dataset)
 25 |         self.logger = logging.getLogger()
 26 | 
 27 |         self.LABEL = dataset.config['LABEL_FIELD']
 28 | 
 29 |         self.user_emb_size = config['user_emb_size']
 30 |         self.item_emb_size = config['item_emb_size']
 31 |         self.token_emb_size = config['token_emb_size']
 32 | 
 33 |         self.user_cf_embedding = nn.Embedding(self.n_users, self.user_emb_size)
 34 |         self.item_cf_embedding = nn.Embedding(self.n_items, self.item_emb_size)
 35 | 
 36 |         self.P = len(dataset.config['item_feat_path'])
 37 |         self.item_size = dataset.item_nums
 38 | 
 39 |         self.device = config['device']
 40 |         self.user_hidden_size_list = config['user_hidden_size_list']
 41 |         self.item_hidden_size_list = config['item_hidden_size_list']
 42 | 
 43 |         assert self.user_hidden_size_list[-1] == self.item_hidden_size_list[-1]
 44 | 
 45 |         self.item_nn_dict = nn.ModuleDict()
 46 | 
 47 |         for item_type, item_feats in dataset.item_feat_fields.items():
 48 |             item_feat_type_count = Counter(
 49 |                 [dataset.field2type[i] for i in item_feats])
 50 |             input_dim = (item_feat_type_count['token'] + 1) * self.token_emb_size + \
 51 |                 item_feat_type_count['float']
 52 |             self.item_nn_dict[item_type] = MLPLayers(
 53 |                 [input_dim + self.user_emb_size] + self.item_hidden_size_list,
 54 |                 activation='tanh').to(self.device)
 55 | 
 56 |         self.user_fc_layers = MLPLayers([self.user_emb_size] +
 57 |                                         self.user_hidden_size_list).to(
 58 |                                             self.device)
 59 | 
 60 |         self.bce_loss = nn.BCELoss()
 61 |         self.sigmoid = nn.Sigmoid()
 62 |         # Save the item embedding before dot product layer to speed up evaluation
 63 |         self.i_embedding = None
 64 | 
 65 |         # parameters initialization
 66 |         self.apply(self._init_weights)
 67 | 
 68 |     def _init_weights(self, module):
 69 |         # We just initialize the module with normal distribution as the paper said
 70 |         if isinstance(module, nn.Linear):
 71 |             normal_(module.weight.data, 0, 0.01)
 72 |             if module.bias is not None:
 73 |                 module.bias.data.fill_(0.0)
 74 |         elif isinstance(module, nn.Embedding):
 75 |             normal_(module.weight.data, 0, 0.01)
 76 | 
 77 |     def agg_item_feature(self, item_type, item_data):
 78 | 
 79 |         token_embeddings = []
 80 |         float_feats = []
 81 |         for feat_name, feat_value in item_data.items():
 82 |             if feat_name in self.token_embedding_table and feat_name != self.USER_ID:
 83 |                 emb = self.token_embedding_table[feat_name](feat_value.long())
 84 |                 token_embeddings.append(emb)
 85 |             if feat_name in self.float_field_names:
 86 |                 float_feat = feat_value.float()
 87 |                 if float_feat.dim() == 1:
 88 |                     float_feat = float_feat.unsqueeze(-1)
 89 |                 float_feats.append(float_feat)
 90 |         all_emb = torch.cat(token_embeddings + float_feats, dim=-1)
 91 |         return all_emb
 92 | 
 93 |     def forward(self, item_type, data):
 94 | 
 95 |         user = data[self.USER_ID]
 96 |         item_id = data[self.ITEM_ID]
 97 |         user_emb = self.user_cf_embedding(user)
 98 |         item_cf_emb = self.item_cf_embedding(item_id)
 99 | 
100 |         item_layer = self.item_nn_dict[item_type]
101 |         item_content_emb = self.agg_item_feature(item_type, data)
102 |         item_emb = torch.cat([item_cf_emb, item_content_emb], dim=-1)
103 |         item_emb = item_layer(item_emb)
104 | 
105 |         user_emb = self.user_fc_layers(user_emb)
106 | 
107 |         vector = torch.mul(user_emb, item_emb).sum(dim=1)
108 |         vector = self.sigmoid(vector)
109 |         return vector
110 | 
111 |     def calculate_loss(self, data):
112 |         losses = []
113 |         losses_dict = defaultdict(int)
114 | 
115 |         for item_type, item_data in data.items():
116 | 
117 |             output = self.forward(item_type, item_data)
118 | 
119 |             label = item_data[self.LABEL].float()
120 |             tmp_loss = self.bce_loss(output, label)
121 |             losses.append(tmp_loss)
122 |             losses_dict['total'] += tmp_loss.item()
123 | 
124 |         loss = torch.sum(torch.stack(losses))
125 |         return loss, losses_dict
126 | 
127 |     def predict(self, h, data):
128 |         return self.forward(h, data)
129 | 


--------------------------------------------------------------------------------
/HRec/models/dcn.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: dcn.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2020-12-31 17:32:58
  7 | # ###########################
  8 | r"""
  9 | ################################################
 10 | Reference:
 11 |     Ruoxi Wang at al. "Deep & Cross Network for Ad Click Predictions." in ADKDD 2017.
 12 | """
 13 | 
 14 | import torch
 15 | import torch.nn as nn
 16 | from torch.nn.init import xavier_normal_, constant_
 17 | 
 18 | from .layers import MLPLayers, RegLoss
 19 | from .base import ContextModel
 20 | 
 21 | 
 22 | class DCN(ContextModel):
 23 |     """Deep & Cross Network replaces the wide part in Wide&Deep with cross network,
 24 |     automatically construct limited high-degree cross features, and learns the corresponding weights.
 25 | 
 26 |     """
 27 |     def __init__(self, config, dataset):
 28 |         super(DCN, self).__init__(config, dataset)
 29 | 
 30 |         # load parameters info
 31 |         self.mlp_hidden_size = config['mlp_hidden_size']
 32 |         self.cross_layer_num = config['cross_layer_num']
 33 |         self.reg_weight = config['reg_weight']
 34 |         self.dropout_prob = config['dropout_prob']
 35 | 
 36 |         # define layers and loss
 37 |         # init weight and bias of each cross layer
 38 |         self.cross_layer_parameter = [
 39 |             nn.Parameter(
 40 |                 torch.empty(self.embedding_size * len(self.token_field_names) +
 41 |                             len(self.float_field_names),
 42 |                             device=self.device))
 43 |             for _ in range(self.cross_layer_num * 2)
 44 |         ]
 45 |         self.cross_layer_w = nn.ParameterList(
 46 |             self.cross_layer_parameter[:self.cross_layer_num])
 47 |         self.cross_layer_b = nn.ParameterList(
 48 |             self.cross_layer_parameter[self.cross_layer_num:])
 49 | 
 50 |         # size of mlp hidden layer
 51 |         size_list = [
 52 |             self.embedding_size * len(self.token_field_names) +
 53 |             len(self.float_field_names)
 54 |         ] + self.mlp_hidden_size
 55 |         # size of cross network output
 56 |         in_feature_num = self.mlp_hidden_size[-1] + \
 57 |             self.embedding_size * len(self.token_field_names) + \
 58 |             len(self.float_field_names)
 59 | 
 60 |         self.mlp_layers = MLPLayers(size_list,
 61 |                                     dropout=self.dropout_prob,
 62 |                                     bn=True)
 63 |         self.predict_layer = nn.Linear(in_feature_num, 1)
 64 |         self.reg_loss = RegLoss()
 65 |         self.sigmoid = nn.Sigmoid()
 66 |         self.loss = nn.BCELoss()
 67 | 
 68 |         # parameters initialization
 69 |         self.apply(self._init_weights)
 70 | 
 71 |     def _init_weights(self, module):
 72 |         if isinstance(module, nn.Embedding):
 73 |             xavier_normal_(module.weight.data)
 74 |         elif isinstance(module, nn.Linear):
 75 |             xavier_normal_(module.weight.data)
 76 |             if module.bias is not None:
 77 |                 constant_(module.bias.data, 0)
 78 | 
 79 |     def cross_network(self, x_0):
 80 |         r"""Cross network is composed of cross layers, with each layer having the following formula.
 81 | 
 82 |         .. math:: x_{l+1} = x_0 {x_l^T} w_l + b_l + x_l
 83 | 
 84 |         :math:`x_l`, :math:`x_{l+1}` are column vectors denoting the outputs from the l -th and
 85 |         (l + 1)-th cross layers, respectively.
 86 |         :math:`w_l`, :math:`b_l` are the weight and bias parameters of the l -th layer.
 87 | 
 88 |         Args:
 89 |             x_0(torch.Tensor): Embedding vectors of all features, input of cross network.
 90 | 
 91 |         Returns:
 92 |             torch.Tensor:output of cross network, [batch_size, num_feature_field * embedding_size]
 93 | 
 94 |         """
 95 |         x_l = x_0
 96 |         for i in range(self.cross_layer_num):
 97 |             xl_w = torch.tensordot(x_l, self.cross_layer_w[i], dims=([1], [0]))
 98 |             xl_dot = (x_0.transpose(0, 1) * xl_w).transpose(0, 1)
 99 |             x_l = xl_dot + self.cross_layer_b[i] + x_l
100 |         return x_l
101 | 
102 |     def forward(self, interaction):
103 |         # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None
104 |         # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None
105 |         sparse_embedding, dense_embedding = self.embed_input_fields(
106 |             interaction)
107 |         batch_size = sparse_embedding.shape[0]
108 |         all_embeddings = []
109 |         if sparse_embedding is not None:
110 |             all_embeddings.append(sparse_embedding.view(batch_size, -1))
111 |         if dense_embedding is not None and len(dense_embedding.shape) == 3:
112 |             all_embeddings.append(dense_embedding.view(batch_size, -1))
113 | 
114 |         dcn_all_embeddings = torch.cat(
115 |             all_embeddings, dim=1)  # [batch_size, num_field, embed_dim]
116 |         dcn_all_embeddings = dcn_all_embeddings.view(batch_size, -1)
117 | 
118 |         # DNN
119 |         deep_output = self.mlp_layers(dcn_all_embeddings)
120 |         # Cross Network
121 |         cross_output = self.cross_network(dcn_all_embeddings)
122 |         stack = torch.cat([cross_output, deep_output], dim=-1)
123 |         output = self.sigmoid(self.predict_layer(stack))
124 | 
125 |         return output.squeeze(1)
126 | 
127 |     def calculate_loss(self, interaction):
128 |         label = interaction[self.LABEL]
129 |         output = self.forward(interaction)
130 |         l2_loss = self.reg_weight * self.reg_loss(self.cross_layer_w)
131 |         return self.loss(output, label) + l2_loss
132 | 
133 |     def predict(self, interaction):
134 |         return self.forward(interaction)
135 | 


--------------------------------------------------------------------------------
/HRec/models/ddtcdr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # ###########################
 3 | # File Name: ddtcdr.py
 4 | # Author: geekinglcq
 5 | # Mail: lcqgeek@live.com
 6 | # Created Time: 2021-02-03 01:38:58
 7 | # ###########################
 8 | 
 9 | import torch
10 | import logging
11 | import torch.nn as nn
12 | from .base import HModel
13 | from collections import Counter
14 | 
15 | 
16 | class DDTCDR(HModel):
17 |     """ DDTCDR
18 |     DDTCDR: Deep Dual Transfer Cross Domain Recommendation.
19 |     """
20 |     def __init__(self, config, dataset, item_type):
21 |         super().__init__(config, dataset)
22 |         self.logger = logging.getLogger()
23 | 
24 |         self.LABEL = dataset.config['LABEL_FIELD']
25 |         # self.RATING = dataset.config['RATING_FIELD']
26 | 
27 |         self.user_emb_size = config['latent_dim']
28 |         self.item_emb_size = config['latent_dim']
29 | 
30 |         self.layers = config['layers']
31 |         self.token_emb_size = config['token_emb_size']
32 |         self.user_cf_embedding = nn.Embedding(self.n_users, self.user_emb_size)
33 |         self.item_cf_embedding = nn.Embedding(self.n_items, self.item_emb_size)
34 | 
35 |         self.latent_dim = config['latent_dim']
36 |         self.fc_layers = torch.nn.ModuleList()
37 | 
38 |         item_feats = dataset.item_feat_fields[item_type]
39 |         item_feat_type_count = Counter(
40 |             [dataset.field2type[i] for i in item_feats])
41 |         input_dim = (item_feat_type_count['token'] + 1) * self.token_emb_size + \
42 |             item_feat_type_count['float'] + self.user_emb_size + self.item_emb_size
43 | 
44 |         self.layers.insert(0, input_dim)
45 |         for idx, (in_size,
46 |                   out_size) in enumerate(zip(self.layers[:-1],
47 |                                              self.layers[1:])):
48 |             self.fc_layers.append(torch.nn.Linear(in_size, out_size))
49 | 
50 |         self.affine_output = torch.nn.Linear(in_features=config['layers'][-1],
51 |                                              out_features=1)
52 |         self.logistic = torch.nn.Sigmoid()
53 |         self.bridge = torch.nn.Linear(config['latent_dim'],
54 |                                       config['latent_dim'])
55 |         torch.nn.init.orthogonal_(self.bridge.weight)
56 | 
57 |     def agg_item_feature(self, item_type, item_data):
58 |         token_embeddings = []
59 |         float_feats = []
60 |         for feat_name, feat_value in item_data.items():
61 |             if feat_name in self.token_embedding_table and feat_name != self.USER_ID:
62 |                 emb = self.token_embedding_table[feat_name](feat_value.long())
63 |                 token_embeddings.append(emb)
64 |             if feat_name in self.float_field_names:
65 |                 float_feat = feat_value.float()
66 |                 if float_feat.dim() == 1:
67 |                     float_feat = float_feat.unsqueeze(-1)
68 |                 float_feats.append(float_feat)
69 |         all_emb = torch.cat(token_embeddings + float_feats, dim=-1)
70 |         return all_emb
71 | 
72 |     def forward(self, item_type, data, dual=False):
73 |         user = data[self.USER_ID]
74 |         item_id = data[self.ITEM_ID]
75 |         user_emb = self.user_cf_embedding(user)
76 |         if dual:
77 |             user_emb = self.bridge(user_emb)
78 |         item_cf_emb = self.item_cf_embedding(item_id)
79 | 
80 |         item_content_emb = self.agg_item_feature(item_type, data)
81 |         item_emb = torch.cat([item_cf_emb, item_content_emb], dim=-1)
82 |         vector = torch.cat([user_emb, item_emb], dim=-1)
83 |         vector = vector.float()
84 | 
85 |         for fc in self.fc_layers:
86 |             vector = fc(vector)
87 |             vector = torch.nn.Dropout(p=0.1)(vector)
88 |             vector = torch.nn.ReLU()(vector)
89 |         rating = self.affine_output(vector)
90 |         rating = self.logistic(rating)
91 |         return rating
92 | 
93 |     def calculate_loss(self):
94 |         pass
95 | 
96 |     def predict(self, h, data):
97 |         return self.forward(h, data)
98 | 


--------------------------------------------------------------------------------
/HRec/models/deepfm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # ###########################
 3 | # File Name: deepfm.py
 4 | # Author: geekinglcq
 5 | # Mail: lcqgeek@live.com
 6 | # Created Time: 2020-12-31 11:09:13
 7 | # ###########################
 8 | """
 9 | DeepFM
10 | ################################################
11 | Reference:
12 |     Huifeng Guo et al. "DeepFM: A Factorization-Machine based Neural Network for CTR Prediction." in IJCAI 2017.
13 | """
14 | 
15 | import torch
16 | import torch.nn as nn
17 | from torch.nn.init import xavier_normal_, constant_
18 | 
19 | from .base import ContextModel
20 | from .layers import BaseFactorizationMachine, MLPLayers
21 | 
22 | 
23 | class DeepFM(ContextModel):
24 |     """DeepFM is a DNN enhanced FM which both use a DNN and a FM to calculate feature interaction.
25 |     Also DeepFM can be seen as a combination of FNN and FM.
26 | 
27 |     """
28 |     def __init__(self, config, dataset):
29 |         super(DeepFM, self).__init__(config, dataset)
30 | 
31 |         # load parameters info
32 |         self.mlp_hidden_size = config['mlp_hidden_size']
33 |         self.dropout_prob = config['dropout_prob']
34 | 
35 |         # define layers and loss
36 |         self.fm = BaseFactorizationMachine(reduce_sum=True)
37 |         size_list = [
38 |             self.embedding_size * len(self.token_field_names) +
39 |             len(self.float_field_names)
40 |         ] + self.mlp_hidden_size
41 |         self.mlp_layers = MLPLayers(size_list, self.dropout_prob)
42 |         self.deep_predict_layer = nn.Linear(
43 |             self.mlp_hidden_size[-1], 1)  # Linear product to the final score
44 |         self.sigmoid = nn.Sigmoid()
45 |         self.loss = nn.BCELoss()
46 | 
47 |         # parameters initialization
48 |         self.apply(self._init_weights)
49 | 
50 |     def _init_weights(self, module):
51 |         if isinstance(module, nn.Embedding):
52 |             xavier_normal_(module.weight.data)
53 |         elif isinstance(module, nn.Linear):
54 |             xavier_normal_(module.weight.data)
55 |             if module.bias is not None:
56 |                 constant_(module.bias.data, 0)
57 | 
58 |     def forward(self, interaction):
59 |         # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None
60 |         # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None
61 |         sparse_embedding, dense_embedding = self.embed_input_fields(
62 |             interaction)
63 |         all_embeddings = []
64 |         batch_size = sparse_embedding.shape[0]
65 |         if sparse_embedding is not None:
66 |             all_embeddings.append(sparse_embedding.view(batch_size, -1))
67 |         if dense_embedding is not None and len(dense_embedding.shape) == 3:
68 |             all_embeddings.append(dense_embedding.view(batch_size, -1))
69 |         # import pdb
70 |         # pdb.set_trace()
71 |         deepfm_all_embeddings = torch.cat(
72 |             all_embeddings, dim=1)  # [batch_size, num_field, embed_dim]
73 |         y_fm = self.first_order_linear(interaction) + self.fm(sparse_embedding)
74 | 
75 |         y_deep = self.deep_predict_layer(
76 |             self.mlp_layers(deepfm_all_embeddings.view(batch_size, -1)))
77 |         y = self.sigmoid(y_fm + y_deep)
78 |         return y.squeeze()
79 | 
80 |     def calculate_loss(self, interaction):
81 |         label = interaction[self.LABEL]
82 |         output = self.forward(interaction)
83 |         return self.loss(output, label.float())
84 | 
85 |     def predict(self, interaction):
86 |         return self.forward(interaction)
87 | 


--------------------------------------------------------------------------------
/HRec/models/deepmf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import torch
  4 | import logging
  5 | import numpy as np
  6 | import torch.nn as nn
  7 | from .layers import MLPLayers
  8 | from .base import GeneralModel
  9 | from torch.nn.init import normal_
 10 | """
 11 | DMF
 12 | ################################################
 13 | Reference:
 14 |     Hong-Jian Xue et al. "Deep Matrix Factorization Models for Recommender Systems." in IJCAI 2017.
 15 | """
 16 | 
 17 | 
 18 | class DMF(GeneralModel):
 19 |     """Deep MF"""
 20 |     def __init__(self, config, dataset):
 21 |         super().__init__(config, dataset)
 22 |         self.logger = logging.getLogger()
 23 | 
 24 |         self.LABEL = dataset.config['LABEL_FIELD']
 25 |         # self.RATING = dataset.config['RATING_FIELD']
 26 | 
 27 |         self.user_emb_size = config['user_emb_size']
 28 |         self.item_emb_size = config['item_emb_size']
 29 | 
 30 |         self.device = config['device']
 31 |         self.user_hidden_size_list = config['user_hidden_size_list']
 32 |         self.item_hidden_size_list = config['item_hidden_size_list']
 33 | 
 34 |         assert self.user_hidden_size_list[-1] == self.item_hidden_size_list[-1]
 35 |         self.inter_matrix_type = dataset.config['inter_matrix_type']
 36 | 
 37 |         # generate intermediate data
 38 |         if self.inter_matrix_type == '01':
 39 |             self.history_user_id, self.history_user_value, _ = dataset.history_user_matrix(
 40 |             )
 41 |             self.history_item_id, self.history_item_value, _ = dataset.history_item_matrix(
 42 |             )
 43 |             self.interaction_matrix = dataset.inter_matrix(form='csr').astype(
 44 |                 np.float32)
 45 |         elif self.inter_matrix_type == 'rating':
 46 |             self.history_user_id, self.history_user_value, _ = dataset.history_user_matrix(
 47 |                 value_field=self.RATING)
 48 |             self.history_item_id, self.history_item_value, _ = dataset.history_item_matrix(
 49 |                 value_field=self.RATING)
 50 |             self.interaction_matrix = dataset.inter_matrix(
 51 |                 form='csr', value_field=self.RATING).astype(np.float32)
 52 | 
 53 |         self.max_rating = self.history_user_value.max()
 54 |         # tensor of shape [n_items, H] where H is max length of history interaction.
 55 |         self.history_user_id = self.history_user_id.to(self.device)
 56 |         self.history_user_value = self.history_user_value.to(self.device)
 57 |         self.history_item_id = self.history_item_id.to(self.device)
 58 |         self.history_item_value = self.history_item_value.to(self.device)
 59 | 
 60 |         # define layers
 61 |         self.user_linear = nn.Linear(in_features=self.n_items,
 62 |                                      out_features=self.user_emb_size,
 63 |                                      bias=False)
 64 |         self.item_linear = nn.Linear(in_features=self.n_users,
 65 |                                      out_features=self.item_emb_size,
 66 |                                      bias=False)
 67 |         self.user_fc_layers = MLPLayers([self.user_emb_size] +
 68 |                                         self.user_hidden_size_list)
 69 |         self.item_fc_layers = MLPLayers([self.item_emb_size] +
 70 |                                         self.item_hidden_size_list)
 71 |         self.sigmoid = nn.Sigmoid()
 72 |         self.bce_loss = nn.BCELoss()
 73 | 
 74 |         # Save the item embedding before dot product layer to speed up evaluation
 75 |         self.i_embedding = None
 76 | 
 77 |         # parameters initialization
 78 |         self.apply(self._init_weights)
 79 | 
 80 |     def _init_weights(self, module):
 81 |         # We just initialize the module with normal distribution as the paper said
 82 |         if isinstance(module, nn.Linear):
 83 |             normal_(module.weight.data, 0, 0.01)
 84 |             if module.bias is not None:
 85 |                 module.bias.data.fill_(0.0)
 86 |         elif isinstance(module, nn.Embedding):
 87 |             normal_(module.weight.data, 0, 0.01)
 88 | 
 89 |     def forward(self, user, item):
 90 | 
 91 |         user = user.long()
 92 |         item = item.long()
 93 |         user = self.get_user_embedding(user)
 94 | 
 95 |         # Following lines construct tensor of shape [B,n_users] using the tensor of shape [B,H]
 96 |         col_indices = self.history_user_id[item].flatten()
 97 |         row_indices = torch.arange(item.shape[0]).to(
 98 |             self.device).repeat_interleave(self.history_user_id.shape[1],
 99 |                                            dim=0)
100 |         matrix_01 = torch.zeros(1).to(self.device).repeat(
101 |             item.shape[0], self.n_users)
102 |         matrix_01.index_put_((row_indices, col_indices),
103 |                              self.history_user_value[item].flatten())
104 |         item = self.item_linear(matrix_01)
105 | 
106 |         user = self.user_fc_layers(user)
107 |         item = self.item_fc_layers(item)
108 | 
109 |         # cosine distance is replaced by dot product according the result of our experiments.
110 |         vector = torch.mul(user, item).sum(dim=1)
111 |         vector = self.sigmoid(vector)
112 | 
113 |         return vector
114 | 
115 |     def calculate_loss(self, interaction):
116 |         # when starting a new epoch, the item embedding we saved must be cleared.
117 |         if self.training:
118 |             self.i_embedding = None
119 | 
120 |         user = interaction[self.USER_ID]
121 |         item = interaction[self.ITEM_ID]
122 |         if self.inter_matrix_type == '01':
123 |             label = interaction[self.LABEL].float()
124 |         elif self.inter_matrix_type == 'rating':
125 |             label = interaction[self.RATING] * interaction[self.LABEL]
126 |         output = self.forward(user, item)
127 | 
128 | 
129 |         label = label / self.max_rating  # normalize the label to calculate BCE loss.
130 |         loss = self.bce_loss(output, label)
131 |         return loss
132 | 
133 |     def predict(self, interaction):
134 |         user = interaction[self.USER_ID]
135 |         item = interaction[self.ITEM_ID]
136 |         return self.forward(user, item)
137 | 
138 |     def get_user_embedding(self, user):
139 |         r"""Get a batch of user's embedding with the user's id and history interaction matrix.
140 | 
141 |         Args:
142 |             user (torch.LongTensor): The input tensor that contains user's id, shape: [batch_size, ]
143 | 
144 |         Returns:
145 |             torch.FloatTensor: The embedding tensor of a batch of user, shape: [batch_size, emb_size]
146 |         """
147 |         # Following lines construct tensor of shape [B,n_items] using the tensor of shape [B,H]
148 |         col_indices = self.history_item_id[user].flatten()
149 |         row_indices = torch.arange(user.shape[0]).to(
150 |             self.device).repeat_interleave(self.history_item_id.shape[1],
151 |                                            dim=0)
152 |         matrix_01 = torch.zeros(1).to(self.device).repeat(
153 |             user.shape[0], self.n_items)
154 |         matrix_01.index_put_((row_indices, col_indices),
155 |                              self.history_item_value[user].flatten())
156 |         user = self.user_linear(matrix_01)
157 | 
158 |         return user
159 | 
160 |     def get_item_embedding(self):
161 |         r"""Get all item's embedding with history interaction matrix.
162 | 
163 |         Considering the RAM of device, we use matrix multiply on sparse tensor for generalization.
164 | 
165 |         Returns:
166 |             torch.FloatTensor: The embedding tensor of all item, shape: [n_items, emb_size]
167 |         """
168 |         interaction_matrix = self.interaction_matrix.tocoo()
169 |         row = interaction_matrix.row
170 |         col = interaction_matrix.col
171 |         i = torch.LongTensor([row, col])
172 |         data = torch.FloatTensor(interaction_matrix.data)
173 |         item_matrix = torch.sparse.FloatTensor(
174 |             i, data,
175 |             torch.Size(interaction_matrix.shape)).to(self.device).transpose(
176 |                 0, 1)
177 |         item = torch.sparse.mm(item_matrix, self.item_linear.weight.t())
178 | 
179 |         item = self.item_fc_layers(item)
180 |         return item
181 | 
182 |     def full_sort_predict(self, interaction):
183 |         user = interaction[self.USER_ID]
184 |         u_embedding = self.get_user_embedding(user)
185 |         u_embedding = self.user_fc_layers(u_embedding)
186 | 
187 |         if self.i_embedding is None:
188 |             self.i_embedding = self.get_item_embedding()
189 | 
190 |         similarity = torch.mm(u_embedding, self.i_embedding.t())
191 |         similarity = self.sigmoid(similarity)
192 |         return similarity.view(-1)
193 | 
194 | 
195 | if __name__ == '__main__':
196 | 
197 |     model = DMF()
198 | 


--------------------------------------------------------------------------------
/HRec/models/dssm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: dssm.py
  4 | # ###########################
  5 | """
  6 | DSSM
  7 | ################################################
  8 | Reference:
  9 |     PS Huang et al. "Learning Deep Structured Semantic Models for Web Search using Clickthrough Data" in CIKM 2013.
 10 | """
 11 | 
 12 | import torch
 13 | import torch.nn as nn
 14 | from torch.nn.init import xavier_normal_, constant_
 15 | 
 16 | from .layers import MLPLayers
 17 | from .base import ContextModel
 18 | 
 19 | 
 20 | class DSSM(ContextModel):
 21 |     """ DSSM respectively expresses user and item as low dimensional vectors with mlp layers,
 22 |     and uses cosine distance to calculate the distance between the two semantic vectors.
 23 | 
 24 |     """
 25 |     def __init__(self, config, dataset):
 26 |         super(DSSM, self).__init__(config, dataset)
 27 | 
 28 |         # load parameters info
 29 |         self.mlp_hidden_size = config['mlp_hidden_size']
 30 |         self.dropout_prob = config['dropout_prob']
 31 | 
 32 |         self.user_feature_num = self.user_token_field_num + self.user_float_field_num + self.user_token_seq_field_num
 33 |         self.item_feature_num = self.item_token_field_num + self.item_float_field_num + self.item_token_seq_field_num
 34 |         user_size_list = [self.embedding_size * self.user_feature_num
 35 |                           ] + self.mlp_hidden_size
 36 |         item_size_list = [self.embedding_size * self.item_feature_num
 37 |                           ] + self.mlp_hidden_size
 38 | 
 39 |         # define layers and loss
 40 |         self.user_mlp_layers = MLPLayers(user_size_list,
 41 |                                          self.dropout_prob,
 42 |                                          activation='tanh',
 43 |                                          bn=True)
 44 |         self.item_mlp_layers = MLPLayers(item_size_list,
 45 |                                          self.dropout_prob,
 46 |                                          activation='tanh',
 47 |                                          bn=True)
 48 | 
 49 |         self.loss = nn.BCELoss()
 50 |         self.sigmod = nn.Sigmoid()
 51 | 
 52 |         # parameters initialization
 53 |         self.apply(self._init_weights)
 54 | 
 55 |     def _init_weights(self, module):
 56 |         if isinstance(module, nn.Embedding):
 57 |             xavier_normal_(module.weight.data)
 58 |         elif isinstance(module, nn.Linear):
 59 |             xavier_normal_(module.weight.data)
 60 |             if module.bias is not None:
 61 |                 constant_(module.bias.data, 0)
 62 | 
 63 |     def forward(self, interaction):
 64 |         embed_result = self.double_tower_embed_input_fields(interaction)
 65 |         user_sparse_embedding, user_dense_embedding = embed_result[:2]
 66 |         item_sparse_embedding, item_dense_embedding = embed_result[2:]
 67 | 
 68 |         user = []
 69 |         if user_sparse_embedding is not None:
 70 |             user.append(user_sparse_embedding)
 71 |         if user_dense_embedding is not None and len(
 72 |                 user_dense_embedding.shape) == 3:
 73 |             user.append(user_dense_embedding)
 74 | 
 75 |         embed_user = torch.cat(user, dim=1)
 76 | 
 77 |         item = []
 78 |         if item_sparse_embedding is not None:
 79 |             item.append(item_sparse_embedding)
 80 |         if item_dense_embedding is not None and len(
 81 |                 item_dense_embedding.shape) == 3:
 82 |             item.append(item_dense_embedding)
 83 | 
 84 |         embed_item = torch.cat(item, dim=1)
 85 | 
 86 |         batch_size = embed_item.shape[0]
 87 |         user_dnn_out = self.user_mlp_layers(embed_user.view(batch_size, -1))
 88 |         item_dnn_out = self.item_mlp_layers(embed_item.view(batch_size, -1))
 89 |         score = torch.cosine_similarity(user_dnn_out, item_dnn_out, dim=1)
 90 | 
 91 |         sig_score = self.sigmod(score)
 92 |         return sig_score.squeeze()
 93 | 
 94 |     def calculate_loss(self, interaction):
 95 |         label = interaction[self.LABEL]
 96 |         output = self.forward(interaction)
 97 |         return self.loss(output, label.float())
 98 | 
 99 |     def predict(self, interaction):
100 |         return self.forward(interaction)
101 | 


--------------------------------------------------------------------------------
/HRec/models/duration.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: duration.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2021-01-07 15:28:53
  7 | # ###########################
  8 | 
  9 | import torch
 10 | import random
 11 | import logging
 12 | import numpy as np
 13 | import torch.nn as nn
 14 | from .layers import MLPLayers, set_kernel_layer
 15 | from .base import HModel
 16 | from torch.nn.init import normal_
 17 | from collections import Counter, defaultdict
 18 | from itertools import combinations_with_replacement
 19 | 
 20 | 
 21 | class DURation(HModel):
 22 |     """ Deep Unified Representation for Heterogeneous Recommendation"""
 23 |     def __init__(self, config, dataset):
 24 |         super().__init__(config, dataset)
 25 |         self.logger = logging.getLogger()
 26 | 
 27 |         self.LABEL = dataset.config['LABEL_FIELD']
 28 |         # self.RATING = dataset.config['RATING_FIELD']
 29 | 
 30 |         self.user_emb_size = config['user_emb_size']
 31 |         self.item_emb_size = config['item_emb_size']
 32 |         self.token_emb_size = config['token_emb_size']
 33 | 
 34 |         # The number of item types
 35 |         self.P = len(dataset.config['item_feat_path'])
 36 |         self.item_size = dataset.item_nums
 37 | 
 38 |         self.device = config['device']
 39 |         self.user_hidden_size_list = config['user_hidden_size_list']
 40 |         self.item_hidden_size_list = config['item_hidden_size_list']
 41 |         self.item_map_hidden_size_list = config['item_map_hidden_size_list']
 42 |         self.kernel = set_kernel_layer(config.get('kernel', 'gaussian'))
 43 | 
 44 |         assert self.user_hidden_size_list[-1] == self.item_hidden_size_list[-1]
 45 |         self.inter_matrix_type = dataset.config['inter_matrix_type']
 46 | 
 47 |         # generate intermediate data
 48 |         if self.inter_matrix_type == '01':
 49 |             self.history_user_id, self.history_user_value, _ = dataset.history_user_matrix(
 50 |             )
 51 |             self.history_item_id, self.history_item_value, _ = dataset.history_item_matrix(
 52 |             )
 53 |             self.interaction_matrix = dataset.inter_matrix(form='csr').astype(
 54 |                 np.float32)
 55 |         elif self.inter_matrix_type == 'rating':
 56 |             self.history_user_id, self.history_user_value, _ = dataset.history_user_matrix(
 57 |                 value_field=self.RATING)
 58 |             self.history_item_id, self.history_item_value, _ = dataset.history_item_matrix(
 59 |                 value_field=self.RATING)
 60 |             self.interaction_matrix = dataset.inter_matrix(
 61 |                 form='csr', value_field=self.RATING).astype(np.float32)
 62 | 
 63 |         self.max_rating = self.history_user_value.max()
 64 |         # tensor of shape [n_items, H] where H is max length of history interaction.
 65 | 
 66 |         # Keep the user matrix in cpu to save gpu mem
 67 |         # self.history_user_id = self.history_user_id.to(self.device)
 68 |         # self.history_user_value = self.history_user_value.to(self.device)
 69 | 
 70 |         self.history_item_id = self.history_item_id.to(self.device)
 71 |         self.history_item_value = self.history_item_value.to(self.device)
 72 | 
 73 |         # define layers
 74 |         self.user_linear = nn.Linear(in_features=self.n_items,
 75 |                                      out_features=self.user_emb_size,
 76 |                                      bias=False)
 77 | 
 78 |         self.map_func_dict = nn.ModuleDict()
 79 | 
 80 |         self.pdist = nn.PairwiseDistance(p=2)
 81 | 
 82 |         for item_type, item_feats in dataset.item_feat_fields.items():
 83 | 
 84 |             item_feat_type_count = Counter(
 85 |                 [dataset.field2type[i] for i in item_feats])
 86 |             input_dim = (item_feat_type_count['token'] + 1) * self.token_emb_size + \
 87 |                 item_feat_type_count['float']
 88 |             self.map_func_dict[item_type] = MLPLayers(
 89 |                 [input_dim] + self.item_map_hidden_size_list).to(self.device)
 90 | 
 91 |         self.item_linear = nn.Linear(in_features=self.n_users,
 92 |                                      out_features=self.item_emb_size,
 93 |                                      bias=False)
 94 |         self.user_fc_layers = MLPLayers([self.user_emb_size] +
 95 |                                         self.user_hidden_size_list).to(
 96 |                                             self.device)
 97 |         self.item_fc_layers = MLPLayers(
 98 |             [self.item_map_hidden_size_list[-1] + self.item_emb_size] +
 99 |             self.item_hidden_size_list).to(self.device)
100 |         self.sigmoid = nn.Sigmoid()
101 |         self.bce_loss = nn.BCELoss()
102 | 
103 |         # Save the item embedding before dot product layer to speed up evaluation
104 |         self.i_embedding = None
105 | 
106 |         # parameters initialization
107 |         self.apply(self._init_weights)
108 | 
109 |     def _init_weights(self, module):
110 |         # We just initialize the module with normal distribution as the paper said
111 |         if isinstance(module, nn.Linear):
112 |             normal_(module.weight.data, 0, 0.01)
113 |             if module.bias is not None:
114 |                 module.bias.data.fill_(0.0)
115 |         elif isinstance(module, nn.Embedding):
116 |             normal_(module.weight.data, 0, 0.01)
117 | 
118 |     def agg_item_feature(self, item_type, item_data):
119 | 
120 |         token_embeddings = []
121 |         float_feats = []
122 |         for feat_name, feat_value in item_data.items():
123 |             if feat_name in self.token_embedding_table and feat_name != self.USER_ID:
124 |                 emb = self.token_embedding_table[feat_name](feat_value.long())
125 |                 token_embeddings.append(emb)
126 |             if feat_name in self.float_field_names:
127 |                 float_feat = feat_value.float()
128 |                 if float_feat.dim() == 1:
129 |                     float_feat = float_feat.unsqueeze(-1)
130 |                 float_feats.append(float_feat)
131 |         all_emb = torch.cat(token_embeddings + float_feats, dim=-1)
132 |         return all_emb
133 | 
134 |     def get_item_embedding(self, item_type, data):
135 |         item_id = data[self.ITEM_ID].long()
136 | 
137 |         # Following lines construct tensor of shape [B,n_users] using the tensor of shape [B,H]
138 |         row_indices = torch.arange(item_id.shape[0],
139 |                                    device=self.device).repeat_interleave(
140 |                                        self.history_user_id.shape[1], dim=0)
141 |         col_indices = self.history_user_id[item_id].flatten().to(self.device)
142 |         matrix_01 = torch.zeros(1, device=self.device).repeat(
143 |             item_id.shape[0], self.n_users)
144 |         matrix_01.index_put_(
145 |             (row_indices, col_indices),
146 |             self.history_user_value[item_id].flatten().to(self.device))
147 |         item_inter_feat = self.item_linear(matrix_01)
148 | 
149 |         map_layers = self.map_func_dict[item_type]
150 |         item_emb = self.agg_item_feature(item_type, data)
151 |         item_transformed_emb = map_layers(item_emb)
152 |         item_feat = torch.cat([item_inter_feat, item_transformed_emb], dim=-1)
153 | 
154 |         return item_emb, item_transformed_emb, item_feat
155 | 
156 |     def forward(self, item_type, data, return_item_emb=False):
157 | 
158 |         # Interaction-related features
159 |         user = data[self.USER_ID]
160 |         item_id = data[self.ITEM_ID]
161 |         user = self.get_user_embedding(user)
162 | 
163 |         # Following lines construct tensor of shape [B,n_users] using the tensor of shape [B,H]
164 |         col_indices = self.history_user_id[item_id].flatten().to(self.device)
165 |         row_indices = torch.arange(item_id.shape[0],
166 |                                    device=self.device).repeat_interleave(
167 |                                        self.history_user_id.shape[1], dim=0)
168 |         matrix_01 = torch.zeros(1, device=self.device).repeat(
169 |             item_id.shape[0], self.n_users)
170 |         matrix_01.index_put_(
171 |             (row_indices, col_indices),
172 |             self.history_user_value[item_id].flatten().to(self.device))
173 |         item_inter_feat = self.item_linear(matrix_01)
174 | 
175 |         # Context-related features
176 | 
177 |         # Map heterogeneous raw feature to unified feature space
178 | 
179 |         map_layers = self.map_func_dict[item_type]
180 |         item_emb = self.agg_item_feature(item_type, data)
181 |         item_transformed_emb = map_layers(item_emb)
182 |         item_feat = torch.cat([item_inter_feat, item_transformed_emb], dim=-1)
183 | 
184 |         user = self.user_fc_layers(user)
185 |         item = self.item_fc_layers(item_feat)
186 | 
187 |         vector = torch.mul(user, item).sum(dim=1)
188 |         vector = self.sigmoid(vector)
189 | 
190 |         if return_item_emb:
191 |             return vector, item_emb, item_transformed_emb
192 |         else:
193 |             return vector
194 | 
195 |     def calculate_topo_loss(self, raw_emb, emb):
196 |         """
197 |         Calculate the topology loss, for every pair of items sampled from given batch,
198 |         calculate the
199 |             |x_i, x_j|^2 * W(r_i, r_j)
200 |         x is the tranformed representation
201 |         r is the raw representation
202 |         W is the similarity function
203 | 
204 |         Input:
205 |             raw_emb: [bs, dim] raw features
206 |             emb: [bs, new_dim] embedding in transformed feature space
207 | 
208 |         """
209 | 
210 |         d = emb.shape[0]
211 |         n_r = raw_emb.shape[1]
212 |         n_x = emb.shape[1]
213 |         # r_one = torch.ones((1, n_r), device=self.device) @ raw_emb.T
214 |         # x_one = torch.ones((1, n_x), device=self.device) @ emb.T
215 |         c_r = 1 / (n_r - 1) * torch.matmul(raw_emb, raw_emb.T)
216 |         # (1 / n_r) * torch.matmul(r_one.T, r_one))
217 |         c_x = 1 / (n_x - 1) * torch.matmul(emb, emb.T)
218 |         # (1 / n_x) * torch.matmul(x_one.T, x_one))
219 | 
220 |         loss = 1 / (4 * d**2) * (c_r - c_x).pow(2).sum().sqrt()
221 |         return loss
222 | 
223 |     def calculate_align_loss(self, data):
224 |         """
225 |         Calculate the alignment loss. For each batch, sample a number of pairs to minimize
226 |         the alignment loss.
227 |         """
228 |         size = 128
229 |         item_size = self.item_size
230 | 
231 |         min_size = min([i.shape[0] for i in data.values()])
232 |         if min_size < size:
233 |             return None
234 |         losses = []
235 |         for type_i, type_j in combinations_with_replacement(data.keys(), r=2):
236 | 
237 |             if type_i == type_j:
238 |                 factor = (self.P - 1) / (self.P**2 * item_size[type_i]**2)
239 |             else:
240 |                 factor = -1 / (self.P**2 * item_size[type_i] *
241 |                                item_size[type_j])
242 |             indice_i = random.sample(range(min_size), size)
243 |             indice_i = torch.tensor(indice_i, device=self.device)
244 |             sample_i = data[type_i][indice_i]
245 |             indice_j = random.sample(range(min_size), size)
246 |             indice_j = torch.tensor(indice_j, device=self.device)
247 |             sample_j = data[type_j][indice_j]
248 | 
249 |             res = self.kernel(sample_i, sample_j)
250 |             loss = factor * res
251 |             losses.append(loss)
252 | 
253 |         align_loss = torch.sum(torch.stack(losses))
254 |         return align_loss
255 | 
256 |     def calculate_loss(self, data):
257 |         # when starting a new epoch, the item embedding we saved must be cleared.
258 |         # The
259 |         if self.training:
260 |             self.i_embedding = None
261 | 
262 |         losses = []
263 |         losses_dict = defaultdict(int)
264 | 
265 |         item_emb_dict = {}
266 |         for item_type, item_data in data.items():
267 | 
268 |             if self.inter_matrix_type == '01':
269 |                 label = item_data[self.LABEL].float()
270 |             elif self.inter_matrix_type == 'rating':
271 |                 label = item_data[self.RATING] * item_data[self.LABEL]
272 | 
273 |             output, item_raw_emb, item_emb = self.forward(item_type,
274 |                                                           item_data,
275 |                                                           return_item_emb=True)
276 | 
277 |             item_emb_dict[item_type] = item_emb
278 | 
279 |             topo_loss = 0.001 * self.calculate_topo_loss(
280 |                 item_raw_emb, item_emb)
281 |             losses_dict['topo'] += topo_loss.item()
282 |             losses.append(topo_loss)
283 | 
284 |             label = label / self.max_rating  # normalize the label to calculate BCE loss.
285 | 
286 |             cls_loss = self.bce_loss(output, label)
287 |             losses_dict['cls'] += cls_loss.item()
288 | 
289 |             losses.append(cls_loss)
290 | 
291 |         align_loss = 5e8 * self.calculate_align_loss(item_emb_dict)
292 |         if align_loss is not None:
293 |             losses.append(align_loss)
294 |             losses_dict['align'] += align_loss.item()
295 | 
296 |         loss = torch.sum(torch.stack(losses))
297 |         return loss, losses_dict
298 | 
299 |     def predict(self, h, data):
300 |         return self.forward(h, data)
301 | 
302 |     def get_user_embedding(self, user):
303 |         r"""Get a batch of user's embedding with the user's id and history interaction matrix.
304 | 
305 |         Args:
306 |             user (torch.LongTensor): The input tensor that contains user's id, shape: [batch_size, ]
307 | 
308 |         Returns:
309 |             torch.FloatTensor: The embedding tensor of a batch of user, shape: [batch_size, emb_size]
310 |         """
311 |         # Following lines construct tensor of shape [B,n_items] using the tensor of shape [B,H]
312 |         col_indices = self.history_item_id[user].flatten()
313 |         row_indices = torch.arange(user.shape[0],
314 |                                    device=self.device).repeat_interleave(
315 |                                        self.history_item_id.shape[1], dim=0)
316 |         matrix_01 = torch.zeros(1, device=self.device).repeat(
317 |             user.shape[0], self.n_items)
318 |         matrix_01.index_put_((row_indices, col_indices),
319 |                              self.history_item_value[user].flatten())
320 |         user = self.user_linear(matrix_01)
321 | 
322 |         return user
323 | 
324 | 
325 | if __name__ == '__main__':
326 | 
327 |     model = DURation()
328 | 


--------------------------------------------------------------------------------
/HRec/models/fism.py:
--------------------------------------------------------------------------------
  1 | # -*- codi:utf-8 -*-
  2 | # ###########################
  3 | # File Name: fism.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2020-12-25 18:34:53
  7 | # ###########################
  8 | 
  9 | import torch
 10 | import logging
 11 | import torch.nn as nn
 12 | from .base import GeneralModel
 13 | from torch.nn.init import normal_
 14 | """
 15 | FISM
 16 | #######################################
 17 | Reference:
 18 |     S. Kabbur et al. "FISM: Factored item similarity models for top-n recommender systems" in KDD 2013
 19 | """
 20 | 
 21 | 
 22 | class FISM(GeneralModel):
 23 |     """FISM is an item-based model for generating top-N recommendations that learns the
 24 |     item-item similarity matrix as the product of two low dimensional latent factor matrices.
 25 |     These matrices are learned using a structural equation modeling approach, where in the
 26 |     value being estimated is not used for its own estimation.
 27 | 
 28 |     """
 29 |     def __init__(self, config, dataset):
 30 |         super(FISM, self).__init__(config, dataset)
 31 | 
 32 |         # load dataset info
 33 |         self.LABEL = dataset.config['LABEL_FIELD']
 34 |         self.logger = logging.getLogger()
 35 | 
 36 |         # get all users's history interaction information.the history item
 37 |         # matrix is padding by the maximum number of a user's interactions
 38 |         self.history_item_matrix, self.history_lens, self.mask_mat = self.get_history_info(
 39 |             dataset)
 40 | 
 41 |         # load parameters info
 42 |         self.embedding_size = config['embedding_size']
 43 |         self.reg_weights = config['reg_weights']
 44 |         self.alpha = config['alpha']
 45 |         self.split_to = config['split_to']
 46 | 
 47 |         # split the too large dataset into the specified pieces
 48 |         if self.split_to > 0:
 49 |             self.group = torch.chunk(
 50 |                 torch.arange(self.n_items).to(self.device), self.split_to)
 51 |         else:
 52 |             self.logger.warning(
 53 |                 'Pay Attetion!! the `split_to` is set to 0. If you catch a OMM error in this case, '
 54 |                 +
 55 |                 'you need to increase it \n\t\t\tuntil the error disappears. For example, '
 56 |                 +
 57 |                 'you can append it in the command line such as `--split_to=5`')
 58 | 
 59 |         # define layers and loss
 60 |         # construct source and destination item embedding matrix
 61 |         self.item_src_embedding = nn.Embedding(self.n_items,
 62 |                                                self.embedding_size,
 63 |                                                padding_idx=0)
 64 |         self.item_dst_embedding = nn.Embedding(self.n_items,
 65 |                                                self.embedding_size,
 66 |                                                padding_idx=0)
 67 |         self.user_bias = nn.Parameter(torch.zeros(self.n_users))
 68 |         self.item_bias = nn.Parameter(torch.zeros(self.n_items))
 69 |         self.bceloss = nn.BCELoss()
 70 | 
 71 |         # parameters initialization
 72 |         self.apply(self._init_weights)
 73 | 
 74 |     def get_history_info(self, dataset):
 75 |         """get the user history interaction information
 76 | 
 77 |         Args:
 78 |             dataset (DataSet): train dataset
 79 | 
 80 |         Returns:
 81 |             tuple: (history_item_matrix, history_lens, mask_mat)
 82 | 
 83 |         """
 84 |         history_item_matrix, _, history_lens = dataset.history_item_matrix()
 85 |         history_item_matrix = history_item_matrix.to(self.device)
 86 |         history_lens = history_lens.to(self.device)
 87 |         arange_tensor = torch.arange(history_item_matrix.shape[1]).to(
 88 |             self.device)
 89 |         mask_mat = (arange_tensor < history_lens.unsqueeze(1)).float()
 90 |         return history_item_matrix, history_lens, mask_mat
 91 | 
 92 |     def reg_loss(self):
 93 |         """calculate the reg loss for embedding layers
 94 | 
 95 |         Returns:
 96 |             torch.Tensor: reg loss
 97 | 
 98 |         """
 99 |         reg_1, reg_2 = self.reg_weights
100 |         loss_1 = reg_1 * self.item_src_embedding.weight.norm(2)
101 |         loss_2 = reg_2 * self.item_dst_embedding.weight.norm(2)
102 | 
103 |         return loss_1 + loss_2
104 | 
105 |     def _init_weights(self, module):
106 |         """Initialize the module's parameters
107 | 
108 |         Note:
109 |             It's a little different from the source code, because pytorch has no function to initialize
110 |             the parameters by truncated normal distribution, so we replace it with xavier normal distribution
111 | 
112 |         """
113 |         if isinstance(module, nn.Embedding):
114 |             normal_(module.weight.data, 0, 0.01)
115 | 
116 |     def inter_forward(self, user, item):
117 |         """forward the model by interaction
118 | 
119 |         """
120 |         user_inter = self.history_item_matrix[user]
121 |         item_num = self.history_lens[user].unsqueeze(1)
122 |         batch_mask_mat = self.mask_mat[user]
123 |         user_history = self.item_src_embedding(
124 |             user_inter)  # batch_size x max_len x embedding_size
125 |         target = self.item_dst_embedding(item)  # batch_size x embedding_size
126 |         user_bias = self.user_bias[user]  # batch_size x 1
127 |         item_bias = self.item_bias[item]
128 |         similarity = torch.bmm(user_history, target.unsqueeze(2)).squeeze(
129 |             2)  # batch_size x max_len
130 |         similarity = batch_mask_mat * similarity
131 |         coeff = torch.pow(item_num.squeeze(1), -self.alpha)
132 |         scores = torch.sigmoid(coeff.float() * torch.sum(similarity, dim=1) +
133 |                                user_bias + item_bias)
134 |         return scores
135 | 
136 |     def user_forward(self,
137 |                      user_input,
138 |                      item_num,
139 |                      user_bias,
140 |                      repeats=None,
141 |                      pred_slc=None):
142 |         """forward the model by user
143 | 
144 |         Args:
145 |             user_input (torch.Tensor): user input tensor
146 |             item_num (torch.Tensor): user hitory interaction lens
147 |             repeats (int, optional): the number of items to be evaluated
148 |             pred_slc (torch.Tensor, optional): continuous index which controls the current evaluation items,
149 |                                               if pred_slc is None, it will evaluate all items
150 | 
151 |         Returns:
152 |             torch.Tensor: result
153 | 
154 |         """
155 |         item_num = item_num.repeat(repeats, 1)
156 |         user_history = self.item_src_embedding(
157 |             user_input)  # inter_num x embedding_size
158 |         user_history = user_history.repeat(
159 |             repeats, 1, 1)  # target_items x inter_num x embedding_size
160 |         if pred_slc is None:
161 |             targets = self.item_dst_embedding.weight  # target_items x embedding_size
162 |             item_bias = self.item_bias
163 |         else:
164 |             targets = self.item_dst_embedding(pred_slc)
165 |             item_bias = self.item_bias[pred_slc]
166 |         similarity = torch.bmm(user_history, targets.unsqueeze(2)).squeeze(
167 |             2)  # inter_num x target_items
168 |         coeff = torch.pow(item_num.squeeze(1), -self.alpha)
169 |         scores = torch.sigmoid(coeff.float() * torch.sum(similarity, dim=1) +
170 |                                user_bias + item_bias)
171 |         return scores
172 | 
173 |     def forward(self, user, item):
174 |         user = user.long()
175 |         item = item.long()
176 |         return self.inter_forward(user, item)
177 | 
178 |     def calculate_loss(self, interaction):
179 |         user = interaction[self.USER_ID].long()
180 |         item = interaction[self.ITEM_ID].long()
181 |         label = interaction[self.LABEL].float()
182 |         output = self.forward(user, item)
183 |         loss = self.bceloss(output, label) + self.reg_loss()
184 |         return loss
185 | 
186 |     def full_sort_predict(self, interaction):
187 |         user = interaction[self.USER_ID]
188 |         batch_user_bias = self.user_bias[user]
189 |         user_inters = self.history_item_matrix[user]
190 |         item_nums = self.history_lens[user]
191 |         scores = []
192 | 
193 |         # test users one by one, if the number of items is too large, we will split it to some pieces
194 |         for user_input, item_num, user_bias in zip(user_inters,
195 |                                                    item_nums.unsqueeze(1),
196 |                                                    batch_user_bias):
197 |             if self.split_to <= 0:
198 |                 output = self.user_forward(user_input[:item_num],
199 |                                            item_num,
200 |                                            user_bias,
201 |                                            repeats=self.n_items)
202 |             else:
203 |                 output = []
204 |                 for mask in self.group:
205 |                     tmp_output = self.user_forward(user_input[:item_num],
206 |                                                    item_num,
207 |                                                    user_bias,
208 |                                                    repeats=len(mask),
209 |                                                    pred_slc=mask)
210 |                     output.append(tmp_output)
211 |                 output = torch.cat(output, dim=0)
212 |             scores.append(output)
213 |         result = torch.cat(scores, dim=0)
214 |         return result
215 | 
216 |     def predict(self, interaction):
217 |         user = interaction[self.USER_ID]
218 |         item = interaction[self.ITEM_ID]
219 |         output = self.forward(user, item)
220 |         return output
221 | 


--------------------------------------------------------------------------------
/HRec/models/layers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | import torch.nn as nn
  6 | import torch.nn.functional as fn
  7 | 
  8 | from torch.nn.init import normal_
  9 | 
 10 | 
 11 | class MLPLayers(nn.Module):
 12 |     r""" MLPLayers
 13 |     Args:
 14 |         - layers(list): a list contains the size of each layer in mlp layers
 15 |         - dropout(float): probability of an element to be zeroed. Default: 0
 16 |         - activation(str): activation function after each layer in mlp layers. Default: 'relu'
 17 |                       candidates: 'sigmoid', 'tanh', 'relu', 'leekyrelu', 'none'
 18 |     Shape:
 19 | 
 20 |         - Input: (:math:`N`, \*, :math:`H_{in}`) where \* means any number of additional dimensions
 21 |           :math:`H_{in}` must equal to the first value in `layers`
 22 |         - Output: (:math:`N`, \*, :math:`H_{out}`) where :math:`H_{out}` equals to the last value in `layers`
 23 | 
 24 |     Examples::
 25 | 
 26 |         >>> m = MLPLayers([64, 32, 16], 0.2, 'relu')
 27 |         >>> input = torch.randn(128, 64)
 28 |         >>> output = m(input)
 29 |         >>> print(output.size())
 30 |         >>> torch.Size([128, 16])
 31 |     """
 32 |     def __init__(self,
 33 |                  layers,
 34 |                  dropout=0,
 35 |                  activation='relu',
 36 |                  bn=False,
 37 |                  init_method=None):
 38 |         super(MLPLayers, self).__init__()
 39 |         self.layers = layers
 40 |         self.dropout = dropout
 41 |         self.activation = activation
 42 |         self.use_bn = bn
 43 |         self.init_method = init_method
 44 | 
 45 |         mlp_modules = []
 46 |         for idx, (input_size, output_size) in enumerate(
 47 |                 zip(self.layers[:-1], self.layers[1:])):
 48 |             mlp_modules.append(nn.Dropout(p=self.dropout))
 49 |             mlp_modules.append(nn.Linear(input_size, output_size))
 50 |             if self.use_bn:
 51 |                 mlp_modules.append(nn.BatchNorm1d(num_features=output_size))
 52 |             activation_func = activation_layer(self.activation, output_size)
 53 |             if activation_func is not None:
 54 |                 mlp_modules.append(activation_func)
 55 | 
 56 |         self.mlp_layers = nn.Sequential(*mlp_modules)
 57 |         if self.init_method is not None:
 58 |             self.apply(self.init_weights)
 59 | 
 60 |     def init_weights(self, module):
 61 |         # We just initialize the module with normal distribution as the paper said
 62 |         if isinstance(module, nn.Linear):
 63 |             if self.init_method == 'norm':
 64 |                 normal_(module.weight.data, 0, 0.01)
 65 |             if module.bias is not None:
 66 |                 module.bias.data.fill_(0.0)
 67 | 
 68 |     def forward(self, input_feature):
 69 |         return self.mlp_layers(input_feature)
 70 | 
 71 | 
 72 | def activation_layer(activation_name='relu', emb_dim=None):
 73 |     """Construct activation layers
 74 | 
 75 |     Args:
 76 |         activation_name: str, name of activation function
 77 |         emb_dim: int, used for Dice activation
 78 | 
 79 |     Return:
 80 |         activation: activation layer
 81 |     """
 82 |     if activation_name is None:
 83 |         activation = None
 84 |     elif isinstance(activation_name, str):
 85 |         if activation_name.lower() == 'sigmoid':
 86 |             activation = nn.Sigmoid()
 87 |         elif activation_name.lower() == 'tanh':
 88 |             activation = nn.Tanh()
 89 |         elif activation_name.lower() == 'relu':
 90 |             activation = nn.ReLU()
 91 |         elif activation_name.lower() == 'leakyrelu':
 92 |             activation = nn.LeakyReLU()
 93 |         elif activation_name.lower() == 'none':
 94 |             activation = None
 95 |     elif issubclass(activation_name, nn.Module):
 96 |         activation = activation_name()
 97 |     else:
 98 |         raise NotImplementedError(
 99 |             "activation function {} is not implemented".format(
100 |                 activation_name))
101 |     return activation
102 | 
103 | 
104 | class BaseFactorizationMachine(nn.Module):
105 |     r"""Calculate FM result over the embeddings
106 | 
107 |     Args:
108 |         reduce_sum: bool, whether to sum the result, default is True.
109 | 
110 |     Input:
111 |         input_x: tensor, A 3D tensor with shape:``(batch_size,field_size,embed_dim)``.
112 | 
113 |     Output
114 |         output: tensor, A 3D tensor with shape: ``(batch_size,1)`` or ``(batch_size, embed_dim)``.
115 |     """
116 |     def __init__(self, reduce_sum=True):
117 |         super(BaseFactorizationMachine, self).__init__()
118 |         self.reduce_sum = reduce_sum
119 | 
120 |     def forward(self, input_x):
121 |         square_of_sum = torch.sum(input_x, dim=1)**2
122 |         sum_of_square = torch.sum(input_x**2, dim=1)
123 |         output = square_of_sum - sum_of_square
124 |         if self.reduce_sum:
125 |             output = torch.sum(output, dim=1, keepdim=True)
126 |         output = 0.5 * output
127 |         return output
128 | 
129 | 
130 | class FMEmbedding(nn.Module):
131 |     r""" Embedding for token fields.
132 | 
133 |     Args:
134 |         field_dims: list, the number of tokens in each token fields
135 |         offsets: list, the dimension offset of each token field
136 |         embed_dim: int, the dimension of output embedding vectors
137 | 
138 |     Input:
139 |         input_x: tensor, A 3D tensor with shape:``(batch_size,field_size)``.
140 | 
141 |     Return:
142 |         output: tensor,  A 3D tensor with shape: ``(batch_size,field_size,embed_dim)``.
143 |     """
144 |     def __init__(self, field_dims, offsets, embed_dim):
145 |         super(FMEmbedding, self).__init__()
146 |         self.embedding = nn.Embedding(sum(field_dims), embed_dim)
147 |         self.offsets = offsets
148 | 
149 |     def forward(self, input_x):
150 |         input_x = input_x + input_x.new_tensor(self.offsets).unsqueeze(0)
151 |         output = self.embedding(input_x)
152 |         return output
153 | 
154 | 
155 | class FMFirstOrderLinear(nn.Module):
156 |     """Calculate the first order score of the input features.
157 |     This class is a member of ContextRecommender, you can call it easily when inherit ContextRecommender.
158 | 
159 |     """
160 |     def __init__(self, config, dataset, output_dim=1, embed=True):
161 | 
162 |         super(FMFirstOrderLinear, self).__init__()
163 |         self.field_names = dataset.fields()
164 |         self.LABEL = dataset.config['LABEL_FIELD']
165 |         self.device = config['device']
166 |         self.embed = embed
167 |         self.token_field_names = []
168 |         self.token_field_dims = []
169 |         self.float_field_names = []
170 |         self.float_field_dims = []
171 |         self.token_seq_field_names = []
172 |         self.token_seq_field_dims = []
173 |         for field_name in self.field_names:
174 |             if field_name == self.LABEL:
175 |                 continue
176 |             if dataset.field2type[field_name] == "token":
177 |                 self.token_field_names.append(field_name)
178 |                 self.token_field_dims.append(dataset.num(field_name))
179 |             elif dataset.field2type[field_name] == "token_seq":
180 |                 self.token_seq_field_names.append(field_name)
181 |                 self.token_seq_field_dims.append(dataset.num(field_name))
182 |             else:
183 |                 self.float_field_names.append(field_name)
184 |                 self.float_field_dims.append(dataset.num(field_name))
185 |         if len(self.token_field_dims) > 0:
186 |             self.token_field_offsets = np.array(
187 |                 (0, *np.cumsum(self.token_field_dims)[:-1]), dtype=np.long)
188 |             self.token_embedding_table = FMEmbedding(self.token_field_dims,
189 |                                                      self.token_field_offsets,
190 |                                                      output_dim)
191 |         if len(self.float_field_dims) > 0:
192 |             self.float_embedding_table = nn.Embedding(
193 |                 np.sum(self.float_field_dims, dtype=np.int32), output_dim)
194 |         if len(self.token_seq_field_dims) > 0:
195 |             self.token_seq_embedding_table = nn.ModuleList()
196 |             for token_seq_field_dim in self.token_seq_field_dims:
197 |                 self.token_seq_embedding_table.append(
198 |                     nn.Embedding(token_seq_field_dim, output_dim))
199 | 
200 |         self.bias = nn.Parameter(torch.zeros((output_dim, )),
201 |                                  requires_grad=True)
202 | 
203 |     def embed_float_fields(self, float_fields, embed=True):
204 |         """Calculate the first order score of float feature columns
205 | 
206 |         Args:
207 |             float_fields (torch.FloatTensor): The input tensor. shape of [batch_size, num_float_field]
208 | 
209 |         Returns:
210 |             torch.FloatTensor: The first order score of float feature columns
211 |         """
212 |         # input Tensor shape : [batch_size, num_float_field]
213 |         if float_fields is None:
214 |             return float_fields
215 |         if not embed:
216 |             if float_fields.dim() == 2:
217 |                 return float_fields.unsqueeze(1)
218 | 
219 |         num_float_field = float_fields.shape[1]
220 |         # [batch_size, num_float_field]
221 |         index = torch.arange(
222 |             0, num_float_field).unsqueeze(0).expand_as(float_fields).long().to(
223 |                 self.device)
224 | 
225 |         # [batch_size, num_float_field, output_dim]
226 |         float_embedding = self.float_embedding_table(index)
227 |         float_embedding = torch.mul(float_embedding, float_fields.unsqueeze(2))
228 | 
229 |         # [batch_size, 1, output_dim]
230 |         float_embedding = torch.sum(float_embedding, dim=1, keepdim=True)
231 | 
232 |         return float_embedding
233 | 
234 |     def embed_token_fields(self, token_fields):
235 |         """Calculate the first order score of token feature columns
236 | 
237 |         Args:
238 |             token_fields (torch.LongTensor): The input tensor. shape of [batch_size, num_token_field]
239 | 
240 |         Returns:
241 |             torch.FloatTensor: The first order score of token feature columns
242 |         """
243 |         # input Tensor shape : [batch_size, num_token_field]
244 |         if token_fields is None:
245 |             return None
246 |         # [batch_size, num_token_field, embed_dim]
247 |         token_embedding = self.token_embedding_table(token_fields)
248 |         # [batch_size, 1, output_dim]
249 |         token_embedding = torch.sum(token_embedding, dim=1, keepdim=True)
250 | 
251 |         return token_embedding
252 | 
253 |     def embed_token_seq_fields(self, token_seq_fields):
254 |         """Calculate the first order score of token sequence feature columns
255 | 
256 |         Args:
257 |             token_seq_fields (torch.LongTensor): The input tensor. shape of [batch_size, seq_len]
258 | 
259 |         Returns:
260 |             torch.FloatTensor: The first order score of token sequence feature columns
261 |         """
262 |         # input is a list of Tensor shape of [batch_size, seq_len]
263 |         fields_result = []
264 |         for i, token_seq_field in enumerate(token_seq_fields):
265 |             embedding_table = self.token_seq_embedding_table[i]
266 |             mask = token_seq_field != 0  # [batch_size, seq_len]
267 |             mask = mask.float()
268 |             # value_cnt = torch.sum(mask, dim=1, keepdim=True)  # [batch_size, 1]
269 | 
270 |             token_seq_embedding = embedding_table(
271 |                 token_seq_field)  # [batch_size, seq_len, output_dim]
272 | 
273 |             mask = mask.unsqueeze(2).expand_as(
274 |                 token_seq_embedding)  # [batch_size, seq_len, output_dim]
275 |             pdb.set_trace()
276 |             masked_token_seq_embedding = token_seq_embedding * mask.float()
277 |             result = torch.sum(masked_token_seq_embedding, dim=1,
278 |                                keepdim=True)  # [batch_size, 1, output_dim]
279 | 
280 |             fields_result.append(result)
281 |         if len(fields_result) == 0:
282 |             return None
283 |         else:
284 |             return torch.sum(torch.cat(fields_result, dim=1),
285 |                              dim=1,
286 |                              keepdim=True)  # [batch_size, 1, output_dim]
287 | 
288 |     def forward(self, interaction):
289 |         total_fields_embedding = []
290 |         float_fields = []
291 |         for field_name in self.float_field_names:
292 |             float_fields.append(interaction[field_name]
293 |                                 if len(interaction[field_name].shape) ==
294 |                                 2 else interaction[field_name].unsqueeze(1))
295 | 
296 |         if len(float_fields) > 0:
297 |             float_fields = torch.cat(float_fields,
298 |                                      dim=1)  # [batch_size, num_float_field]
299 |         else:
300 |             float_fields = None
301 | 
302 |         # [batch_size, 1, output_dim] or None
303 |         float_fields_embedding = self.embed_float_fields(float_fields,
304 |                                                          embed=self.embed)
305 | 
306 |         if float_fields_embedding is not None:
307 |             total_fields_embedding.append(float_fields_embedding.float())
308 | 
309 |         token_fields = []
310 |         for field_name in self.token_field_names:
311 |             token_fields.append(interaction[field_name].unsqueeze(1))
312 |         if len(token_fields) > 0:
313 |             token_fields = torch.cat(token_fields,
314 |                                      dim=1)  # [batch_size, num_token_field]
315 |         else:
316 |             token_fields = None
317 |         # [batch_size, 1, output_dim] or None
318 |         token_fields_embedding = self.embed_token_fields(token_fields)
319 |         if token_fields_embedding is not None:
320 |             total_fields_embedding.append(token_fields_embedding)
321 | 
322 |         token_seq_fields = []
323 |         for field_name in self.token_seq_field_names:
324 |             token_seq_fields.append(interaction[field_name])
325 |         # [batch_size, 1, output_dim] or None
326 |         token_seq_fields_embedding = self.embed_token_seq_fields(
327 |             token_seq_fields)
328 |         if token_seq_fields_embedding is not None:
329 |             total_fields_embedding.append(token_seq_fields_embedding)
330 | 
331 |         if self.embed:
332 |             return torch.sum(torch.cat(total_fields_embedding, dim=1),
333 |                              dim=1) + self.bias  # [batch_size, output_dim]
334 |         else:
335 |             return torch.sum(torch.cat(total_fields_embedding, dim=2),
336 |                              dim=2) + self.bias
337 | 
338 | 
339 | class AttLayer(nn.Module):
340 |     """Calculate the attention signal(weight) according the input tensor.
341 | 
342 |     Args:
343 |         infeatures (torch.FloatTensor): A 3D input tensor with shape of[batch_size, M, embed_dim].
344 | 
345 |     Returns:
346 |         torch.FloatTensor: Attention weight of input. shape of [batch_size, M].
347 |     """
348 |     def __init__(self, in_dim, att_dim):
349 |         super(AttLayer, self).__init__()
350 |         self.in_dim = in_dim
351 |         self.att_dim = att_dim
352 |         self.w = torch.nn.Linear(in_features=in_dim,
353 |                                  out_features=att_dim,
354 |                                  bias=False)
355 |         self.h = nn.Parameter(torch.randn(att_dim), requires_grad=True)
356 | 
357 |     def forward(self, infeatures):
358 |         att_singal = self.w(infeatures)  # [batch_size, M, att_dim]
359 |         att_singal = fn.relu(att_singal)  # [batch_size, M, att_dim]
360 | 
361 |         att_singal = torch.mul(att_singal, self.h)  # [batch_size, M, att_dim]
362 |         att_singal = torch.sum(att_singal, dim=2)  # [batch_size, M]
363 |         att_singal = fn.softmax(att_singal, dim=1)  # [batch_size, M]
364 | 
365 |         return att_singal
366 | 
367 | 
368 | class RegLoss(nn.Module):
369 |     """ RegLoss, L2 regularization on model parameters
370 | 
371 |     """
372 |     def __init__(self):
373 |         super(RegLoss, self).__init__()
374 | 
375 |     def forward(self, parameters):
376 |         reg_loss = None
377 |         for W in parameters:
378 |             if reg_loss is None:
379 |                 reg_loss = W.norm(2)
380 |             else:
381 |                 reg_loss = reg_loss + W.norm(2)
382 |         return reg_loss
383 | 
384 | 
385 | def meshgrid(x, y=None):
386 |     if y is None:
387 |         y = x
388 |     x = torch.as_tensor(x)
389 |     y = torch.as_tensor(y)
390 |     m, n = x.size(0), y.size(0)
391 |     grid_x = x[None].expand(n, m)
392 |     grid_y = y[:, None].expand(n, m)
393 |     return grid_x, grid_y
394 | 
395 | 
396 | def get_all_combination(x, dim=0, r=2, device='cpu'):
397 |     """
398 |     Get all combination of given x.
399 |     Input:
400 |         x: tensor
401 |         dim:
402 |         r: the number of elements to combine
403 |     """
404 |     xs = torch.arange(x.shape[dim], device=device)
405 |     idx = torch.combinations(xs, r=r)
406 |     a = x.index_select(dim, idx[:, 0])
407 |     b = x.index_select(dim, idx[:, 1])
408 |     return a, b
409 | 
410 | 
411 | def combinations(x, y, dim, all=True, n=None):
412 |     """
413 |     Given the tensor x and y, return a list of pair of tensor where the first tensor sampled
414 |     from x and the second tensor sampled from y.
415 |     Input:
416 |         x,y: tensors shared the same dim
417 |         dim: sample from which dimension
418 |         TODO all: if True, return all of possible combinations
419 |         n: the num of samples should return. If all is True, will ignore n
420 |     """
421 |     xs = torch.arange(x.shape[dim])
422 |     ys = torch.arange(y.shape[dim])
423 |     grid_x, grid_y = meshgrid(xs, ys)
424 |     # select n random elements from the
425 |     # cartesian product
426 |     sampled = torch.randperm(grid_x.numel())[:n]
427 |     indices_x = grid_x.take(sampled)
428 |     indices_y = grid_y.take(sampled)
429 |     # get from the indices
430 |     return x.index_select(dim, indices_x), y.index_select(dim, indices_y)
431 | 
432 | 
433 | def set_kernel_layer(name):
434 | 
435 |     if name == 'gaussian':
436 |         return gaussian_rbf_layer
437 | 
438 | 
439 | def gaussian_rbf_layer(x, y):
440 |     dist = torch.pairwise_distance(x, y, 2)
441 |     return torch.exp(-0.5 * dist.pow(2))
442 | 


--------------------------------------------------------------------------------
/HRec/models/model_map.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekinglcq/HRec/f13a685dd593154d4887ed18bd444e588484d014/HRec/models/model_map.py


--------------------------------------------------------------------------------
/HRec/models/nais.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: nais.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2021-01-25 19:18:52
  7 | # ###########################
  8 | """
  9 | NAIS
 10 | ######################################
 11 | Reference:
 12 |     Xiangnan He et al. "NAIS: Neural Attentive Item Similarity Model for Recommendation." in TKDE 2018.
 13 | """
 14 | 
 15 | import torch
 16 | import logging
 17 | import torch.nn as nn
 18 | from .layers import MLPLayers
 19 | from .base import GeneralModel
 20 | from torch.nn.init import constant_, normal_, xavier_normal_
 21 | 
 22 | 
 23 | class NAIS(GeneralModel):
 24 |     """NAIS is an attention network, which is capable of distinguishing which historical items
 25 |     in a user profile are more important for a prediction. We just implement the model following
 26 |     the original author with a pointwise training mode.
 27 |     Note:
 28 |         instead of forming a minibatch as all training instances of a randomly sampled user which is
 29 |         mentioned in the original paper, we still train the model by a randomly sampled interactions.
 30 | 
 31 |     """
 32 |     def __init__(self, config, dataset):
 33 |         super(NAIS, self).__init__(config, dataset)
 34 | 
 35 |         # load dataset info
 36 |         self.LABEL = config['LABEL_FIELD']
 37 |         self.logger = logging.getLogger()
 38 | 
 39 |         # get all users's history interaction information.the history item
 40 |         # matrix is padding by the maximum number of a user's interactions
 41 |         self.history_item_matrix, self.history_lens, self.mask_mat = self.get_history_info(
 42 |             dataset)
 43 | 
 44 |         # load parameters info
 45 |         self.embedding_size = config['embedding_size']
 46 |         self.weight_size = config['weight_size']
 47 |         self.algorithm = config['algorithm']
 48 |         self.reg_weights = config['reg_weights']
 49 |         self.alpha = config['alpha']
 50 |         self.beta = config['beta']
 51 |         self.split_to = config['split_to']
 52 |         self.pretrain_path = config['pretrain_path']
 53 | 
 54 |         # split the too large dataset into the specified pieces
 55 |         if self.split_to > 0:
 56 |             self.logger.info('split the n_items to {} pieces'.format(
 57 |                 self.split_to))
 58 |             self.group = torch.chunk(
 59 |                 torch.arange(self.n_items).to(self.device), self.split_to)
 60 |         else:
 61 |             self.logger.warning(
 62 |                 'Pay Attetion!! the `split_to` is set to 0. If you catch a OMM error in this case, '
 63 |                 +
 64 |                 'you need to increase it \n\t\t\tuntil the error disappears. For example, '
 65 |                 +
 66 |                 'you can append it in the command line such as `--split_to=5`')
 67 | 
 68 |         # define layers and loss
 69 |         # construct source and destination item embedding matrix
 70 |         self.item_src_embedding = nn.Embedding(self.n_items,
 71 |                                                self.embedding_size,
 72 |                                                padding_idx=0)
 73 |         self.item_dst_embedding = nn.Embedding(self.n_items,
 74 |                                                self.embedding_size,
 75 |                                                padding_idx=0)
 76 |         self.bias = nn.Parameter(torch.zeros(self.n_items))
 77 |         if self.algorithm == 'concat':
 78 |             self.mlp_layers = MLPLayers(
 79 |                 [self.embedding_size * 2, self.weight_size])
 80 |         elif self.algorithm == 'prod':
 81 |             self.mlp_layers = MLPLayers(
 82 |                 [self.embedding_size, self.weight_size])
 83 |         else:
 84 |             raise ValueError(
 85 |                 "NAIS just support attention type in ['concat', 'prod'] but get {}"
 86 |                 .format(self.algorithm))
 87 |         self.weight_layer = nn.Parameter(torch.ones(self.weight_size, 1))
 88 |         self.bceloss = nn.BCELoss()
 89 | 
 90 |         # parameters initialization
 91 |         if self.pretrain_path is not None:
 92 |             self.logger.info('use pretrain from [{}]...'.format(
 93 |                 self.pretrain_path))
 94 |             self._load_pretrain()
 95 |         else:
 96 |             self.logger.info('unuse pretrain...')
 97 |             self.apply(self._init_weights)
 98 | 
 99 |     def _init_weights(self, module):
100 |         """Initialize the module's parameters
101 | 
102 |         Note:
103 |             It's a little different from the source code, because pytorch has no function to initialize
104 |             the parameters by truncated normal distribution, so we replace it with xavier normal distribution
105 | 
106 |         """
107 |         if isinstance(module, nn.Embedding):
108 |             normal_(module.weight.data, 0, 0.01)
109 |         elif isinstance(module, nn.Linear):
110 |             xavier_normal_(module.weight.data)
111 |             if module.bias is not None:
112 |                 constant_(module.bias.data, 0)
113 | 
114 |     def _load_pretrain(self):
115 |         """A simple implementation of loading pretrained parameters.
116 | 
117 |         """
118 |         fism = torch.load(self.pretrain_path)['state_dict']
119 |         self.item_src_embedding.weight.data.copy_(
120 |             fism['item_src_embedding.weight'])
121 |         self.item_dst_embedding.weight.data.copy_(
122 |             fism['item_dst_embedding.weight'])
123 |         for name, parm in self.mlp_layers.named_parameters():
124 |             if name.endswith('weight'):
125 |                 xavier_normal_(parm.data)
126 |             elif name.endswith('bias'):
127 |                 constant_(parm.data, 0)
128 | 
129 |     def get_history_info(self, dataset):
130 |         """get the user history interaction information
131 | 
132 |         Args:
133 |             dataset (DataSet): train dataset
134 | 
135 |         Returns:
136 |             tuple: (history_item_matrix, history_lens, mask_mat)
137 | 
138 |         """
139 |         history_item_matrix, _, history_lens = dataset.history_item_matrix()
140 |         history_item_matrix = history_item_matrix.to(self.device)
141 |         history_lens = history_lens.to(self.device)
142 |         arange_tensor = torch.arange(history_item_matrix.shape[1]).to(
143 |             self.device)
144 |         mask_mat = (arange_tensor < history_lens.unsqueeze(1)).float()
145 |         return history_item_matrix, history_lens, mask_mat
146 | 
147 |     def reg_loss(self):
148 |         """calculate the reg loss for embedding layers and mlp layers
149 | 
150 |         Returns:
151 |             torch.Tensor: reg loss
152 | 
153 |         """
154 |         reg_1, reg_2, reg_3 = self.reg_weights
155 |         loss_1 = reg_1 * self.item_src_embedding.weight.norm(2)
156 |         loss_2 = reg_2 * self.item_dst_embedding.weight.norm(2)
157 |         loss_3 = 0
158 |         for name, parm in self.mlp_layers.named_parameters():
159 |             if name.endswith('weight'):
160 |                 loss_3 = loss_3 + reg_3 * parm.norm(2)
161 |         return loss_1 + loss_2 + loss_3
162 | 
163 |     def attention_mlp(self, inter, target):
164 |         """layers of attention which support `prod` and `concat`
165 | 
166 |         Args:
167 |             inter (torch.Tensor): the embedding of history items
168 |             target (torch.Tensor): the embedding of target items
169 | 
170 |         Returns:
171 |             torch.Tensor: the result of attention
172 | 
173 |         """
174 |         if self.algorithm == 'prod':
175 |             mlp_input = inter * target.unsqueeze(
176 |                 1)  # batch_size x max_len x embedding_size
177 |         else:
178 |             mlp_input = torch.cat(
179 |                 [inter, target.unsqueeze(1).expand_as(inter)],
180 |                 dim=2)  # batch_size x max_len x embedding_size*2
181 |         mlp_output = self.mlp_layers(
182 |             mlp_input)  # batch_size x max_len x weight_size
183 | 
184 |         logits = torch.matmul(mlp_output, self.weight_layer).squeeze(
185 |             2)  # batch_size x max_len
186 |         return logits
187 | 
188 |     def mask_softmax(self, similarity, logits, bias, item_num, batch_mask_mat):
189 |         """softmax the unmasked user history items and get the final output
190 | 
191 |         Args:
192 |             similarity (torch.Tensor): the similarity between the histoy items and target items
193 |             logits (torch.Tensor): the initial weights of the history items
194 |             item_num (torch.Tensor): user hitory interaction lengths
195 |             bias (torch.Tensor): bias
196 |             batch_mask_mat (torch.Tensor): the mask of user history interactions
197 | 
198 |         Returns:
199 |             torch.Tensor: final output
200 | 
201 |         """
202 |         exp_logits = torch.exp(logits)  # batch_size x max_len
203 | 
204 |         exp_logits = batch_mask_mat * exp_logits  # batch_size x max_len
205 |         exp_sum = torch.sum(exp_logits, dim=1, keepdim=True)
206 |         exp_sum = torch.pow(exp_sum, self.beta)
207 |         weights = torch.div(exp_logits, exp_sum)
208 | 
209 |         coeff = torch.pow(item_num.squeeze(1), -self.alpha)
210 |         output = torch.sigmoid(coeff.float() *
211 |                                torch.sum(weights * similarity, dim=1) + bias)
212 | 
213 |         return output
214 | 
215 |     def softmax(self, similarity, logits, item_num, bias):
216 |         """softmax the user history features and get the final output
217 | 
218 |         Args:
219 |             similarity (torch.Tensor): the similarity between the histoy items and target items
220 |             logits (torch.Tensor): the initial weights of the history items
221 |             item_num (torch.Tensor): user hitory interaction lengths
222 |             bias (torch.Tensor): bias
223 | 
224 |         Returns:
225 |             torch.Tensor: final output
226 | 
227 |         """
228 |         exp_logits = torch.exp(logits)  # batch_size x max_len
229 |         exp_sum = torch.sum(exp_logits, dim=1, keepdim=True)
230 |         exp_sum = torch.pow(exp_sum, self.beta)
231 |         weights = torch.div(exp_logits, exp_sum)
232 |         coeff = torch.pow(item_num.squeeze(1), -self.alpha)
233 |         output = torch.sigmoid(coeff.float() *
234 |                                torch.sum(weights * similarity, dim=1) + bias)
235 | 
236 |         return output
237 | 
238 |     def inter_forward(self, user, item):
239 |         """forward the model by interaction
240 | 
241 |         """
242 |         user_inter = self.history_item_matrix[user]
243 |         item_num = self.history_lens[user].unsqueeze(1)
244 |         batch_mask_mat = self.mask_mat[user]
245 |         user_history = self.item_src_embedding(
246 |             user_inter)  # batch_size x max_len x embedding_size
247 |         target = self.item_dst_embedding(item)  # batch_size x embedding_size
248 |         bias = self.bias[item]  # batch_size x 1
249 |         similarity = torch.bmm(user_history, target.unsqueeze(2)).squeeze(
250 |             2)  # batch_size x max_len
251 |         logits = self.attention_mlp(user_history, target)
252 |         scores = self.mask_softmax(similarity, logits, bias, item_num,
253 |                                    batch_mask_mat)
254 |         return scores
255 | 
256 |     def user_forward(self, user_input, item_num, repeats=None, pred_slc=None):
257 |         """forward the model by user
258 | 
259 |         Args:
260 |             user_input (torch.Tensor): user input tensor
261 |             item_num (torch.Tensor): user hitory interaction lens
262 |             repeats (int, optional): the number of items to be evaluated
263 |             pred_slc (torch.Tensor, optional): continuous index which controls the current evaluation items,
264 |                                               if pred_slc is None, it will evaluate all items
265 | 
266 |         Returns:
267 |             torch.Tensor: result
268 | 
269 |         """
270 |         item_num = item_num.repeat(repeats, 1)
271 |         user_history = self.item_src_embedding(
272 |             user_input)  # inter_num x embedding_size
273 |         user_history = user_history.repeat(
274 |             repeats, 1, 1)  # target_items x inter_num x embedding_size
275 |         if pred_slc is None:
276 |             targets = self.item_dst_embedding.weight  # target_items x embedding_size
277 |             bias = self.bias
278 |         else:
279 |             targets = self.item_dst_embedding(pred_slc)
280 |             bias = self.bias[pred_slc]
281 |         similarity = torch.bmm(user_history, targets.unsqueeze(2)).squeeze(
282 |             2)  # inter_num x target_items
283 |         logits = self.attention_mlp(user_history, targets)
284 |         scores = self.softmax(similarity, logits, item_num, bias)
285 |         return scores
286 | 
287 |     def forward(self, user, item):
288 |         return self.inter_forward(user.long(), item.long())
289 | 
290 |     def calculate_loss(self, interaction):
291 |         user = interaction[self.USER_ID]
292 |         item = interaction[self.ITEM_ID]
293 |         label = interaction[self.LABEL].float()
294 |         output = self.forward(user, item)
295 |         loss = self.bceloss(output, label) + self.reg_loss()
296 |         return loss
297 | 
298 |     def full_sort_predict(self, interaction):
299 |         user = interaction[self.USER_ID]
300 |         user_inters = self.history_item_matrix[user]
301 |         item_nums = self.history_lens[user]
302 |         scores = []
303 | 
304 |         # test users one by one, if the number of items is too large, we will split it to some pieces
305 |         for user_input, item_num in zip(user_inters, item_nums.unsqueeze(1)):
306 |             if self.split_to <= 0:
307 |                 output = self.user_forward(user_input[:item_num],
308 |                                            item_num,
309 |                                            repeats=self.n_items)
310 |             else:
311 |                 output = []
312 |                 for mask in self.group:
313 |                     tmp_output = self.user_forward(user_input[:item_num],
314 |                                                    item_num,
315 |                                                    repeats=len(mask),
316 |                                                    pred_slc=mask)
317 |                     output.append(tmp_output)
318 |                 output = torch.cat(output, dim=0)
319 |             scores.append(output)
320 |         result = torch.cat(scores, dim=0)
321 |         return result
322 | 
323 |     def predict(self, interaction):
324 |         user = interaction[self.USER_ID]
325 |         item = interaction[self.ITEM_ID]
326 |         output = self.forward(user, item)
327 |         return output
328 | 


--------------------------------------------------------------------------------
/HRec/models/ncf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: ncf.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2020-12-25 20:54:34
  7 | # ###########################
  8 | 
  9 | import torch
 10 | import logging
 11 | import torch.nn as nn
 12 | from .layers import MLPLayers
 13 | from .base import GeneralModel
 14 | from torch.nn.init import normal_
 15 | """
 16 | NCF
 17 | #######################################
 18 | Reference:
 19 |   Xiangnan He et al. "Neural Collaborative Filtering." in WWW 2017.
 20 | """
 21 | 
 22 | 
 23 | class NCF(GeneralModel):
 24 |     def __init__(self, config, dataset):
 25 |         super().__init__(config, dataset)
 26 | 
 27 |         # load dataset info
 28 |         self.LABEL = dataset.config['LABEL_FIELD']
 29 |         self.logger = logging.getLogger()
 30 | 
 31 |         # load parameters info
 32 |         self.mf_embedding_size = config['mf_embedding_size']
 33 |         self.mlp_embedding_size = config['mlp_embedding_size']
 34 |         self.mlp_hidden_size = config['mlp_hidden_size']
 35 |         self.dropout_prob = config['dropout_prob']
 36 |         self.mf_train = config['mf_train']
 37 |         self.mlp_train = config['mlp_train']
 38 |         self.use_pretrain = config['use_pretrain']
 39 |         self.mf_pretrain_path = config['mf_pretrain_path']
 40 |         self.mlp_pretrain_path = config['mlp_pretrain_path']
 41 | 
 42 |         # define layers and loss
 43 |         self.user_mf_embedding = nn.Embedding(self.n_users,
 44 |                                               self.mf_embedding_size)
 45 |         self.item_mf_embedding = nn.Embedding(self.n_items,
 46 |                                               self.mf_embedding_size)
 47 |         self.user_mlp_embedding = nn.Embedding(self.n_users,
 48 |                                                self.mlp_embedding_size)
 49 |         self.item_mlp_embedding = nn.Embedding(self.n_items,
 50 |                                                self.mlp_embedding_size)
 51 |         self.mlp_layers = MLPLayers([2 * self.mlp_embedding_size] +
 52 |                                     self.mlp_hidden_size)
 53 |         self.mlp_layers.logger = None  # remove logger to use torch.save()
 54 |         if self.mf_train and self.mlp_train:
 55 |             self.predict_layer = nn.Linear(
 56 |                 self.mf_embedding_size + self.mlp_hidden_size[-1], 1,
 57 |                 self.dropout_prob)
 58 |         elif self.mf_train:
 59 |             self.predict_layer = nn.Linear(self.mf_embedding_size, 1)
 60 |         elif self.mlp_train:
 61 |             self.predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1)
 62 |         self.sigmoid = nn.Sigmoid()
 63 |         self.loss = nn.BCELoss()
 64 | 
 65 |         # parameters initialization
 66 |         if self.use_pretrain:
 67 |             self.load_pretrain()
 68 |         else:
 69 |             self.apply(self._init_weights)
 70 | 
 71 |     def load_pretrain(self):
 72 |         r"""A simple implementation of loading pretrained parameters.
 73 | 
 74 |         """
 75 |         mf = torch.load(self.mf_pretrain_path)
 76 |         mlp = torch.load(self.mlp_pretrain_path)
 77 |         self.user_mf_embedding.weight.data.copy_(mf.user_mf_embedding.weight)
 78 |         self.item_mf_embedding.weight.data.copy_(mf.item_mf_embedding.weight)
 79 |         self.user_mlp_embedding.weight.data.copy_(
 80 |             mlp.user_mlp_embedding.weight)
 81 |         self.item_mlp_embedding.weight.data.copy_(
 82 |             mlp.item_mlp_embedding.weight)
 83 | 
 84 |         for (m1, m2) in zip(self.mlp_layers.mlp_layers,
 85 |                             mlp.mlp_layers.mlp_layers):
 86 |             if isinstance(m1, nn.Linear) and isinstance(m2, nn.Linear):
 87 |                 m1.weight.data.copy_(m2.weight)
 88 |                 m1.bias.data.copy_(m2.bias)
 89 | 
 90 |         predict_weight = torch.cat(
 91 |             [mf.predict_layer.weight, mlp.predict_layer.weight], dim=1)
 92 |         predict_bias = mf.predict_layer.bias + mlp.predict_layer.bias
 93 | 
 94 |         self.predict_layer.weight.data.copy_(0.5 * predict_weight)
 95 |         self.predict_layer.weight.data.copy_(0.5 * predict_bias)
 96 | 
 97 |     def _init_weights(self, module):
 98 |         if isinstance(module, nn.Embedding):
 99 |             normal_(module.weight.data, mean=0.0, std=0.01)
100 | 
101 |     def forward(self, user, item):
102 |         user = user.long()
103 |         item = item.long()
104 |         user_mf_e = self.user_mf_embedding(user)
105 |         item_mf_e = self.item_mf_embedding(item)
106 |         user_mlp_e = self.user_mlp_embedding(user)
107 |         item_mlp_e = self.item_mlp_embedding(item)
108 |         if self.mf_train:
109 |             mf_output = torch.mul(user_mf_e,
110 |                                   item_mf_e)  # [batch_size, embedding_size]
111 |         if self.mlp_train:
112 |             mlp_output = self.mlp_layers(
113 |                 torch.cat((user_mlp_e, item_mlp_e),
114 |                           -1))  # [batch_size, layers[-1]]
115 |         if self.mf_train and self.mlp_train:
116 |             output = self.sigmoid(
117 |                 self.predict_layer(torch.cat((mf_output, mlp_output), -1)))
118 |         elif self.mf_train:
119 |             output = self.sigmoid(self.predict_layer(mf_output))
120 |         elif self.mlp_train:
121 |             output = self.sigmoid(self.predict_layer(mlp_output))
122 |         else:
123 |             raise RuntimeError(
124 |                 'mf_train and mlp_train can not be False at the same time')
125 |         return output.squeeze()
126 | 
127 |     def calculate_loss(self, interaction):
128 |         user = interaction[self.USER_ID]
129 |         item = interaction[self.ITEM_ID]
130 |         label = interaction[self.LABEL].float()
131 | 
132 |         output = self.forward(user, item)
133 |         return self.loss(output, label)
134 | 
135 |     def predict(self, interaction):
136 |         user = interaction[self.USER_ID]
137 |         item = interaction[self.ITEM_ID]
138 |         return self.forward(user, item)
139 | 
140 |     def dump_parameters(self):
141 |         r"""A simple implementation of dumping model parameters for pretrain.
142 | 
143 |         """
144 |         if self.mf_train and not self.mlp_train:
145 |             save_path = self.mf_pretrain_path
146 |             torch.save(self, save_path)
147 |         elif self.mlp_train and not self.mf_train:
148 |             save_path = self.mlp_pretrain_path
149 |             torch.save(self, save_path)
150 | 


--------------------------------------------------------------------------------
/HRec/models/nfm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # ###########################
 3 | # File Name: nfm.py
 4 | # Author: geekinglcq
 5 | # Mail: lcqgeek@live.com
 6 | # Created Time: 2020-12-31 17:30:38
 7 | # ###########################
 8 | r"""
 9 | NFM
10 | ################################################
11 | Reference:
12 |     He X, Chua T S. "Neural factorization machines for sparse predictive analytics" in SIGIR 2017
13 | """
14 | 
15 | import torch
16 | import torch.nn as nn
17 | from torch.nn.init import xavier_normal_, constant_
18 | 
19 | from .layers import BaseFactorizationMachine, MLPLayers
20 | from .base import ContextModel
21 | 
22 | 
23 | class NFM(ContextModel):
24 |     """ NFM replace the fm part as a mlp to model the feature interaction.
25 | 
26 |     """
27 |     def __init__(self, config, dataset):
28 |         super(NFM, self).__init__(config, dataset)
29 | 
30 |         # load parameters info
31 |         self.mlp_hidden_size = config['mlp_hidden_size']
32 |         self.dropout_prob = config['dropout_prob']
33 | 
34 |         # define layers and loss
35 |         size_list = [self.embedding_size] + self.mlp_hidden_size
36 |         self.fm = BaseFactorizationMachine(reduce_sum=False)
37 |         self.bn = nn.BatchNorm1d(num_features=self.embedding_size)
38 |         self.mlp_layers = MLPLayers(size_list,
39 |                                     self.dropout_prob,
40 |                                     activation='sigmoid',
41 |                                     bn=True)
42 |         self.predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1, bias=False)
43 |         self.sigmoid = nn.Sigmoid()
44 |         self.loss = nn.BCELoss()
45 | 
46 |         # parameters initialization
47 |         self.apply(self._init_weights)
48 | 
49 |     def _init_weights(self, module):
50 |         if isinstance(module, nn.Embedding):
51 |             xavier_normal_(module.weight.data)
52 |         elif isinstance(module, nn.Linear):
53 |             xavier_normal_(module.weight.data)
54 |             if module.bias is not None:
55 |                 constant_(module.bias.data, 0)
56 | 
57 |     def forward(self, interaction):
58 |         # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None
59 |         # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None
60 |         sparse_embedding, dense_embedding = self.embed_input_fields(
61 |             interaction)
62 |         all_embeddings = []
63 |         if sparse_embedding is not None:
64 |             all_embeddings.append(sparse_embedding)
65 |         if dense_embedding is not None and len(dense_embedding.shape) == 3:
66 |             all_embeddings.append(dense_embedding)
67 |         nfm_all_embeddings = torch.cat(
68 |             all_embeddings, dim=1)  # [batch_size, num_field, embed_dim]
69 |         bn_nfm_all_embeddings = self.bn(self.fm(nfm_all_embeddings))
70 | 
71 |         output = self.sigmoid(
72 |             self.predict_layer(self.mlp_layers(bn_nfm_all_embeddings)) +
73 |             self.first_order_linear(interaction))
74 |         return output.squeeze()
75 | 
76 |     def calculate_loss(self, interaction):
77 |         label = interaction[self.LABEL]
78 |         output = self.forward(interaction)
79 |         return self.loss(output, label)
80 | 
81 |     def predict(self, interaction):
82 |         return self.forward(interaction)
83 | 


--------------------------------------------------------------------------------
/HRec/models/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import gc
 4 | import torch
 5 | from enum import Enum
 6 | 
 7 | 
 8 | class ModelType(Enum):
 9 |     """Type of models:
10 |         - GENERAL: General model, treat different type item as the same.
11 |         - HETERO: Heterogenous model
12 |     """
13 |     GENERAL = 1
14 |     CONTEXT = 2
15 |     HETERO = 3
16 | 
17 | 
18 | ## MEM utils ##
19 | def mem_report():
20 |     '''Report the memory usage of the tensor.storage in pytorch
21 |     Both on CPUs and GPUs are reported'''
22 |     def _mem_report(tensors, mem_type):
23 |         '''Print the selected tensors of type
24 | 
25 |         There are two major storage types in our major concern:
26 |             - GPU: tensors transferred to CUDA devices
27 |             - CPU: tensors remaining on the system memory (usually unimportant)
28 | 
29 |         Args:
30 |             - tensors: the tensors of specified type
31 |             - mem_type: 'CPU' or 'GPU' in current implementation '''
32 |         print('Storage on %s' % (mem_type))
33 |         print('-' * LEN)
34 |         total_numel = 0
35 |         total_mem = 0
36 |         visited_data = []
37 |         for tensor in tensors:
38 |             if tensor.is_sparse:
39 |                 continue
40 |             import pdb
41 |             pdb.set_trace()
42 |             # a data_ptr indicates a memory block allocated
43 |             data_ptr = tensor.storage().data_ptr()
44 |             if data_ptr in visited_data:
45 |                 continue
46 |             visited_data.append(data_ptr)
47 | 
48 |             numel = tensor.storage().size()
49 |             total_numel += numel
50 |             element_size = tensor.storage().element_size()
51 |             mem = numel * element_size / 1024 / 1024  # 32bit=4Byte, MByte
52 |             total_mem += mem
53 |             element_type = type(tensor).__name__
54 |             size = tuple(tensor.size())
55 | 
56 |             print('%s\t\t%s\t\t%.2f' % (element_type, size, mem))
57 |         print('-' * LEN)
58 |         print('Total Tensors: %d \tUsed Memory Space: %.2f MBytes' %
59 |               (total_numel, total_mem))
60 |         print('-' * LEN)
61 | 
62 |     LEN = 65
63 |     print('=' * LEN)
64 |     objects = gc.get_objects()
65 |     print('%s\t%s\t\t\t%s' % ('Element type', 'Size', 'Used MEM(MBytes)'))
66 |     tensors = [obj for obj in objects if torch.is_tensor(obj)]
67 |     cuda_tensors = [t for t in tensors if t.is_cuda]
68 |     host_tensors = [t for t in tensors if not t.is_cuda]
69 |     _mem_report(cuda_tensors, 'GPU')
70 |     _mem_report(host_tensors, 'CPU')
71 |     print('=' * LEN)
72 | 


--------------------------------------------------------------------------------
/HRec/models/widedeep.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # ###########################
 3 | # File Name: widedeep.py
 4 | # Author: geekinglcq
 5 | # Mail: lcqgeek@live.com
 6 | # Created Time: 2020-12-31 17:31:01
 7 | # ###########################
 8 | 
 9 | # -*- coding: utf-8 -*-
10 | # @Time   : 2020/08/30
11 | # @Author : Xinyan Fan
12 | # @Email  : xinyan.fan@ruc.edu.cn
13 | # @File   : widedeep.py
14 | r"""
15 | WideDeep
16 | #####################################################
17 | Reference:
18 |     Heng-Tze Cheng et al. "Wide & Deep Learning for Recommender Systems." in RecSys 2016.
19 | """
20 | 
21 | import torch
22 | import torch.nn as nn
23 | from torch.nn.init import xavier_normal_, constant_
24 | 
25 | from .layers import MLPLayers
26 | from .base import ContextModel
27 | 
28 | 
29 | class WideDeep(ContextModel):
30 |     r"""WideDeep is a context-based recommendation model.
31 |     It jointly trains wide linear models and deep neural networks to combine the benefits
32 |     of memorization and generalization for recommender systems. The wide component is a generalized linear model
33 |     of the form :math:`y = w^Tx + b`. The deep component is a feed-forward neural network. The wide component
34 |     and deep component are combined using a weighted sum of their output log odds as the prediction,
35 |     which is then fed to one common logistic loss function for joint training.
36 |     """
37 |     def __init__(self, config, dataset):
38 |         super(WideDeep, self).__init__(config, dataset)
39 | 
40 |         # load parameters info
41 |         self.mlp_hidden_size = config['mlp_hidden_size']
42 |         self.dropout_prob = config['dropout_prob']
43 | 
44 |         # define layers and loss
45 |         size_list = [
46 |             self.embedding_size * len(self.token_field_names) +
47 |             len(self.float_field_names)
48 |         ] + self.mlp_hidden_size
49 | 
50 |         self.mlp_layers = MLPLayers(size_list, self.dropout_prob)
51 |         self.deep_predict_layer = nn.Linear(self.mlp_hidden_size[-1], 1)
52 |         self.sigmoid = nn.Sigmoid()
53 |         self.loss = nn.BCELoss()
54 | 
55 |         # parameters initialization
56 |         self.apply(self._init_weights)
57 | 
58 |     def _init_weights(self, module):
59 |         if isinstance(module, nn.Embedding):
60 |             xavier_normal_(module.weight.data)
61 |         elif isinstance(module, nn.Linear):
62 |             xavier_normal_(module.weight.data)
63 |             if module.bias is not None:
64 |                 constant_(module.bias.data, 0)
65 | 
66 |     def forward(self, interaction):
67 |         # sparse_embedding shape: [batch_size, num_token_seq_field+num_token_field, embed_dim] or None
68 |         # dense_embedding shape: [batch_size, num_float_field] or [batch_size, num_float_field, embed_dim] or None
69 |         sparse_embedding, dense_embedding = self.embed_input_fields(
70 |             interaction)
71 |         batch_size = sparse_embedding.shape[0]
72 |         all_embeddings = []
73 |         if sparse_embedding is not None:
74 |             all_embeddings.append(sparse_embedding.view(batch_size, -1))
75 |         if dense_embedding is not None and len(dense_embedding.shape) == 3:
76 |             all_embeddings.append(dense_embedding.view(batch_size, -1))
77 |         widedeep_all_embeddings = torch.cat(
78 |             all_embeddings, dim=1)  # [batch_size, num_field, embed_dim]
79 |         fm_output = self.first_order_linear(interaction)
80 | 
81 |         deep_output = self.deep_predict_layer(
82 |             self.mlp_layers(widedeep_all_embeddings))
83 |         output = self.sigmoid(fm_output + deep_output)
84 |         return output.squeeze()
85 | 
86 |     def calculate_loss(self, interaction):
87 |         label = interaction[self.LABEL]
88 |         output = self.forward(interaction)
89 |         return self.loss(output, label.float())
90 | 
91 |     def predict(self, interaction):
92 |         return self.forward(interaction)
93 | 


--------------------------------------------------------------------------------
/HRec/models/xdeepfm.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: xdeepfm.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2020-12-31 17:31:23
  7 | # ###########################
  8 | r"""
  9 | xDeepFM
 10 | ################################################
 11 | Reference:
 12 |     Jianxun Lian at al. "xDeepFM: Combining Explicit and Implicit Feature Interactions for Recommender Systems."
 13 |     in SIGKDD 2018.
 14 | """
 15 | 
 16 | import torch
 17 | import torch.nn as nn
 18 | from torch.nn.init import xavier_normal_, constant_
 19 | from logging import getLogger
 20 | 
 21 | from .layers import MLPLayers, activation_layer
 22 | from .base import ContextModel
 23 | 
 24 | 
 25 | class xDeepFM(ContextModel):
 26 |     """xDeepFM combines a CIN (Compressed Interaction Network) with a classical DNN.
 27 |     The model is able to learn certain bounded-degree feature interactions explicitly;
 28 |     Besides, it can also learn arbitrary low- and high-order feature interactions implicitly.
 29 |     """
 30 |     def __init__(self, config, dataset):
 31 |         super(xDeepFM, self).__init__(config, dataset)
 32 | 
 33 |         # load parameters info
 34 |         self.mlp_hidden_size = config['mlp_hidden_size']
 35 |         self.reg_weight = config['reg_weight']
 36 |         self.dropout_prob = config['dropout_prob']
 37 |         self.direct = config['direct']
 38 |         self.eize = temp_cin_size = list(config['cin_layer_size'])
 39 | 
 40 |         # Check whether the size of the CIN layer is legal.
 41 |         if not self.direct:
 42 |             self.cin_layer_size = list(
 43 |                 map(lambda x: int(x // 2 * 2), temp_cin_size))
 44 |             if self.cin_layer_size[:-1] != temp_cin_size[:-1]:
 45 |                 logger = getLogger()
 46 |                 logger.warning(
 47 |                     'Layer size of CIN should be even except for the last layer when direct is True.'
 48 |                     'It is changed to {}'.format(self.cin_layer_size))
 49 | 
 50 |         # Create a convolutional layer for each CIN layer
 51 |         self.conv1d_list = []
 52 |         self.field_nums = [self.num_feature_field]
 53 |         for i, layer_size in enumerate(self.cin_layer_size):
 54 |             conv1d = nn.Conv1d(self.field_nums[-1] * self.field_nums[0],
 55 |                                layer_size, 1).to(self.device)
 56 |             self.conv1d_list.append(conv1d)
 57 |             if self.direct:
 58 |                 self.field_nums.append(layer_size)
 59 |             else:
 60 |                 self.field_nums.append(layer_size // 2)
 61 | 
 62 |         # Create MLP layer
 63 |         size_list = [
 64 |             self.embedding_size * len(self.token_field_names) +
 65 |             len(self.float_field_names)
 66 |         ] + self.mlp_hidden_size + [1]
 67 |         self.mlp_layers = MLPLayers(size_list, dropout=self.dropout_prob)
 68 | 
 69 |         # Get the output size of CIN
 70 |         if self.direct:
 71 |             self.final_len = sum(self.cin_layer_size)
 72 |         else:
 73 |             self.final_len = sum(
 74 |                 self.cin_layer_size[:-1]) // 2 + self.cin_layer_size[-1]
 75 | 
 76 |         self.cin_linear = nn.Linear(self.final_len, 1, bias=False)
 77 |         self.sigmoid = nn.Sigmoid()
 78 |         self.loss = nn.BCELoss()
 79 |         self.apply(self._init_weights)
 80 | 
 81 |     def _init_weights(self, module):
 82 |         if isinstance(module, nn.Embedding):
 83 |             xavier_normal_(module.weight.data)
 84 |         elif isinstance(module, nn.Linear):
 85 |             xavier_normal_(module.weight.data)
 86 |             if module.bias is not None:
 87 |                 constant_(module.bias.data, 0)
 88 | 
 89 |     def reg_loss(self, parameters):
 90 |         """Calculate the L2 normalization loss of parameters in a certain layer.
 91 | 
 92 |         Returns:
 93 |             loss(torch.FloatTensor): The L2 Loss tensor. shape of [1,]
 94 |         """
 95 |         reg_loss = 0
 96 |         for name, parm in parameters:
 97 |             if name.endswith('weight'):
 98 |                 reg_loss = reg_loss + parm.norm(2)
 99 |         return reg_loss
100 | 
101 |     def calculate_reg_loss(self):
102 |         """Calculate the final L2 normalization loss of model parameters.
103 |         Including weight matrixes of mlp layers, linear layer and convolutional layers.
104 | 
105 |         Returns:
106 |             loss(torch.FloatTensor): The L2 Loss tensor. shape of [1,]
107 |         """
108 |         l2_reg = 0
109 |         l2_reg = l2_reg + self.reg_loss(self.mlp_layers.named_parameters())
110 |         l2_reg = l2_reg + self.reg_loss(
111 |             self.first_order_linear.named_parameters())
112 |         for conv1d in self.conv1d_list:
113 |             l2_reg += self.reg_loss(conv1d.named_parameters())
114 |         return l2_reg
115 | 
116 |     def compressed_interaction_network(self,
117 |                                        input_features,
118 |                                        activation='identity'):
119 |         r"""For k-th CIN layer, the output :math:`X_k` is calculated via
120 | 
121 |         .. math::
122 |             x_{h,*}^{k} = \sum_{i=1}^{H_k-1} \sum_{j=1}^{m}W_{i,j}^{k,h}(X_{i,*}^{k-1} \circ x_{j,*}^0)
123 | 
124 |         :math:`H_k` donates the number of feature vectors in the k-th layer,
125 |         :math:`1 \le h \le H_k`.
126 |         :math:`\circ` donates the Hadamard product.
127 | 
128 |         And Then, We apply sum pooling on each feature map of the hidden layer.
129 |         Finally, All pooling vectors from hidden layers are concatenated.
130 | 
131 |         Args:
132 |             input_features(torch.Tensor): [batch_size, field_num, embed_dim]. Embedding vectors of all features.
133 |             activation(str): name of activation function.
134 | 
135 |         Returns:
136 |             torch.Tensor: [batch_size, num_feature_field * embedding_size]. output of CIN layer.
137 |         """
138 |         batch_size, _, embedding_size = input_features.shape
139 |         hidden_nn_layers = [input_features]
140 |         final_result = []
141 |         for i, layer_size in enumerate(self.cin_layer_size):
142 |             z_i = torch.einsum('bmd,bhd->bhmd', hidden_nn_layers[0],
143 |                                hidden_nn_layers[-1])
144 |             z_i = z_i.view(batch_size, self.field_nums[0] * self.field_nums[i],
145 |                            embedding_size)
146 |             z_i = self.conv1d_list[i](z_i)
147 | 
148 |             # Pass the CIN intermediate result through the activation function.
149 |             if activation.lower() == 'identity':
150 |                 output = z_i
151 |             else:
152 |                 activate_func = activation_layer(activation)
153 |                 if activate_func is None:
154 |                     output = z_i
155 |                 else:
156 |                     output = activate_func(z_i)
157 | 
158 |             # Get the output of the hidden layer.
159 |             if self.direct:
160 |                 direct_connect = output
161 |                 next_hidden = output
162 |             else:
163 |                 if i != len(self.cin_layer_size) - 1:
164 |                     next_hidden, direct_connect = torch.split(
165 |                         output, 2 * [layer_size // 2], 1)
166 |                 else:
167 |                     direct_connect = output
168 |                     next_hidden = 0
169 | 
170 |             final_result.append(direct_connect)
171 |             hidden_nn_layers.append(next_hidden)
172 |         result = torch.cat(final_result, dim=1)
173 |         result = torch.sum(result, -1)
174 |         return result
175 | 
176 |     def forward(self, interaction):
177 |         sparse_embedding, dense_embedding = self.embed_input_fields(
178 |             interaction)
179 |         all_embeddings = []
180 |         batch_size = sparse_embedding.shape[0]
181 |         if sparse_embedding is not None:
182 |             all_embeddings.append(sparse_embedding.view(batch_size, -1))
183 |         if dense_embedding is not None and len(dense_embedding.shape) == 3:
184 |             all_embeddings.append(dense_embedding.view(batch_size, -1))
185 | 
186 |         # Get the output of CIN.
187 |         xdeepfm_input = torch.cat(all_embeddings,
188 |                                   dim=1)  # [batch_size, num_field, embed_dim]
189 |         cin_output = self.compressed_interaction_network(sparse_embedding)
190 |         cin_output = self.cin_linear(cin_output)
191 | 
192 |         # Get the output of MLP layer.
193 |         batch_size = xdeepfm_input.shape[0]
194 |         dnn_output = self.mlp_layers(xdeepfm_input.view(batch_size, -1))
195 | 
196 |         # Get predicted score.
197 |         y_p = self.first_order_linear(interaction) + cin_output + dnn_output
198 |         y = self.sigmoid(y_p)
199 | 
200 |         return y.squeeze(1)
201 | 
202 |     def calculate_loss(self, interaction):
203 |         label = interaction[self.LABEL]
204 |         output = self.forward(interaction)
205 |         l2_reg = self.calculate_reg_loss()
206 |         return self.loss(output, label) + self.reg_weight * l2_reg
207 | 
208 |     def predict(self, interaction):
209 |         return self.forward(interaction)
210 | 


--------------------------------------------------------------------------------
/HRec/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .process import Process
2 | from .hprocess import HProcess
3 | from .dprocess import DProcess
4 | from .utils import get_free_gpu
5 | 


--------------------------------------------------------------------------------
/HRec/pipeline/configure.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import json
 4 | class Config(object):
 5 | 
 6 |     """Config class that control all config in the experiment.
 7 |     """
 8 |     def __init__(self, config_path):
 9 | 
10 |         self.dict = json.load(open(config_path))
11 |         for key in ['data', 'model', 'opt']:
12 |             self.dict.update(self.dict[key])
13 | 
14 |     
15 |     def __getitem__(self, arg):
16 | 
17 |         if arg in self.dict:
18 |             return self.get(arg)
19 |         else:
20 |             raise ValueError(f'No [{arg}] value in this config.')
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/HRec/pipeline/dprocess.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: dprocess.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2021-02-03 02:14:47
  7 | # ###########################
  8 | 
  9 | import os
 10 | from tqdm import tqdm
 11 | from .optimizer import Optimizer
 12 | import torch
 13 | from ..models import model_map
 14 | from .hprocess import HProcess
 15 | from collections import defaultdict
 16 | from torch.autograd import Variable
 17 | 
 18 | 
 19 | class DProcess(HProcess):
 20 |     """
 21 |     Process for the DDTCDR model
 22 |     """
 23 |     def __init__(self, config):
 24 |         self.config = config
 25 |         self._path_config(config['path'])
 26 |         self._logger_config()
 27 |         self._set_device(config)
 28 |         self._prepare_data(config['data'])
 29 |         self._prepare_model(config['model'])
 30 |         self._prepare_optimizer(config['opt'])
 31 |         self._prepare_evaluator(config)
 32 | 
 33 |     def _prepare_model(self, model_config):
 34 |         self.models = {}
 35 |         name = model_config['name']
 36 |         for item_type in self.types:
 37 |             model = model_map[name](model_config, self.dataset, item_type)
 38 |             self.models[item_type] = model.to(self.device)
 39 |         self.model = self.models[item_type]
 40 |         self.crit = torch.nn.BCELoss()
 41 |         self.alpha = model_config['alpha']
 42 | 
 43 |     def _prepare_optimizer(self, opt_config):
 44 |         self.opts = {}
 45 |         for item_type in self.types:
 46 |             opt = Optimizer(opt_config, self.models[item_type].parameters())
 47 |             self.opts[item_type] = opt
 48 |         self.start_epoch = 0
 49 |         self.best_val_score = -1
 50 |         self.epochs = opt_config['epochs']
 51 |         self.eval_step = opt_config['eval_step']
 52 |         self.save_step = opt_config['save_step']
 53 |         self.train_loss_dict = {}
 54 |         self.val_loss_dict = {}
 55 |         if 'early_stop' in opt_config.keys():
 56 |             self.early_stop = True
 57 |             config = opt_config['early_stop']
 58 |             self.eval_metric = config.get('metric', 'auc')
 59 |             self.eval_mode = config.get('mode', 'max')
 60 |             self.stop_step = config.get('stop_step', 5)
 61 |         else:
 62 |             self.early_stop = False
 63 | 
 64 |     def train_one_batch(self, hdata):
 65 | 
 66 |         for opt in self.opts.values():
 67 |             opt.zero_grad()
 68 | 
 69 |         preds = defaultdict(dict)
 70 |         losses = defaultdict(dict)
 71 | 
 72 |         for item_model_type in self.types:
 73 |             for item_type in self.types:
 74 |                 if item_type == item_model_type:
 75 |                     preds[item_model_type][item_type] = self.models[
 76 |                         item_model_type](item_type, hdata[item_type])
 77 |                 else:
 78 |                     preds[item_model_type][item_type] = self.models[
 79 |                         item_model_type](item_type,
 80 |                                          hdata[item_type],
 81 |                                          dual=True)
 82 |                 label = hdata[item_type][self.LABEL].reshape((-1, 1)).float()
 83 |                 losses[item_model_type][item_type] = self.crit(
 84 |                     preds[item_model_type][item_type], label)
 85 | 
 86 |         # wighted loss
 87 |         w_loss = defaultdict(list)
 88 |         for item_type in self.types:
 89 |             for item_model_type in self.types:
 90 |                 if item_type == item_model_type:
 91 |                     loss = (1 -
 92 |                             self.alpha) * losses[item_model_type][item_type]
 93 |                 else:
 94 |                     # change variable to Tensor if error
 95 |                     loss = self.alpha * Variable(
 96 |                         losses[item_model_type][item_type].data,
 97 |                         requires_grad=False)
 98 |                 w_loss[item_type].append(loss)
 99 |         t_loss = {}
100 |         for k, v in w_loss.items():
101 |             t_loss[k] = torch.sum(torch.stack(w_loss[k], dim=0))
102 |             t_loss[k].backward(retain_graph=True)
103 |         orth_loss = {}
104 |         for item_type in self.types:
105 |             orth_loss[item_type] = torch.zeros(1, device=self.device)
106 |         reg = 1e-6
107 | 
108 |         for item_type, model in self.models.items():
109 |             for name, param in model.bridge.named_parameters():
110 |                 if 'bias' not in name:
111 |                     param_flat = param.view(param.shape[0], -1)
112 |                     sym = torch.mm(param_flat, torch.t(param_flat))
113 |                     sym -= torch.eye(param_flat.shape[0], device=self.device)
114 |                     orth_loss[item_type] += reg * sym.abs().sum()
115 |             orth_loss[item_type].backward()
116 |         for item_type in self.types:
117 |             self.opts[item_type].step()
118 | 
119 |         r_loss = 0
120 |         for _, loss in t_loss.items():
121 |             r_loss += loss.data.cpu().item()
122 |         for _, l in orth_loss.items():
123 |             r_loss += loss.data.cpu().item()
124 | 
125 |         return r_loss
126 | 
127 |     def train_one_epoch(self, data_loader=None):
128 |         """Train one epoch using given data"""
129 |         if data_loader is None:
130 |             data_loader = self.dataset.train_data_loader
131 |         max_len = max([len(dl) for dl in data_loader.values()])
132 |         m = self.model
133 | 
134 |         m.train()
135 |         losses_dict = defaultdict(float)
136 | 
137 |         iters = {}
138 |         hdata = {}
139 |         for idx in tqdm(range(max_len), total=max_len):
140 | 
141 |             for item_type, dl in data_loader.items():
142 |                 try:
143 |                     data_iter = iters.get(item_type, None)
144 |                     if data_iter is None:
145 |                         iters[item_type] = iter(dl)
146 |                         data_iter = iters[item_type]
147 |                     data = next(data_iter)
148 |                 except:
149 |                     iters[item_type] = iter(dl)
150 |                     data = next(iters[item_type])
151 | 
152 |                 # self.modelsize(self.model, data)
153 |                 if type(data) is dict:
154 |                     for key, value in data.items():
155 |                         data[key] = value.to(self.device)
156 |                 hdata[item_type] = data
157 | 
158 |             r_loss = self.train_one_batch(hdata)
159 | 
160 |         losses_dict['total'] = r_loss
161 | 
162 |         return losses_dict
163 | 
164 |     def validate(self, data_loader=None):
165 |         """
166 |         Run model in validation dataset and calculate the
167 |         score using evaluator.
168 |         Return:
169 |             result: a dict store metrics name-value pair.
170 |         """
171 |         if data_loader is None:
172 |             data_loader = self.dataset.val_data_loader
173 |         ms = self.models
174 |         for m in ms.values():
175 |             m.eval()
176 | 
177 |         lens = [len(dl) for dl in data_loader.values()]
178 |         batch_matrix_list = []
179 |         with tqdm(total=sum(lens)) as pbar:
180 |             for item_type, dl in data_loader.items():
181 |                 for data in dl:
182 | 
183 |                     if type(data) is dict:
184 |                         for key, value in data.items():
185 |                             data[key] = value.to(self.device)
186 |                     pred = ms[item_type].predict(item_type, data).reshape(
187 |                         (-1, ))
188 |                     batch_matrix = self.evaluator.collect(data, pred)
189 |                     batch_matrix_list.append(batch_matrix)
190 |                     pbar.update(1)
191 | 
192 |         result = self.evaluator.evaluate(batch_matrix_list, groupby=True)
193 |         return result
194 | 
195 |     def save_checkpoint(self, epoch, name='last', path=None):
196 |         if path is None:
197 |             path = self.ckp_path
198 |         model_dict = {}
199 |         opt_dict = {}
200 |         for item_type in self.models:
201 |             model_dict[item_type] = self.models[item_type].state_dict()
202 |             opt_dict[item_type] = self.opts[item_type].opt.state_dict()
203 |         state = {
204 |             'epoch': epoch,
205 |             'state_dict': model_dict,
206 |             'optimizer': opt_dict
207 |         }
208 |         if name == 'last':
209 |             file_name = os.path.join(self.ckp_path,
210 |                                      f'{name}-{epoch}-model.pth')
211 |         else:
212 |             file_name = os.path.join(self.ckp_path, f'{name}-model.pth')
213 |         self.last_model_path = file_name
214 |         torch.save(state, file_name)
215 |         if name == 'best':
216 |             self.best_ckp_path = file_name
217 |         elif name == 'last':
218 |             self.last_ckp_path = file_name
219 | 
220 |     def load_checkpoint(self, file_name=None, mode=None):
221 |         if file_name is None:
222 |             if mode == 'last':
223 |                 file_name = self.last_model_path
224 |             elif mode == 'best':
225 |                 file_name = self.best_model_path
226 |             else:
227 |                 raise ValueError("No checkpoint path provided.")
228 |         ckp = torch.load(file_name)
229 |         self.start_epoch = ckp['epoch'] + 1
230 |         for item_type in self.models:
231 |             self.models[item_type].load_state_dict(
232 |                 ckp['state_dict'][item_type])
233 |             self.opts[item_type].opt.load_state_dict(
234 |                 ckp['optimizer'][item_type])
235 |         self.logger.info(f"Load ckp from {file_name}.")
236 | 


--------------------------------------------------------------------------------
/HRec/pipeline/evaluator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name:
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2020-12-18 15:34:30
  7 | # ###########################
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | from .metrics import metrics_dict
 12 | 
 13 | # These metrics are typical in loss recommendations
 14 | loss_metrics = {
 15 |     metric.lower(): metric
 16 |     for metric in ['AUC', 'RMSE', 'MAE', 'LOGLOSS']
 17 | }
 18 | 
 19 | 
 20 | class Evaluator(object):
 21 |     r"""Loss Evaluator is mainly used in rating prediction and click through rate prediction. Now, we support four
 22 |     loss metrics which contain `'AUC', 'RMSE', 'MAE', 'LOGLOSS'`.
 23 |     """
 24 |     def __init__(self, config):
 25 |         super().__init__()
 26 | 
 27 |         self.metrics = config['metrics']
 28 | 
 29 |         self.label_field = config['data']['LABEL_FIELD']
 30 |         self.type_field = config['data']['TYPE_FIELD']
 31 |         self._check_args()
 32 | 
 33 |     def collect(self, interaction, pred_scores):
 34 |         """collect the loss intermediate result of one batch, this function mainly
 35 |         implements concatenating preds and trues. It is called at the end of each batch
 36 | 
 37 |         Args:
 38 |             interaction (Interaction): :class:`AbstractEvaluator` of the batch
 39 |             pred_scores (tensor): the tensor of model output with a size of `(N, )`
 40 | 
 41 |         Returns:
 42 |             tensor : a batch of socres with a size of `(N, 2)`
 43 | 
 44 |         """
 45 |         true_scores = interaction[self.label_field].to(pred_scores.device)
 46 |         types = interaction[self.type_field].to(pred_scores.device).float()
 47 |         assert len(true_scores) == len(pred_scores)
 48 |         return torch.stack((true_scores.float(), pred_scores.detach(), types),
 49 |                            dim=1)
 50 | 
 51 |     def evaluate(self, batch_matrix_list, groupby=False, *args):
 52 |         """calculate the metrics of all batches. It is called at the end of each epoch
 53 | 
 54 |         Args:
 55 |             batch_matrix_list (list): the results of all batches
 56 | 
 57 |         Returns:
 58 |             dict: such as {'AUC': 0.83}
 59 | 
 60 |         """
 61 |         concat = torch.cat(batch_matrix_list, dim=0).cpu().numpy()
 62 | 
 63 |         metric_dict = {}
 64 |         if groupby:
 65 |             types = concat[:, 2]
 66 |             for t in np.unique(types):
 67 |                 trues = concat[types == t][:, 0]
 68 |                 preds = concat[types == t][:, 1]
 69 |                 result_list = self._calculate_metrics(trues, preds)
 70 |                 for metric, value in zip(self.metrics, result_list):
 71 |                     key = str(t) + "-" + str(metric)
 72 |                     metric_dict[key] = round(value, 4)
 73 | 
 74 |         trues = concat[:, 0]
 75 |         preds = concat[:, 1]
 76 |         # get metrics
 77 |         result_list = self._calculate_metrics(trues, preds)
 78 |         for metric, value in zip(self.metrics, result_list):
 79 |             key = str(metric)
 80 |             metric_dict[key] = round(value, 4)
 81 | 
 82 |         return metric_dict
 83 | 
 84 |     def _check_args(self):
 85 | 
 86 |         # Check metrics
 87 |         if isinstance(self.metrics, (str, list)):
 88 |             if isinstance(self.metrics, str):
 89 |                 self.metrics = [self.metrics]
 90 |         else:
 91 |             raise TypeError('metrics must be str or list')
 92 | 
 93 |         # Convert metric to lowercase
 94 |         for m in self.metrics:
 95 |             if m.lower() not in loss_metrics:
 96 |                 raise ValueError("There is no loss metric named {}!".format(m))
 97 |         self.metrics = [metric.lower() for metric in self.metrics]
 98 | 
 99 |     def metrics_info(self, trues, preds):
100 |         """get metrics result
101 | 
102 |         Args:
103 |             trues (np.ndarray): the true scores' list
104 |             preds (np.ndarray): the predict scores' list
105 | 
106 |         Returns:
107 |             list: a list of metrics result
108 | 
109 |         """
110 |         result_list = []
111 |         for metric in self.metrics:
112 |             metric_fuc = metrics_dict[metric.lower()]
113 |             result = metric_fuc(trues, preds)
114 |             result_list.append(result)
115 |         return result_list
116 | 
117 |     def _calculate_metrics(self, trues, preds):
118 |         return self.metrics_info(trues, preds)
119 | 
120 |     def __str__(self):
121 |         mesg = 'The Loss Evaluator Info:\n' + '\tMetrics:[' + ', '.join(
122 |             [loss_metrics[metric.lower()] for metric in self.metrics]) + ']'
123 |         return mesg
124 | 


--------------------------------------------------------------------------------
/HRec/pipeline/hprocess.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: hprocess
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2021-01-06 16:24:08
  7 | # ###########################
  8 | 
  9 | from tqdm import tqdm
 10 | import numpy as np
 11 | import torch.nn as nn
 12 | from ..datasets import HDataSet, SubSet
 13 | from .process import Process
 14 | from collections import defaultdict
 15 | from torch.utils.data import DataLoader
 16 | 
 17 | 
 18 | class HProcess(Process):
 19 |     """
 20 |     Process for Heterogeneous Recommendation
 21 |     """
 22 |     def __init__(self, config):
 23 |         self.config = config
 24 |         self._path_config(config['path'])
 25 |         self._logger_config()
 26 |         self._set_device(config)
 27 |         self._prepare_data(config['data'])
 28 |         self._prepare_model(config['model'])
 29 |         self._prepare_optimizer(config['opt'], self.model.parameters())
 30 |         self._prepare_evaluator(config)
 31 | 
 32 |     def train_one_epoch(self, data_loader=None):
 33 |         """Train one epoch using given data"""
 34 |         if data_loader is None:
 35 |             data_loader = self.dataset.train_data_loader
 36 |         max_len = max([len(dl) for dl in data_loader.values()])
 37 |         m = self.model
 38 |         loss_fn = self.model.calculate_loss
 39 | 
 40 |         m.train()
 41 |         losses = None
 42 |         losses_dict = defaultdict(float)
 43 |         opt = self.opt.opt
 44 |         if hasattr(self.opt, 'scheduler'):
 45 |             pass
 46 |             # TODO: scheduler
 47 |             # sch = self.opt.scheduler
 48 |         else:
 49 |             pass
 50 |             # sch = None
 51 |         iters = {}
 52 |         hdata = {}
 53 |         for idx in tqdm(range(max_len), total=max_len):
 54 | 
 55 |             for item_type, dl in data_loader.items():
 56 |                 try:
 57 |                     data_iter = iters.get(item_type, None)
 58 |                     if data_iter is None:
 59 |                         iters[item_type] = iter(dl)
 60 |                         data_iter = iters[item_type]
 61 |                     data = next(data_iter)
 62 |                 except:
 63 |                     iters[item_type] = iter(dl)
 64 |                     data = next(iters[item_type])
 65 | 
 66 |                 # self.modelsize(self.model, data)
 67 |                 if type(data) is dict:
 68 |                     for key, value in data.items():
 69 |                         data[key] = value.to(self.device)
 70 |                 hdata[item_type] = data
 71 | 
 72 |             opt.zero_grad()
 73 |             loss, loss_dict = loss_fn(hdata)
 74 |             loss.backward()
 75 |             opt.step()
 76 | 
 77 |             losses = loss.item() if losses is None else losses + loss.item()
 78 |             for k, v in loss_dict.items():
 79 |                 losses_dict[k] += v
 80 |         losses_dict['total'] = losses
 81 | 
 82 |         return losses_dict
 83 | 
 84 |     def get_item_embeddings(self, item_kind):
 85 |         item_feat = self.dataset.item_feat[item_kind]
 86 |         item_set = SubSet(item_feat, None, self.dataset.iid_field,
 87 |                           self.dataset.itype_field, None, None,
 88 |                           self.dataset.item_feat_fields[item_kind])
 89 |         dl = DataLoader(item_set)
 90 |         id2mapemb = {}
 91 |         id2emb = {}
 92 |         id2rawemb = {}
 93 |         for data in dl:
 94 |             if type(data) is dict:
 95 |                 for k, v in data.items():
 96 |                     data[k] = v.to(self.device)
 97 |                 rawembs, mapembs, embs = self.model.get_item_embedding(
 98 |                     item_kind, data)
 99 |                 ids = data['item_id'].cpu().detach().numpy()
100 |                 embs = embs.cpu().detach().numpy()
101 |                 mapembs = mapembs.cpu().detach().numpy()
102 |                 rawembs = rawembs.cpu().detach().numpy()
103 |                 for idx, mapemb, emb, rawemb in zip(ids, mapembs, embs,
104 |                                                     rawembs):
105 |                     id2mapemb[idx] = mapemb
106 |                     id2emb[idx] = emb
107 |                     id2rawemb[idx] = rawemb
108 | 
109 |         return id2emb, id2mapemb, id2rawemb
110 | 
111 |     def validate(self, data_loader=None):
112 |         """
113 |         Run model in validation dataset and calculate the
114 |         score using evaluator.
115 |         Return:
116 |             result: a dict store metrics name-value pair.
117 |         """
118 |         if data_loader is None:
119 |             data_loader = self.dataset.val_data_loader
120 |         m = self.model
121 |         m.eval()
122 | 
123 |         lens = [len(dl) for dl in data_loader.values()]
124 |         batch_matrix_list = []
125 |         with tqdm(total=sum(lens)) as pbar:
126 |             for item_type, dl in data_loader.items():
127 |                 for data in dl:
128 | 
129 |                     if type(data) is dict:
130 |                         for key, value in data.items():
131 |                             data[key] = value.to(self.device)
132 |                     pred = m.predict(item_type, data)
133 |                     batch_matrix = self.evaluator.collect(data, pred)
134 |                     batch_matrix_list.append(batch_matrix)
135 |                     pbar.update(1)
136 | 
137 |         result = self.evaluator.evaluate(batch_matrix_list, groupby=True)
138 |         return result
139 | 
140 |     def test(self, data_loader=None):
141 |         """
142 |         Test
143 |         """
144 |         if data_loader is None:
145 |             data_loader = self.dataset.test_data_loader
146 |         return self.validate(data_loader=data_loader)
147 | 
148 |     def _prepare_data(self, data_config):
149 |         self.dataset = HDataSet(data_config)
150 |         self.LABEL = self.dataset.config['LABEL_FIELD']
151 |         self.types = self.dataset.types
152 | 
153 |     def modelsize(self, model, input, type_size=4):
154 |         para = sum([np.prod(list(p.size())) for p in model.parameters()])
155 |         print('Model {} : params: {:4f}M'.format(
156 |             model._get_name(), para * type_size / 1000 / 1000))
157 |         input_ = input
158 |         input_.requires_grad_(requires_grad=False)
159 |         mods = list(model.modules())
160 |         out_sizes = []
161 | 
162 |         for i in range(1, len(mods)):
163 |             m = mods[i]
164 |             if isinstance(m, nn.ReLU):
165 |                 if m.inplace:
166 |                     continue
167 |             out = m(input_)
168 |             out_sizes.append(np.array(out.size()))
169 |             input_ = out
170 | 
171 |         total_nums = 0
172 |         for i in range(len(out_sizes)):
173 |             s = out_sizes[i]
174 |             nums = np.prod(np.array(s))
175 |             total_nums += nums
176 |         print('Model {} : intermedite variables: {:3f} M (without backward)'.
177 |               format(model._get_name(), total_nums * type_size / 1000 / 1000))
178 |         print(
179 |             'Model {} : intermedite variables: {:3f} M (with backward)'.format(
180 |                 model._get_name(), total_nums * type_size * 2 / 1000 / 1000))
181 | 


--------------------------------------------------------------------------------
/HRec/pipeline/metrics.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name:
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2020-12-18 15:25:19
  7 | # ###########################
  8 | 
  9 | import numpy as np
 10 | from sklearn.metrics import auc as sk_auc
 11 | from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
 12 | 
 13 | #    TopK Metrics    #
 14 | 
 15 | 
 16 | def hit_(pos_index, pos_len):
 17 |     r"""Hit_ (also known as hit ratio at :math:`N`) is a way of calculating how many 'hits' you have
 18 |     in an n-sized list of ranked items.
 19 | 
 20 |     .. _Hit: https://medium.com/@rishabhbhatia315/recommendation-system-evaluation-metrics-3f6739288870
 21 | 
 22 |     .. math::
 23 |         \mathrm {HR@K} =\frac{Number \space of \space Hits @K}{|GT|}
 24 | 
 25 |     :math:`HR` is the number of users with a positive sample in the recommendation list.
 26 |     :math:`GT` is the total number of samples in the test set.
 27 | 
 28 |     """
 29 |     result = np.cumsum(pos_index, axis=1)
 30 |     return (result > 0).astype(int)
 31 | 
 32 | 
 33 | def mrr_(pos_index, pos_len):
 34 |     r"""The MRR_ (also known as mean reciprocal rank) is a statistic measure for evaluating any process
 35 |     that produces a list of possible responses to a sample of queries, ordered by probability of correctness.
 36 | 
 37 |     .. _MRR: https://en.wikipedia.org/wiki/Mean_reciprocal_rank
 38 | 
 39 |     .. math::
 40 |         \mathrm {MRR} = \frac{1}{|{U}|} \sum_{i=1}^{|{U}|} \frac{1}{rank_i}
 41 | 
 42 |     :math:`U` is the number of users, :math:`rank_i` is the rank of the first item in the recommendation list
 43 |     in the test set results for user :math:`i`.
 44 | 
 45 |     """
 46 |     idxs = pos_index.argmax(axis=1)
 47 |     result = np.zeros_like(pos_index, dtype=np.float)
 48 |     for row, idx in enumerate(idxs):
 49 |         if pos_index[row, idx] > 0:
 50 |             result[row, idx:] = 1 / (idx + 1)
 51 |         else:
 52 |             result[row, idx:] = 0
 53 |     return result
 54 | 
 55 | 
 56 | def map_(pos_index, pos_len):
 57 |     r"""MAP_ (also known as Mean Average Precision) The MAP is meant to calculate Avg. Precision for the relevant items.
 58 | 
 59 |     Note:
 60 |         In this case the normalization factor used is :math:`\frac{1}{\min (m,N)}`, which prevents your AP score from
 61 |         being unfairly suppressed when your number of recommendations couldn't possibly capture all the correct ones.
 62 | 
 63 |     .. _map: http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html#MAP-for-Recommender-Algorithms
 64 | 
 65 |     .. math::
 66 |         \begin{align*}
 67 |         \mathrm{AP@N} &= \frac{1}{\mathrm{min}(m,N)}\sum_{k=1}^N P(k) \cdot rel(k) \\
 68 |         \mathrm{MAP@N}& = \frac{1}{|U|}\sum_{u=1}^{|U|}(\mathrm{AP@N})_u
 69 |         \end{align*}
 70 | 
 71 |     """
 72 |     pre = precision_(pos_index, pos_len)
 73 |     sum_pre = np.cumsum(pre * pos_index.astype(np.float), axis=1)
 74 |     len_rank = np.full_like(pos_len, pos_index.shape[1])
 75 |     actual_len = np.where(pos_len > len_rank, len_rank, pos_len)
 76 |     result = np.zeros_like(pos_index, dtype=np.float)
 77 |     for row, lens in enumerate(actual_len):
 78 |         ranges = np.arange(1, pos_index.shape[1] + 1)
 79 |         ranges[lens:] = ranges[lens - 1]
 80 |         result[row] = sum_pre[row] / ranges
 81 |     return result
 82 | 
 83 | 
 84 | def recall_(pos_index, pos_len):
 85 |     r"""Recall_ (also known as sensitivity) is the fraction of the total amount of relevant instances
 86 |     that were actually retrieved
 87 | 
 88 |     .. _recall: https://en.wikipedia.org/wiki/Precision_and_recall#Recall
 89 | 
 90 |     .. math::
 91 |         \mathrm {Recall@K} = \frac{|Rel_u\cap Rec_u|}{Rel_u}
 92 | 
 93 |     :math:`Rel_u` is the set of items relavent to user :math:`U`,
 94 |     :math:`Rec_u` is the top K items recommended to users.
 95 |     We obtain the result by calculating the average :math:`Recall@K` of each user.
 96 | 
 97 |     """
 98 |     return np.cumsum(pos_index, axis=1) / pos_len.reshape(-1, 1)
 99 | 
100 | 
101 | def ndcg_(pos_index, pos_len):
102 |     r"""NDCG_ (also known as normalized discounted cumulative gain) is a measure of ranking quality.
103 |     Through normalizing the score, users and their recommendation list results in the whole test set can be evaluated.
104 | 
105 |     .. _NDCG: https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG
106 | 
107 |     .. math::
108 |         \begin{gather}
109 |             \mathrm {DCG@K}=\sum_{i=1}^{K} \frac{2^{rel_i}-1}{\log_{2}{(i+1)}}\\
110 |             \mathrm {IDCG@K}=\sum_{i=1}^{K}\frac{1}{\log_{2}{(i+1)}}\\
111 |             \mathrm {NDCG_u@K}=\frac{DCG_u@K}{IDCG_u@K}\\
112 |             \mathrm {NDCG@K}=\frac{\sum \nolimits_{u \in u^{te}NDCG_u@K}}{|u^{te}|}
113 |         \end{gather}
114 | 
115 |     :math:`K` stands for recommending :math:`K` items.
116 |     And the :math:`rel_i` is the relevance of the item in position :math:`i` in the recommendation list.
117 |     :math:`2^{rel_i}` equals to 1 if the item hits otherwise 0.
118 |     :math:`U^{te}` is for all users in the test set.
119 | 
120 |     """
121 | 
122 |     len_rank = np.full_like(pos_len, pos_index.shape[1])
123 |     idcg_len = np.where(pos_len > len_rank, len_rank, pos_len)
124 | 
125 |     iranks = np.zeros_like(pos_index, dtype=np.float)
126 |     iranks[:, :] = np.arange(1, pos_index.shape[1] + 1)
127 |     idcg = np.cumsum(1.0 / np.log2(iranks + 1), axis=1)
128 |     for row, idx in enumerate(idcg_len):
129 |         idcg[row, idx:] = idcg[row, idx - 1]
130 | 
131 |     ranks = np.zeros_like(pos_index, dtype=np.float)
132 |     ranks[:, :] = np.arange(1, pos_index.shape[1] + 1)
133 |     dcg = 1.0 / np.log2(ranks + 1)
134 |     dcg = np.cumsum(np.where(pos_index, dcg, 0), axis=1)
135 | 
136 |     result = dcg / idcg
137 |     return result
138 | 
139 | 
140 | def precision_(pos_index, pos_len):
141 |     r"""Precision_ (also called positive predictive value) is the fraction of
142 |     relevant instances among the retrieved instances
143 | 
144 |     .. _precision: https://en.wikipedia.org/wiki/Precision_and_recall#Precision
145 | 
146 |     .. math::
147 |         \mathrm {Precision@K} = \frac{|Rel_u \cap Rec_u|}{Rec_u}
148 | 
149 |     :math:`Rel_u` is the set of items relavent to user :math:`U`,
150 |     :math:`Rec_u` is the top K items recommended to users.
151 |     We obtain the result by calculating the average :math:`Precision@K` of each user.
152 | 
153 |     """
154 |     return pos_index.cumsum(axis=1) / np.arange(1, pos_index.shape[1] + 1)
155 | 
156 | 
157 | #    CTR Metrics    #
158 | 
159 | 
160 | def auc_(trues, preds):
161 |     r"""AUC_ (also known as Area Under Curve) is used to evaluate the two-class model, referring to
162 |     the area under the ROC curve
163 | 
164 |     .. _AUC: https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
165 | 
166 |     Note:
167 |         This metric does not calculate group-based AUC which considers the AUC scores
168 |         averaged across users. It is also not limited to k. Instead, it calculates the
169 |         scores on the entire prediction results regardless the users.
170 | 
171 |     .. math::
172 |         \mathrm {AUC} = \frac{\sum\limits_{i=1}^M rank_{i}
173 |         - {{M} \times {(M+1)}}} {{M} \times {N}}
174 | 
175 |     :math:`M` is the number of positive samples.
176 |     :math:`N` is the number of negative samples.
177 |     :math:`rank_i` is the rank of the ith positive sample.
178 | 
179 |     """
180 |     fps, tps = _binary_clf_curve(trues, preds)
181 | 
182 |     if len(fps) > 2:
183 |         optimal_idxs = np.where(
184 |             np.r_[True,
185 |                   np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True])[0]
186 |         fps = fps[optimal_idxs]
187 |         tps = tps[optimal_idxs]
188 | 
189 |     tps = np.r_[0, tps]
190 |     fps = np.r_[0, fps]
191 | 
192 |     if fps[-1] <= 0:
193 |         fpr = np.repeat(np.nan, fps.shape)
194 |     else:
195 |         fpr = fps / fps[-1]
196 | 
197 |     if tps[-1] <= 0:
198 |         tpr = np.repeat(np.nan, tps.shape)
199 |     else:
200 |         tpr = tps / tps[-1]
201 | 
202 |     return sk_auc(fpr, tpr)
203 | 
204 | 
205 | # Loss based Metrics #
206 | 
207 | 
208 | def mae_(trues, preds):
209 |     r"""`Mean absolute error regression loss`__
210 | 
211 |     .. __: https://en.wikipedia.org/wiki/Mean_absolute_error
212 | 
213 |     .. math::
214 |         \mathrm{MAE}=\frac{1}{|{T}|} \sum_{(u, i) \in {T}}\left|\hat{r}_{u i}-r_{u i}\right|
215 | 
216 |     :math:`T` is the test set, :math:`\hat{r}_{u i}` is the score predicted by the model,
217 |     and :math:`r_{u i}` the actual score of the test set.
218 | 
219 |     """
220 |     return mean_absolute_error(trues, preds)
221 | 
222 | 
223 | def rmse_(trues, preds):
224 |     r"""`Mean std error regression loss`__
225 | 
226 |     .. __: https://en.wikipedia.org/wiki/Root-mean-square_deviation
227 | 
228 |     .. math::
229 |         \mathrm{RMSE} = \sqrt{\frac{1}{|{T}|} \sum_{(u, i) \in {T}}(\hat{r}_{u i}-r_{u i})^{2}}
230 | 
231 |     :math:`T` is the test set, :math:`\hat{r}_{u i}` is the score predicted by the model,
232 |     and :math:`r_{u i}` the actual score of the test set.
233 | 
234 |     """
235 |     return np.sqrt(mean_squared_error(trues, preds))
236 | 
237 | 
238 | def log_loss_(trues, preds):
239 |     r"""`Log loss`__, aka logistic loss or cross-entropy loss
240 | 
241 |     .. __: http://wiki.fast.ai/index.php/Log_Loss
242 | 
243 |     .. math::
244 |         -\log {P(y_t|y_p)} = -(({y_t}\ \log{y_p}) + {(1-y_t)}\ \log{(1 - y_p)})
245 | 
246 |     For a single sample, :math:`y_t` is true label in :math:`\{0,1\}`.
247 |     :math:`y_p` is the estimated probability that :math:`y_t = 1`.
248 | 
249 |     """
250 |     eps = 1e-15
251 |     preds = np.float64(preds)
252 |     preds = np.clip(preds, eps, 1 - eps)
253 |     loss = np.sum(-trues * np.log(preds) - (1 - trues) * np.log(1 - preds))
254 | 
255 |     return loss / len(preds)
256 | 
257 | 
258 | # Item based Metrics #
259 | 
260 | 
261 | def _binary_clf_curve(trues, preds):
262 |     """Calculate true and false positives per binary classification threshold
263 | 
264 |     Args:
265 |         trues (numpy.ndarray): the true scores' list
266 |         preds (numpy.ndarray): the predict scores' list
267 | 
268 |     Returns:
269 |         fps (np.ndarray): A count of false positives, at index i being the number of negative
270 |         samples assigned a score >= thresholds[i]
271 |         preds (numpy.ndarray): An increasing count of true positives, at index i being the number
272 |         of positive samples assigned a score >= thresholds[i].
273 | 
274 |     Note:
275 |         To improve efficiency, we referred to the source code(which is available at sklearn.metrics.roc_curve)
276 |         in SkLearn and made some optimizations.
277 | 
278 |     """
279 |     trues = (trues == 1)
280 | 
281 |     desc_idxs = np.argsort(preds)[::-1]
282 |     preds = preds[desc_idxs]
283 |     trues = trues[desc_idxs]
284 | 
285 |     unique_val_idxs = np.where(np.diff(preds))[0]
286 |     threshold_idxs = np.r_[unique_val_idxs, trues.size - 1]
287 | 
288 |     tps = np.cumsum(trues)[threshold_idxs]
289 |     fps = 1 + threshold_idxs - tps
290 |     return fps, tps
291 | 
292 | 
293 | """Function name and function mapper.
294 | Useful when we have to serialize evaluation metric names
295 | and call the functions based on deserialized names
296 | """
297 | metrics_dict = {
298 |     'ndcg': ndcg_,
299 |     'hit': hit_,
300 |     'precision': precision_,
301 |     'map': map_,
302 |     'recall': recall_,
303 |     'mrr': mrr_,
304 |     'rmse': rmse_,
305 |     'mae': mae_,
306 |     'logloss': log_loss_,
307 |     'auc': auc_
308 | }
309 | 


--------------------------------------------------------------------------------
/HRec/pipeline/optimizer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # ###########################
 3 | # File Name: optimizer.py
 4 | # Author: geekinglcq
 5 | # Mail: lcqgeek@live.com
 6 | # Created Time: 2020-12-18 11:49:59
 7 | # ###########################
 8 | 
 9 | import torch.optim as optim
10 | 
11 | opt_map = {
12 |     "Adam": optim.Adam,
13 |     "Adadelta": optim.Adadelta,
14 |     "Adagrad": optim.Adagrad,
15 |     "AdamW": optim.AdamW,
16 |     "RMSprop": optim.RMSprop,
17 |     "SGD": optim.SGD
18 | }
19 | 
20 | 
21 | class Optimizer(object):
22 |     def __init__(self, config, params):
23 |         opt_fn = opt_map[config["name"]]
24 |         try:
25 |             self.opt = opt_fn(params, **config["hyper_params"])
26 |         except TypeError:
27 |             print("Unexcepted key error in optimizer")
28 |         self.adjust_lr = config.get("adjust_lr", False)
29 |         if self.adjust_lr:
30 |             self.scheduler = self.get_scheduler(config.get("scheduler"))
31 | 
32 |     def get_scheduler(self, config):
33 |         if config["name"] == "ReduceLROnPlateau":
34 |             return optim.lr_scheduler.ReduceLROnPlateau(
35 |                 self.opt, **config["hyper_params"])
36 |         else:
37 |             # TODO: Other schedulers
38 |             raise NotImplementedError
39 | 
40 |     def zero_grad(self):
41 |         self.opt.zero_grad()
42 | 
43 |     def step(self):
44 |         self.opt.step()
45 | 


--------------------------------------------------------------------------------
/HRec/pipeline/process.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import json
  3 | import os
  4 | import time
  5 | import logging
  6 | from datetime import datetime
  7 | from collections import defaultdict
  8 | 
  9 | import torch
 10 | from tqdm import tqdm
 11 | 
 12 | from ..datasets import DataSet
 13 | from ..models import model_map, ModelType
 14 | from .evaluator import Evaluator
 15 | from .optimizer import Optimizer
 16 | from .utils import get_free_gpu, EarlyStopping
 17 | 
 18 | 
 19 | class Process(object):
 20 |     def __init__(self, config):
 21 |         self.config = config
 22 |         self._path_config(config['path'])
 23 |         self._logger_config()
 24 |         self._set_device(config)
 25 |         self._prepare_data(config['data'])
 26 |         self._prepare_model(config['model'])
 27 |         self._prepare_optimizer(config['opt'], self.model.parameters())
 28 |         self._prepare_evaluator(config)
 29 | 
 30 |     def _set_device(self, config):
 31 |         device = config.get('device', None)
 32 |         if device is not None:
 33 |             self.device = device
 34 |         else:
 35 |             device_list = get_free_gpu(mode="memory", memory_need=5000)
 36 |             if len(device_list) < 1:
 37 |                 raise ValueError("No GPU available now.")
 38 |             else:
 39 |                 self.device = f'cuda:{device_list[0]}'
 40 |                 self.logger.info(f'Use device {self.device}')
 41 |         config['data']['device'] = self.device
 42 |         config['model']['device'] = self.device
 43 | 
 44 |     def _logger_config(self):
 45 |         """
 46 |         Set the logger
 47 |         """
 48 |         model_name = self.config['model']['name']
 49 |         data_name = self.config['data']['name']
 50 |         if self.config['data'].get('single', False):
 51 |             single_type = self.config['data'].get('single_type')
 52 |             data_name = f'{data_name}_{single_type}'
 53 |         logfile_name = os.path.join(self.log_path,
 54 |                                     f'{model_name}-{data_name}.log')
 55 |         fmt = "%(asctime)-15s %(levelname)s %(message)s"
 56 |         filedatefmt = "%a %d %b %Y %H:%M:%S"
 57 |         fileformatter = logging.Formatter(fmt, filedatefmt)
 58 | 
 59 |         sdatefmt = "%d %b %H:%M"
 60 |         sformatter = logging.Formatter(fmt, sdatefmt)
 61 | 
 62 |         fh = logging.FileHandler(logfile_name)
 63 |         fh.setLevel(logging.INFO)
 64 |         fh.setFormatter(fileformatter)
 65 |         sh = logging.StreamHandler()
 66 |         sh.setLevel(logging.INFO)
 67 |         sh.setFormatter(sformatter)
 68 |         logging.basicConfig(level=logging.INFO, handlers=[fh, sh])
 69 |         self.logger = logging.getLogger()
 70 | 
 71 |     def _prepare_model(self, model_config):
 72 |         name = model_config['name']
 73 |         self.model = model_map[name](model_config, self.dataset)
 74 |         self.model.to(self.device)
 75 | 
 76 |     def _prepare_data(self, data_config):
 77 |         self.dataset = DataSet(data_config)
 78 |         self.LABEL = self.dataset.config['LABEL_FIELD']
 79 |         self.single = self.dataset.single
 80 | 
 81 |     def _prepare_optimizer(self, opt_config, params):
 82 |         self.opt = Optimizer(opt_config, params)
 83 |         self.start_epoch = 0
 84 |         self.best_val_score = -1
 85 |         self.epochs = opt_config['epochs']
 86 |         self.eval_step = opt_config['eval_step']
 87 |         self.save_step = opt_config['save_step']
 88 |         self.train_loss_dict = {}
 89 |         self.val_loss_dict = {}
 90 |         if 'early_stop' in opt_config.keys():
 91 |             self.early_stop = True
 92 |             config = opt_config['early_stop']
 93 |             self.eval_metric = config.get('metric', 'auc')
 94 |             self.eval_mode = config.get('mode', 'max')
 95 |             self.stop_step = config.get('stop_step', 5)
 96 |         else:
 97 |             self.early_stop = False
 98 | 
 99 |     def _prepare_evaluator(self, config):
100 |         self.evaluator = Evaluator(config)
101 | 
102 |     def train_one_epoch(self, data_loader=None):
103 |         """Train one epoch using given data"""
104 |         if data_loader is None:
105 |             data_loader = self.dataset.train_data_loader
106 | 
107 |         m = self.model
108 |         loss_fn = self.model.calculate_loss
109 | 
110 |         m.train()
111 |         losses = None
112 |         opt = self.opt.opt
113 |         if hasattr(self.opt, 'scheduler'):
114 |             pass
115 |             # TODO: scheduler
116 |             # sch = self.opt.scheduler
117 |         else:
118 |             pass
119 |             # sch = None
120 |         for idx, data in tqdm(enumerate(data_loader), total=len(data_loader)):
121 |             if type(data) is dict:
122 |                 for key, value in data.items():
123 |                     data[key] = value.to(self.device)
124 |             opt.zero_grad()
125 |             loss = loss_fn(data)
126 |             loss.backward()
127 |             opt.step()
128 |             losses = loss.item() if losses is None else losses + loss.item()
129 |         return losses
130 | 
131 |     def validate(self, data_loader=None):
132 |         """
133 |         Run model in validation dataset and calculate the
134 |         score using evaluator.
135 |         Return:
136 |             result: a dict store metrics name-value pair.
137 |         """
138 |         if data_loader is None:
139 |             data_loader = self.dataset.val_data_loader
140 |         m = self.model
141 |         m.eval()
142 | 
143 |         batch_matrix_list = []
144 |         for idx, data in tqdm(enumerate(data_loader), total=len(data_loader)):
145 |             if type(data) is dict:
146 |                 for key, value in data.items():
147 |                     data[key] = value.to(self.device)
148 |             pred = m.predict(data)
149 |             batch_matrix = self.evaluator.collect(data, pred)
150 |             batch_matrix_list.append(batch_matrix)
151 | 
152 |         if self.single:
153 |             result = self.evaluator.evaluate(batch_matrix_list, groupby=False)
154 |         else:
155 |             result = self.evaluator.evaluate(batch_matrix_list, groupby=True)
156 |         return result
157 | 
158 |     def test(self, data_loader=None):
159 | 
160 |         if data_loader is None:
161 |             data_loader = self.dataset.test_data_loader
162 |         return self.validate(data_loader=data_loader)
163 | 
164 |     def save_checkpoint(self, epoch, name='last', path=None):
165 |         if path is None:
166 |             path = self.ckp_path
167 |         state = {
168 |             'epoch': epoch,
169 |             'state_dict': self.model.state_dict(),
170 |             'optimizer': self.opt.opt.state_dict()
171 |         }
172 |         if name == 'last':
173 |             file_name = os.path.join(self.ckp_path,
174 |                                      f'{name}-{epoch}-model.pth')
175 |         else:
176 |             file_name = os.path.join(self.ckp_path, f'{name}-model.pth')
177 |         self.last_model_path = file_name
178 |         torch.save(state, file_name)
179 |         if name == 'best':
180 |             self.best_ckp_path = file_name
181 |         elif name == 'last':
182 |             self.last_ckp_path = file_name
183 | 
184 |     def load_checkpoint(self, file_name=None, mode=None):
185 |         if file_name is None:
186 |             if mode == 'last':
187 |                 file_name = self.last_model_path
188 |             elif mode == 'best':
189 |                 file_name = self.best_model_path
190 |             else:
191 |                 raise ValueError("No checkpoint path provided.")
192 |         ckp = torch.load(file_name)
193 |         self.start_epoch = ckp['epoch'] + 1
194 |         self.model.load_state_dict(ckp['state_dict'])
195 |         self.opt.opt.load_state_dict(ckp['optimizer'])
196 |         self.logger.info(f"Load ckp from {file_name}.")
197 | 
198 |     def fit(self,
199 |             train_data=None,
200 |             val_data=None,
201 |             test_data=None,
202 |             verbose=True):
203 | 
204 |         if self.model.model_type == ModelType.CONTEXT:
205 |             self.dataset.join_interaction()
206 |             self.dataset.train_val_test_split(context=True)
207 |         elif self.model.model_type == ModelType.HETERO:
208 |             self.dataset.join_interaction()
209 |             self.dataset.train_val_test_split()
210 |         else:
211 |             self.dataset.train_val_test_split(context=False)
212 | 
213 |         batch_size = self.config['opt'].get('batch_size', 256)
214 |         num_workers = self.dataset.config.get('num_workers', 2)
215 |         self.dataset.init_data_loader(batch_size=batch_size,
216 |                                       num_workers=num_workers)
217 |         for epoch_idx in range(self.start_epoch, self.epochs):
218 | 
219 |             # Train
220 |             st = time.time()
221 |             train_loss = self.train_one_epoch(train_data)
222 |             self.train_loss_dict[epoch_idx] = train_loss
223 |             ed = time.time()
224 | 
225 |             if verbose:
226 |                 if type(train_loss) is dict or type(train_loss) is defaultdict:
227 |                     train_loss = '\t'.join(
228 |                         [f'{k}: {v}' for k, v in train_loss.items()])
229 |                 self.logger.info(
230 |                     f'[TRAIN] Epoch: {epoch_idx} cost time: {ed - st:.1f}, train loss: {train_loss}'
231 |                 )
232 | 
233 |             # Eval
234 |             if not ((epoch_idx + 1) % self.eval_step):
235 |                 st = time.time()
236 |                 result = self.validate(val_data)
237 |                 ed = time.time()
238 |                 self.logger.info(
239 |                     f'[EVAL] Epoch: {epoch_idx} cost time: {ed - st:.1f}')
240 |                 result_str = '[EVAL] ' + '\t'.join(
241 |                     [f'{k}: {v} ' for k, v in result.items()])
242 |                 stop_flag, better = EarlyStopping.update(
243 |                     result, epoch_idx, self.eval_metric, self.eval_mode,
244 |                     self.stop_step)
245 |                 self.logger.info(result_str)
246 | 
247 |                 # Save the best model
248 |                 if better:
249 |                     self.save_checkpoint(epoch_idx, 'best')
250 |                 if self.early_stop and stop_flag:
251 |                     self.logger.info(f'Early Stop in {epoch_idx} epoch. ')
252 |                     break
253 | 
254 |             if not ((epoch_idx + 1) % self.save_step):
255 |                 self.save_checkpoint(epoch_idx, 'last')
256 | 
257 |         # Test
258 |         self.logger.info(
259 |             'Finish training. Start to evaluate in the test set using the best model in val set.'
260 |         )
261 |         if hasattr(self, 'best_ckp_path'):
262 |             self.load_checkpoint(self.best_ckp_path)
263 |         result = self.test(test_data)
264 |         result_str = '[TEST] ' + '\t'.join(
265 |             [f'{k}: {v:.3f} ' for k, v in result.items()])
266 |         self.logger.info(result_str)
267 |         self.config['result'] = result
268 |         # Save the result to config file
269 |         json.dump(self.config,
270 |                   open(os.path.join(self.output_path, "config.json"), "w"),
271 |                   indent='\t')
272 | 
273 |     def _path_config(self, config):
274 |         now = str(datetime.now()).replace(" ", "_").split(".")[0]
275 |         model_name = self.config['model']['name']
276 |         data_name = self.config['data']['name']
277 |         output_path = os.path.join(config["output"],
278 |                                    f'{model_name}-{data_name}-{now}')
279 |         self.output_path = output_path
280 |         if os.path.isdir(output_path):
281 |             raise ValueError("Output dir already exist")
282 |         else:
283 |             os.makedirs(output_path)
284 |         # Save config files
285 |         json.dump(self.config,
286 |                   open(os.path.join(output_path, "config.json"), "w"),
287 |                   indent='\t')
288 |         print(f"Config is saved in {output_path}.")
289 |         for sub_dir in ["log", "ckp"]:
290 |             path = os.path.join(output_path, sub_dir)
291 |             if not os.path.exists(path):
292 |                 os.mkdir(path)
293 |             setattr(self, f'{sub_dir}_path', path)
294 |             print(f'{sub_dir} is saved in {path}')
295 | 


--------------------------------------------------------------------------------
/HRec/pipeline/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # ###########################
  3 | # File Name: utils.py
  4 | # Author: geekinglcq
  5 | # Mail: lcqgeek@live.com
  6 | # Created Time: 2020-12-20 21:32:39
  7 | # ###########################
  8 | 
  9 | from gpustat import GPUStatCollection
 10 | 
 11 | 
 12 | def get_free_gpu(mode="memory", memory_need=10000) -> list:
 13 |     r"""Get free gpu according to mode (process-free or memory-free).
 14 |     Args:
 15 |         mode (str, optional): memory-free or process-free. Defaults to "memory".
 16 |         memory_need (int): The memory you need, used if mode=='memory'. Defaults to 10000.
 17 |     Returns:
 18 |         list: free gpu ids sorting by free memory
 19 |     """
 20 |     assert mode in ["memory", "process"], "mode must be 'memory' or 'process'"
 21 |     if mode == "memory":
 22 |         assert memory_need is not None, \
 23 |             "'memory_need' if None, 'memory' mode must give the free memory you want to apply for"
 24 |         memory_need = int(memory_need)
 25 |         assert memory_need > 0, "'memory_need' you want must be positive"
 26 |     gpu_stats = GPUStatCollection.new_query()
 27 |     gpu_free_id_list = []
 28 | 
 29 |     for idx, gpu_stat in enumerate(gpu_stats):
 30 |         if gpu_check_condition(gpu_stat, mode, memory_need):
 31 |             gpu_free_id_list.append([idx, gpu_stat.memory_free])
 32 |             print("gpu[{}]: {}MB".format(idx, gpu_stat.memory_free))
 33 | 
 34 |     if gpu_free_id_list:
 35 |         gpu_free_id_list = sorted(gpu_free_id_list,
 36 |                                   key=lambda x: x[1],
 37 |                                   reverse=True)
 38 |         gpu_free_id_list = [i[0] for i in gpu_free_id_list]
 39 |     return gpu_free_id_list
 40 | 
 41 | 
 42 | def gpu_check_condition(gpu_stat, mode, memory_need) -> bool:
 43 |     r"""Check gpu is free or not.
 44 |     Args:
 45 |         gpu_stat (gpustat.core): gpustat to check
 46 |         mode (str): memory-free or process-free.
 47 |         memory_need (int): The memory you need, used if mode=='memory'
 48 |     Returns:
 49 |         bool: gpu is free or not
 50 |     """
 51 |     if mode == "memory":
 52 |         return gpu_stat.memory_free > memory_need
 53 |     elif mode == "process":
 54 |         for process in gpu_stat.processes:
 55 |             if process["command"] == "python":
 56 |                 return False
 57 |         return True
 58 |     else:
 59 |         return False
 60 | 
 61 | 
 62 | class EarlyStopping(object):
 63 |     """
 64 |     The class control the info to decide whether to do earlystopping
 65 |     """
 66 |     best_score = None
 67 |     best_epoch = None
 68 |     steps = 0
 69 | 
 70 |     @classmethod
 71 |     def update(self, scores, epoch, metric='auc', mode='max', stop_step=5):
 72 |         """
 73 |         Update current suitation after each epoch.
 74 |         Args:
 75 |             scores: a dict store metrice name-value pair
 76 |             metric: which metric to use in earlystopping
 77 |             mode: 'max' or 'min'
 78 |             stop_step: if after the given num of epochs, the
 79 |                 model does not improve then stop the training
 80 |         Return:
 81 |             stop_flag: if or not stop
 82 |             better: if current version is the best
 83 |         """
 84 |         assert metric in scores.keys()
 85 |         assert mode in ['max', 'min']
 86 | 
 87 |         def _set_best(score, epoch):
 88 |             EarlyStopping.best_score = score
 89 |             EarlyStopping.best_epoch = epoch
 90 |             EarlyStopping.steps = 0
 91 | 
 92 |         def _compare(score, mode):
 93 |             comp_fn = {"max": lambda a, b: a > b, "min": lambda a, b: a < b}
 94 |             return comp_fn[mode](score, EarlyStopping.best_score)
 95 | 
 96 |         score = scores[metric]
 97 |         better = False
 98 |         if EarlyStopping.best_score is None:
 99 |             _set_best(score, epoch)
100 |             better = True
101 |         elif _compare(score, mode):
102 |             _set_best(score, epoch)
103 |             better = True
104 |         else:
105 |             EarlyStopping.steps += 1
106 |         if EarlyStopping.steps >= stop_step:
107 |             return True, better
108 |         return False, better
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     get_free_gpu()
113 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2021 <COPYRIGHT HOLDER>
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <img src="framework.png" width="800">
 3 |   <br />
 4 |   <br />
 5 | </p>   
 6 | 
 7 | # HR  
 8 | 
 9 | Official implementation of the paper [Deep Unified Representation for Heterogeneous Recommendation](https://arxiv.org/abs/2201.05861).   
10 | Accepted by the ACM Web Conference 2022 (WWW '22)  
11 | [中文版算法介绍](https://zhuanlan.zhihu.com/p/474148693)
12 | 
13 | ## Dataset
14 | In this paper, we use the Douban dataset stored in `data.tar.xz`. 
15 | Please uncompress it (`tar -xf data.tar.xz`) and put it in the working directory.  
16 | 
17 | ## Usage 
18 | 
19 | 
20 | Before runing the code, please make sure that you have installed the dependency. You can install them with 
21 | ```
22 | pip install -r requirements.txt 
23 | ``` 
24 | Our code is tested on `python 3.7`.  
25 | 
26 | The next step is to prepare the configuration file. We provides the configurations of our proposed model (DURation) and baselines in `configs` fold as examples. To reproduce the results report in our paper, one just need change the path to your local path in the configuration.     
27 | 
28 | Then, you can run the program with simple one-line code. Take the DURation model as a example, there is a `duration.json` file in `configs`.  
29 | 
30 | ```
31 | python train_hete.py duration
32 | ```
33 | 
34 | It is worth note that the configuration file must be put in `configs`. To test the homogeneous models, just replace `train_hete.py` with `train_homo.py`. The program will output the results on screen while save the log to a certain path.  
35 | 
36 | ## Models
37 | 
38 | Currently, we support the following models:
39 | 
40 | + **DeepMF**(2017): Deep Matrix Factorization Models for Recommender Systems
41 | + **FISM**(2013): Fism: factored item similarity models for top-n recommender systems.
42 | + **NAIS**(2018): Nais: Neural attentive item similarity model for recommendation. 
43 | + **DeepFM**(2017): DeepFM: a factorization-machine based neural network for CTR prediction
44 | + **xDeepFM**(2018): xdeepfm: Combining explicit and implicit feature interactions for recommender systems
45 | + **AFM**(2017): Attentional factorization machines: Learning the weight of feature interactions via attention networks
46 | + **DSSM**(2013): Learning deep structured semantic models for web search using clickthrough data
47 | + **Wide & Deep**(2016): Wide & deep learning for recommender systems
48 | + **autoInt**(2019): Autoint: Automatic feature interaction learning via selfattentive neural networks
49 | + **CCCFNet**(2012): Cross-domain collaboration recommendation
50 | + **DDTCDR**(2020): DDTCDR: Deep dual transfer cross domain recommendation
51 | 
52 | ## Cite
53 | 
54 | ```
55 | @inproceedings{lu2022deep,
56 |   title={Deep Unified Representation for Heterogeneous Recommendation},
57 |   author={Lu, Chengqiang and Yin, Mingyang and Shen, Shuheng and Ji, Luo and Liu, Qi and Yang, Hongxia},
58 |   booktitle={Proceedings of the ACM Web Conference 2022},
59 |   pages={2141--2152},
60 |   year={2022}
61 | }
62 | ```
63 | 


--------------------------------------------------------------------------------
/configs/afm.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |   "data" : {
 4 |     "name" : "douban",
 5 |     "USER_ID_FIELD" : "user_id",
 6 |     "ITEM_ID_FIELD" : "item_id",
 7 |     "LABEL_FIELD" : "label",
 8 |     "TYPE_FIELD" : "type",
 9 |     "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv",
10 |     "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv",
11 |     "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv",
12 |     "feat" : {
13 | 
14 |       "user_id" : {"type" : "token", "source" : "user"},
15 | 
16 |       "item_id" : {"type" : "token", "source" : "item"},
17 |       "type" : {"type" : "token", "source" : "item"},
18 |       "douban_score" : {"type" : "token", "source" : "item"},
19 |       "douban_votes" : {"type" : "token", "source" : "item"}
20 |     },
21 |     "inter_matrix_type" : "01"
22 |   },
23 |            "model" : {
24 |              "name" : "AFM",
25 |              "attention_size" : 25,
26 |              "embedding_size" : 10,
27 |              "dropout_prob" : 0.3,
28 |              "reg_weight" : 2
29 | 
30 |            },
31 |                      "opt"
32 |       : {
33 |         "name" : "Adam",
34 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
35 |         "adjust_lr" : false,
36 |         "scheduler" :
37 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
38 |         "epochs" : 100,
39 |         "eval_step" : 2,
40 |         "batch_size" : 128,
41 |         "save_step" : 5,
42 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
43 |       },
44 |         "path" : {"output" : "/Your/Local/Path/output/"},
45 |                  "metrics" : ["AUC"]
46 | }
47 | 


--------------------------------------------------------------------------------
/configs/autoint.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |   "data" : {
 4 |     "name" : "douban",
 5 |     "USER_ID_FIELD" : "user_id",
 6 |     "ITEM_ID_FIELD" : "item_id",
 7 |     "LABEL_FIELD" : "label",
 8 |     "TYPE_FIELD" : "type",
 9 |     "user_feat_path" :
10 |         "/Your/Local/Path/data/douban/all_users.csv",
11 |     "inter_feat_path" :
12 |         "/Your/Local/Path/data/douban/all_rate.csv",
13 |     "item_feat_path" :
14 |         "/Your/Local/Path/data/douban/all_item_token.csv",
15 |     "feat" : {
16 | 
17 |       "user_id" : {"type" : "token", "source" : "user"},
18 | 
19 |       "item_id" : {"type" : "token", "source" : "item"},
20 | 
21 |       "type" : {"type" : "token", "source" : "item"},
22 |       "douban_score" : {"type" : "token", "source" : "item"},
23 | 
24 |       "douban_votes" : {"type" : "token", "source" : "item"}
25 |  
26 |     },
27 |     "inter_matrix_type" : "01"
28 |   },
29 |            "model" : {
30 |              "name" : "AutoInt",
31 |              "embedding_size" : 64,
32 |              "attention_size" : 16,
33 |                "n_layers": 3,
34 |                "num_heads":2,
35 |                "dropout_probs": [0.2, 0.2, 0.2],
36 |                "mlp_hidden_size": [128,128]
37 |            },
38 |                      "opt"
39 |       : {
40 |         "name" : "Adam",
41 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
42 |         "adjust_lr" : false,
43 |         "scheduler" :
44 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
45 |         "epochs" : 100,
46 |         "eval_step" : 2,
47 |         "batch_size" : 1024,
48 |         "save_step" : 5,
49 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
50 |       },
51 |         "path" : {"output" : "/Your/Local/Path/output/"},
52 |                  "metrics" : ["AUC"]
53 | }
54 | 


--------------------------------------------------------------------------------
/configs/cccf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data" : {
 3 |     "name" : "douban",
 4 |     "USER_ID_FIELD" : "user_id",
 5 |     "ITEM_ID_FIELD" : "item_id",
 6 |     "LABEL_FIELD" : "label",
 7 |     "TYPE_FIELD" : "type",
 8 |     "pin_mem" : true,
 9 |     "type" : [ "book", "music", "movie" ],
10 |     "user_feat_path" :
11 |         "/Your/Local/Path/data/douban/all_users.csv",
12 |     "inter_feat_path" :
13 |         "/Your/Local/Path/data/douban/all_rate.csv",
14 |     "item_feat_path" : {
15 |       "book" : "/Your/Local/Path/data/douban/book_item.csv",
16 |       "music" : "/Your/Local/Path/data/douban/music_item.csv",
17 |       "movie" : "/Your/Local/Path/data/douban/movie_item.csv"
18 |     },
19 |     "feat" : {
20 | 
21 |       "user_id" : {"type" : "token", "source" : "user"},
22 | 
23 |       "item_id" : {"type" : "token", "source" : "item"},
24 | 
25 |       "authors" : {"type" : "token", "source" : "item_book"},
26 |       "series" : {"type" : "token", "source" : "item_book"},
27 |       "publisher" : {"type" : "token", "source" : "item_book"},
28 |       "binding" : {"type" : "token", "source" : "item_book"},
29 |       "publish_year" : {"type" : "token", "source" : "item_book"},
30 |       "price" : {"type" : "float", "source" : "item_book"},
31 |       "pages" : {"type" : "float", "source" : "item_book"},
32 |       "book_douban_score" : {"type" : "token", "source" : "item_book"},
33 |       "book_douban_votes" : {"type" : "token", "source" : "item_book"},
34 |       "performer_ids" : {"type" : "token", "source" : "item_music"},
35 |       "style" : {"type" : "token", "source" : "item_music"},
36 |       "medium" : {"type" : "token", "source" : "item_music"},
37 |       "music_douban_score" : {"type" : "token", "source" : "item_music"},
38 |       "music_douban_votes" : {"type" : "token", "source" : "item_music"},
39 |       "publish_time" : {"type" : "token", "source" : "item_music"},
40 |       "regions" : {"type" : "token", "source" : "item_movie"},
41 |       "genres" : {"type" : "token", "source" : "item_movie"},
42 |       "languages" : {"type" : "token", "source" : "item_movie"},
43 |       "directors" : {"type" : "token", "source" : "item_movie"},
44 |       "year" : {"type" : "token", "source" : "item_movie"},
45 |       "mins" : {"type" : "float", "source" : "item_movie"},
46 |       "movie_douban_score" : {"type" : "token", "source" : "item_movie"},
47 |       "movie_douban_votes" : {"type" : "token", "source" : "item_movie"}
48 |     },
49 |     "inter_matrix_type" : "01"
50 |   },
51 |            "model" : {
52 |              "name" : "CCCF",
53 |              "user_emb_size" : 64,
54 |              "item_emb_size" : 64,
55 |              "token_emb_size" : 32,
56 |              "user_hidden_size_list" : [ 64, 64 ],
57 |              "item_hidden_size_list" : [ 64, 64 ]
58 | 
59 |            },
60 |                      "opt"
61 |       : {
62 |         "name" : "Adam",
63 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
64 |         "adjust_lr" : false,
65 |         "scheduler" :
66 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
67 |         "epochs" : 100,
68 |         "eval_step" : 2,
69 |         "batch_size" : 1024,
70 |         "save_step" : 5,
71 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
72 |       },
73 |         "path" : {"output" : "/Your/Local/Path/output/"},
74 |                  "metrics" : ["AUC"]
75 | }
76 | 


--------------------------------------------------------------------------------
/configs/ddtcdr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data" : {
 3 |     "name" : "douban",
 4 |     "USER_ID_FIELD" : "user_id",
 5 |     "ITEM_ID_FIELD" : "item_id",
 6 |     "LABEL_FIELD" : "label",
 7 |     "TYPE_FIELD" : "type",
 8 |     "pin_mem" : true,
 9 |     "type" : [ "book", "music", "movie" ],
10 |     "user_feat_path" :
11 |         "/Your/Local/Path/data/douban/all_users.csv",
12 |     "inter_feat_path" :
13 |         "/Your/Local/Path/data/douban/all_rate.csv",
14 |     "item_feat_path" : {
15 |       "book" : "/Your/Local/Path/data/douban/book_item.csv",
16 |       "music" : "/Your/Local/Path/data/douban/music_item.csv",
17 |       "movie" : "/Your/Local/Path/data/douban/movie_item.csv"
18 |     },
19 |     "feat" : {
20 | 
21 |       "user_id" : {"type" : "token", "source" : "user"},
22 |       "item_id" : {"type" : "token", "source" : "item"},
23 |       "authors" : {"type" : "token", "source" : "item_book"},
24 |       "publish_year" : {"type" : "token", "source" : "item_book"},
25 |       "book_douban_score" : {"type" : "token", "source" : "item_book"},
26 |       "book_douban_votes" : {"type" : "token", "source" : "item_book"},
27 |       "performer_ids" : {"type" : "token", "source" : "item_music"},
28 |       "music_douban_score" : {"type" : "token", "source" : "item_music"},
29 |       "music_douban_votes" : {"type" : "token", "source" : "item_music"},
30 |       "publish_time" : {"type" : "token", "source" : "item_music"},
31 |       "directors" : {"type" : "token", "source" : "item_movie"},
32 |       "year" : {"type" : "token", "source" : "item_movie"},
33 |       "movie_douban_score" : {"type" : "token", "source" : "item_movie"},
34 |       "movie_douban_votes" : {"type" : "token", "source" : "item_movie"}
35 |     },
36 |     "inter_matrix_type" : "01"
37 |   },
38 |            "model" : {
39 |              "name" : "DDTCDR",
40 |                "latent_dim": 32,
41 |              "token_emb_size" : 32,
42 |                "layers": [64, 32],
43 |                "alpha": 0.03
44 |            },
45 |                      "opt"
46 |       : {
47 |         "name" : "Adam",
48 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
49 |         "adjust_lr" : false,
50 |         "scheduler" :
51 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
52 |         "epochs" : 100,
53 |         "eval_step" : 2,
54 |         "batch_size" : 1024,
55 |         "save_step" : 5,
56 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
57 |       },
58 |         "path" : {"output" : "/Your/Local/Path/output/"},
59 |                  "metrics" : ["AUC"]
60 | }
61 | 


--------------------------------------------------------------------------------
/configs/deepfm.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |   "data" : {
 4 |     "name" : "douban",
 5 |     "USER_ID_FIELD" : "user_id",
 6 |     "ITEM_ID_FIELD" : "item_id",
 7 |     "LABEL_FIELD" : "label",
 8 |     "TYPE_FIELD" : "type",
 9 |     "user_feat_path" :
10 |         "/Your/Local/Path/data/douban/all_users.csv",
11 |     "inter_feat_path" :
12 |         "/Your/Local/Path/data/douban/all_rate.csv",
13 |     "item_feat_path" :
14 |         "/Your/Local/Path/data/douban/all_item_token.csv",
15 |     "feat" : {
16 | 
17 |       "user_id" : {"type" : "token", "source" : "user"},
18 | 
19 |       "item_id" : {"type" : "token", "source" : "item"},
20 | 
21 |       "type" : {"type" : "token", "source" : "item"},
22 |       "douban_score" : {"type" : "token", "source" : "item"},
23 | 
24 |       "douban_votes" : {"type" : "token", "source" : "item"}
25 |     },
26 |     "inter_matrix_type" : "01"
27 |   },
28 |            "model" : {
29 |              "name" : "DeepFM",
30 |              "user_emb_size" : 64,
31 |              "item_emb_size" : 64,
32 |              "embedding_size" : 64,
33 |              "mlp_hidden_size" : [ 128, 128, 128 ],
34 |              "dropout_prob" : 0.2
35 | 
36 |            },
37 |                      "opt"
38 |       : {
39 |         "name" : "Adam",
40 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
41 |         "adjust_lr" : false,
42 |         "scheduler" :
43 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
44 |         "epochs" : 100,
45 |         "eval_step" : 2,
46 |         "batch_size" : 1024,
47 |         "save_step" : 5,
48 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
49 |       },
50 |         "path" : {"output" : "/Your/Local/Path/output/"},
51 |                  "metrics" : ["AUC"]
52 | }
53 | 


--------------------------------------------------------------------------------
/configs/deepmf.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |   "data" : {
 4 |     "name" : "douban",
 5 |     "USER_ID_FIELD" : "user_id",
 6 |     "ITEM_ID_FIELD" : "item_id",
 7 |     "LABEL_FIELD" : "label",
 8 |     "TYPE_FIELD" : "type",
 9 |     "user_feat_path" :
10 |         "/Your/Local/Path/data/douban/all_users.csv",
11 |     "inter_feat_path" :
12 |         "/Your/Local/Path/data/douban/all_rate.csv",
13 |     "item_feat_path" :
14 |         "/Your/Local/Path/data/douban/all_item.csv",
15 |     "feat" : {
16 | 
17 |       "user_id" : {"type" : "token", "source" : "user"},
18 | 
19 |       "item_id" : {"type" : "token", "source" : "item"},
20 | 
21 |       "douban_score" : {"type" : "float", "source" : "item"},
22 | 
23 |       "douban_votes" : {"type" : "float", "source" : "item"}
24 |     },
25 |     "inter_matrix_type" : "01"
26 |   },
27 |            "model" : {
28 |              "name" : "DMF",
29 |              "user_emb_size" : 64,
30 |              "item_emb_size" : 64,
31 |              "user_hidden_size_list" : [ 64, 64 ],
32 |              "item_hidden_size_list" : [ 64, 64 ]
33 | 
34 |            },
35 |                      "opt"
36 |       : {
37 |         "name" : "Adam",
38 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
39 |         "adjust_lr" : false,
40 |         "scheduler" :
41 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
42 |         "epochs" : 100,
43 |         "eval_step" : 2,
44 |         "batch_size" : 16,
45 |         "save_step" : 5,
46 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
47 |       },
48 |         "path" : {"output" : "/Your/Local/Path/output/"},
49 |                  "metrics" : ["AUC"]
50 | }
51 | 


--------------------------------------------------------------------------------
/configs/dssm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data" : {
 3 |     "name" : "douban",
 4 |     "USER_ID_FIELD" : "user_id",
 5 |     "ITEM_ID_FIELD" : "item_id",
 6 |     "LABEL_FIELD" : "label",
 7 |     "TYPE_FIELD" : "type",
 8 |     "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv",
 9 |     "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv",
10 |     "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv",
11 |     "feat" : {
12 |       "user_id" : {"type" : "token", "source" : "user"},
13 |       "item_id" : {"type" : "token", "source" : "item"},
14 |       "type" : {"type" : "token", "source" : "item"},
15 |       "douban_score" : {"type" : "token", "source" : "item"},
16 |       "douban_votes" : {"type" : "token", "source" : "item"}
17 |     },
18 |     "inter_matrix_type" : "01"
19 |   },
20 |            "model" : {
21 |              "name" : "DSSM",
22 |              "user_emb_size" : 64,
23 |              "item_emb_size" : 64,
24 |              "embedding_size" : 10,
25 |              "mlp_hidden_size" : [ 256, 256, 256 ],
26 |              "dropout_prob" : 0.3,
27 |              "double_tower" : true
28 |            },
29 |                      "opt"
30 |       : {
31 |         "name" : "Adam",
32 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
33 |         "adjust_lr" : false,
34 |         "scheduler" :
35 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
36 |         "epochs" : 100,
37 |         "eval_step" : 2,
38 |         "batch_size" : 128,
39 |         "save_step" : 5,
40 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
41 |       },
42 |         "path" : {"output" : "/Your/Local/Path/output/"},
43 |                  "metrics" : ["AUC"]
44 | }
45 | 


--------------------------------------------------------------------------------
/configs/duration.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data" : {
 3 |     "name" : "douban",
 4 |     "USER_ID_FIELD" : "user_id",
 5 |     "ITEM_ID_FIELD" : "item_id",
 6 |     "LABEL_FIELD" : "label",
 7 |     "TYPE_FIELD" : "type",
 8 |     "pin_mem" : true,
 9 |     "type" : [ "book", "music", "movie" ],
10 |     "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv",
11 |     "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv",
12 |     "item_feat_path" : {
13 |       "book" : "/Your/Local/Path/data/douban/book_item.csv",
14 |       "music" : "/Your/Local/Path/data/douban/music_item.csv",
15 |       "movie" : "/Your/Local/Path/data/douban/movie_item.csv"
16 |     },
17 |     "feat" : {
18 | 
19 |       "user_id" : {"type" : "token", "source" : "user"},
20 | 
21 |       "item_id" : {"type" : "token", "source" : "item"},
22 | 
23 |       "authors" : {"type" : "token", "source" : "item_book"},
24 |       "series" : {"type" : "token", "source" : "item_book"},
25 |       "publisher" : {"type" : "token", "source" : "item_book"},
26 |       "binding" : {"type" : "token", "source" : "item_book"},
27 |       "publish_year" : {"type" : "token", "source" : "item_book"},
28 |       "price" : {"type" : "float", "source" : "item_book"},
29 |       "pages" : {"type" : "float", "source" : "item_book"},
30 |       "book_douban_score" : {"type" : "token", "source" : "item_book"},
31 |       "book_douban_votes" : {"type" : "token", "source" : "item_book"},
32 |       "performer_ids" : {"type" : "token", "source" : "item_music"},
33 |       "style" : {"type" : "token", "source" : "item_music"},
34 |       "medium" : {"type" : "token", "source" : "item_music"},
35 |       "music_douban_score" : {"type" : "token", "source" : "item_music"},
36 |       "music_douban_votes" : {"type" : "token", "source" : "item_music"},
37 |       "publish_time" : {"type" : "token", "source" : "item_music"},
38 |       "regions" : {"type" : "token", "source" : "item_movie"},
39 |       "genres" : {"type" : "token", "source" : "item_movie"},
40 |       "languages" : {"type" : "token", "source" : "item_movie"},
41 |       "directors" : {"type" : "token", "source" : "item_movie"},
42 |       "year" : {"type" : "token", "source" : "item_movie"},
43 |       "mins" : {"type" : "float", "source" : "item_movie"},
44 |       "movie_douban_score" : {"type" : "token", "source" : "item_movie"},
45 |       "movie_douban_votes" : {"type" : "token", "source" : "item_movie"}
46 |     },
47 |     "inter_matrix_type" : "01"
48 |   },
49 |            "model" : {
50 |              "name" : "DURation",
51 |              "user_emb_size" : 64,
52 |              "item_emb_size" : 64,
53 |              "token_emb_size" : 32,
54 |              "user_hidden_size_list" : [ 64, 64 ],
55 |              "item_hidden_size_list" : [ 64, 64 ],
56 |              "item_map_hidden_size_list" : [ 256, 64 ],
57 |              "mlp_hidden_size" : [ 128, 128, 128 ],
58 |              "dropout_prob" : 0.2,
59 |              "kernel" : "gaussian",
60 |              "align_sample_size" : 128
61 | 
62 |            },
63 |                      "opt"
64 |       : {
65 |         "name" : "Adam",
66 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
67 |         "adjust_lr" : false,
68 |         "scheduler" :
69 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
70 |         "epochs" : 100,
71 |         "eval_step" : 2,
72 |         "batch_size" : 1024,
73 |         "save_step" : 5,
74 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
75 |       },
76 |         "path" : {"output" : "/Your/Local/Path/output/"},
77 |                  "metrics" : ["AUC"]
78 | }
79 | 


--------------------------------------------------------------------------------
/configs/fism.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |   "data" : {
 4 |     "name" : "douban",
 5 |     "USER_ID_FIELD" : "user_id",
 6 |     "ITEM_ID_FIELD" : "item_id",
 7 |     "LABEL_FIELD" : "label",
 8 |     "TYPE_FIELD" : "type",
 9 |     "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv",
10 |     "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv",
11 |     "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv",
12 |     "feat" : {
13 | 
14 |       "user_id" : {"type" : "token", "source" : "user"},
15 | 
16 |       "item_id" : {"type" : "token", "source" : "item"},
17 | 
18 |       "type" : {"type" : "token", "source" : "item"},
19 |       "douban_score" : {"type" : "token", "source" : "item"},
20 | 
21 |       "douban_votes" : {"type" : "token", "source" : "item"}
22 |     },
23 |     "inter_matrix_type" : "01"
24 |   },
25 |            "model" : {
26 |              "name" : "FISM",
27 |              "embedding_size" : 64,
28 |              "split_to" : 0,
29 |              "reg_weights" : [ 1e-2, 1e-2 ],
30 |              "alpha" : 0
31 | 
32 |            },
33 |                      "opt"
34 |       : {
35 |         "name" : "Adam",
36 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
37 |         "adjust_lr" : false,
38 |         "scheduler" :
39 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
40 |         "epochs" : 100,
41 |         "eval_step" : 2,
42 |         "batch_size" : 512,
43 |         "save_step" : 5,
44 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
45 |       },
46 |         "path" : {"output" : "/Your/Local/Path/output/"},
47 |                  "metrics" : ["AUC"]
48 | }
49 | 


--------------------------------------------------------------------------------
/configs/nais.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |   "data" : {
 4 |     "name" : "douban",
 5 |     "USER_ID_FIELD" : "user_id",
 6 |     "ITEM_ID_FIELD" : "item_id",
 7 |     "LABEL_FIELD" : "label",
 8 |     "TYPE_FIELD" : "type",
 9 |     "user_feat_path" :
10 |         "/Your/Local/Path/data/douban/all_users.csv",
11 |     "inter_feat_path" :
12 |         "/Your/Local/Path/data/douban/all_rate.csv",
13 |     "item_feat_path" :
14 |         "/Your/Local/Path/data/douban/all_item.csv",
15 |     "feat" : {
16 | 
17 |       "user_id" : {"type" : "token", "source" : "user"},
18 | 
19 |       "item_id" : {"type" : "token", "source" : "item"},
20 | 
21 |       "douban_score" : {"type" : "token", "source" : "item"},
22 | 
23 |       "douban_votes" : {"type" : "token", "source" : "item"}
24 |     },
25 |     "inter_matrix_type" : "01"
26 |   },
27 |            "model" : {
28 |              "name" : "NAIS",
29 |                "algorithm": "prod",
30 | 
31 |              "embedding_size" : 64,
32 |                "weight_size": 64,
33 |                "split_to": 0,
34 |                "reg_weights": [1e-7, 1e-7, 1e-5],
35 |                "alpha": 0,
36 |                "beta": 0.5,
37 |              "pretrain_path" : null
38 |            },
39 |                      "opt"
40 |       : {
41 |         "name" : "Adam",
42 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
43 |         "adjust_lr" : false,
44 |         "scheduler" :
45 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
46 |         "epochs" : 100,
47 |         "eval_step" : 2,
48 |         "batch_size" : 1024,
49 |         "save_step" : 5,
50 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
51 |       },
52 |         "path" : {"output" : "/Your/Local/Path/output/"},
53 |                  "metrics" : ["AUC"]
54 | }
55 | 


--------------------------------------------------------------------------------
/configs/widedeep.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data" : {
 3 |     "name" : "douban",
 4 |     "USER_ID_FIELD" : "user_id",
 5 |     "ITEM_ID_FIELD" : "item_id",
 6 |     "LABEL_FIELD" : "label",
 7 |     "TYPE_FIELD" : "type",
 8 |     "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv",
 9 |     "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv",
10 |     "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv",
11 |     "feat" : {
12 |       "user_id" : {"type" : "token", "source" : "user"},
13 |       "item_id" : {"type" : "token", "source" : "item"},
14 |       "type" : {"type" : "token", "source" : "item"},
15 |       "douban_score" : {"type" : "token", "source" : "item"},
16 |       "douban_votes" : {"type" : "token", "source" : "item"}
17 |     },
18 |     "inter_matrix_type" : "01"
19 |   },
20 |            "model" : {
21 |              "name" : "WideDeep",
22 |              "user_emb_size" : 64,
23 |              "item_emb_size" : 64,
24 |              "embedding_size" : 64,
25 |              "mlp_hidden_size" : [ 256, 64, 8 ],
26 |              "dropout_prob" : 0.1
27 |            },
28 |                      "opt"
29 |       : {
30 |         "name" : "Adam",
31 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
32 |         "adjust_lr" : false,
33 |         "scheduler" :
34 |             {"name" : "ReduceLROnPlateau", "hyper_params" : {"mode" : "min"}},
35 |         "epochs" : 100,
36 |         "eval_step" : 2,
37 |         "batch_size" : 1024,
38 |         "save_step" : 5,
39 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
40 |       },
41 |         "path" : {"output" : "/Your/Local/Path/output/"},
42 |                  "metrics" : ["AUC"]
43 | }
44 | 


--------------------------------------------------------------------------------
/configs/xdeepfm.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |   "data" : {
 4 |     "name" : "douban",
 5 |     "USER_ID_FIELD" : "user_id",
 6 |     "ITEM_ID_FIELD" : "item_id",
 7 |     "LABEL_FIELD" : "label",
 8 |     "TYPE_FIELD" : "type",
 9 |     "user_feat_path" : "/Your/Local/Path/data/douban/all_users.csv",
10 |     "inter_feat_path" : "/Your/Local/Path/data/douban/all_rate.csv",
11 |     "item_feat_path" : "/Your/Local/Path/data/douban/all_item_token.csv",
12 |     "feat" : {
13 | 
14 |       "user_id" : {"type" : "token", "source" : "user"},
15 | 
16 |       "item_id" : {"type" : "token", "source" : "item"},
17 | 
18 |       "type" : {"type" : "token", "source" : "item"},
19 |       "douban_score" : {"type" : "token", "source" : "item"},
20 | 
21 |       "douban_votes" : {"type" : "token", "source" : "item"}
22 | 
23 |     },
24 |     "inter_matrix_type" : "01"
25 |   },
26 |            "model" : {
27 |              "name" : "xDeepFM",
28 |              "user_emb_size" : 64,
29 |              "item_emb_size" : 64,
30 |              "embedding_size" : 10,
31 |              "mlp_hidden_size" : [ 128, 128, 128 ],
32 |              "reg_weight" : 0.0005,
33 |              "dropout_prob" : 0.2,
34 |              "direct" : false,
35 |              "cin_layer_size" : [ 100, 100, 100 ]
36 |            },
37 |                      "opt"
38 |       : {
39 |         "name" : "Adam",
40 |         "hyper_params" : {"lr" : 0.001, "weight_decay" : 0.01},
41 |         "adjust_lr" : false,
42 |         "epochs" : 100,
43 |         "eval_step" : 2,
44 |         "batch_size" : 1024,
45 |         "save_step" : 5,
46 |         "early_stop" : {"metric" : "auc", "stop_step" : 5, "mode" : "max"}
47 |       },
48 |         "path" : {"output" : "/Your/Local/Path/output/"},
49 |                  "metrics" : ["AUC"]
50 | }
51 | 


--------------------------------------------------------------------------------
/data.tar.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekinglcq/HRec/f13a685dd593154d4887ed18bd444e588484d014/data.tar.xz


--------------------------------------------------------------------------------
/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekinglcq/HRec/f13a685dd593154d4887ed18bd444e588484d014/framework.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Automatically generated by https://github.com/damnever/pigar.
 2 | 
 3 | gpustat == 0.6.0
 4 | numpy == 1.22.0
 5 | pandas == 1.1.5
 6 | scipy == 1.5.4
 7 | sklearn == 0.0
 8 | torch == 1.5.0
 9 | tqdm == 4.54.1
10 | 


--------------------------------------------------------------------------------
/train_hete.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import json
 4 | import argparse
 5 | from HRec import pipeline
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('model', type=str)
 9 | 
10 | args = parser.parse_args()
11 | model_name = args.model
12 | print(args.model)
13 | config = json.load(open("./configs/%s.json" % (model_name)))
14 | 
15 | p = pipeline.HProcess(config)
16 | p.fit()
17 | 


--------------------------------------------------------------------------------
/train_homo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | import json
 4 | import argparse
 5 | from HRec import pipeline
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('model', type=str)
 9 | 
10 | args = parser.parse_args()
11 | model_name = args.model
12 | print(args.model)
13 | config = json.load(open("./configs/%s.json" % (model_name)))
14 | 
15 | p = pipeline.Process(config)
16 | p.fit()
17 | break
18 | 


--------------------------------------------------------------------------------